From 3ff78cc501e5255f0db02d932a63f3e69a8be085 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 24 Apr 2024 16:45:31 +0300 Subject: [PATCH 01/21] HTML API: Add custom text decoder. Provide a custom decoder for strings coming from HTML attributes and markup. This custom decoder is necessary because of deficiencies in PHP's `html_entity_decode()` function: - It isn't aware of 720 of the possible named character references in HTML, leaving many out that should be translated. - It isn't able to decode character references in data segments where the final semicolon is missing, or when there are ambiguous characters after the reference name but before the semicolon. This one is complicated: refer to the HTML5 specification to clarify. This decoder will also provide some conveniences, such as making a single-pass and interruptable decode operation possible. This will provide a number of opportunities to optimize detection and decoding of things like value prefixes, and whether a value contains a given substring. --- .../html-api/class-wp-html-decoder.php | 435 ++++++++++++++++++ .../html-api/class-wp-html-tag-processor.php | 10 +- src/wp-settings.php | 1 + .../html-api/wpHtmlProcessorHtml5lib.php | 65 ++- 4 files changed, 465 insertions(+), 46 deletions(-) create mode 100644 src/wp-includes/html-api/class-wp-html-decoder.php diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php new file mode 100644 index 0000000000000..7d446b2c08f30 --- /dev/null +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -0,0 +1,435 @@ += $end ) { + break; + } + + $character_reference = self::read_character_reference( $context, $text, $next_character_reference_at, $skip_bytes ); + if ( isset( $character_reference ) ) { + $at = $next_character_reference_at; + $decoded .= substr( $text, $was_at, $at - $was_at ); + $decoded .= $character_reference; + $at += $skip_bytes; + $was_at = $at; + continue; + } + + ++$at; + } + + if ( 0 === $was_at ) { + return $text; + } + + if ( $was_at < $end ) { + $decoded .= substr( $text, $was_at, $end - $was_at ); + } + + return $decoded; + } + + /** + * Attempt to read a character reference at the given location in a given string, + * depending on the context in which it's found. + * + * If a character reference is found, this function will return the translated value + * that the reference maps to. It will then set in `$skip_bytes` how many bytes of + * input it read while consuming the character reference. This gives calling code the + * opportunity to advance its cursor when traversing a string and decoding. It + * indicates how long the character reference was. + * + * Example: + * + * null === WP_HTML_Decoder::read_character_reference( 'attribute', 'Ships…', 0 ); + * '…' === WP_HTML_Decoder::read_character_reference( 'attribute', 'Ships…', 5, $skip_bytes ); + * 8 === $skip_bytes; + * + * null === WP_HTML_Decoder::read_character_reference( 'attribute', '∉', 0 ); + * '¬' === WP_HTML_Decoder::read_character_reference( 'attribute', '∉', 0, $skip_bytes ); + * 4 === $skip_bytes; + * + * @since 6.6.0 + * + * @param string $context `attribute` for decoding attribute values, `data` otherwise. + * @param string $text Text document containing span of text to decode. + * @param ?int $at Byte offset into text where span begins, defaults to the beginning. + * @param ?int $skip_bytes How many bytes the decodable portion of the text spans. + * The default value spans to the end of the text. + * @return string|null Decoded character reference if found, otherwise `false`. + */ + public static function read_character_reference( $context, $text, $at, &$skip_bytes = null ) { + global $html5_named_character_references; + + $length = strlen( $text ); + if ( $at + 1 >= $length ) { + return null; + } + + if ( '&' !== $text[ $at ] ) { + return null; + } + + /* + * Numeric character references. + * + * When truncated, these will encode the code point found by parsing the + * digits that are available. For example, when `🅰` is truncated + * to `DZ` it will encode `DZ`. It does not: + * - know how to parse the original `🅰`. + * - fail to parse and return plaintext `DZ`. + * - fail to parse and return the replacement character `�` + */ + if ( '#' === $text[ $at + 1 ] ) { + if ( $at + 2 >= $length ) { + return null; + } + + /** Tracks inner parsing within the numeric character reference. */ + $digits_at = $at + 2; + + if ( 'x' === $text[ $digits_at ] || 'X' === $text[ $digits_at ] ) { + $numeric_base = 16; + $numeric_digits = '0123456789abcdefABCDEF'; + $max_digits = 6; // 􏿿 + ++$digits_at; + } else { + $numeric_base = 10; + $numeric_digits = '0123456789'; + $max_digits = 7; // 􏿿 + } + + // Cannot encode invalid Unicode code points. Max is to U+10FFFF. + $zero_count = strspn( $text, '0', $digits_at ); + $digit_count = strspn( $text, $numeric_digits, $digits_at + $zero_count ); + $after_digits = $digits_at + $zero_count + $digit_count; + $has_semicolon = $after_digits < $length && ';' === $text[ $after_digits ]; + $end_of_span = $has_semicolon ? $after_digits + 1 : $after_digits; + + // `&#` or `&#x` without digits returns into plaintext. + if ( 0 === $digit_count && 0 === $zero_count ) { + return null; + } + + if ( 0 === $digit_count ) { + $skip_bytes = $end_of_span - $at; + return '�'; + } + + if ( $digit_count - $zero_count > $max_digits ) { + $skip_bytes = $end_of_span - $at; + return '�'; + } + + $digits = substr( $text, $digits_at + $zero_count, $digit_count ); + $code_point = intval( $digits, $numeric_base ); + + /* + * Noncharacters, 0x0D, and non-ASCII-whitespace control characters. + * + * > A noncharacter is a code point that is in the range U+FDD0 to U+FDEF, + * > inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, + * > U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, + * > U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, + * > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, + * > U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF. + * + * A C0 control is a code point that is in the range of U+00 to U+1F, + * but ASCII whitespace includes U+09, U+0A, U+0C, and U+0D. + * + * These characters are invalid but still decode as any valid character. + * This comment is here to note and explain why there's no check to + * remove these characters or replace them. + * + * @see https://infra.spec.whatwg.org/#noncharacter + */ + + /* + * Code points in the C1 controls area need to be remapped as if they + * were stored in Windows-1252. Note! This transformation only happens + * for numeric character references. The raw code points in the byte + * stream are not translated. + * + * > If the number is one of the numbers in the first column of + * > the following table, then find the row with that number in + * > the first column, and set the character reference code to + * > the number in the second column of that row. + */ + if ( $code_point >= 0x80 && $code_point <= 0x9F ) { + $windows_1252_mapping = array( + 0x20AC, // 0x80 -> EURO SIGN (€). + 0x81, // 0x81 -> (no change). + 0x201A, // 0x82 -> SINGLE LOW-9 QUOTATION MARK (‚). + 0x0192, // 0x83 -> LATIN SMALL LETTER F WITH HOOK (ƒ). + 0x201E, // 0x84 -> DOUBLE LOW-9 QUOTATION MARK („). + 0x2026, // 0x85 -> HORIZONTAL ELLIPSIS (…). + 0x2020, // 0x86 -> DAGGER (†). + 0x2021, // 0x87 -> DOUBLE DAGGER (‡). + 0x02C6, // 0x88 -> MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ). + 0x2030, // 0x89 -> PER MILLE SIGN (‰). + 0x0160, // 0x8A -> LATIN CAPITAL LETTER S WITH CARON (Š). + 0x2039, // 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹). + 0x0152, // 0x8C -> LATIN CAPITAL LIGATURE OE (Œ). + 0x8D, // 0x8D -> (no change). + 0x017D, // 0x8E -> LATIN CAPITAL LETTER Z WITH CARON (Ž). + 0x8F, // 0x8F -> (no change). + 0x90, // 0x90 -> (no change). + 0x2018, // 0x91 -> LEFT SINGLE QUOTATION MARK (‘). + 0x2019, // 0x92 -> RIGHT SINGLE QUOTATION MARK (’). + 0x201C, // 0x93 -> LEFT DOUBLE QUOTATION MARK (“). + 0x201D, // 0x94 -> RIGHT DOUBLE QUOTATION MARK (”). + 0x2022, // 0x95 -> BULLET (•). + 0x2013, // 0x96 -> EN DASH (–). + 0x2014, // 0x97 -> EM DASH (—). + 0x02DC, // 0x98 -> SMALL TILDE (˜). + 0x2122, // 0x99 -> TRADE MARK SIGN (™). + 0x0161, // 0x9A -> LATIN SMALL LETTER S WITH CARON (š). + 0x203A, // 0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›). + 0x0153, // 0x9C -> LATIN SMALL LIGATURE OE (œ). + 0x9D, // 0x9D -> (no change). + 0x017E, // 0x9E -> LATIN SMALL LETTER Z WITH CARON (ž). + 0x0178, // 0x9F -> LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ). + ); + + $code_point = $windows_1252_mapping[ $code_point - 0x80 ]; + } + + $skip_bytes = $end_of_span - $at; + return self::code_point_to_utf8_bytes( $code_point ); + } + + /** Tracks inner parsing within the named character reference. */ + $name_at = $at + 1; + // Minimum named character reference is two characters. E.g. `GT`. + if ( $name_at + 2 > $length ) { + return null; + } + + $name_length = 0; + $replacement = $html5_named_character_references->read_token( $text, $name_at, $name_length ); + if ( false === $replacement ) { + return null; + } + + $after_name = $name_at + $name_length; + + // If the match ended with a semicolon then it should always be decoded. + if ( ';' === $text[ $name_at + $name_length - 1 ] ) { + $skip_bytes = $after_name - $at; + return $replacement; + } + + /* + * At this point though there's a match for an entry in the named + * character reference table but the match doesn't end in `;`. + * It may be allowed if it's followed by something unambiguous. + */ + $ambiguous_follower = ( + $after_name < $length && + $name_at < $length && + ( + ctype_alnum( $text[ $after_name ] ) || + '=' === $text[ $after_name ] + ) + ); + + // It's non-ambiguous, safe to leave it in. + if ( ! $ambiguous_follower ) { + $skip_bytes = $after_name - $at; + return $replacement; + } + + if ( 'attribute' === $context ) { + return null; + } + + $skip_bytes = $after_name - $at; + return $replacement; + } + + /** + * Encode a code point number into the UTF-8 encoding. + * + * This encoder implements the encoding algorithm for converting a number + * into a byte sequence, but if it receives an invalid code point it will + * return the Unicode Replacement Character U+FFFD `�`. + * + * Example: + * + * '🅰' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0x1f170 ); + * + * // Half of a surrogate pair is an invalid code point. + * '�' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0xd83c ); + * + * @since 6.6.0 + * + * @see https://www.rfc-editor.org/rfc/rfc3629 UTF-8 + * + * @param int $code_point Which code point to convert. + * @return string Converted code point, or `�` if invalid. + */ + public static function code_point_to_utf8_bytes( $code_point ) { + if ( + $code_point <= 0 || + ( $code_point >= 0xD800 && $code_point <= 0xDFFF ) || + $code_point > 0x10FFFF + ) { + return '�'; + } + + if ( $code_point <= 0x7F ) { + return chr( $code_point ); + } + + if ( $code_point <= 0x7FF ) { + $byte1 = ( $code_point >> 6 ) | 0xC0; + $byte2 = $code_point & 0x3F | 0x80; + + return pack( 'CC', $byte1, $byte2 ); + } + + if ( $code_point <= 0xFFFF ) { + $byte1 = ( $code_point >> 12 ) | 0xE0; + $byte2 = ( $code_point >> 6 ) & 0x3F | 0x80; + $byte3 = $code_point & 0x3F | 0x80; + + return pack( 'CCC', $byte1, $byte2, $byte3 ); + } + + if ( $code_point <= 0x10FFFF ) { + $byte1 = ( $code_point >> 18 ) | 0xF0; + $byte2 = ( $code_point >> 12 ) & 0x3F | 0x80; + $byte3 = ( $code_point >> 6 ) & 0x3F | 0x80; + $byte4 = $code_point & 0x3F | 0x80; + + return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 ); + } + } +} diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 4597a888b5efe..c4a5ffe6de7a8 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -15,10 +15,6 @@ * - Prune the whitespace when removing classes/attributes: e.g. "a b c" -> "c" not " c". * This would increase the size of the changes for some operations but leave more * natural-looking output HTML. - * - Properly decode HTML character references in `get_attribute()`. PHP's - * `html_entity_decode()` is wrong in a couple ways: it doesn't account for the - * no-ambiguous-ampersand rule, and it improperly handles the way semicolons may - * or may not terminate a character reference. * * @package WordPress * @subpackage HTML-API @@ -2499,7 +2495,7 @@ private function get_enqueued_attribute_value( $comparable_name ) { * 3. Double-quoting ends at the last character in the update. */ $enqueued_value = substr( $enqueued_text, $equals_at + 2, -1 ); - return html_entity_decode( $enqueued_value ); + return WP_HTML_Decoder::decode_attribute( $enqueued_value ); } /** @@ -2572,7 +2568,7 @@ public function get_attribute( $name ) { $raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length ); - return html_entity_decode( $raw_value ); + return WP_HTML_Decoder::decode_attribute( $raw_value ); } /** @@ -2872,7 +2868,7 @@ public function get_modifiable_text() { return $text; } - $decoded = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE ); + $decoded = WP_HTML_Decoder::decode_text_node( $text ); /* * TEXTAREA skips a leading newline, but this newline may appear not only as the diff --git a/src/wp-settings.php b/src/wp-settings.php index 4d8a35ae8358f..c644e60605dbc 100644 --- a/src/wp-settings.php +++ b/src/wp-settings.php @@ -252,6 +252,7 @@ require ABSPATH . WPINC . '/html-api/class-wp-html-attribute-token.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-span.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-text-replacement.php'; +require ABSPATH . WPINC . '/html-api/class-wp-html-decoder.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-tag-processor.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-unsupported-exception.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-active-formatting-elements.php'; diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index c40481ac18e45..523966d412d25 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -31,41 +31,32 @@ class Tests_HtmlApi_Html5lib extends WP_UnitTestCase { * Skip specific tests that may not be supported or have known issues. */ const SKIP_TESTS = array( - 'adoption01/line0046' => 'Unimplemented: Reconstruction of active formatting elements.', - 'adoption01/line0159' => 'Unimplemented: Reconstruction of active formatting elements.', - 'adoption01/line0318' => 'Unimplemented: Reconstruction of active formatting elements.', - 'entities02/line0100' => 'Encoded characters without semicolon termination in attribute values are not handled properly', - 'entities02/line0114' => 'Encoded characters without semicolon termination in attribute values are not handled properly', - 'entities02/line0128' => 'Encoded characters without semicolon termination in attribute values are not handled properly', - 'entities02/line0142' => 'Encoded characters without semicolon termination in attribute values are not handled properly', - 'entities02/line0156' => 'Encoded characters without semicolon termination in attribute values are not handled properly', - 'inbody01/line0001' => 'Bug.', - 'inbody01/line0014' => 'Bug.', - 'inbody01/line0029' => 'Bug.', - 'menuitem-element/line0012' => 'Bug.', - 'plain-text-unsafe/line0001' => 'HTML entities may be mishandled.', - 'plain-text-unsafe/line0105' => 'Binary.', - 'tests1/line0342' => "Closing P tag implicitly creates opener, which we don't visit.", - 'tests1/line0720' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests1/line0833' => 'Bug.', - 'tests15/line0001' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests15/line0022' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests2/line0317' => 'HTML entities may be mishandled.', - 'tests2/line0408' => 'HTML entities may be mishandled.', - 'tests2/line0650' => 'Whitespace only test never enters "in body" parsing mode.', - 'tests20/line0497' => "Closing P tag implicitly creates opener, which we don't visit.", - 'tests23/line0001' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests23/line0041' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests23/line0069' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests23/line0101' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests25/line0169' => 'Bug.', - 'tests26/line0263' => 'Bug: An active formatting element should be created for a trailing text node.', - 'tests7/line0354' => 'Bug.', - 'tests8/line0001' => 'Bug.', - 'tests8/line0020' => 'Bug.', - 'tests8/line0037' => 'Bug.', - 'tests8/line0052' => 'Bug.', - 'webkit01/line0174' => 'Bug.', + 'adoption01/line0046' => 'Unimplemented: Reconstruction of active formatting elements.', + 'adoption01/line0159' => 'Unimplemented: Reconstruction of active formatting elements.', + 'adoption01/line0318' => 'Unimplemented: Reconstruction of active formatting elements.', + 'inbody01/line0001' => 'Bug.', + 'inbody01/line0014' => 'Bug.', + 'inbody01/line0029' => 'Bug.', + 'menuitem-element/line0012' => 'Bug.', + 'tests1/line0342' => "Closing P tag implicitly creates opener, which we don't visit.", + 'tests1/line0720' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests1/line0833' => 'Bug.', + 'tests15/line0001' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests15/line0022' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests2/line0650' => 'Whitespace only test never enters "in body" parsing mode.', + 'tests20/line0497' => "Closing P tag implicitly creates opener, which we don't visit.", + 'tests23/line0001' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests23/line0041' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests23/line0069' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests23/line0101' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests25/line0169' => 'Bug.', + 'tests26/line0263' => 'Bug: An active formatting element should be created for a trailing text node.', + 'tests7/line0354' => 'Bug.', + 'tests8/line0001' => 'Bug.', + 'tests8/line0020' => 'Bug.', + 'tests8/line0037' => 'Bug.', + 'tests8/line0052' => 'Bug.', + 'webkit01/line0174' => 'Bug.', ); @@ -107,10 +98,6 @@ public function data_external_html5lib_tests() { continue; } - if ( 'entities01.dat' === $entry || 'entities02.dat' === $entry ) { - continue; - } - foreach ( self::parse_html5_dat_testfile( $test_dir . $entry ) as $k => $test ) { // strip .dat extension from filename $test_suite = substr( $entry, 0, -4 ); From 401d30facdb4d3cf0f7cfda18514b5e07eb2e5cb Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 14 May 2024 20:21:23 -0700 Subject: [PATCH 02/21] Correct typo in max digit computation. --- src/wp-includes/html-api/class-wp-html-decoder.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index 7d446b2c08f30..1f8fe526b4f7f 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -243,7 +243,7 @@ public static function read_character_reference( $context, $text, $at, &$skip_by return '�'; } - if ( $digit_count - $zero_count > $max_digits ) { + if ( $digit_count > $max_digits ) { $skip_bytes = $end_of_span - $at; return '�'; } From 4f5f21eab60b5b8e24fbd7e4027d8d6dec671664 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 24 May 2024 15:15:52 -0700 Subject: [PATCH 03/21] Adjust docs, names. --- .../html-api/class-wp-html-decoder.php | 109 +++++++++++------- 1 file changed, 66 insertions(+), 43 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index 1f8fe526b4f7f..96e6a77119c42 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -20,15 +20,14 @@ class WP_HTML_Decoder { * Example: * * $value = 'http://wordpress.org/'; - * true === WP_HTML_Decoder::attribute_starts_with( $value, 'http:', 'case-insensitive' ); - * false === WP_HTML_Decoder::attribute_starts_with( $value, 'https:', 'case-insensitive' ); + * true === WP_HTML_Decoder::attribute_starts_with( $value, 'http:', 'ascii-case-insensitive' ); + * false === WP_HTML_Decoder::attribute_starts_with( $value, 'https:', 'ascii-case-insensitive' ); * * @since 6.6.0 * * @param string $haystack String containing the raw non-decoded attribute value. * @param string $search_text Does the attribute value start with this plain string. - * @param ?string $case_sensitivity Set to `case-insensitive` to ignore ASCII case when matching. - * + * @param ?string $case_sensitivity Set to `ascii-case-insensitive` to ignore ASCII case when matching. * @return bool Whether the attribute value starts with the given string. */ public static function attribute_starts_with( $haystack, $search_text, $case_sensitivity = 'case-sensitive' ) { @@ -44,7 +43,7 @@ public static function attribute_starts_with( $haystack, $search_text, $case_sen $is_introducer = '&' === $haystack[ $value_at ]; $next_chunk = $is_introducer - ? self::read_character_reference( $haystack, $value_at, false, $skip_bytes ) + ? self::read_character_reference( $haystack, $value_at, false, $token_length ) : false; // If there's no character reference and the characters don't match, the match fails. @@ -65,7 +64,7 @@ public static function attribute_starts_with( $haystack, $search_text, $case_sen } // The character reference matched, so continue checking. - $value_at += $skip_bytes; + $value_at += $token_length; $search_at += strlen( $next_chunk ); } @@ -75,14 +74,19 @@ public static function attribute_starts_with( $haystack, $search_text, $case_sen /** * Returns a string containing the decoded value of a given HTML text node. * + * Text nodes appear in HTML DATA sections, which are the text segments inside + * and around tags, excepting SCRIPT and STYLE elements (and some others), + * whose inner text is not decoded. Use this function to read the decoded + * value of such a text span in an HTML document. + * * Example: * * '“😄”' === WP_HTML_Decode::decode_text_node( '“😄”' ); * * @since 6.6.0 * - * @param string $text Text containing raw and non-decoded text node to decode. - * @return string Decoded value of given text node. + * @param string $text Text containing raw and non-decoded text node to decode. + * @return string Decoded UTF-8 value of given text node. */ public static function decode_text_node( $text ) { return static::decode( 'data', $text ); @@ -91,14 +95,18 @@ public static function decode_text_node( $text ) { /** * Returns a string containing the decoded value of a given HTML attribute. * + * Text found inside an HTML attribute has different parsing rules than for + * text found inside other markup, or DATA segments. Use this function to + * read the decoded value of an HTML string inside a quoted attribute. + * * Example: * * '“😄”' === WP_HTML_Decode::decode_attribute( '“😄”' ); * * @since 6.6.0 * - * @param string $text Text containing raw and non-decoded attribute value to decode. - * @return string Decoded value of given attribute value. + * @param string $text Text containing raw and non-decoded attribute value to decode. + * @return string Decoded UTF-8 value of given attribute value. */ public static function decode_attribute( $text ) { return static::decode( 'attribute', $text ); @@ -108,7 +116,8 @@ public static function decode_attribute( $text ) { * Decodes a span of HTML text, depending on the context in which it's found. * * This is a low-level method; prefer calling WP_HTML_Decoder::decode_attribute() or - * WP_HTML_Decoder::decode_text_node() instead. + * WP_HTML_Decoder::decode_text_node() instead. It's provided for cases where this + * may be difficult to do from calling code. * * Example: * @@ -116,9 +125,11 @@ public static function decode_attribute( $text ) { * * @since 6.6.0 * + * @access private + * * @param string $context `attribute` for decoding attribute values, `data` otherwise. * @param string $text Text document containing span of text to decode. - * @return string Decoded string. + * @return string Decoded UTF-8 string. */ public static function decode( $context, $text ) { $decoded = ''; @@ -132,12 +143,12 @@ public static function decode( $context, $text ) { break; } - $character_reference = self::read_character_reference( $context, $text, $next_character_reference_at, $skip_bytes ); + $character_reference = self::read_character_reference( $context, $text, $next_character_reference_at, $token_length ); if ( isset( $character_reference ) ) { $at = $next_character_reference_at; $decoded .= substr( $text, $was_at, $at - $was_at ); $decoded .= $character_reference; - $at += $skip_bytes; + $at += $token_length; $was_at = $at; continue; } @@ -161,33 +172,42 @@ public static function decode( $context, $text ) { * depending on the context in which it's found. * * If a character reference is found, this function will return the translated value - * that the reference maps to. It will then set in `$skip_bytes` how many bytes of - * input it read while consuming the character reference. This gives calling code the - * opportunity to advance its cursor when traversing a string and decoding. It - * indicates how long the character reference was. + * that the reference maps to. It will then set `$byte_length_of_matched_token` the + * number of bytes of input it read while consuming the character reference. This + * gives calling code the opportunity to advance its cursor when traversing a string + * and decoding. * * Example: * * null === WP_HTML_Decoder::read_character_reference( 'attribute', 'Ships…', 0 ); - * '…' === WP_HTML_Decoder::read_character_reference( 'attribute', 'Ships…', 5, $skip_bytes ); - * 8 === $skip_bytes; + * '…' === WP_HTML_Decoder::read_character_reference( 'attribute', 'Ships…', 5, $token_length ); + * 8 === $token_length; // `…` * - * null === WP_HTML_Decoder::read_character_reference( 'attribute', '∉', 0 ); - * '¬' === WP_HTML_Decoder::read_character_reference( 'attribute', '∉', 0, $skip_bytes ); - * 4 === $skip_bytes; + * null === WP_HTML_Decoder::read_character_reference( 'attribute', '¬in', 0 ); + * '∉' === WP_HTML_Decoder::read_character_reference( 'attribute', '∉', 0, $token_length ); + * 7 === $token_length; // `∉` + * + * '¬' === WP_HTML_Decoder::read_character_reference( 'data', '¬in', 0, $token_length ); + * 4 === $token_length; // `¬` + * '∉' === WP_HTML_Decoder::read_character_reference( 'data', '∉', 0, $token_length ); + * 7 === $token_length; // `∉` * * @since 6.6.0 * - * @param string $context `attribute` for decoding attribute values, `data` otherwise. - * @param string $text Text document containing span of text to decode. - * @param ?int $at Byte offset into text where span begins, defaults to the beginning. - * @param ?int $skip_bytes How many bytes the decodable portion of the text spans. - * The default value spans to the end of the text. - * @return string|null Decoded character reference if found, otherwise `false`. + * @param string $context `attribute` for decoding attribute values, `data` otherwise. + * @param string $text Text document containing span of text to decode. + * @param ?int $at Byte offset into text where span begins, defaults to the beginning. + * @param ?int $byte_length_of_matched_token Set to byte length of matched character reference, if matched, + * otherwise not set. This is an "out" parameter. + * @return string|null Decoded character reference in UTF-8 if found, otherwise `false`. */ - public static function read_character_reference( $context, $text, $at, &$skip_bytes = null ) { + public static function read_character_reference( $context, $text, $at, &$byte_length_of_matched_token = null ) { global $html5_named_character_references; + if ( ! isset( $at ) ) { + $at = 0; + } + $length = strlen( $text ); if ( $at + 1 >= $length ) { return null; @@ -238,13 +258,15 @@ public static function read_character_reference( $context, $text, $at, &$skip_by return null; } + // Whereas `&#` and only zeros is invalid. if ( 0 === $digit_count ) { - $skip_bytes = $end_of_span - $at; + $byte_length_of_matched_token = $end_of_span - $at; return '�'; } + // If there are too many digits then it's not worth parsing. It's invalid. if ( $digit_count > $max_digits ) { - $skip_bytes = $end_of_span - $at; + $byte_length_of_matched_token = $end_of_span - $at; return '�'; } @@ -321,7 +343,7 @@ public static function read_character_reference( $context, $text, $at, &$skip_by $code_point = $windows_1252_mapping[ $code_point - 0x80 ]; } - $skip_bytes = $end_of_span - $at; + $byte_length_of_matched_token = $end_of_span - $at; return self::code_point_to_utf8_bytes( $code_point ); } @@ -342,7 +364,7 @@ public static function read_character_reference( $context, $text, $at, &$skip_by // If the match ended with a semicolon then it should always be decoded. if ( ';' === $text[ $name_at + $name_length - 1 ] ) { - $skip_bytes = $after_name - $at; + $byte_length_of_matched_token = $after_name - $at; return $replacement; } @@ -362,15 +384,16 @@ public static function read_character_reference( $context, $text, $at, &$skip_by // It's non-ambiguous, safe to leave it in. if ( ! $ambiguous_follower ) { - $skip_bytes = $after_name - $at; + $byte_length_of_matched_token = $after_name - $at; return $replacement; } + // It's ambiguous, which isn't allowed inside attributes. if ( 'attribute' === $context ) { return null; } - $skip_bytes = $after_name - $at; + $byte_length_of_matched_token = $after_name - $at; return $replacement; } @@ -396,6 +419,7 @@ public static function read_character_reference( $context, $text, $at, &$skip_by * @return string Converted code point, or `�` if invalid. */ public static function code_point_to_utf8_bytes( $code_point ) { + // Pre-check to ensure a valid code point. if ( $code_point <= 0 || ( $code_point >= 0xD800 && $code_point <= 0xDFFF ) || @@ -423,13 +447,12 @@ public static function code_point_to_utf8_bytes( $code_point ) { return pack( 'CCC', $byte1, $byte2, $byte3 ); } - if ( $code_point <= 0x10FFFF ) { - $byte1 = ( $code_point >> 18 ) | 0xF0; - $byte2 = ( $code_point >> 12 ) & 0x3F | 0x80; - $byte3 = ( $code_point >> 6 ) & 0x3F | 0x80; - $byte4 = $code_point & 0x3F | 0x80; + // Any values above U+10FFFF are eliminated above in the pre-check. + $byte1 = ( $code_point >> 18 ) | 0xF0; + $byte2 = ( $code_point >> 12 ) & 0x3F | 0x80; + $byte3 = ( $code_point >> 6 ) & 0x3F | 0x80; + $byte4 = $code_point & 0x3F | 0x80; - return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 ); - } + return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 ); } } From 72aeccd7a4c27b01e20fb0994da691b9157b36d2 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 24 May 2024 18:04:16 -0700 Subject: [PATCH 04/21] Updates to attribute_starts_with --- .../html-api/class-wp-html-decoder.php | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index 96e6a77119c42..a92e2021d4c74 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -32,18 +32,19 @@ class WP_HTML_Decoder { */ public static function attribute_starts_with( $haystack, $search_text, $case_sensitivity = 'case-sensitive' ) { $search_length = strlen( $search_text ); - $loose_case = 'case-insensitive' === $case_sensitivity; + $loose_case = 'ascii-case-insensitive' === $case_sensitivity; $haystack_end = strlen( $haystack ); $search_at = 0; + $haystack_at = 0; - while ( $search_at < $search_length && $value_at < $haystack_end ) { + while ( $search_at < $search_length && $haystack_at < $haystack_end ) { $chars_match = $loose_case - ? strtolower( $haystack[ $value_at ] ) === strtolower( $search_text[ $search_at ] ) - : $haystack[ $value_at ] === $search_text[ $search_at ]; + ? strtolower( $haystack[ $haystack_at ] ) === strtolower( $search_text[ $search_at ] ) + : $haystack[ $haystack_at ] === $search_text[ $search_at ]; - $is_introducer = '&' === $haystack[ $value_at ]; + $is_introducer = '&' === $haystack[ $haystack_at ]; $next_chunk = $is_introducer - ? self::read_character_reference( $haystack, $value_at, false, $token_length ) + ? self::read_character_reference( 'attribute', $haystack, $haystack_at, $token_length ) : false; // If there's no character reference and the characters don't match, the match fails. @@ -53,7 +54,7 @@ public static function attribute_starts_with( $haystack, $search_text, $case_sen // If there's no character reference but the character do match, then it could still match. if ( false === $next_chunk && $chars_match ) { - ++$value_at; + ++$haystack_at; ++$search_at; continue; } @@ -64,7 +65,7 @@ public static function attribute_starts_with( $haystack, $search_text, $case_sen } // The character reference matched, so continue checking. - $value_at += $token_length; + $haystack_at += $token_length; $search_at += strlen( $next_chunk ); } From d60e320c70c8e761bb5e0ee2408be414e2cef4e0 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 27 May 2024 18:19:19 -0700 Subject: [PATCH 05/21] Add tests for attribute_starts_with --- .../phpunit/tests/html-api/wpHtmlDecoder.php | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 tests/phpunit/tests/html-api/wpHtmlDecoder.php diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php new file mode 100644 index 0000000000000..5e5fe03a6186d --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -0,0 +1,114 @@ +assertTrue( + WP_HTML_Decoder::attribute_starts_with( $attribute_value, $search_string, 'ascii-case-insensitive' ), + "Should have found that '{$attribute_value}' starts with '{$search_string}'" + ); + } + + /** + * Data provider. + * + * @return Generator. + */ + public static function data_case_variants_of_attribute_prefixes() { + $with_javascript_prefix = array( + 'javascript:', + 'JAVASCRIPT:', + 'javascript:', + 'javascript:', + 'javascript:', + 'javascript:', + 'javascript&colon', + 'javascript:alert(1)', + 'JaVaScRiPt:alert(1)', + 'javascript:alert(1);', + 'javascript:alert(1);', + 'javascript:alert(1);', + 'javascript:alert(1);', + 'javascript:alert(1);', + 'javascript:alert(1);', + 'javascript:alert(1);', + 'javascript:alert(1);', + 'javascript:alert(1);', + 'javascript:alert('XSS')', + 'javascript:javascript:alert(1);', + 'javascript:javascript:alert(1);', + 'javascript:javascript:alert(1);', + 'javascript:javascript:alert(1);', + 'javascript:javascript:alert(1);', + 'javascript:alert(1)//?:', + 'javascript:alert(1)', + 'javascript:x=1;alert(1)', + ); + + foreach ( $with_javascript_prefix as $attribute_value ) { + yield $attribute_value => array( $attribute_value, 'javascript:' ); + } + } + + /** + * Ensures that `attribute_starts_with` respects the case sensitivity argument. + * + * @ticket 61072 + * + * @dataProvider data_attributes_with_prefix_and_case_sensitive_match + * + * @param string $attribute_value Raw attribute value from HTML string. + * @param string $search_string Prefix contained or not contained in encoded attribute value. + * @param string $case_sensitivity Whether to search with ASCII case sensitivity; + * 'ascii-case-insensitive' or 'case-sensitive'. + * @param bool $is_match Whether the search string is a prefix for the attribute value, + * given the case sensitivity setting. + */ + public function test_attribute_starts_with_heeds_case_sensitivity( $attribute_value, $search_string, $case_sensitivity, $is_match ) { + if ( $is_match ) { + $this->assertTrue( + WP_HTML_Decoder::attribute_starts_with( $attribute_value, $search_string, $case_sensitivity ), + 'Should have found attribute prefix with case-sensitive search.' + ); + } else { + $this->assertFalse( + WP_HTML_Decoder::attribute_starts_with( $attribute_value, $search_string, $case_sensitivity ), + 'Should not have matched attribute with prefix with ASCII-case-insensitive search.' + ); + } + } + + /** + * Data provider. + * + * @return array[]. + */ + public static function data_attributes_with_prefix_and_case_sensitive_match() { + return array( + array( 'http://wordpress.org', 'http', 'case-sensitive', true ), + array( 'http://wordpress.org', 'http', 'ascii-case-insensitive', true ), + array( 'http://wordpress.org', 'HTTP', 'case-sensitive', false ), + array( 'http://wordpress.org', 'HTTP', 'ascii-case-insensitive', true ), + ); + } +} From db384b92ca489ed267c121e635e6decf62b53951 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 28 May 2024 18:00:18 +0200 Subject: [PATCH 06/21] Fix alignment lint --- src/wp-includes/html-api/class-wp-html-decoder.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index a92e2021d4c74..4e81a4ff883a9 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -65,8 +65,8 @@ public static function attribute_starts_with( $haystack, $search_text, $case_sen } // The character reference matched, so continue checking. - $haystack_at += $token_length; - $search_at += strlen( $next_chunk ); + $haystack_at += $token_length; + $search_at += strlen( $next_chunk ); } return true; From be8b2a55ac82eae3c882e5a72eaec7236a561d03 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 28 May 2024 18:08:34 +0200 Subject: [PATCH 07/21] Annotate global variable type --- src/wp-includes/html-api/class-wp-html-decoder.php | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index 4e81a4ff883a9..03fb0b0183cee 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -203,6 +203,8 @@ public static function decode( $context, $text ) { * @return string|null Decoded character reference in UTF-8 if found, otherwise `false`. */ public static function read_character_reference( $context, $text, $at, &$byte_length_of_matched_token = null ) { + + /** @var WP_Token_Map $html5_named_character_references */ global $html5_named_character_references; if ( ! isset( $at ) ) { From 8f9b0761c5948364a366da062e7708ec5e99ae36 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 28 May 2024 18:09:17 +0200 Subject: [PATCH 08/21] Change read_character_reference return type to string|false --- .../html-api/class-wp-html-decoder.php | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index 03fb0b0183cee..57bf09d2a2d52 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -200,7 +200,7 @@ public static function decode( $context, $text ) { * @param ?int $at Byte offset into text where span begins, defaults to the beginning. * @param ?int $byte_length_of_matched_token Set to byte length of matched character reference, if matched, * otherwise not set. This is an "out" parameter. - * @return string|null Decoded character reference in UTF-8 if found, otherwise `false`. + * @return string|false Decoded character reference in UTF-8 if found, otherwise `false`. */ public static function read_character_reference( $context, $text, $at, &$byte_length_of_matched_token = null ) { @@ -213,11 +213,11 @@ public static function read_character_reference( $context, $text, $at, &$byte_le $length = strlen( $text ); if ( $at + 1 >= $length ) { - return null; + return false; } if ( '&' !== $text[ $at ] ) { - return null; + return false; } /* @@ -232,7 +232,7 @@ public static function read_character_reference( $context, $text, $at, &$byte_le */ if ( '#' === $text[ $at + 1 ] ) { if ( $at + 2 >= $length ) { - return null; + return false; } /** Tracks inner parsing within the numeric character reference. */ @@ -258,7 +258,7 @@ public static function read_character_reference( $context, $text, $at, &$byte_le // `&#` or `&#x` without digits returns into plaintext. if ( 0 === $digit_count && 0 === $zero_count ) { - return null; + return false; } // Whereas `&#` and only zeros is invalid. @@ -354,13 +354,13 @@ public static function read_character_reference( $context, $text, $at, &$byte_le $name_at = $at + 1; // Minimum named character reference is two characters. E.g. `GT`. if ( $name_at + 2 > $length ) { - return null; + return false; } $name_length = 0; $replacement = $html5_named_character_references->read_token( $text, $name_at, $name_length ); if ( false === $replacement ) { - return null; + return false; } $after_name = $name_at + $name_length; @@ -393,7 +393,7 @@ public static function read_character_reference( $context, $text, $at, &$byte_le // It's ambiguous, which isn't allowed inside attributes. if ( 'attribute' === $context ) { - return null; + return false; } $byte_length_of_matched_token = $after_name - $at; From b55932072cf6292f86329a289bea8280c8618c6b Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 28 May 2024 18:48:08 +0200 Subject: [PATCH 09/21] Replace &colon entity test with ∷ --- tests/phpunit/tests/html-api/wpHtmlDecoder.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 5e5fe03a6186d..a2f59084987ae 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -42,7 +42,7 @@ public static function data_case_variants_of_attribute_prefixes() { 'javascript:', 'javascript:', 'javascript:', - 'javascript&colon', + 'javascript∷', 'javascript:alert(1)', 'JaVaScRiPt:alert(1)', 'javascript:alert(1);', From 53dafce985ba3666fb70c6cfb85cff7b49a04f4d Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Tue, 28 May 2024 18:49:36 +0200 Subject: [PATCH 10/21] =?UTF-8?q?Remove=20∷=20test=20(this=20means?= =?UTF-8?q?=20"=E2=88=B7"!)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/phpunit/tests/html-api/wpHtmlDecoder.php | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index a2f59084987ae..340f248d4a3ec 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -42,7 +42,6 @@ public static function data_case_variants_of_attribute_prefixes() { 'javascript:', 'javascript:', 'javascript:', - 'javascript∷', 'javascript:alert(1)', 'JaVaScRiPt:alert(1)', 'javascript:alert(1);', From 3bf043476a2d0523ff052ec40f85bb02217a5f93 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 28 May 2024 21:50:26 +0200 Subject: [PATCH 11/21] Add WIP test for edge cases. --- .../phpunit/tests/html-api/wpHtmlDecoder.php | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 340f248d4a3ec..5699df7df9c61 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -12,6 +12,25 @@ * @coversDefaultClass WP_HTML_Decoder */ class Tests_HtmlApi_WpHtmlDecoder extends WP_UnitTestCase { + /** + * @dataProvider data_edge_cases + * @param $raw_text_node + * @param $decoded_value + */ + public function test_edge_cases( $raw_text_node, $decoded_value ) { + $this->assertSame( + $decoded_value, + WP_HTML_Decoder::decode_text_node( $raw_text_node ), + 'Improperly decoded raw text node.' + ); + } + + public static function data_edge_cases() { + return array( + 'Single ampersand' => array( '&', '&' ), + ); + } + /** * Ensures proper detection of attribute prefixes ignoring ASCII case. * From edbc0b0a30d16a3001b97de95cbaec13d1c5fb88 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 28 May 2024 11:22:07 +0200 Subject: [PATCH 12/21] Fix type error in text decoder --- src/wp-includes/html-api/class-wp-html-decoder.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index 57bf09d2a2d52..09cacac26973b 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -45,15 +45,15 @@ public static function attribute_starts_with( $haystack, $search_text, $case_sen $is_introducer = '&' === $haystack[ $haystack_at ]; $next_chunk = $is_introducer ? self::read_character_reference( 'attribute', $haystack, $haystack_at, $token_length ) - : false; + : null; // If there's no character reference and the characters don't match, the match fails. - if ( false === $next_chunk && ! $chars_match ) { + if ( null === $next_chunk && ! $chars_match ) { return false; } // If there's no character reference but the character do match, then it could still match. - if ( false === $next_chunk && $chars_match ) { + if ( null === $next_chunk && $chars_match ) { ++$haystack_at; ++$search_at; continue; From fd83c4646c726b51d2cf39dc6ccfc506c4ea786e Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 29 May 2024 12:10:56 +0200 Subject: [PATCH 13/21] Replace false with null for token reading. --- src/wp-includes/class-wp-token-map.php | 10 +++++----- .../html-api/class-wp-html-decoder.php | 16 ++++++++-------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/wp-includes/class-wp-token-map.php b/src/wp-includes/class-wp-token-map.php index 564c0e07de095..558ca4eee3145 100644 --- a/src/wp-includes/class-wp-token-map.php +++ b/src/wp-includes/class-wp-token-map.php @@ -524,7 +524,7 @@ public function contains( $word, $case_sensitivity = 'case-sensitive' ) { * @param ?int $offset How many bytes into the string where the lookup key ought to start. * @param ?int &$matched_token_byte_length Holds byte-length of found token matched, otherwise not set. * @param ?string $case_sensitivity 'ascii-case-insensitive' to ignore ASCII case or default of 'case-sensitive'. - * @return string|false Mapped value of lookup key if found, otherwise `false`. + * @return string|null Mapped value of lookup key if found, otherwise `null`. */ public function read_token( $text, $offset = 0, &$matched_token_byte_length = null, $case_sensitivity = 'case-sensitive' ) { $ignore_case = 'ascii-case-insensitive' === $case_sensitivity; @@ -539,7 +539,7 @@ public function read_token( $text, $offset = 0, &$matched_token_byte_length = nu // Perhaps a short word then. return strlen( $this->small_words ) > 0 ? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity ) - : false; + : null; } $group = $this->large_words[ $group_at / ( $this->key_length + 1 ) ]; @@ -564,7 +564,7 @@ public function read_token( $text, $offset = 0, &$matched_token_byte_length = nu // Perhaps a short word then. return strlen( $this->small_words ) > 0 ? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity ) - : false; + : null; } /** @@ -576,7 +576,7 @@ public function read_token( $text, $offset = 0, &$matched_token_byte_length = nu * @param ?int $offset How many bytes into the string where the lookup key ought to start. * @param ?int &$matched_token_byte_length Holds byte-length of found lookup key if matched, otherwise not set. * @param ?string $case_sensitivity 'ascii-case-insensitive' to ignore ASCII case or default of 'case-sensitive'. - * @return string|false Mapped value of lookup key if found, otherwise `false`. + * @return string|null Mapped value of lookup key if found, otherwise `null`. */ private function read_small_token( $text, $offset, &$matched_token_byte_length, $case_sensitivity = 'case-sensitive' ) { $ignore_case = 'ascii-case-insensitive' === $case_sensitivity; @@ -616,7 +616,7 @@ private function read_small_token( $text, $offset, &$matched_token_byte_length, return $this->small_mappings[ $at / ( $this->key_length + 1 ) ]; } - return false; + return null; } /** diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index 09cacac26973b..b03756a6fbcc3 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -213,11 +213,11 @@ public static function read_character_reference( $context, $text, $at, &$byte_le $length = strlen( $text ); if ( $at + 1 >= $length ) { - return false; + return null; } if ( '&' !== $text[ $at ] ) { - return false; + return null; } /* @@ -232,7 +232,7 @@ public static function read_character_reference( $context, $text, $at, &$byte_le */ if ( '#' === $text[ $at + 1 ] ) { if ( $at + 2 >= $length ) { - return false; + return null; } /** Tracks inner parsing within the numeric character reference. */ @@ -258,7 +258,7 @@ public static function read_character_reference( $context, $text, $at, &$byte_le // `&#` or `&#x` without digits returns into plaintext. if ( 0 === $digit_count && 0 === $zero_count ) { - return false; + return null; } // Whereas `&#` and only zeros is invalid. @@ -354,13 +354,13 @@ public static function read_character_reference( $context, $text, $at, &$byte_le $name_at = $at + 1; // Minimum named character reference is two characters. E.g. `GT`. if ( $name_at + 2 > $length ) { - return false; + return null; } $name_length = 0; $replacement = $html5_named_character_references->read_token( $text, $name_at, $name_length ); - if ( false === $replacement ) { - return false; + if ( ! isset( $replacement ) ) { + return null; } $after_name = $name_at + $name_length; @@ -393,7 +393,7 @@ public static function read_character_reference( $context, $text, $at, &$byte_le // It's ambiguous, which isn't allowed inside attributes. if ( 'attribute' === $context ) { - return false; + return null; } $byte_length_of_matched_token = $after_name - $at; From 59671d6d29a705252b656905544cac188338cdd2 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 29 May 2024 12:46:43 +0200 Subject: [PATCH 14/21] Update tests for Token Map returning null. --- tests/phpunit/tests/wp-token-map/wpTokenMap.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/phpunit/tests/wp-token-map/wpTokenMap.php b/tests/phpunit/tests/wp-token-map/wpTokenMap.php index fb2a08655d2a6..23edc5de80b94 100644 --- a/tests/phpunit/tests/wp-token-map/wpTokenMap.php +++ b/tests/phpunit/tests/wp-token-map/wpTokenMap.php @@ -317,7 +317,7 @@ public function test_reads_token_at_given_offset( $token, $replacement ) { $map = self::get_html5_token_map(); $skip_bytes = 0; - $this->assertFalse( + $this->assertNull( $map->read_token( $document, 0, $skip_bytes ), "Shouldn't have found token at start of document." ); From ab672e73dca9c4dc3b7484a5bae159f5eca832c7 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 29 May 2024 13:29:14 +0200 Subject: [PATCH 15/21] Fill in test phpdoc details --- tests/phpunit/tests/html-api/wpHtmlDecoder.php | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 5699df7df9c61..1355d8ecef489 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -13,9 +13,14 @@ */ class Tests_HtmlApi_WpHtmlDecoder extends WP_UnitTestCase { /** + * Ensures proper decoding of edge cases. + * + * @ticket 61072 + * * @dataProvider data_edge_cases - * @param $raw_text_node - * @param $decoded_value + * + * @param $raw_text_node Raw input text. + * @param $decoded_value The expected decoded text result. */ public function test_edge_cases( $raw_text_node, $decoded_value ) { $this->assertSame( From 2e2fb8b2ecfc32f226fcfe9aa04c13dfc94ea807 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 29 May 2024 14:08:25 +0200 Subject: [PATCH 16/21] Update optional $case_sensitivity param types ?string (nullable string) was used for these types. The type should be string, optionality can appear in the param description but is derived from a default argument. --- src/wp-includes/class-wp-token-map.php | 14 +++++++------- src/wp-includes/html-api/class-wp-html-decoder.php | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/wp-includes/class-wp-token-map.php b/src/wp-includes/class-wp-token-map.php index 558ca4eee3145..009ebc0cd72f4 100644 --- a/src/wp-includes/class-wp-token-map.php +++ b/src/wp-includes/class-wp-token-map.php @@ -435,8 +435,8 @@ public static function from_precomputed_table( $state ) { * * @since 6.6.0 * - * @param string $word Determine if this word is a lookup key in the map. - * @param ?string $case_sensitivity 'ascii-case-insensitive' to ignore ASCII case or default of 'case-sensitive'. + * @param string $word Determine if this word is a lookup key in the map. + * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. * @return bool Whether there's an entry for the given word in the map. */ public function contains( $word, $case_sensitivity = 'case-sensitive' ) { @@ -523,7 +523,7 @@ public function contains( $word, $case_sensitivity = 'case-sensitive' ) { * @param string $text String in which to search for a lookup key. * @param ?int $offset How many bytes into the string where the lookup key ought to start. * @param ?int &$matched_token_byte_length Holds byte-length of found token matched, otherwise not set. - * @param ?string $case_sensitivity 'ascii-case-insensitive' to ignore ASCII case or default of 'case-sensitive'. + * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. * @return string|null Mapped value of lookup key if found, otherwise `null`. */ public function read_token( $text, $offset = 0, &$matched_token_byte_length = null, $case_sensitivity = 'case-sensitive' ) { @@ -572,10 +572,10 @@ public function read_token( $text, $offset = 0, &$matched_token_byte_length = nu * * @since 6.6.0. * - * @param string $text String in which to search for a lookup key. - * @param ?int $offset How many bytes into the string where the lookup key ought to start. - * @param ?int &$matched_token_byte_length Holds byte-length of found lookup key if matched, otherwise not set. - * @param ?string $case_sensitivity 'ascii-case-insensitive' to ignore ASCII case or default of 'case-sensitive'. + * @param string $text String in which to search for a lookup key. + * @param ?int $offset How many bytes into the string where the lookup key ought to start. + * @param ?int &$matched_token_byte_length Holds byte-length of found lookup key if matched, otherwise not set. + * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. * @return string|null Mapped value of lookup key if found, otherwise `null`. */ private function read_small_token( $text, $offset, &$matched_token_byte_length, $case_sensitivity = 'case-sensitive' ) { diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index b03756a6fbcc3..fdc7034832a93 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -25,9 +25,9 @@ class WP_HTML_Decoder { * * @since 6.6.0 * - * @param string $haystack String containing the raw non-decoded attribute value. - * @param string $search_text Does the attribute value start with this plain string. - * @param ?string $case_sensitivity Set to `ascii-case-insensitive` to ignore ASCII case when matching. + * @param string $haystack String containing the raw non-decoded attribute value. + * @param string $search_text Does the attribute value start with this plain string. + * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. * @return bool Whether the attribute value starts with the given string. */ public static function attribute_starts_with( $haystack, $search_text, $case_sensitivity = 'case-sensitive' ) { From d38df36222b91fbd08e068657590dd7b422569ef Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 29 May 2024 14:15:31 +0200 Subject: [PATCH 17/21] Fix optional @param types ?Type is a nullable Type, not an optional parameter. Identify optional parameters in the @param description. --- src/wp-includes/class-wp-token-map.php | 10 +++++----- src/wp-includes/html-api/class-wp-html-decoder.php | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/wp-includes/class-wp-token-map.php b/src/wp-includes/class-wp-token-map.php index 009ebc0cd72f4..a932f4cae4d27 100644 --- a/src/wp-includes/class-wp-token-map.php +++ b/src/wp-includes/class-wp-token-map.php @@ -521,8 +521,8 @@ public function contains( $word, $case_sensitivity = 'case-sensitive' ) { * @since 6.6.0 * * @param string $text String in which to search for a lookup key. - * @param ?int $offset How many bytes into the string where the lookup key ought to start. - * @param ?int &$matched_token_byte_length Holds byte-length of found token matched, otherwise not set. + * @param int $offset Optional. How many bytes into the string where the lookup key ought to start. Default 0. + * @param ?int &$matched_token_byte_length Optional. Holds byte-length of found token matched, otherwise not set. Default null. * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. * @return string|null Mapped value of lookup key if found, otherwise `null`. */ @@ -573,8 +573,8 @@ public function read_token( $text, $offset = 0, &$matched_token_byte_length = nu * @since 6.6.0. * * @param string $text String in which to search for a lookup key. - * @param ?int $offset How many bytes into the string where the lookup key ought to start. - * @param ?int &$matched_token_byte_length Holds byte-length of found lookup key if matched, otherwise not set. + * @param int $offset Optional. How many bytes into the string where the lookup key ought to start. Default 0. + * @param ?int &$matched_token_byte_length Optional. Holds byte-length of found lookup key if matched, otherwise not set. Default null. * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. * @return string|null Mapped value of lookup key if found, otherwise `null`. */ @@ -692,7 +692,7 @@ public function to_array() { * * @since 6.6.0 * - * @param ?string $indent Use this string for indentation, or rely on the default horizontal tab character. + * @param string $indent Optional. Use this string for indentation, or rely on the default horizontal tab character. Default "\t". * @return string Value which can be pasted into a PHP source file for quick loading of table. */ public function precomputed_php_source_table( $indent = "\t" ) { diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index fdc7034832a93..d8d2a582aea0a 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -197,8 +197,8 @@ public static function decode( $context, $text ) { * * @param string $context `attribute` for decoding attribute values, `data` otherwise. * @param string $text Text document containing span of text to decode. - * @param ?int $at Byte offset into text where span begins, defaults to the beginning. - * @param ?int $byte_length_of_matched_token Set to byte length of matched character reference, if matched, + * @param ?int $at Optional. Byte offset into text where span begins, defaults to the beginning. + * @param ?int $byte_length_of_matched_token Optional. Set to byte length of matched character reference, if matched, * otherwise not set. This is an "out" parameter. * @return string|false Decoded character reference in UTF-8 if found, otherwise `false`. */ From 9931e09f50aeb3cdcad7e4615f0e69205b85287f Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 29 May 2024 14:21:51 +0200 Subject: [PATCH 18/21] Use default argument for read_character_reference at Instead of a nullable parameter that updates to `0` in the body of the function when null, use a default argument `0`, update the @param type and remove the null check and set from the function body. --- src/wp-includes/html-api/class-wp-html-decoder.php | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index d8d2a582aea0a..52f6143ab2f45 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -197,20 +197,16 @@ public static function decode( $context, $text ) { * * @param string $context `attribute` for decoding attribute values, `data` otherwise. * @param string $text Text document containing span of text to decode. - * @param ?int $at Optional. Byte offset into text where span begins, defaults to the beginning. + * @param int $at Optional. Byte offset into text where span begins, defaults to the beginning (0). * @param ?int $byte_length_of_matched_token Optional. Set to byte length of matched character reference, if matched, * otherwise not set. This is an "out" parameter. * @return string|false Decoded character reference in UTF-8 if found, otherwise `false`. */ - public static function read_character_reference( $context, $text, $at, &$byte_length_of_matched_token = null ) { + public static function read_character_reference( $context, $text, $at = 0, &$byte_length_of_matched_token = null ) { /** @var WP_Token_Map $html5_named_character_references */ global $html5_named_character_references; - if ( ! isset( $at ) ) { - $at = 0; - } - $length = strlen( $text ); if ( $at + 1 >= $length ) { return null; From 95820aacc9dd14c561cdcbc907a58d0f433d71cf Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 29 May 2024 14:30:02 +0200 Subject: [PATCH 19/21] Add additional attribute prefix tests Add mixed case and non-matching test cases --- tests/phpunit/tests/html-api/wpHtmlDecoder.php | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlDecoder.php b/tests/phpunit/tests/html-api/wpHtmlDecoder.php index 1355d8ecef489..82d6a10d349db 100644 --- a/tests/phpunit/tests/html-api/wpHtmlDecoder.php +++ b/tests/phpunit/tests/html-api/wpHtmlDecoder.php @@ -132,6 +132,10 @@ public static function data_attributes_with_prefix_and_case_sensitive_match() { array( 'http://wordpress.org', 'http', 'ascii-case-insensitive', true ), array( 'http://wordpress.org', 'HTTP', 'case-sensitive', false ), array( 'http://wordpress.org', 'HTTP', 'ascii-case-insensitive', true ), + array( 'http://wordpress.org', 'Http', 'case-sensitive', false ), + array( 'http://wordpress.org', 'Http', 'ascii-case-insensitive', true ), + array( 'http://wordpress.org', 'https', 'case-sensitive', false ), + array( 'http://wordpress.org', 'https', 'ascii-case-insensitive', false ), ); } } From 47dcdc1b60306c7435b1474be193878445cbff65 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 29 May 2024 14:37:37 +0200 Subject: [PATCH 20/21] Change byte_length_of_matched_token to matched_token_byte_length Update the parameter name and description to align with the descriptions of analogous parameters used in WP_Token_Map methods. --- .../html-api/class-wp-html-decoder.php | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index 52f6143ab2f45..91278cee82083 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -173,7 +173,7 @@ public static function decode( $context, $text ) { * depending on the context in which it's found. * * If a character reference is found, this function will return the translated value - * that the reference maps to. It will then set `$byte_length_of_matched_token` the + * that the reference maps to. It will then set `$matched_token_byte_length` the * number of bytes of input it read while consuming the character reference. This * gives calling code the opportunity to advance its cursor when traversing a string * and decoding. @@ -195,14 +195,13 @@ public static function decode( $context, $text ) { * * @since 6.6.0 * - * @param string $context `attribute` for decoding attribute values, `data` otherwise. - * @param string $text Text document containing span of text to decode. - * @param int $at Optional. Byte offset into text where span begins, defaults to the beginning (0). - * @param ?int $byte_length_of_matched_token Optional. Set to byte length of matched character reference, if matched, - * otherwise not set. This is an "out" parameter. + * @param string $context `attribute` for decoding attribute values, `data` otherwise. + * @param string $text Text document containing span of text to decode. + * @param int $at Optional. Byte offset into text where span begins, defaults to the beginning (0). + * @param ?int &$matched_token_byte_length Optional. Holds byte-length of found lookup key if matched, otherwise not set. Default null. * @return string|false Decoded character reference in UTF-8 if found, otherwise `false`. */ - public static function read_character_reference( $context, $text, $at = 0, &$byte_length_of_matched_token = null ) { + public static function read_character_reference( $context, $text, $at = 0, &$matched_token_byte_length = null ) { /** @var WP_Token_Map $html5_named_character_references */ global $html5_named_character_references; @@ -259,13 +258,13 @@ public static function read_character_reference( $context, $text, $at = 0, &$byt // Whereas `&#` and only zeros is invalid. if ( 0 === $digit_count ) { - $byte_length_of_matched_token = $end_of_span - $at; + $matched_token_byte_length = $end_of_span - $at; return '�'; } // If there are too many digits then it's not worth parsing. It's invalid. if ( $digit_count > $max_digits ) { - $byte_length_of_matched_token = $end_of_span - $at; + $matched_token_byte_length = $end_of_span - $at; return '�'; } @@ -342,7 +341,7 @@ public static function read_character_reference( $context, $text, $at = 0, &$byt $code_point = $windows_1252_mapping[ $code_point - 0x80 ]; } - $byte_length_of_matched_token = $end_of_span - $at; + $matched_token_byte_length = $end_of_span - $at; return self::code_point_to_utf8_bytes( $code_point ); } @@ -363,7 +362,7 @@ public static function read_character_reference( $context, $text, $at = 0, &$byt // If the match ended with a semicolon then it should always be decoded. if ( ';' === $text[ $name_at + $name_length - 1 ] ) { - $byte_length_of_matched_token = $after_name - $at; + $matched_token_byte_length = $after_name - $at; return $replacement; } @@ -383,7 +382,7 @@ public static function read_character_reference( $context, $text, $at = 0, &$byt // It's non-ambiguous, safe to leave it in. if ( ! $ambiguous_follower ) { - $byte_length_of_matched_token = $after_name - $at; + $matched_token_byte_length = $after_name - $at; return $replacement; } @@ -392,7 +391,7 @@ public static function read_character_reference( $context, $text, $at = 0, &$byt return null; } - $byte_length_of_matched_token = $after_name - $at; + $matched_token_byte_length = $after_name - $at; return $replacement; } From 0c0b36cd6dcfc59ca82f06db0dfd631c0640dc89 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 31 May 2024 12:15:14 +0200 Subject: [PATCH 21/21] Docblock cleanup --- .../html-api/class-wp-html-decoder.php | 45 ++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index 91278cee82083..78976002b4a93 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -27,7 +27,8 @@ class WP_HTML_Decoder { * * @param string $haystack String containing the raw non-decoded attribute value. * @param string $search_text Does the attribute value start with this plain string. - * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. + * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. + * Default 'case-sensitive'. * @return bool Whether the attribute value starts with the given string. */ public static function attribute_starts_with( $haystack, $search_text, $case_sensitivity = 'case-sensitive' ) { @@ -173,7 +174,7 @@ public static function decode( $context, $text ) { * depending on the context in which it's found. * * If a character reference is found, this function will return the translated value - * that the reference maps to. It will then set `$matched_token_byte_length` the + * that the reference maps to. It will then set `$match_byte_length` the * number of bytes of input it read while consuming the character reference. This * gives calling code the opportunity to advance its cursor when traversing a string * and decoding. @@ -195,15 +196,19 @@ public static function decode( $context, $text ) { * * @since 6.6.0 * - * @param string $context `attribute` for decoding attribute values, `data` otherwise. - * @param string $text Text document containing span of text to decode. - * @param int $at Optional. Byte offset into text where span begins, defaults to the beginning (0). - * @param ?int &$matched_token_byte_length Optional. Holds byte-length of found lookup key if matched, otherwise not set. Default null. + * @param string $context `attribute` for decoding attribute values, `data` otherwise. + * @param string $text Text document containing span of text to decode. + * @param int $at Optional. Byte offset into text where span begins, defaults to the beginning (0). + * @param int &$match_byte_length Optional. Set to byte-length of character reference if provided and if a match + * is found, otherwise not set. Default null. * @return string|false Decoded character reference in UTF-8 if found, otherwise `false`. */ - public static function read_character_reference( $context, $text, $at = 0, &$matched_token_byte_length = null ) { - - /** @var WP_Token_Map $html5_named_character_references */ + public static function read_character_reference( $context, $text, $at = 0, &$match_byte_length = null ) { + /** + * Mappings for HTML5 named character references. + * + * @var WP_Token_Map $html5_named_character_references + */ global $html5_named_character_references; $length = strlen( $text ); @@ -258,13 +263,13 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat // Whereas `&#` and only zeros is invalid. if ( 0 === $digit_count ) { - $matched_token_byte_length = $end_of_span - $at; + $match_byte_length = $end_of_span - $at; return '�'; } // If there are too many digits then it's not worth parsing. It's invalid. if ( $digit_count > $max_digits ) { - $matched_token_byte_length = $end_of_span - $at; + $match_byte_length = $end_of_span - $at; return '�'; } @@ -341,7 +346,7 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat $code_point = $windows_1252_mapping[ $code_point - 0x80 ]; } - $matched_token_byte_length = $end_of_span - $at; + $match_byte_length = $end_of_span - $at; return self::code_point_to_utf8_bytes( $code_point ); } @@ -354,7 +359,7 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat $name_length = 0; $replacement = $html5_named_character_references->read_token( $text, $name_at, $name_length ); - if ( ! isset( $replacement ) ) { + if ( false === $replacement ) { return null; } @@ -362,7 +367,7 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat // If the match ended with a semicolon then it should always be decoded. if ( ';' === $text[ $name_at + $name_length - 1 ] ) { - $matched_token_byte_length = $after_name - $at; + $match_byte_length = $after_name - $at; return $replacement; } @@ -382,7 +387,7 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat // It's non-ambiguous, safe to leave it in. if ( ! $ambiguous_follower ) { - $matched_token_byte_length = $after_name - $at; + $match_byte_length = $after_name - $at; return $replacement; } @@ -391,16 +396,16 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat return null; } - $matched_token_byte_length = $after_name - $at; + $match_byte_length = $after_name - $at; return $replacement; } /** * Encode a code point number into the UTF-8 encoding. * - * This encoder implements the encoding algorithm for converting a number - * into a byte sequence, but if it receives an invalid code point it will - * return the Unicode Replacement Character U+FFFD `�`. + * This encoder implements the UTF-8 encoding algorithm for converting + * a code point into a byte sequence. If it receives an invalid code + * point it will return the Unicode Replacement Character U+FFFD `�`. * * Example: * @@ -411,7 +416,7 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat * * @since 6.6.0 * - * @see https://www.rfc-editor.org/rfc/rfc3629 UTF-8 + * @see https://www.rfc-editor.org/rfc/rfc3629 For the UTF-8 standard. * * @param int $code_point Which code point to convert. * @return string Converted code point, or `�` if invalid.