|
1 | 1 | /**
|
2 | 2 | * External dependencies
|
3 | 3 | */
|
4 |
| -import { tokenize } from 'simple-html-tokenizer'; |
5 |
| -import { xor, fromPairs, isEqual, includes, stubTrue } from 'lodash'; |
| 4 | +import Tokenizer from 'simple-html-tokenizer/dist/es6/tokenizer'; |
| 5 | +import { |
| 6 | + identity, |
| 7 | + xor, |
| 8 | + fromPairs, |
| 9 | + isEqual, |
| 10 | + includes, |
| 11 | + stubTrue, |
| 12 | +} from 'lodash'; |
6 | 13 |
|
7 | 14 | /**
|
8 | 15 | * WordPress dependencies
|
9 | 16 | */
|
10 | 17 | import deprecated from '@wordpress/deprecated';
|
| 18 | +import { decodeEntities } from '@wordpress/html-entities'; |
11 | 19 |
|
12 | 20 | /**
|
13 | 21 | * Internal dependencies
|
@@ -134,6 +142,40 @@ const MEANINGFUL_ATTRIBUTES = [
|
134 | 142 | ...ENUMERATED_ATTRIBUTES,
|
135 | 143 | ];
|
136 | 144 |
|
| 145 | +/** |
| 146 | + * Array of functions which receive a text string on which to apply normalizing |
| 147 | + * behavior for consideration in text token equivalence, carefully ordered from |
| 148 | + * least-to-most expensive operations. |
| 149 | + * |
| 150 | + * @type {Array} |
| 151 | + */ |
| 152 | +const TEXT_NORMALIZATIONS = [ |
| 153 | + identity, |
| 154 | + getTextWithCollapsedWhitespace, |
| 155 | +]; |
| 156 | + |
| 157 | +/** |
| 158 | + * Subsitute EntityParser class for `simple-html-tokenizer` which bypasses |
| 159 | + * entity substitution in favor of validator's internal normalization. |
| 160 | + * |
| 161 | + * @see https://github.com/tildeio/simple-html-tokenizer/tree/master/src/entity-parser.ts |
| 162 | + */ |
| 163 | +export class IdentityEntityParser { |
| 164 | + /** |
| 165 | + * Returns a substitute string for an entity string sequence between `&` |
| 166 | + * and `;`, or undefined if no substitution should occur. |
| 167 | + * |
| 168 | + * In this implementation, undefined is always returned. |
| 169 | + * |
| 170 | + * @param {string} entity Entity fragment discovered in HTML. |
| 171 | + * |
| 172 | + * @return {?string} Entity substitute value. |
| 173 | + */ |
| 174 | + parse( entity ) { |
| 175 | + return decodeEntities( '&' + entity + ';' ); |
| 176 | + } |
| 177 | +} |
| 178 | + |
137 | 179 | /**
|
138 | 180 | * Object of logger functions.
|
139 | 181 | */
|
@@ -186,6 +228,10 @@ export function getTextPiecesSplitOnWhitespace( text ) {
|
186 | 228 | * @return {string} Trimmed text with consecutive whitespace collapsed.
|
187 | 229 | */
|
188 | 230 | export function getTextWithCollapsedWhitespace( text ) {
|
| 231 | + // This is an overly simplified whitespace comparison. The specification is |
| 232 | + // more prescriptive of whitespace behavior in inline and block contexts. |
| 233 | + // |
| 234 | + // See: https://medium.com/@patrickbrosset/when-does-white-space-matter-in-html-b90e8a7cdd33 |
189 | 235 | return getTextPiecesSplitOnWhitespace( text ).join( ' ' );
|
190 | 236 | }
|
191 | 237 |
|
@@ -220,18 +266,28 @@ export function getMeaningfulAttributePairs( token ) {
|
220 | 266 | *
|
221 | 267 | * @return {boolean} Whether two text tokens are equivalent.
|
222 | 268 | */
|
223 |
| -export function isEqualTextTokensWithCollapsedWhitespace( actual, expected ) { |
224 |
| - // This is an overly simplified whitespace comparison. The specification is |
225 |
| - // more prescriptive of whitespace behavior in inline and block contexts. |
226 |
| - // |
227 |
| - // See: https://medium.com/@patrickbrosset/when-does-white-space-matter-in-html-b90e8a7cdd33 |
228 |
| - const isEquivalentText = isEqual( ...[ actual.chars, expected.chars ].map( getTextWithCollapsedWhitespace ) ); |
229 |
| - |
230 |
| - if ( ! isEquivalentText ) { |
231 |
| - log.warning( 'Expected text `%s`, saw `%s`.', expected.chars, actual.chars ); |
| 269 | +export function isEquivalentTextTokens( actual, expected ) { |
| 270 | + // This function is intentionally written as syntactically "ugly" as a hot |
| 271 | + // path optimization. Text is progressively normalized in order from least- |
| 272 | + // to-most operationally expensive, until the earliest point at which text |
| 273 | + // can be confidently inferred as being equal. |
| 274 | + let actualChars = actual.chars; |
| 275 | + let expectedChars = expected.chars; |
| 276 | + |
| 277 | + for ( let i = 0; i < TEXT_NORMALIZATIONS.length; i++ ) { |
| 278 | + const normalize = TEXT_NORMALIZATIONS[ i ]; |
| 279 | + |
| 280 | + actualChars = normalize( actualChars ); |
| 281 | + expectedChars = normalize( expectedChars ); |
| 282 | + |
| 283 | + if ( actualChars === expectedChars ) { |
| 284 | + return true; |
| 285 | + } |
232 | 286 | }
|
233 | 287 |
|
234 |
| - return isEquivalentText; |
| 288 | + log.warning( 'Expected text `%s`, saw `%s`.', expected.chars, actual.chars ); |
| 289 | + |
| 290 | + return false; |
235 | 291 | }
|
236 | 292 |
|
237 | 293 | /**
|
@@ -359,8 +415,8 @@ export const isEqualTokensOfType = {
|
359 | 415 | ...[ actual, expected ].map( getMeaningfulAttributePairs )
|
360 | 416 | );
|
361 | 417 | },
|
362 |
| - Chars: isEqualTextTokensWithCollapsedWhitespace, |
363 |
| - Comment: isEqualTextTokensWithCollapsedWhitespace, |
| 418 | + Chars: isEquivalentTextTokens, |
| 419 | + Comment: isEquivalentTextTokens, |
364 | 420 | };
|
365 | 421 |
|
366 | 422 | /**
|
@@ -396,7 +452,7 @@ export function getNextNonWhitespaceToken( tokens ) {
|
396 | 452 | */
|
397 | 453 | function getHTMLTokens( html ) {
|
398 | 454 | try {
|
399 |
| - return tokenize( html ); |
| 455 | + return new Tokenizer( new IdentityEntityParser() ).tokenize( html ); |
400 | 456 | } catch ( e ) {
|
401 | 457 | log.warning( 'Malformed HTML detected: %s', html );
|
402 | 458 | }
|
|
0 commit comments