Block API: Consider encoding-normalized text as equivalent (#11771)

aduth · youknowriad · commit 12372430df6d · 2018-11-15T17:25:04.000+01:00
diff --git a/lib/client-assets.php b/lib/client-assets.php
@@ -470,6 +470,7 @@ function gutenberg_register_scripts_and_styles() {
 			'wp-dom',
 			'wp-element',
 			'wp-hooks',
+			'wp-html-entities',
 			'wp-i18n',
 			'wp-is-shallow-equal',
 			'wp-polyfill',
diff --git a/package-lock.json b/package-lock.json
diff --git a/packages/blocks/CHANGELOG.md b/packages/blocks/CHANGELOG.md
@@ -1,3 +1,9 @@
+## 5.3.2 (Unreleased)
+
+### Bug Fix
+
+- The block validator is more lenient toward equivalent encoding forms.
+
 ## 5.3.1 (2018-11-12)
 
 ## 5.3.0 (2018-11-09)
diff --git a/packages/blocks/package.json b/packages/blocks/package.json
@@ -29,6 +29,7 @@
 		"@wordpress/dom": "file:../dom",
 		"@wordpress/element": "file:../element",
 		"@wordpress/hooks": "file:../hooks",
+		"@wordpress/html-entities": "file:../html-entities",
 		"@wordpress/i18n": "file:../i18n",
 		"@wordpress/is-shallow-equal": "file:../is-shallow-equal",
 		"@wordpress/shortcode": "file:../shortcode",
diff --git a/packages/blocks/src/api/test/validation.js b/packages/blocks/src/api/test/validation.js
@@ -2,10 +2,11 @@
  * Internal dependencies
  */
 import {
+	IdentityEntityParser,
 	getTextPiecesSplitOnWhitespace,
 	getTextWithCollapsedWhitespace,
 	getMeaningfulAttributePairs,
-	isEqualTextTokensWithCollapsedWhitespace,
+	isEquivalentTextTokens,
 	getNormalizedStyleValue,
 	getStyleProperties,
 	isEqualAttributesOfName,
@@ -40,6 +41,16 @@ describe( 'validation', () => {
 		} );
 	} );
 
+	describe( 'IdentityEntityParser', () => {
+		it( 'can be constructed', () => {
+			expect( new IdentityEntityParser() instanceof IdentityEntityParser ).toBe( true );
+		} );
+
+		it( 'returns parse as decoded value', () => {
+			expect( new IdentityEntityParser().parse( 'quot' ) ).toBe( '"' );
+		} );
+	} );
+
 	describe( 'getTextPiecesSplitOnWhitespace()', () => {
 		it( 'returns text pieces spilt on whitespace', () => {
 			const pieces = getTextPiecesSplitOnWhitespace( '  a \t  b \n c' );
@@ -98,9 +109,9 @@ describe( 'validation', () => {
 		} );
 	} );
 
-	describe( 'isEqualTextTokensWithCollapsedWhitespace()', () => {
+	describe( 'isEquivalentTextTokens()', () => {
 		it( 'should return false if not equal with collapsed whitespace', () => {
-			const isEqual = isEqualTextTokensWithCollapsedWhitespace(
+			const isEqual = isEquivalentTextTokens(
 				{ chars: '  a \t  b \n c' },
 				{ chars: 'a \n c \t b  ' },
 			);
@@ -110,7 +121,7 @@ describe( 'validation', () => {
 		} );
 
 		it( 'should return true if equal with collapsed whitespace', () => {
-			const isEqual = isEqualTextTokensWithCollapsedWhitespace(
+			const isEqual = isEquivalentTextTokens(
 				{ chars: '  a \t  b \n c' },
 				{ chars: 'a \n b \t c  ' },
 			);
@@ -379,8 +390,8 @@ describe( 'validation', () => {
 
 		it( 'should return true for effectively equivalent html', () => {
 			const isEquivalent = isEquivalentHTML(
-				'<div>&quot; Hello<span   class="b a" id="foo"> World!</  span>  "</div>',
-				'<div  >" Hello\n<span id="foo" class="a  b">World!</span>"</div>'
+				'<div>&quot; Hello<span   class="b a" id="foo" data-foo="here &mdash; there"> World! &#128517;</  span>  "</div>',
+				'<div  >" Hello\n<span id="foo" class="a  b" data-foo="here — there">World! 😅</span>"</div>'
 			);
 
 			expect( isEquivalent ).toBe( true );
diff --git a/packages/blocks/src/api/validation.js b/packages/blocks/src/api/validation.js
@@ -1,13 +1,21 @@
 /**
  * External dependencies
  */
-import { tokenize } from 'simple-html-tokenizer';
-import { xor, fromPairs, isEqual, includes, stubTrue } from 'lodash';
+import Tokenizer from 'simple-html-tokenizer/dist/es6/tokenizer';
+import {
+	identity,
+	xor,
+	fromPairs,
+	isEqual,
+	includes,
+	stubTrue,
+} from 'lodash';
 
 /**
  * WordPress dependencies
  */
 import deprecated from '@wordpress/deprecated';
+import { decodeEntities } from '@wordpress/html-entities';
 
 /**
  * Internal dependencies
@@ -134,6 +142,40 @@ const MEANINGFUL_ATTRIBUTES = [
 	...ENUMERATED_ATTRIBUTES,
 ];
 
+/**
+ * Array of functions which receive a text string on which to apply normalizing
+ * behavior for consideration in text token equivalence, carefully ordered from
+ * least-to-most expensive operations.
+ *
+ * @type {Array}
+ */
+const TEXT_NORMALIZATIONS = [
+	identity,
+	getTextWithCollapsedWhitespace,
+];
+
+/**
+ * Subsitute EntityParser class for `simple-html-tokenizer` which bypasses
+ * entity substitution in favor of validator's internal normalization.
+ *
+ * @see https://github.com/tildeio/simple-html-tokenizer/tree/master/src/entity-parser.ts
+ */
+export class IdentityEntityParser {
+	/**
+	 * Returns a substitute string for an entity string sequence between `&`
+	 * and `;`, or undefined if no substitution should occur.
+	 *
+	 * In this implementation, undefined is always returned.
+	 *
+	 * @param {string} entity Entity fragment discovered in HTML.
+	 *
+	 * @return {?string} Entity substitute value.
+	 */
+	parse( entity ) {
+		return decodeEntities( '&' + entity + ';' );
+	}
+}
+
 /**
  * Object of logger functions.
  */
@@ -186,6 +228,10 @@ export function getTextPiecesSplitOnWhitespace( text ) {
  * @return {string} Trimmed text with consecutive whitespace collapsed.
  */
 export function getTextWithCollapsedWhitespace( text ) {
+	// This is an overly simplified whitespace comparison. The specification is
+	// more prescriptive of whitespace behavior in inline and block contexts.
+	//
+	// See: https://medium.com/@patrickbrosset/when-does-white-space-matter-in-html-b90e8a7cdd33
 	return getTextPiecesSplitOnWhitespace( text ).join( ' ' );
 }
 
@@ -220,18 +266,28 @@ export function getMeaningfulAttributePairs( token ) {
  *
  * @return {boolean} Whether two text tokens are equivalent.
  */
-export function isEqualTextTokensWithCollapsedWhitespace( actual, expected ) {
-	// This is an overly simplified whitespace comparison. The specification is
-	// more prescriptive of whitespace behavior in inline and block contexts.
-	//
-	// See: https://medium.com/@patrickbrosset/when-does-white-space-matter-in-html-b90e8a7cdd33
-	const isEquivalentText = isEqual( ...[ actual.chars, expected.chars ].map( getTextWithCollapsedWhitespace ) );
-
-	if ( ! isEquivalentText ) {
-		log.warning( 'Expected text `%s`, saw `%s`.', expected.chars, actual.chars );
+export function isEquivalentTextTokens( actual, expected ) {
+	// This function is intentionally written as syntactically "ugly" as a hot
+	// path optimization. Text is progressively normalized in order from least-
+	// to-most operationally expensive, until the earliest point at which text
+	// can be confidently inferred as being equal.
+	let actualChars = actual.chars;
+	let expectedChars = expected.chars;
+
+	for ( let i = 0; i < TEXT_NORMALIZATIONS.length; i++ ) {
+		const normalize = TEXT_NORMALIZATIONS[ i ];
+
+		actualChars = normalize( actualChars );
+		expectedChars = normalize( expectedChars );
+
+		if ( actualChars === expectedChars ) {
+			return true;
+		}
 	}
 
-	return isEquivalentText;
+	log.warning( 'Expected text `%s`, saw `%s`.', expected.chars, actual.chars );
+
+	return false;
 }
 
 /**
@@ -359,8 +415,8 @@ export const isEqualTokensOfType = {
 			...[ actual, expected ].map( getMeaningfulAttributePairs )
 		);
 	},
-	Chars: isEqualTextTokensWithCollapsedWhitespace,
-	Comment: isEqualTextTokensWithCollapsedWhitespace,
+	Chars: isEquivalentTextTokens,
+	Comment: isEquivalentTextTokens,
 };
 
 /**
@@ -396,7 +452,7 @@ export function getNextNonWhitespaceToken( tokens ) {
  */
 function getHTMLTokens( html ) {
 	try {
-		return tokenize( html );
+		return new Tokenizer( new IdentityEntityParser() ).tokenize( html );
 	} catch ( e ) {
 		log.warning( 'Malformed HTML detected: %s', html );
 	}
diff --git a/test/unit/jest.config.json b/test/unit/jest.config.json
@@ -14,5 +14,8 @@
 		"/test/e2e",
 		"<rootDir>/.*/build/",
 		"<rootDir>/.*/build-module/"
+	],
+	"transformIgnorePatterns": [
+		"node_modules/(?!(simple-html-tokenizer)/)"
 	]
 }

Original file line number	Diff line number	Diff line change
`@@ -14,5 +14,8 @@`
`14`	`14`	`"/test/e2e",`
`15`	`15`	`"<rootDir>/.*/build/",`
`16`	`16`	`"<rootDir>/.*/build-module/"`
	`17`	`+ ],`
	`18`	`+ "transformIgnorePatterns": [`
	`19`	`+ "node_modules/(?!(simple-html-tokenizer)/)"`
`17`	`20`	`]`
`18`	`21`	`}`