From c823d187fb8645c0d3bf4b3e96d9aa2677d2a478 Mon Sep 17 00:00:00 2001 From: iseulde Date: Wed, 18 Sep 2019 11:53:17 +0300 Subject: [PATCH 1/7] Paste: Remove HTML Formatting Space --- .../raw-handling/html-formatting-remover.js | 90 +++++++++++++++++++ .../src/api/raw-handling/paste-handler.js | 2 + test/integration/fixtures/apple-out.html | 22 +---- test/integration/fixtures/classic-out.html | 2 +- test/integration/fixtures/evernote-out.html | 12 +-- test/integration/fixtures/markdown-out.html | 3 +- test/integration/fixtures/ms-word-out.html | 26 +----- .../fixtures/ms-word-styled-out.html | 12 +-- 8 files changed, 103 insertions(+), 66 deletions(-) create mode 100644 packages/blocks/src/api/raw-handling/html-formatting-remover.js diff --git a/packages/blocks/src/api/raw-handling/html-formatting-remover.js b/packages/blocks/src/api/raw-handling/html-formatting-remover.js new file mode 100644 index 0000000000000..85c982f66c2b3 --- /dev/null +++ b/packages/blocks/src/api/raw-handling/html-formatting-remover.js @@ -0,0 +1,90 @@ +/** + * Internal dependencies + */ +import { isPhrasingContent } from './phrasing-content'; + +function getSibling( node, which ) { + const sibling = node[ `${ which }Sibling` ]; + + if ( sibling ) { + return sibling; + } + + const { parentNode } = node; + + if ( ! parentNode ) { + return; + } + + if ( ! isPhrasingContent( parentNode ) ) { + return; + } + + return getSibling( parentNode, which ); +} + +function isFormattingSpace( character ) { + return ( + character === ' ' || + character === '\t' || + character === '\n' + ); +} + +/** + * Looks for comments, and removes them. + * + * @param {Node} node The node to be processed. + * @return {void} + */ +export default function( node ) { + if ( node.nodeType !== node.TEXT_NODE ) { + return; + } + + if ( isFormattingSpace( node.data[ 0 ] ) ) { + const previousSibling = getSibling( node, 'previous' ); + const hasPreviousSpace = ( + ! previousSibling || + previousSibling.nodeName === 'BR' || + previousSibling.textContent.slice( -1 ) === ' ' + ); + node.data = node.data.replace( /^[ \n\t]+/, hasPreviousSpace ? '' : ' ' ); + } + + if ( isFormattingSpace( node.data.slice( -1 ) ) ) { + const nextSibling = getSibling( node, 'next' ); + const hasNextSpace = ( + ! nextSibling || + nextSibling.nodeName === 'BR' || + isFormattingSpace( nextSibling.textContent[ 0 ] ) + ); + node.data = node.data.replace( /[ \n\t]+$/, hasNextSpace ? '' : ' ' ); + } + + // Require at least two characters before attempting to replace formatting + // spaces in the middle. + if ( node.data.length > 2 ) { + const middle = node.data.slice( 1, -1 ); + + // Require at least two spaces in a row, a line break, or a tab + // character before trying to replace anything. + if ( + middle.indexOf( ' ' ) !== -1 || + middle.indexOf( '\n' ) !== -1 || + middle.indexOf( '\t' ) !== -1 + ) { + node.data = ( + node.data[ 0 ] + + middle.replace( /[ \n\t]+/g, ' ' ) + + node.data[ node.data.length - 1 ] + ); + } + } + + // If there's no data left, remove the node, so `previousSibling` stays + // accurate. + if ( ! node.data ) { + node.parentNode.removeChild( node ); + } +} diff --git a/packages/blocks/src/api/raw-handling/paste-handler.js b/packages/blocks/src/api/raw-handling/paste-handler.js index fdea17bd0eba2..2519f356657f7 100644 --- a/packages/blocks/src/api/raw-handling/paste-handler.js +++ b/packages/blocks/src/api/raw-handling/paste-handler.js @@ -24,6 +24,7 @@ import shortcodeConverter from './shortcode-converter'; import markdownConverter from './markdown-converter'; import iframeRemover from './iframe-remover'; import googleDocsUIDRemover from './google-docs-uid-remover'; +import HTMLFormattingRemover from './html-formatting-remover'; import { getPhrasingContentSchema } from './phrasing-content'; import { deepFilterHTML, @@ -224,6 +225,7 @@ export function pasteHandler( { HTML = '', plainText = '', mode = 'AUTO', tagNam piece = deepFilterHTML( piece, filters, blockContentSchema ); piece = removeInvalidHTML( piece, schema ); + piece = deepFilterHTML( piece, [ HTMLFormattingRemover ], blockContentSchema ); piece = normaliseBlocks( piece ); // Allows us to ask for this information when we get a report. diff --git a/test/integration/fixtures/apple-out.html b/test/integration/fixtures/apple-out.html index 40d4f1a8d133e..f040d38235291 100644 --- a/test/integration/fixtures/apple-out.html +++ b/test/integration/fixtures/apple-out.html @@ -19,27 +19,9 @@ -
-One - -Two - -Three -
-1 - -2 - -3 -
-I - -II - -III -
+
OneTwoThree
123
IIIIII
-

An image:

+

An image:

diff --git a/test/integration/fixtures/classic-out.html b/test/integration/fixtures/classic-out.html index dbc94528db7b2..1298053376dfa 100644 --- a/test/integration/fixtures/classic-out.html +++ b/test/integration/fixtures/classic-out.html @@ -15,7 +15,7 @@ -

Fourth paragraph

+

Fourth paragraph

diff --git a/test/integration/fixtures/evernote-out.html b/test/integration/fixtures/evernote-out.html index c984df312f0be..32d2deb764897 100644 --- a/test/integration/fixtures/evernote-out.html +++ b/test/integration/fixtures/evernote-out.html @@ -1,7 +1,5 @@ -

This is a paragraph. -
This is a link. -

+

This is a paragraph.
This is a link.

@@ -17,13 +15,7 @@ -
One -Two -Three -
Four -Five -Six -
+
OneTwoThree
FourFiveSix
diff --git a/test/integration/fixtures/markdown-out.html b/test/integration/fixtures/markdown-out.html index 85a105b640923..dc064b3c12f00 100644 --- a/test/integration/fixtures/markdown-out.html +++ b/test/integration/fixtures/markdown-out.html @@ -7,8 +7,7 @@

This is a heading with italic

-

Preserve
-line breaks please.

+

Preserve
line breaks please.

diff --git a/test/integration/fixtures/ms-word-out.html b/test/integration/fixtures/ms-word-out.html index 47f7cab83ae73..79871a73e6a3f 100644 --- a/test/integration/fixtures/ms-word-out.html +++ b/test/integration/fixtures/ms-word-out.html @@ -1,11 +1,9 @@ -

This is a -title

+

This is a title

-

This is a -subtitle

+

This is a subtitle

@@ -29,25 +27,7 @@

This is a heading level 2

-
- One - - Two - - Three -
- 1 - - 2 - - 3 -
- I - - II - - III -
+
OneTwoThree
123
IIIIII
diff --git a/test/integration/fixtures/ms-word-styled-out.html b/test/integration/fixtures/ms-word-styled-out.html index 2eef7365f9668..277d8d37bbc63 100644 --- a/test/integration/fixtures/ms-word-styled-out.html +++ b/test/integration/fixtures/ms-word-styled-out.html @@ -1,15 +1,7 @@ -

-Lorem -ipsum dolor sit amet, consectetur adipiscing elit  -

+

Lorem ipsum dolor sit amet, consectetur adipiscing elit 

-

-Lorem -ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque -aliquet hendrerit auctor. Nam lobortis, est vel lacinia tincidunt, -purus tellus vehicula ex, nec pharetra justo dui sed lorem. Nam -congue laoreet massa, quis varius est tincidunt ut.

+

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque aliquet hendrerit auctor. Nam lobortis, est vel lacinia tincidunt, purus tellus vehicula ex, nec pharetra justo dui sed lorem. Nam congue laoreet massa, quis varius est tincidunt ut.

From 893937f4459e179c901a0c79c43282f541975959 Mon Sep 17 00:00:00 2001 From: iseulde Date: Wed, 18 Sep 2019 12:32:23 +0300 Subject: [PATCH 2/7] Simplify --- .../raw-handling/html-formatting-remover.js | 58 ++++++++----------- 1 file changed, 25 insertions(+), 33 deletions(-) diff --git a/packages/blocks/src/api/raw-handling/html-formatting-remover.js b/packages/blocks/src/api/raw-handling/html-formatting-remover.js index 85c982f66c2b3..d7b6fa84067c4 100644 --- a/packages/blocks/src/api/raw-handling/html-formatting-remover.js +++ b/packages/blocks/src/api/raw-handling/html-formatting-remover.js @@ -24,11 +24,7 @@ function getSibling( node, which ) { } function isFormattingSpace( character ) { - return ( - character === ' ' || - character === '\t' || - character === '\n' - ); + return character === ' ' || character === '\t' || character === '\n'; } /** @@ -42,49 +38,45 @@ export default function( node ) { return; } - if ( isFormattingSpace( node.data[ 0 ] ) ) { + // First, replace any sequence of HTML formatting space with a single space. + let newData = node.data.replace( /[ \n\t]+/g, ' ' ); + + // Remove the leading space if the text element is at the start of a block, + // is preceded by a line break element, or has a space in the previous + // element. + if ( newData[ 0 ] === ' ' ) { const previousSibling = getSibling( node, 'previous' ); - const hasPreviousSpace = ( + + if ( ! previousSibling || previousSibling.nodeName === 'BR' || previousSibling.textContent.slice( -1 ) === ' ' - ); - node.data = node.data.replace( /^[ \n\t]+/, hasPreviousSpace ? '' : ' ' ); + ) { + newData = newData.slice( 1 ); + } } - if ( isFormattingSpace( node.data.slice( -1 ) ) ) { + // Remove the trailing space if the text element is at the end of a block, + // is succeded by a line break element, or has a space in the next element. + if ( newData[ newData.length - 1 ] === ' ' ) { const nextSibling = getSibling( node, 'next' ); - const hasNextSpace = ( + + if ( ! nextSibling || nextSibling.nodeName === 'BR' || + // Note that any next node data has not yet been replaced, so we + // have to check for any formatting space. isFormattingSpace( nextSibling.textContent[ 0 ] ) - ); - node.data = node.data.replace( /[ \n\t]+$/, hasNextSpace ? '' : ' ' ); - } - - // Require at least two characters before attempting to replace formatting - // spaces in the middle. - if ( node.data.length > 2 ) { - const middle = node.data.slice( 1, -1 ); - - // Require at least two spaces in a row, a line break, or a tab - // character before trying to replace anything. - if ( - middle.indexOf( ' ' ) !== -1 || - middle.indexOf( '\n' ) !== -1 || - middle.indexOf( '\t' ) !== -1 ) { - node.data = ( - node.data[ 0 ] + - middle.replace( /[ \n\t]+/g, ' ' ) + - node.data[ node.data.length - 1 ] - ); + newData = newData.slice( 0, -1 ); } } // If there's no data left, remove the node, so `previousSibling` stays - // accurate. - if ( ! node.data ) { + // accurate. Otherwise, update the node data. + if ( ! newData ) { node.parentNode.removeChild( node ); + } else { + node.data = newData; } } From 18559d63e2ff4e9137c027729fc00c6fcc9a678a Mon Sep 17 00:00:00 2001 From: iseulde Date: Wed, 18 Sep 2019 13:46:30 +0300 Subject: [PATCH 3/7] Add unit tests --- .../test/html-formatting-remover.js | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 packages/blocks/src/api/raw-handling/test/html-formatting-remover.js diff --git a/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js b/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js new file mode 100644 index 0000000000000..38f52901fecde --- /dev/null +++ b/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js @@ -0,0 +1,94 @@ +/** + * Internal dependencies + */ +import filter from '../html-formatting-remover'; +import { deepFilterHTML } from '../utils'; + +describe( 'HTMLFormattingRemover', () => { + it( 'should remove formatting space', () => { + const input = ` +
+ a + b +
+ `; + const output = '
a b
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); + + it( 'should remove nested formatting space', () => { + const input = ` +
+ + a + b + +
+ `; + const output = '
a b
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); + + it( 'should not remove leading or trailing space if previous or next element has no space', () => { + const input = ` +
+ a + b + c +
+ `; + const output = '
a b c
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); + + it( 'should remove formatting space (empty)', () => { + const input = ` +
+
+ `; + const output = '
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); + + it( 'should remove block level formatting space', () => { + const input = ` +
+
+ a +
+
+ b +
+
+ `; + const output = '
a
b
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); + + it( 'should remove formatting space around br', () => { + const input = ` +
+ a +
+ b +
+ `; + const output = '
a
b
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); + + it( 'should remove formatting space around phasing content elements', () => { + const input = ` +
+ + a + + + b + +
+ `; + const output = '
a b
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); +} ); From d9c138279fedb9c9b1ed05d53f744831f70a66cf Mon Sep 17 00:00:00 2001 From: iseulde Date: Thu, 19 Sep 2019 12:58:41 +0300 Subject: [PATCH 4/7] Ignore pre --- .../blocks/src/api/raw-handling/html-formatting-remover.js | 7 ++++++- .../src/api/raw-handling/test/html-formatting-remover.js | 5 +++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/packages/blocks/src/api/raw-handling/html-formatting-remover.js b/packages/blocks/src/api/raw-handling/html-formatting-remover.js index d7b6fa84067c4..881f6e395cdc9 100644 --- a/packages/blocks/src/api/raw-handling/html-formatting-remover.js +++ b/packages/blocks/src/api/raw-handling/html-formatting-remover.js @@ -28,7 +28,7 @@ function isFormattingSpace( character ) { } /** - * Looks for comments, and removes them. + * Removes spacing that formats HTML. * * @param {Node} node The node to be processed. * @return {void} @@ -38,6 +38,11 @@ export default function( node ) { return; } + // Ignore pre content. + if ( node.parentElement.closest( 'pre' ) ) { + return; + } + // First, replace any sequence of HTML formatting space with a single space. let newData = node.data.replace( /[ \n\t]+/g, ' ' ); diff --git a/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js b/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js index 38f52901fecde..2bbb0917626fe 100644 --- a/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js +++ b/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js @@ -91,4 +91,9 @@ describe( 'HTMLFormattingRemover', () => { const output = '
a b
'; expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); } ); + + it( 'should ignore pre', () => { + const input = `
 a\n b\n
`; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( input ); + } ); } ); From 29d2f307ce36caaa6266d74a6303a09225613b28 Mon Sep 17 00:00:00 2001 From: Ella van Durpe Date: Mon, 25 Nov 2019 16:17:17 +0100 Subject: [PATCH 5/7] Address feedback --- .../raw-handling/html-formatting-remover.js | 27 +++++++------------ .../src/api/raw-handling/paste-handler.js | 4 +-- .../test/html-formatting-remover.js | 2 +- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/packages/blocks/src/api/raw-handling/html-formatting-remover.js b/packages/blocks/src/api/raw-handling/html-formatting-remover.js index 881f6e395cdc9..d9e1aabaaba2c 100644 --- a/packages/blocks/src/api/raw-handling/html-formatting-remover.js +++ b/packages/blocks/src/api/raw-handling/html-formatting-remover.js @@ -6,30 +6,24 @@ import { isPhrasingContent } from './phrasing-content'; function getSibling( node, which ) { const sibling = node[ `${ which }Sibling` ]; + if ( ! sibling || ! isPhrasingContent( sibling ) ) { + return; + } + if ( sibling ) { return sibling; } const { parentNode } = node; - if ( ! parentNode ) { - return; - } - - if ( ! isPhrasingContent( parentNode ) ) { - return; - } - return getSibling( parentNode, which ); } -function isFormattingSpace( character ) { - return character === ' ' || character === '\t' || character === '\n'; -} - /** * Removes spacing that formats HTML. * + * @see https://www.w3.org/TR/css-text-3/#white-space-processing + * * @param {Node} node The node to be processed. * @return {void} */ @@ -44,7 +38,7 @@ export default function( node ) { } // First, replace any sequence of HTML formatting space with a single space. - let newData = node.data.replace( /[ \n\t]+/g, ' ' ); + let newData = node.data.replace( /[ \r\n\t]+/g, ' ' ); // Remove the leading space if the text element is at the start of a block, // is preceded by a line break element, or has a space in the previous @@ -62,16 +56,13 @@ export default function( node ) { } // Remove the trailing space if the text element is at the end of a block, - // is succeded by a line break element, or has a space in the next element. + // or is succeded by a line break element. if ( newData[ newData.length - 1 ] === ' ' ) { const nextSibling = getSibling( node, 'next' ); if ( ! nextSibling || - nextSibling.nodeName === 'BR' || - // Note that any next node data has not yet been replaced, so we - // have to check for any formatting space. - isFormattingSpace( nextSibling.textContent[ 0 ] ) + nextSibling.nodeName === 'BR' ) { newData = newData.slice( 0, -1 ); } diff --git a/packages/blocks/src/api/raw-handling/paste-handler.js b/packages/blocks/src/api/raw-handling/paste-handler.js index 2519f356657f7..4468eb87c51db 100644 --- a/packages/blocks/src/api/raw-handling/paste-handler.js +++ b/packages/blocks/src/api/raw-handling/paste-handler.js @@ -24,7 +24,7 @@ import shortcodeConverter from './shortcode-converter'; import markdownConverter from './markdown-converter'; import iframeRemover from './iframe-remover'; import googleDocsUIDRemover from './google-docs-uid-remover'; -import HTMLFormattingRemover from './html-formatting-remover'; +import htmlFormattingRemover from './html-formatting-remover'; import { getPhrasingContentSchema } from './phrasing-content'; import { deepFilterHTML, @@ -225,7 +225,7 @@ export function pasteHandler( { HTML = '', plainText = '', mode = 'AUTO', tagNam piece = deepFilterHTML( piece, filters, blockContentSchema ); piece = removeInvalidHTML( piece, schema ); - piece = deepFilterHTML( piece, [ HTMLFormattingRemover ], blockContentSchema ); + piece = deepFilterHTML( piece, [ htmlFormattingRemover ], blockContentSchema ); piece = normaliseBlocks( piece ); // Allows us to ask for this information when we get a report. diff --git a/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js b/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js index 2bbb0917626fe..f3c794d6306ae 100644 --- a/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js +++ b/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js @@ -88,7 +88,7 @@ describe( 'HTMLFormattingRemover', () => { `; - const output = '
a b
'; + const output = '
a b
'; expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); } ); From 35795d2ae10c9c0df96c1b79e715c1ec5cc32ef8 Mon Sep 17 00:00:00 2001 From: Ella van Durpe Date: Mon, 25 Nov 2019 16:54:29 +0100 Subject: [PATCH 6/7] Add extra test --- .../raw-handling/html-formatting-remover.js | 30 ++++++++++++++----- .../test/html-formatting-remover.js | 6 ++++ 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/packages/blocks/src/api/raw-handling/html-formatting-remover.js b/packages/blocks/src/api/raw-handling/html-formatting-remover.js index d9e1aabaaba2c..615feb80508a9 100644 --- a/packages/blocks/src/api/raw-handling/html-formatting-remover.js +++ b/packages/blocks/src/api/raw-handling/html-formatting-remover.js @@ -6,19 +6,28 @@ import { isPhrasingContent } from './phrasing-content'; function getSibling( node, which ) { const sibling = node[ `${ which }Sibling` ]; - if ( ! sibling || ! isPhrasingContent( sibling ) ) { - return; - } - - if ( sibling ) { + if ( sibling && isPhrasingContent( sibling ) ) { return sibling; } const { parentNode } = node; + if ( ! parentNode || ! isPhrasingContent( parentNode ) ) { + return; + } + return getSibling( parentNode, which ); } +function isFormattingSpace( character ) { + return ( + character === ' ' || + character === '\r' || + character === '\n' || + character === '\t' + ); +} + /** * Removes spacing that formats HTML. * @@ -42,7 +51,7 @@ export default function( node ) { // Remove the leading space if the text element is at the start of a block, // is preceded by a line break element, or has a space in the previous - // element. + // node. if ( newData[ 0 ] === ' ' ) { const previousSibling = getSibling( node, 'previous' ); @@ -56,13 +65,18 @@ export default function( node ) { } // Remove the trailing space if the text element is at the end of a block, - // or is succeded by a line break element. + // is succeded by a line break element, or has a space in the next text + // node. if ( newData[ newData.length - 1 ] === ' ' ) { const nextSibling = getSibling( node, 'next' ); if ( ! nextSibling || - nextSibling.nodeName === 'BR' + nextSibling.nodeName === 'BR' || + ( + nextSibling.nodeType === nextSibling.TEXT_NODE && + isFormattingSpace( nextSibling.textContent[ 0 ] ) + ) ) { newData = newData.slice( 0, -1 ); } diff --git a/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js b/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js index f3c794d6306ae..5cb8c39252c5d 100644 --- a/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js +++ b/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js @@ -96,4 +96,10 @@ describe( 'HTMLFormattingRemover', () => { const input = `
 a\n b\n
`; expect( deepFilterHTML( input, [ filter ] ) ).toEqual( input ); } ); + + it( 'should not remove white space if next elemnt has none', () => { + const input = `
a b
`; + const output = '
a b
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); } ); From 760a49213102d76459ef34e83880c3286bd3e017 Mon Sep 17 00:00:00 2001 From: Ella van Durpe Date: Mon, 25 Nov 2019 17:19:44 +0100 Subject: [PATCH 7/7] Add extra plain text test --- .../src/api/raw-handling/test/html-formatting-remover.js | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js b/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js index 5cb8c39252c5d..d5efd50f4db85 100644 --- a/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js +++ b/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js @@ -5,6 +5,11 @@ import filter from '../html-formatting-remover'; import { deepFilterHTML } from '../utils'; describe( 'HTMLFormattingRemover', () => { + it( 'should trim text node without parent', () => { + const input = 'a'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( input ); + } ); + it( 'should remove formatting space', () => { const input = `