From 71f867bd2e5d5568703406353abd16d7532e8361 Mon Sep 17 00:00:00 2001 From: Zihua Li Date: Sat, 3 Feb 2024 11:55:40 +0800 Subject: [PATCH] Improve support for pasting from Google Docs and Microsoft Word --- CHANGELOG.md | 1 + packages/quill/src/modules/clipboard.ts | 8 +- .../modules/normalizeExternalHTML/index.ts | 14 +++ .../normalizers/googleDocs.ts | 35 +++++++ .../normalizers/msWord.ts | 92 +++++++++++++++++++ .../quill/test/unit/modules/clipboard.spec.ts | 13 +++ .../normalizers/googleDocs.spec.ts | 36 ++++++++ .../normalizers/msWord.spec.ts | 60 ++++++++++++ 8 files changed, 258 insertions(+), 1 deletion(-) create mode 100644 packages/quill/src/modules/normalizeExternalHTML/index.ts create mode 100644 packages/quill/src/modules/normalizeExternalHTML/normalizers/googleDocs.ts create mode 100644 packages/quill/src/modules/normalizeExternalHTML/normalizers/msWord.ts create mode 100644 packages/quill/test/unit/modules/normalizeExternalHTML/normalizers/googleDocs.spec.ts create mode 100644 packages/quill/test/unit/modules/normalizeExternalHTML/normalizers/msWord.spec.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 44ed55646e..775506bd96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ - **Clipboard** Convert newlines between inline elements to a space. - **Clipboard** Avoid generating unsupported formats on paste. +- **Clipboard** Improve support for pasting from Google Docs and Microsoft Word. - **Syntax** Support highlight.js v10 and v11. # 2.0.0-beta.2 diff --git a/packages/quill/src/modules/clipboard.ts b/packages/quill/src/modules/clipboard.ts index ad21b95f23..e4a63cdf69 100644 --- a/packages/quill/src/modules/clipboard.ts +++ b/packages/quill/src/modules/clipboard.ts @@ -22,6 +22,7 @@ import { DirectionAttribute, DirectionStyle } from '../formats/direction'; import { FontStyle } from '../formats/font'; import { SizeStyle } from '../formats/size'; import { deleteRange } from './keyboard'; +import normalizeExternalHTML from './normalizeExternalHTML'; const debug = logger('quill:clipboard'); @@ -118,8 +119,13 @@ class Clipboard extends Module { return delta; } - convertHTML(html: string) { + protected normalizeHTML(doc: Document) { + normalizeExternalHTML(doc); + } + + protected convertHTML(html: string) { const doc = new DOMParser().parseFromString(html, 'text/html'); + this.normalizeHTML(doc); const container = doc.body; const nodeMatches = new WeakMap(); const [elementMatchers, textMatchers] = this.prepareMatching( diff --git a/packages/quill/src/modules/normalizeExternalHTML/index.ts b/packages/quill/src/modules/normalizeExternalHTML/index.ts new file mode 100644 index 0000000000..87468ea961 --- /dev/null +++ b/packages/quill/src/modules/normalizeExternalHTML/index.ts @@ -0,0 +1,14 @@ +import googleDocs from './normalizers/googleDocs'; +import msWord from './normalizers/msWord'; + +const NORMALIZERS = [msWord, googleDocs]; + +const normalizeExternalHTML = (doc: Document) => { + if (doc.documentElement) { + NORMALIZERS.forEach((normalize) => { + normalize(doc); + }); + } +}; + +export default normalizeExternalHTML; diff --git a/packages/quill/src/modules/normalizeExternalHTML/normalizers/googleDocs.ts b/packages/quill/src/modules/normalizeExternalHTML/normalizers/googleDocs.ts new file mode 100644 index 0000000000..1ce127d563 --- /dev/null +++ b/packages/quill/src/modules/normalizeExternalHTML/normalizers/googleDocs.ts @@ -0,0 +1,35 @@ +const normalWeightRegexp = /font-weight:\s*normal/; +const blockTagNames = ['P', 'OL', 'UL']; + +const isBlockElement = (element: Element | null) => { + return element && blockTagNames.includes(element.tagName); +}; + +const normalizeEmptyLines = (doc: Document) => { + Array.from(doc.querySelectorAll('br')) + .filter( + (br) => + isBlockElement(br.previousElementSibling) && + isBlockElement(br.nextElementSibling), + ) + .forEach((br) => { + br.parentNode?.removeChild(br); + }); +}; + +const normalizeFontWeight = (doc: Document) => { + Array.from(doc.querySelectorAll('b[style*="font-weight"]')) + .filter((node) => node.getAttribute('style')?.match(normalWeightRegexp)) + .forEach((node) => { + const fragment = doc.createDocumentFragment(); + fragment.append(...node.childNodes); + node.parentNode?.replaceChild(fragment, node); + }); +}; + +export default function normalize(doc: Document) { + if (doc.querySelector('[id^="docs-internal-guid-"]')) { + normalizeFontWeight(doc); + normalizeEmptyLines(doc); + } +} diff --git a/packages/quill/src/modules/normalizeExternalHTML/normalizers/msWord.ts b/packages/quill/src/modules/normalizeExternalHTML/normalizers/msWord.ts new file mode 100644 index 0000000000..2495e931ff --- /dev/null +++ b/packages/quill/src/modules/normalizeExternalHTML/normalizers/msWord.ts @@ -0,0 +1,92 @@ +import _ from 'lodash'; + +const ignoreRegexp = /\bmso-list:[^;]*ignore/i; +const idRegexp = /\bmso-list:[^;]*\bl(\d+)/i; +const indentRegexp = /\bmso-list:[^;]*\blevel(\d+)/i; + +const parseListItem = (element: Element, html: string) => { + const style = element.getAttribute('style'); + const idMatch = style?.match(idRegexp); + if (!idMatch) { + return null; + } + const id = Number(idMatch[1]); + + const indentMatch = style?.match(indentRegexp); + const indent = indentMatch ? Number(indentMatch[1]) : 1; + + const typeRegexp = new RegExp( + `@list l${id}:level${indent}\\s*\\{[^\\}]*mso-level-number-format:\\s*([\\w-]+)`, + 'i', + ); + const typeMatch = html.match(typeRegexp); + const type = typeMatch && typeMatch[1] === 'bullet' ? 'bullet' : 'ordered'; + + return { id, indent, type, element }; +}; + +// list items are represented as `p` tags with styles like `mso-list: l0 level1` where: +// 1. "0" in "l0" means the list item id; +// 2. "1" in "level1" means the indent level, starting from 1. +const normalizeListItem = (doc: Document) => { + const msoList = Array.from(doc.querySelectorAll('[style*=mso-list]')); + const [ignored, others] = _.partition(msoList, (node) => + (node.getAttribute('style') || '').match(ignoreRegexp), + ); + + // Each list item contains a marker wrapped with "mso-list: Ignore". + ignored.forEach((node) => node.parentNode?.removeChild(node)); + + // The list stype is not defined inline with the tag, instead, it's in the + // style tag so we need to pass the html as a string. + const html = doc.documentElement.innerHTML; + const listItems = others + .map((element) => parseListItem(element, html)) + .filter((parsed) => parsed); + + while (listItems.length) { + const childListItems = []; + + let current = listItems.shift(); + // Group continuous items into the same group (aka "ul") + while (current) { + childListItems.push(current); + current = + listItems.length && + listItems[0]?.element === current.element.nextElementSibling && + // Different id means the next item doesn't belong to this group. + listItems[0].id === current.id + ? listItems.shift() + : null; + } + + const ul = document.createElement('ul'); + childListItems.forEach((listItem) => { + const li = document.createElement('li'); + li.setAttribute('data-list', listItem.type); + if (listItem.indent > 1) { + li.setAttribute('class', `ql-indent-${listItem.indent - 1}`); + } + li.innerHTML = listItem.element.innerHTML; + ul.appendChild(li); + }); + + const element = childListItems[0]?.element; + const { parentNode } = element ?? {}; + if (element) { + parentNode?.replaceChild(ul, element); + } + childListItems.slice(1).forEach(({ element: e }) => { + parentNode?.removeChild(e); + }); + } +}; + +export default function normalize(doc: Document) { + if ( + doc.documentElement.getAttribute('xmlns:w') === + 'urn:schemas-microsoft-com:office:word' + ) { + normalizeListItem(doc); + } +} diff --git a/packages/quill/test/unit/modules/clipboard.spec.ts b/packages/quill/test/unit/modules/clipboard.spec.ts index bc9de3e7f7..a39fac08dd 100644 --- a/packages/quill/test/unit/modules/clipboard.spec.ts +++ b/packages/quill/test/unit/modules/clipboard.spec.ts @@ -528,5 +528,18 @@ describe('Clipboard', () => { }); expect(delta).toEqual(new Delta().insert('')); }); + + test('Google Docs', () => { + const html = `

text


  1. i1

  2. i2

    1. i3

text


`; + const delta = createClipboard().convert({ html }); + expect(delta).toEqual( + new Delta() + .insert('text\n') + .insert('i1\ni2\n', { list: 'ordered' }) + .insert('i3\n', { list: 'ordered', indent: 1 }) + .insert('text', { bold: true }) + .insert('\n'), + ); + }); }); }); diff --git a/packages/quill/test/unit/modules/normalizeExternalHTML/normalizers/googleDocs.spec.ts b/packages/quill/test/unit/modules/normalizeExternalHTML/normalizers/googleDocs.spec.ts new file mode 100644 index 0000000000..5d4c1a6ae3 --- /dev/null +++ b/packages/quill/test/unit/modules/normalizeExternalHTML/normalizers/googleDocs.spec.ts @@ -0,0 +1,36 @@ +import { describe, expect, test } from 'vitest'; +import normalize from '../../../../../src/modules/normalizeExternalHTML/normalizers/googleDocs'; + +describe('Google Docs', () => { + test('remove unnecessary b tags', () => { + const html = ` + + Item 1Item 2 + + Item 3 + `; + + const doc = new DOMParser().parseFromString(html, 'text/html'); + normalize(doc); + expect(doc.body.children).toMatchInlineSnapshot(` + HTMLCollection [ + + Item 1 + , + + Item 2 + , + + Item 3 + , + ] + `); + }); +}); diff --git a/packages/quill/test/unit/modules/normalizeExternalHTML/normalizers/msWord.spec.ts b/packages/quill/test/unit/modules/normalizeExternalHTML/normalizers/msWord.spec.ts new file mode 100644 index 0000000000..860a7ac699 --- /dev/null +++ b/packages/quill/test/unit/modules/normalizeExternalHTML/normalizers/msWord.spec.ts @@ -0,0 +1,60 @@ +import { describe, expect, test } from 'vitest'; +import normalize from '../../../../../src/modules/normalizeExternalHTML/normalizers/msWord'; + +describe('Microsoft Word', () => { + test('keep the list style', () => { + const html = ` + + + +

1. item 1

+

item 2

+

item 3 in another list

+

Plain paragraph

+

the last item

+ + + `; + + const doc = new DOMParser().parseFromString(html, 'text/html'); + normalize(doc); + expect(doc.body.children).toMatchInlineSnapshot(` + HTMLCollection [ + , + , +

+ Plain paragraph +

, + , + ] + `); + }); +});