Improve support for pasting from Google Docs and Microsoft Word

slab · Feb 3, 2024 · a4bfe54 · a4bfe54
1 parent 8fc9bb7
commit a4bfe54
Show file tree

Hide file tree

Showing 7 changed files with 245 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 - **Clipboard** Convert newlines between inline elements to a space.
 - **Clipboard** Avoid generating unsupported formats on paste.
+- **Clipboard** Improve support for pasting from Google Docs and Microsoft Word.
 - **Syntax** Support highlight.js v10 and v11.
 
 # 2.0.0-beta.2

diff --git a/packages/quill/src/modules/clipboard.ts b/packages/quill/src/modules/clipboard.ts
@@ -22,6 +22,7 @@ import { DirectionAttribute, DirectionStyle } from '../formats/direction';
 import { FontStyle } from '../formats/font';
 import { SizeStyle } from '../formats/size';
 import { deleteRange } from './keyboard';
+import normalizeExternalHTML from './normalizeExternalHTML';
 
 const debug = logger('quill:clipboard');
 
@@ -118,8 +119,13 @@ class Clipboard extends Module<ClipboardOptions> {
     return delta;
   }
 
-  convertHTML(html: string) {
+  protected normalizeHTML(doc: Document) {
+    normalizeExternalHTML(doc);
+  }
+
+  protected convertHTML(html: string) {
     const doc = new DOMParser().parseFromString(html, 'text/html');
+    this.normalizeHTML(doc);
     const container = doc.body;
     const nodeMatches = new WeakMap();
     const [elementMatchers, textMatchers] = this.prepareMatching(

diff --git a/packages/quill/src/modules/normalizeExternalHTML/index.ts b/packages/quill/src/modules/normalizeExternalHTML/index.ts
@@ -0,0 +1,14 @@
+import googleDocs from './normalizers/googleDocs';
+import msWord from './normalizers/msWord';
+
+const NORMALIZERS = [msWord, googleDocs];
+
+const normalizeExternalHTML = (doc: Document) => {
+  if (doc.documentElement) {
+    NORMALIZERS.forEach((normalize) => {
+      normalize(doc);
+    });
+  }
+};
+
+export default normalizeExternalHTML;
diff --git a/packages/quill/src/modules/normalizeExternalHTML/normalizers/googleDocs.ts b/packages/quill/src/modules/normalizeExternalHTML/normalizers/googleDocs.ts
@@ -0,0 +1,35 @@
+const normalWeightRegexp = /font-weight:\s*normal/;
+const blockTagNames = ['P', 'OL', 'UL'];
+
+const isBlockElement = (element: Element | null) => {
+  return element && blockTagNames.includes(element.tagName);
+};
+
+const normalizeEmptyLines = (doc: Document) => {
+  Array.from(doc.querySelectorAll('br'))
+    .filter(
+      (br) =>
+        isBlockElement(br.previousElementSibling) &&
+        isBlockElement(br.nextElementSibling),
+    )
+    .forEach((br) => {
+      br.parentNode?.removeChild(br);
+    });
+};
+
+const normalizeFontWeight = (doc: Document) => {
+  Array.from(doc.querySelectorAll('b[style*="font-weight"]'))
+    .filter((node) => node.getAttribute('style')?.match(normalWeightRegexp))
+    .forEach((node) => {
+      const fragment = doc.createDocumentFragment();
+      fragment.append(...node.childNodes);
+      node.parentNode?.replaceChild(fragment, node);
+    });
+};
+
+export default function normalize(doc: Document) {
+  if (doc.querySelector('[id^="docs-internal-guid-"]')) {
+    normalizeFontWeight(doc);
+    normalizeEmptyLines(doc);
+  }
+}
diff --git a/packages/quill/src/modules/normalizeExternalHTML/normalizers/msWord.ts b/packages/quill/src/modules/normalizeExternalHTML/normalizers/msWord.ts
@@ -0,0 +1,92 @@
+import _ from 'lodash';
+
+const ignoreRegexp = /\bmso-list:[^;]*ignore/i;
+const idRegexp = /\bmso-list:[^;]*\bl(\d+)/i;
+const indentRegexp = /\bmso-list:[^;]*\blevel(\d+)/i;
+
+const parseListItem = (element: Element, html: string) => {
+  const style = element.getAttribute('style');
+  const idMatch = style?.match(idRegexp);
+  if (!idMatch) {
+    return null;
+  }
+  const id = Number(idMatch[1]);
+
+  const indentMatch = style?.match(indentRegexp);
+  const indent = indentMatch ? Number(indentMatch[1]) : 1;
+
+  const typeRegexp = new RegExp(
+    `@list l${id}:level${indent}\\s*\\{[^\\}]*mso-level-number-format:\\s*([\\w-]+)`,
+    'i',
+  );
+  const typeMatch = html.match(typeRegexp);
+  const type = typeMatch && typeMatch[1] === 'bullet' ? 'bullet' : 'ordered';
+
+  return { id, indent, type, element };
+};
+
+// list items are represented as `p` tags with styles like `mso-list: l0 level1` where:
+// 1. "0" in "l0" means the list item id;
+// 2. "1" in "level1" means the indent level, starting from 1.
+const normalizeListItem = (doc: Document) => {
+  const msoList = Array.from(doc.querySelectorAll('[style*=mso-list]'));
+  const [ignored, others] = _.partition(msoList, (node) =>
+    (node.getAttribute('style') || '').match(ignoreRegexp),
+  );
+
+  // Each list item contains a marker wrapped with "mso-list: Ignore".
+  ignored.forEach((node) => node.parentNode?.removeChild(node));
+
+  // The list stype is not defined inline with the tag, instead, it's in the
+  // style tag so we need to pass the html as a string.
+  const html = doc.documentElement.innerHTML;
+  const listItems = others
+    .map((element) => parseListItem(element, html))
+    .filter((parsed) => parsed);
+
+  while (listItems.length) {
+    const childListItems = [];
+
+    let current = listItems.shift();
+    // Group continuous items into the same group (aka "ul")
+    while (current) {
+      childListItems.push(current);
+      current =
+        listItems.length &&
+        listItems[0]?.element === current.element.nextElementSibling &&
+        // Different id means the next item doesn't belong to this group.
+        listItems[0].id === current.id
+          ? listItems.shift()
+          : null;
+    }
+
+    const ul = document.createElement('ul');
+    childListItems.forEach((listItem) => {
+      const li = document.createElement('li');
+      li.setAttribute('data-list', listItem.type);
+      if (listItem.indent > 1) {
+        li.setAttribute('class', `ql-indent-${listItem.indent - 1}`);
+      }
+      li.innerHTML = listItem.element.innerHTML;
+      ul.appendChild(li);
+    });
+
+    const element = childListItems[0]?.element;
+    const { parentNode } = element ?? {};
+    if (element) {
+      parentNode?.replaceChild(ul, element);
+    }
+    childListItems.slice(1).forEach(({ element: e }) => {
+      parentNode?.removeChild(e);
+    });
+  }
+};
+
+export default function normalize(doc: Document) {
+  if (
+    doc.documentElement.getAttribute('xmlns:w') ===
+    'urn:schemas-microsoft-com:office:word'
+  ) {
+    normalizeListItem(doc);
+  }
+}
diff --git a/packages/quill/test/unit/modules/normalizeExternalHTML/normalizers/googleDocs.spec.ts b/packages/quill/test/unit/modules/normalizeExternalHTML/normalizers/googleDocs.spec.ts
@@ -0,0 +1,36 @@
+import { describe, expect, test } from 'vitest';
+import normalize from '../../../../../src/modules/normalizeExternalHTML/normalizers/googleDocs';
+
+describe('Google Docs', () => {
+  test('remove unnecessary b tags', () => {
+    const html = `
+      <b
+        style="font-weight: normal;"
+        id="docs-internal-guid-9f51ddb9-7fff-7da1-2cd6-e966f9297902"
+      >
+        <span>Item 1</span><b>Item 2</b>
+      </b>
+      <b
+        style="font-weight: bold;"
+      >Item 3</b>
+      `;
+
+    const doc = new DOMParser().parseFromString(html, 'text/html');
+    normalize(doc);
+    expect(doc.body.children).toMatchInlineSnapshot(`
+      HTMLCollection [
+        <span>
+          Item 1
+        </span>,
+        <b>
+          Item 2
+        </b>,
+        <b
+          style="font-weight: bold;"
+        >
+          Item 3
+        </b>,
+      ]
+    `);
+  });
+});
diff --git a/packages/quill/test/unit/modules/normalizeExternalHTML/normalizers/msWord.spec.ts b/packages/quill/test/unit/modules/normalizeExternalHTML/normalizers/msWord.spec.ts
@@ -0,0 +1,60 @@
+import { describe, expect, test } from 'vitest';
+import normalize from '../../../../../src/modules/normalizeExternalHTML/normalizers/msWord';
+
+describe('Microsoft Word', () => {
+  test('keep the list style', () => {
+    const html = `
+      <html xmlns:w="urn:schemas-microsoft-com:office:word">
+        <style>
+          @list l0:level3 { mso-level-number-format:bullet; }
+          @list l2:level1 { mso-level-number-format:alpha; }
+        </style>
+        <body>
+          <p style="mso-list: l0 level1 lfo1"><span style="mso-list: Ignore;">1. </span>item 1</p>
+          <p style="mso-list: l0 level3 lfo1">item 2</p>
+          <p style="mso-list: l1 level4 lfo1">item 3 in another list</p>
+          <p>Plain paragraph</p>
+          <p style="mso-list: l2 level1 lfo1">the last item</p>
+        </body>
+      </html>
+      `;
+
+    const doc = new DOMParser().parseFromString(html, 'text/html');
+    normalize(doc);
+    expect(doc.body.children).toMatchInlineSnapshot(`
+      HTMLCollection [
+        <ul>
+          <li
+            data-list="ordered"
+          >
+            item 1
+          </li>
+          <li
+            class="ql-indent-2"
+            data-list="bullet"
+          >
+            item 2
+          </li>
+        </ul>,
+        <ul>
+          <li
+            class="ql-indent-3"
+            data-list="ordered"
+          >
+            item 3 in another list
+          </li>
+        </ul>,
+        <p>
+          Plain paragraph
+        </p>,
+        <ul>
+          <li
+            data-list="ordered"
+          >
+            the last item
+          </li>
+        </ul>,
+      ]
+    `);
+  });
+});