Skip to content

Commit

Permalink
Improve support for pasting from Google Docs and Microsoft Word
Browse files Browse the repository at this point in the history
  • Loading branch information
luin committed Feb 3, 2024
1 parent 8fc9bb7 commit a4bfe54
Show file tree
Hide file tree
Showing 7 changed files with 245 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

- **Clipboard** Convert newlines between inline elements to a space.
- **Clipboard** Avoid generating unsupported formats on paste.
- **Clipboard** Improve support for pasting from Google Docs and Microsoft Word.
- **Syntax** Support highlight.js v10 and v11.

# 2.0.0-beta.2
Expand Down
8 changes: 7 additions & 1 deletion packages/quill/src/modules/clipboard.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import { DirectionAttribute, DirectionStyle } from '../formats/direction';
import { FontStyle } from '../formats/font';
import { SizeStyle } from '../formats/size';
import { deleteRange } from './keyboard';
import normalizeExternalHTML from './normalizeExternalHTML';

const debug = logger('quill:clipboard');

Expand Down Expand Up @@ -118,8 +119,13 @@ class Clipboard extends Module<ClipboardOptions> {
return delta;
}

convertHTML(html: string) {
protected normalizeHTML(doc: Document) {
normalizeExternalHTML(doc);
}

protected convertHTML(html: string) {
const doc = new DOMParser().parseFromString(html, 'text/html');
this.normalizeHTML(doc);
const container = doc.body;
const nodeMatches = new WeakMap();
const [elementMatchers, textMatchers] = this.prepareMatching(
Expand Down
14 changes: 14 additions & 0 deletions packages/quill/src/modules/normalizeExternalHTML/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import googleDocs from './normalizers/googleDocs';
import msWord from './normalizers/msWord';

const NORMALIZERS = [msWord, googleDocs];

const normalizeExternalHTML = (doc: Document) => {
if (doc.documentElement) {
NORMALIZERS.forEach((normalize) => {
normalize(doc);
});
}
};

export default normalizeExternalHTML;
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
const normalWeightRegexp = /font-weight:\s*normal/;
const blockTagNames = ['P', 'OL', 'UL'];

const isBlockElement = (element: Element | null) => {
return element && blockTagNames.includes(element.tagName);
};

const normalizeEmptyLines = (doc: Document) => {
Array.from(doc.querySelectorAll('br'))
.filter(
(br) =>
isBlockElement(br.previousElementSibling) &&
isBlockElement(br.nextElementSibling),
)
.forEach((br) => {
br.parentNode?.removeChild(br);
});
};

const normalizeFontWeight = (doc: Document) => {
Array.from(doc.querySelectorAll('b[style*="font-weight"]'))
.filter((node) => node.getAttribute('style')?.match(normalWeightRegexp))
.forEach((node) => {
const fragment = doc.createDocumentFragment();
fragment.append(...node.childNodes);
node.parentNode?.replaceChild(fragment, node);
});
};

export default function normalize(doc: Document) {
if (doc.querySelector('[id^="docs-internal-guid-"]')) {
normalizeFontWeight(doc);
normalizeEmptyLines(doc);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import _ from 'lodash';

const ignoreRegexp = /\bmso-list:[^;]*ignore/i;
const idRegexp = /\bmso-list:[^;]*\bl(\d+)/i;
const indentRegexp = /\bmso-list:[^;]*\blevel(\d+)/i;

const parseListItem = (element: Element, html: string) => {
const style = element.getAttribute('style');
const idMatch = style?.match(idRegexp);
if (!idMatch) {
return null;
}
const id = Number(idMatch[1]);

const indentMatch = style?.match(indentRegexp);
const indent = indentMatch ? Number(indentMatch[1]) : 1;

const typeRegexp = new RegExp(
`@list l${id}:level${indent}\\s*\\{[^\\}]*mso-level-number-format:\\s*([\\w-]+)`,
'i',
);
const typeMatch = html.match(typeRegexp);
const type = typeMatch && typeMatch[1] === 'bullet' ? 'bullet' : 'ordered';

return { id, indent, type, element };
};

// list items are represented as `p` tags with styles like `mso-list: l0 level1` where:
// 1. "0" in "l0" means the list item id;
// 2. "1" in "level1" means the indent level, starting from 1.
const normalizeListItem = (doc: Document) => {
const msoList = Array.from(doc.querySelectorAll('[style*=mso-list]'));
const [ignored, others] = _.partition(msoList, (node) =>
(node.getAttribute('style') || '').match(ignoreRegexp),
);

// Each list item contains a marker wrapped with "mso-list: Ignore".
ignored.forEach((node) => node.parentNode?.removeChild(node));

// The list stype is not defined inline with the tag, instead, it's in the
// style tag so we need to pass the html as a string.
const html = doc.documentElement.innerHTML;
const listItems = others
.map((element) => parseListItem(element, html))
.filter((parsed) => parsed);

while (listItems.length) {
const childListItems = [];

let current = listItems.shift();
// Group continuous items into the same group (aka "ul")
while (current) {
childListItems.push(current);
current =
listItems.length &&
listItems[0]?.element === current.element.nextElementSibling &&
// Different id means the next item doesn't belong to this group.
listItems[0].id === current.id
? listItems.shift()
: null;
}

const ul = document.createElement('ul');
childListItems.forEach((listItem) => {
const li = document.createElement('li');
li.setAttribute('data-list', listItem.type);
if (listItem.indent > 1) {
li.setAttribute('class', `ql-indent-${listItem.indent - 1}`);
}
li.innerHTML = listItem.element.innerHTML;
ul.appendChild(li);
});

const element = childListItems[0]?.element;
const { parentNode } = element ?? {};
if (element) {
parentNode?.replaceChild(ul, element);
}
childListItems.slice(1).forEach(({ element: e }) => {
parentNode?.removeChild(e);
});
}
};

export default function normalize(doc: Document) {
if (
doc.documentElement.getAttribute('xmlns:w') ===
'urn:schemas-microsoft-com:office:word'
) {
normalizeListItem(doc);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import { describe, expect, test } from 'vitest';
import normalize from '../../../../../src/modules/normalizeExternalHTML/normalizers/googleDocs';

describe('Google Docs', () => {
test('remove unnecessary b tags', () => {
const html = `
<b
style="font-weight: normal;"
id="docs-internal-guid-9f51ddb9-7fff-7da1-2cd6-e966f9297902"
>
<span>Item 1</span><b>Item 2</b>
</b>
<b
style="font-weight: bold;"
>Item 3</b>
`;

const doc = new DOMParser().parseFromString(html, 'text/html');
normalize(doc);
expect(doc.body.children).toMatchInlineSnapshot(`
HTMLCollection [
<span>
Item 1
</span>,
<b>
Item 2
</b>,
<b
style="font-weight: bold;"
>
Item 3
</b>,
]
`);
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import { describe, expect, test } from 'vitest';
import normalize from '../../../../../src/modules/normalizeExternalHTML/normalizers/msWord';

describe('Microsoft Word', () => {
test('keep the list style', () => {
const html = `
<html xmlns:w="urn:schemas-microsoft-com:office:word">
<style>
@list l0:level3 { mso-level-number-format:bullet; }
@list l2:level1 { mso-level-number-format:alpha; }
</style>
<body>
<p style="mso-list: l0 level1 lfo1"><span style="mso-list: Ignore;">1. </span>item 1</p>
<p style="mso-list: l0 level3 lfo1">item 2</p>
<p style="mso-list: l1 level4 lfo1">item 3 in another list</p>
<p>Plain paragraph</p>
<p style="mso-list: l2 level1 lfo1">the last item</p>
</body>
</html>
`;

const doc = new DOMParser().parseFromString(html, 'text/html');
normalize(doc);
expect(doc.body.children).toMatchInlineSnapshot(`
HTMLCollection [
<ul>
<li
data-list="ordered"
>
item 1
</li>
<li
class="ql-indent-2"
data-list="bullet"
>
item 2
</li>
</ul>,
<ul>
<li
class="ql-indent-3"
data-list="ordered"
>
item 3 in another list
</li>
</ul>,
<p>
Plain paragraph
</p>,
<ul>
<li
data-list="ordered"
>
the last item
</li>
</ul>,
]
`);
});
});

0 comments on commit a4bfe54

Please sign in to comment.