-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve support for pasting from Google Docs and Microsoft Word
- Loading branch information
Showing
7 changed files
with
245 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
import googleDocs from './normalizers/googleDocs'; | ||
import msWord from './normalizers/msWord'; | ||
|
||
const NORMALIZERS = [msWord, googleDocs]; | ||
|
||
const normalizeExternalHTML = (doc: Document) => { | ||
if (doc.documentElement) { | ||
NORMALIZERS.forEach((normalize) => { | ||
normalize(doc); | ||
}); | ||
} | ||
}; | ||
|
||
export default normalizeExternalHTML; |
35 changes: 35 additions & 0 deletions
35
packages/quill/src/modules/normalizeExternalHTML/normalizers/googleDocs.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
const normalWeightRegexp = /font-weight:\s*normal/; | ||
const blockTagNames = ['P', 'OL', 'UL']; | ||
|
||
const isBlockElement = (element: Element | null) => { | ||
return element && blockTagNames.includes(element.tagName); | ||
}; | ||
|
||
const normalizeEmptyLines = (doc: Document) => { | ||
Array.from(doc.querySelectorAll('br')) | ||
.filter( | ||
(br) => | ||
isBlockElement(br.previousElementSibling) && | ||
isBlockElement(br.nextElementSibling), | ||
) | ||
.forEach((br) => { | ||
br.parentNode?.removeChild(br); | ||
}); | ||
}; | ||
|
||
const normalizeFontWeight = (doc: Document) => { | ||
Array.from(doc.querySelectorAll('b[style*="font-weight"]')) | ||
.filter((node) => node.getAttribute('style')?.match(normalWeightRegexp)) | ||
.forEach((node) => { | ||
const fragment = doc.createDocumentFragment(); | ||
fragment.append(...node.childNodes); | ||
node.parentNode?.replaceChild(fragment, node); | ||
}); | ||
}; | ||
|
||
export default function normalize(doc: Document) { | ||
if (doc.querySelector('[id^="docs-internal-guid-"]')) { | ||
normalizeFontWeight(doc); | ||
normalizeEmptyLines(doc); | ||
} | ||
} |
92 changes: 92 additions & 0 deletions
92
packages/quill/src/modules/normalizeExternalHTML/normalizers/msWord.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import _ from 'lodash'; | ||
|
||
const ignoreRegexp = /\bmso-list:[^;]*ignore/i; | ||
const idRegexp = /\bmso-list:[^;]*\bl(\d+)/i; | ||
const indentRegexp = /\bmso-list:[^;]*\blevel(\d+)/i; | ||
|
||
const parseListItem = (element: Element, html: string) => { | ||
const style = element.getAttribute('style'); | ||
const idMatch = style?.match(idRegexp); | ||
if (!idMatch) { | ||
return null; | ||
} | ||
const id = Number(idMatch[1]); | ||
|
||
const indentMatch = style?.match(indentRegexp); | ||
const indent = indentMatch ? Number(indentMatch[1]) : 1; | ||
|
||
const typeRegexp = new RegExp( | ||
`@list l${id}:level${indent}\\s*\\{[^\\}]*mso-level-number-format:\\s*([\\w-]+)`, | ||
'i', | ||
); | ||
const typeMatch = html.match(typeRegexp); | ||
const type = typeMatch && typeMatch[1] === 'bullet' ? 'bullet' : 'ordered'; | ||
|
||
return { id, indent, type, element }; | ||
}; | ||
|
||
// list items are represented as `p` tags with styles like `mso-list: l0 level1` where: | ||
// 1. "0" in "l0" means the list item id; | ||
// 2. "1" in "level1" means the indent level, starting from 1. | ||
const normalizeListItem = (doc: Document) => { | ||
const msoList = Array.from(doc.querySelectorAll('[style*=mso-list]')); | ||
const [ignored, others] = _.partition(msoList, (node) => | ||
(node.getAttribute('style') || '').match(ignoreRegexp), | ||
); | ||
|
||
// Each list item contains a marker wrapped with "mso-list: Ignore". | ||
ignored.forEach((node) => node.parentNode?.removeChild(node)); | ||
|
||
// The list stype is not defined inline with the tag, instead, it's in the | ||
// style tag so we need to pass the html as a string. | ||
const html = doc.documentElement.innerHTML; | ||
const listItems = others | ||
.map((element) => parseListItem(element, html)) | ||
.filter((parsed) => parsed); | ||
|
||
while (listItems.length) { | ||
const childListItems = []; | ||
|
||
let current = listItems.shift(); | ||
// Group continuous items into the same group (aka "ul") | ||
while (current) { | ||
childListItems.push(current); | ||
current = | ||
listItems.length && | ||
listItems[0]?.element === current.element.nextElementSibling && | ||
// Different id means the next item doesn't belong to this group. | ||
listItems[0].id === current.id | ||
? listItems.shift() | ||
: null; | ||
} | ||
|
||
const ul = document.createElement('ul'); | ||
childListItems.forEach((listItem) => { | ||
const li = document.createElement('li'); | ||
li.setAttribute('data-list', listItem.type); | ||
if (listItem.indent > 1) { | ||
li.setAttribute('class', `ql-indent-${listItem.indent - 1}`); | ||
} | ||
li.innerHTML = listItem.element.innerHTML; | ||
ul.appendChild(li); | ||
}); | ||
|
||
const element = childListItems[0]?.element; | ||
const { parentNode } = element ?? {}; | ||
if (element) { | ||
parentNode?.replaceChild(ul, element); | ||
} | ||
childListItems.slice(1).forEach(({ element: e }) => { | ||
parentNode?.removeChild(e); | ||
}); | ||
} | ||
}; | ||
|
||
export default function normalize(doc: Document) { | ||
if ( | ||
doc.documentElement.getAttribute('xmlns:w') === | ||
'urn:schemas-microsoft-com:office:word' | ||
) { | ||
normalizeListItem(doc); | ||
} | ||
} |
36 changes: 36 additions & 0 deletions
36
packages/quill/test/unit/modules/normalizeExternalHTML/normalizers/googleDocs.spec.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import { describe, expect, test } from 'vitest'; | ||
import normalize from '../../../../../src/modules/normalizeExternalHTML/normalizers/googleDocs'; | ||
|
||
describe('Google Docs', () => { | ||
test('remove unnecessary b tags', () => { | ||
const html = ` | ||
<b | ||
style="font-weight: normal;" | ||
id="docs-internal-guid-9f51ddb9-7fff-7da1-2cd6-e966f9297902" | ||
> | ||
<span>Item 1</span><b>Item 2</b> | ||
</b> | ||
<b | ||
style="font-weight: bold;" | ||
>Item 3</b> | ||
`; | ||
|
||
const doc = new DOMParser().parseFromString(html, 'text/html'); | ||
normalize(doc); | ||
expect(doc.body.children).toMatchInlineSnapshot(` | ||
HTMLCollection [ | ||
<span> | ||
Item 1 | ||
</span>, | ||
<b> | ||
Item 2 | ||
</b>, | ||
<b | ||
style="font-weight: bold;" | ||
> | ||
Item 3 | ||
</b>, | ||
] | ||
`); | ||
}); | ||
}); |
60 changes: 60 additions & 0 deletions
60
packages/quill/test/unit/modules/normalizeExternalHTML/normalizers/msWord.spec.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import { describe, expect, test } from 'vitest'; | ||
import normalize from '../../../../../src/modules/normalizeExternalHTML/normalizers/msWord'; | ||
|
||
describe('Microsoft Word', () => { | ||
test('keep the list style', () => { | ||
const html = ` | ||
<html xmlns:w="urn:schemas-microsoft-com:office:word"> | ||
<style> | ||
@list l0:level3 { mso-level-number-format:bullet; } | ||
@list l2:level1 { mso-level-number-format:alpha; } | ||
</style> | ||
<body> | ||
<p style="mso-list: l0 level1 lfo1"><span style="mso-list: Ignore;">1. </span>item 1</p> | ||
<p style="mso-list: l0 level3 lfo1">item 2</p> | ||
<p style="mso-list: l1 level4 lfo1">item 3 in another list</p> | ||
<p>Plain paragraph</p> | ||
<p style="mso-list: l2 level1 lfo1">the last item</p> | ||
</body> | ||
</html> | ||
`; | ||
|
||
const doc = new DOMParser().parseFromString(html, 'text/html'); | ||
normalize(doc); | ||
expect(doc.body.children).toMatchInlineSnapshot(` | ||
HTMLCollection [ | ||
<ul> | ||
<li | ||
data-list="ordered" | ||
> | ||
item 1 | ||
</li> | ||
<li | ||
class="ql-indent-2" | ||
data-list="bullet" | ||
> | ||
item 2 | ||
</li> | ||
</ul>, | ||
<ul> | ||
<li | ||
class="ql-indent-3" | ||
data-list="ordered" | ||
> | ||
item 3 in another list | ||
</li> | ||
</ul>, | ||
<p> | ||
Plain paragraph | ||
</p>, | ||
<ul> | ||
<li | ||
data-list="ordered" | ||
> | ||
the last item | ||
</li> | ||
</ul>, | ||
] | ||
`); | ||
}); | ||
}); |