From 6b36158d0a85673f7cb23365b5169ca9ae50dd90 Mon Sep 17 00:00:00 2001 From: Matthew Lipski Date: Mon, 4 Dec 2023 16:38:16 +0100 Subject: [PATCH] Added tests for parsing complex docs from Notion and Google Docs --- .../paste/parse-google-docs-html.json | 476 ++++++++++++++++++ .../paste/parse-notion-html.json | 470 +++++++++++++++++ .../src/api/parsers/html/parseHTML.test.ts | 171 +++++++ 3 files changed, 1117 insertions(+) create mode 100644 packages/core/src/api/parsers/html/__snapshots__/paste/parse-google-docs-html.json create mode 100644 packages/core/src/api/parsers/html/__snapshots__/paste/parse-notion-html.json diff --git a/packages/core/src/api/parsers/html/__snapshots__/paste/parse-google-docs-html.json b/packages/core/src/api/parsers/html/__snapshots__/paste/parse-google-docs-html.json new file mode 100644 index 000000000..c45e54ef9 --- /dev/null +++ b/packages/core/src/api/parsers/html/__snapshots__/paste/parse-google-docs-html.json @@ -0,0 +1,476 @@ +[ + { + "id": "1", + "type": "heading", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left", + "level": 1 + }, + "content": [ + { + "type": "text", + "text": "Heading 1", + "styles": { + "bold": true + } + } + ], + "children": [] + }, + { + "id": "2", + "type": "heading", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left", + "level": 2 + }, + "content": [ + { + "type": "text", + "text": "Heading 2", + "styles": { + "bold": true + } + } + ], + "children": [] + }, + { + "id": "3", + "type": "heading", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left", + "level": 3 + }, + "content": [ + { + "type": "text", + "text": "Heading 3", + "styles": { + "bold": true + } + } + ], + "children": [] + }, + { + "id": "4", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Paragraph 1", + "styles": {} + } + ], + "children": [] + }, + { + "id": "5", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Paragraph 2", + "styles": {} + } + ], + "children": [] + }, + { + "id": "6", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Paragraph 3", + "styles": {} + } + ], + "children": [] + }, + { + "id": "7", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Paragraph With \nHard Break", + "styles": {} + } + ], + "children": [] + }, + { + "id": "8", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Bold", + "styles": { + "bold": true + } + }, + { + "type": "text", + "text": " ", + "styles": {} + }, + { + "type": "text", + "text": "Italic", + "styles": { + "italic": true + } + }, + { + "type": "text", + "text": " Underline ", + "styles": {} + }, + { + "type": "text", + "text": "Strikethrough", + "styles": { + "strike": true + } + }, + { + "type": "text", + "text": " ", + "styles": {} + }, + { + "type": "text", + "text": "All", + "styles": { + "bold": true, + "italic": true, + "strike": true + } + } + ], + "children": [] + }, + { + "id": "9", + "type": "bulletListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Bullet List Item 1", + "styles": {} + } + ], + "children": [ + { + "id": "10", + "type": "bulletListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Nested Bullet List Item 1", + "styles": {} + } + ], + "children": [ + { + "id": "11", + "type": "numberedListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Nested Numbered List Item 1", + "styles": {} + } + ], + "children": [] + }, + { + "id": "12", + "type": "numberedListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Nested Numbered List Item 2", + "styles": {} + } + ], + "children": [] + } + ] + }, + { + "id": "13", + "type": "bulletListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Nested Bullet List Item 2", + "styles": {} + } + ], + "children": [] + } + ] + }, + { + "id": "14", + "type": "bulletListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Bullet List Item 2", + "styles": {} + } + ], + "children": [] + }, + { + "id": "15", + "type": "numberedListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Numbered List Item 1", + "styles": {} + } + ], + "children": [] + }, + { + "id": "16", + "type": "numberedListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Numbered List Item 2", + "styles": {} + } + ], + "children": [] + }, + { + "id": "17", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [], + "children": [] + }, + { + "id": "18", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "\n", + "styles": {} + } + ], + "children": [] + }, + { + "id": "19", + "type": "table", + "props": { + "textColor": "default", + "backgroundColor": "default" + }, + "content": { + "type": "tableContent", + "rows": [ + { + "cells": [ + [ + { + "type": "text", + "text": "Cell 1", + "styles": {} + } + ], + [ + { + "type": "text", + "text": "Cell 2", + "styles": {} + } + ], + [ + { + "type": "text", + "text": "Cell 3", + "styles": {} + } + ] + ] + }, + { + "cells": [ + [ + { + "type": "text", + "text": "Cell 4", + "styles": {} + } + ], + [ + { + "type": "text", + "text": "Cell 5", + "styles": {} + } + ], + [ + { + "type": "text", + "text": "Cell 6", + "styles": {} + } + ] + ] + }, + { + "cells": [ + [ + { + "type": "text", + "text": "Cell 7", + "styles": {} + } + ], + [ + { + "type": "text", + "text": "Cell 8", + "styles": {} + } + ], + [ + { + "type": "text", + "text": "Cell 9", + "styles": {} + } + ] + ] + } + ] + }, + "children": [] + }, + { + "id": "20", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Paragraph", + "styles": {} + } + ], + "children": [] + }, + { + "id": "21", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "\n", + "styles": {} + } + ], + "children": [] + } +] \ No newline at end of file diff --git a/packages/core/src/api/parsers/html/__snapshots__/paste/parse-notion-html.json b/packages/core/src/api/parsers/html/__snapshots__/paste/parse-notion-html.json new file mode 100644 index 000000000..d79fe0964 --- /dev/null +++ b/packages/core/src/api/parsers/html/__snapshots__/paste/parse-notion-html.json @@ -0,0 +1,470 @@ +[ + { + "id": "1", + "type": "heading", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left", + "level": 1 + }, + "content": [ + { + "type": "text", + "text": "Heading 1", + "styles": {} + } + ], + "children": [] + }, + { + "id": "2", + "type": "heading", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left", + "level": 2 + }, + "content": [ + { + "type": "text", + "text": "Heading 2", + "styles": {} + } + ], + "children": [] + }, + { + "id": "3", + "type": "heading", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left", + "level": 3 + }, + "content": [ + { + "type": "text", + "text": "Heading 3", + "styles": {} + } + ], + "children": [] + }, + { + "id": "4", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Paragraph 1", + "styles": {} + } + ], + "children": [] + }, + { + "id": "5", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Nested Paragraph 1", + "styles": {} + } + ], + "children": [] + }, + { + "id": "6", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Nested Paragraph 2", + "styles": {} + } + ], + "children": [] + }, + { + "id": "7", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Paragraph With Hard Break", + "styles": {} + } + ], + "children": [] + }, + { + "id": "8", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Bold", + "styles": { + "bold": true + } + }, + { + "type": "text", + "text": " ", + "styles": {} + }, + { + "type": "text", + "text": "Italic", + "styles": { + "italic": true + } + }, + { + "type": "text", + "text": " Underline ", + "styles": {} + }, + { + "type": "text", + "text": "Strikethrough", + "styles": { + "strike": true + } + }, + { + "type": "text", + "text": " ", + "styles": {} + }, + { + "type": "text", + "text": "All", + "styles": { + "bold": true, + "italic": true, + "strike": true + } + } + ], + "children": [] + }, + { + "id": "9", + "type": "bulletListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Bullet List Item 1", + "styles": {} + } + ], + "children": [ + { + "id": "10", + "type": "bulletListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Nested Bullet List Item 1", + "styles": {} + } + ], + "children": [ + { + "id": "11", + "type": "numberedListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Nested Numbered List Item 1", + "styles": {} + } + ], + "children": [] + }, + { + "id": "12", + "type": "numberedListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Nested Numbered List Item 2", + "styles": {} + } + ], + "children": [] + } + ] + }, + { + "id": "13", + "type": "bulletListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Nested Bullet List Item 2", + "styles": {} + } + ], + "children": [] + } + ] + }, + { + "id": "14", + "type": "bulletListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Bullet List Item 2", + "styles": {} + } + ], + "children": [] + }, + { + "id": "15", + "type": "numberedListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Numbered List Item 1", + "styles": {} + } + ], + "children": [] + }, + { + "id": "16", + "type": "numberedListItem", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Numbered List Item 2", + "styles": {} + } + ], + "children": [] + }, + { + "id": "17", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Background Color Paragraph", + "styles": {} + } + ], + "children": [] + }, + { + "id": "18", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "!", + "styles": {} + }, + { + "type": "link", + "href": "https://www.pulsecarshalton.co.uk/wp-content/uploads/2016/08/jk-placeholder-image.jpg", + "content": [ + { + "type": "text", + "text": "https://www.pulsecarshalton.co.uk/wp-content/uploads/2016/08/jk-placeholder-image.jpg", + "styles": {} + } + ] + } + ], + "children": [] + }, + { + "id": "19", + "type": "table", + "props": { + "textColor": "default", + "backgroundColor": "default" + }, + "content": { + "type": "tableContent", + "rows": [ + { + "cells": [ + [ + { + "type": "text", + "text": "Cell 1", + "styles": {} + } + ], + [ + { + "type": "text", + "text": "Cell 2", + "styles": {} + } + ], + [ + { + "type": "text", + "text": "Cell 3", + "styles": {} + } + ] + ] + }, + { + "cells": [ + [ + { + "type": "text", + "text": "Cell 4", + "styles": {} + } + ], + [ + { + "type": "text", + "text": "Cell 5", + "styles": {} + } + ], + [ + { + "type": "text", + "text": "Cell 6", + "styles": {} + } + ] + ] + }, + { + "cells": [ + [ + { + "type": "text", + "text": "Cell 7", + "styles": {} + } + ], + [ + { + "type": "text", + "text": "Cell 8", + "styles": {} + } + ], + [ + { + "type": "text", + "text": "Cell 9", + "styles": {} + } + ] + ] + } + ] + }, + "children": [] + }, + { + "id": "20", + "type": "paragraph", + "props": { + "textColor": "default", + "backgroundColor": "default", + "textAlignment": "left" + }, + "content": [ + { + "type": "text", + "text": "Paragraph", + "styles": {} + } + ], + "children": [] + } +] \ No newline at end of file diff --git a/packages/core/src/api/parsers/html/parseHTML.test.ts b/packages/core/src/api/parsers/html/parseHTML.test.ts index 456ba2a4c..69be9d69b 100644 --- a/packages/core/src/api/parsers/html/parseHTML.test.ts +++ b/packages/core/src/api/parsers/html/parseHTML.test.ts @@ -266,4 +266,175 @@ describe("Parse HTML", () => { await parseHTMLAndCompareSnapshots(html, "parse-div-with-inline-content"); }); + + it("Parse Notion HTML", async () => { + // A few notes on Notion output HTML: + // - Does not preserve text/background colors + // - Does not preserve non-list-item block nesting + // - Hard breaks are represented using white space, not `
` elements + // - Images are converted to links with a "!" at the start + // - Cells in first row of a table are converted to `th` elements, regardless + // of if the row is set as a header row + + const html = `

Heading 1

+

Heading 2

+

Heading 3

+

Paragraph 1

+

Nested Paragraph 1

+

Nested Paragraph 2

+

Paragraph +With Hard Break

+

Bold Italic Underline Strikethrough All

+ +
    +
  1. Numbered List Item 1
  2. +
  3. Numbered List Item 2
  4. +
+

Background Color Paragraph

+

!https://www.pulsecarshalton.co.uk/wp-content/uploads/2016/08/jk-placeholder-image.jpg

+ + + + + + + + + + + + + + + + + + + + +
Cell 1Cell 2Cell 3
Cell 4Cell 5Cell 6
Cell 7Cell 8Cell 9
+

Paragraph

+`; + + await parseHTMLAndCompareSnapshots(html, "parse-notion-html"); + }); + + // Currently breaking, seems related to parsing `
` elements + it.skip("Parse Google Docs HTML", async () => { + // A few notes on Google Docs output HTML: + // - All inline markup is represented as `` elements with inline + // styles (bold, italic, etc.) + // - The nested list structure is not valid, i.e. `