diff --git a/src/constants/regular-expressions.ts b/src/constants/regular-expressions.ts
index a896237e..f470ccba 100644
--- a/src/constants/regular-expressions.ts
+++ b/src/constants/regular-expressions.ts
@@ -3,4 +3,11 @@
*/
const REGEX_LINE_BREAKS = /(?:\r\n|\r|\n)/g
-export { REGEX_LINE_BREAKS }
+/**
+ * A regex for standard punctuation characters for US-ASCII plus unicode punctuation.
+ *
+ * @see https://stackoverflow.com/a/25575009
+ */
+const REGEX_PUNCTUATION = /[\u2000-\u206f\u2e00-\u2e7f'!"#$%&()*+,\-./:;<=>?@\\[\]^_`{|}~]/
+
+export { REGEX_LINE_BREAKS, REGEX_PUNCTUATION }
diff --git a/src/serializers/markdown/markdown.test.ts b/src/serializers/markdown/markdown.test.ts
index c18de1d3..1c3f9f40 100644
--- a/src/serializers/markdown/markdown.test.ts
+++ b/src/serializers/markdown/markdown.test.ts
@@ -11,6 +11,13 @@ import { createMarkdownSerializer } from './markdown'
import type { MarkdownSerializerReturnType } from './markdown'
+const HTML_INPUT_SPECIAL_HTML_CHARS = `Ambition & Balance
+<doist>
+</doist>
+<doist></doist>
+"Doist"
+'Doist'`
+
const HTML_INPUT_HEADINGS = `
My favorite search engine is EFF.
This is the Markdown Guide.
See the section on code
.
\\' text \\'
+\\! text \\!
+\\" text \\"
+\\# text \\#
+\\$ text \\$
+\\% text \\%
+\\& text \\&
+\\( text \\(
+\\) text \\)
+\\* text \\*
+\\+ text \\+
+\\, text \\,
+\\\\ text \\\\
+\\- text \\-
+\\. text \\.
+\\/ text \\/
+\\: text \\:
+\\; text \\;
+\\< text \\<
+\\= text \\=
+\\> text \\>
+\\? text \\?
+\\@ text \\@
+\\[ text \\[
+\\] text \\]
+\\^ text \\^
+\\_ text \\_
+\\\` text \\\`
+\\{ text \\{
+\\| text \\|
+\\} text \\}
+\\~ text \\~
` +describe('Markdown Serializer', () => { describe('Plain-text Document', () => { describe('with default extensions', () => { let markdownSerializer: MarkdownSerializerReturnType @@ -227,24 +235,6 @@ before _ after- 1968. A great year!
+- I think 1969 was second best.
`), + ).toBe(`- 1968. A great year! +- I think 1969 was second best.`) + expect(markdownSerializer.serialize(HTML_INPUT_PONCTUATION_CHARACTERS)) + .toBe(`\\' text \\' +\\! text \\! +\\" text \\" +\\# text \\# +\\$ text \\$ +\\% text \\% +\\& text \\& +\\( text \\( +\\) text \\) +\\* text \\* +\\+ text \\+ +\\, text \\, +\\\\ text \\\\ +\\- text \\- +\\. text \\. +\\/ text \\/ +\\: text \\: +\\; text \\; +\\< text \\< +\\= text \\= +\\> text \\> +\\? text \\? +\\@ text \\@ +\\[ text \\[ +\\] text \\] +\\^ text \\^ +\\_ text \\_ +\\\` text \\\` +\\{ text \\{ +\\| text \\| +\\} text \\} +\\~ text \\~`) + }) + }) }) describe('with custom `*Suggestion` extensions', () => { @@ -292,7 +325,7 @@ Answer: [Doist Frontend](channel://190200)`) }) describe('Rich-text Document', () => { - describe('without default extensions', () => { + describe('with default extensions', () => { let markdownSerializer: MarkdownSerializerReturnType beforeEach(() => { @@ -309,24 +342,6 @@ Answer: [Doist Frontend](channel://190200)`) 'Doist'`) }) - test('special Markdown characters are escaped', () => { - expect(markdownSerializer.serialize(HTML_INPUT_SPECIAL_MARKDOWN_CHARS)) - .toBe(`before \\\\ after -before \\* after -\\- after -\\+ after -\\= after -\\=== after -\\\` after -\\~~~ after -before \\[ after -before \\] after -\\> after -before \\_ after -1\\. after -99\\. after`) - }) - test('headings Markdown output is correct', () => { expect(markdownSerializer.serialize(HTML_INPUT_HEADINGS)).toBe( '# Heading level 1\n\n## Heading level 2\n\n### Heading level 3\n\n#### Heading level 4\n\n##### Heading level 5\n\n###### Heading level 6', @@ -427,11 +442,6 @@ Strikethrough uses two tildes: ~~scratch this~~`, --- -- 1968\\. A great year! -- I think 1969 was second best. - ---- - - This is the first list item. - Here's the second list item. I need to add another paragraph below the second list item. @@ -471,11 +481,6 @@ Strikethrough uses two tildes: ~~scratch this~~`, --- -- 1968\\. A great year! -- I think 1969 was second best. - ---- - - This is the first list item. - Here's the second list item. I need to add another paragraph below the second list item. @@ -554,15 +559,83 @@ See the section on [\`code\`](#code).`, ) }) - test('special Markdown characters are NOT escaped if `escape` is disabled', () => { - const customSerializer = createMarkdownSerializer(getSchema([RichTextKit]), { - escape: false, + describe('with overridden `escape` function', () => { + test('backslash characters preceding punctuation characters are escaped correctly', () => { + expect(markdownSerializer.serialize(HTML_INPUT_PONCTUATION_CHARACTERS)) + .toBe(`\\\\' text \\\\' + +\\\\! text \\\\! + +\\\\" text \\\\" + +\\\\# text \\\\# + +\\\\$ text \\\\$ + +\\\\% text \\\\% + +\\\\& text \\\\& + +\\\\( text \\\\( + +\\\\) text \\\\) + +\\\\* text \\\\* + +\\\\+ text \\\\+ + +\\\\, text \\\\, + +\\\\\\ text \\\\\\ + +\\\\- text \\\\- + +\\\\. text \\\\. + +\\\\/ text \\\\/ + +\\\\: text \\\\: + +\\\\; text \\\\; + +\\\\< text \\\\< + +\\\\= text \\\\= + +\\\\> text \\\\> + +\\\\? text \\\\? + +\\\\@ text \\\\@ + +\\\\[ text \\\\[ + +\\\\] text \\\\] + +\\\\^ text \\\\^ + +\\\\_ text \\\\_ + +\\\\\` text \\\\\` + +\\\\{ text \\\\{ + +\\\\| text \\\\| + +\\\\} text \\\\} + +\\\\~ text \\\\~`) + }) + + test('text content that matches the ordered list syntax is escaped correctly', () => { + expect( + markdownSerializer.serialize(`Wrapped markdown **still markdown**
`, - ), - ).toBe(`**Wrapped markdown** **still markdown**`) }) }) @@ -620,11 +693,6 @@ See the section on [\`code\`](#code).`, --- -- 1968\\. A great year! -- I think 1969 was second best. - ---- - - This is the first list item. - Here's the second list item. I need to add another paragraph below the second list item. @@ -686,11 +754,6 @@ See the section on [\`code\`](#code).`, --- -- [ ] 1968\\. A great year! -- [x] I think 1969 was second best. - ---- - - [ ] This is the first list item. - [ ] Here's the second list item. I need to add another paragraph below the second list item. diff --git a/src/serializers/markdown/markdown.ts b/src/serializers/markdown/markdown.ts index 94c06f59..e3a79c01 100644 --- a/src/serializers/markdown/markdown.ts +++ b/src/serializers/markdown/markdown.ts @@ -1,5 +1,6 @@ import Turndown from 'turndown' +import { REGEX_PUNCTUATION } from '../../constants/regular-expressions' import { isPlainTextDocument } from '../../helpers/schema' import { image } from './plugins/image' @@ -11,16 +12,6 @@ import { taskItem } from './plugins/task-item' import type { Schema } from 'prosemirror-model' -/** - * The options that the `createMarkdownSerializer` function accepts. - */ -type MarkdownSerializerOptions = { - /** - * Disables markdown escaping. - */ - escape?: false -} - /** * The return type for the `createMarkdownSerializer` function. */ @@ -97,23 +88,41 @@ const INITIAL_TURNDOWN_OPTIONS: Turndown.Options = { * * @returns A normalized object for the Markdown serializer. */ -function createMarkdownSerializer( - schema: Schema, - options?: MarkdownSerializerOptions, -): MarkdownSerializerReturnType { +function createMarkdownSerializer(schema: Schema): MarkdownSerializerReturnType { // Initialize Turndown with custom options const turndown = new Turndown(INITIAL_TURNDOWN_OPTIONS) - // Turndown was built to convert HTML into Markdown, expecting the input to be standards - // compliant HTML. As such, it collapses all whitespace by default, and there's - // currently no way to opt-out of this behavior. However, for plain-text editors, we - // need to preserve Markdown whitespace (otherwise we lose syntax like nested lists) by - // replacing all instances of the space character (but only if it's preceded by another - // space character) by the non-breaking space character, and after processing the input - // with Turndown, we restore the original space character. - if (isPlainTextDocument(schema) || options?.escape === false) { + // Turndown ensures Markdown characters are escaped (i.e. `\`) by default, so they are not + // interpreted as Markdown when the output is compiled back to HTML. However, for plain-text + // editors, we need to override the `escape` function to return the input as-is (effectively + // disabling the escaping behaviour), so that all characters are interpreted as Markdown. + if (isPlainTextDocument(schema)) { turndown.escape = (str) => str } + // As for rich-text editors, we need to override the built-in escaping behaviour with a custom + // implementation to suit our requirements. Please note that the `escape` function takes the + // text content of each HTML element, with the exception of code elements, so we can be sure + // that the escaping behaviour will only touch relevant Markdown characters. + else { + turndown.escape = (str) => { + return ( + str + // Escape all backslash characters that precedes any punctuation characters, + // otherwise the backslash character itself will be interpreted as escaping the + // character that comes after it (which is not the intent). It's important that + // this escape rule is executed before all other escape rules, otherwise we + // could be double escaping some backslash characters. + .replace(new RegExp(`(\\\\${REGEX_PUNCTUATION.source})`, 'g'), '\\$1') + + // Although the CommonMark specification allows for bulleted or ordered lists + // inside other bulleted or ordered lists (i.e. `- 1. - 1. Item`), the markup + // generated by Markdown compilers is not supported by Tiptap, and we need to + // make sure that text context that matches the ordered list syntax is + // correctly escaped in order to be interpreted as text. + .replace(/^(\d+)\.(\s.+|$)/, '$1\\.$2') + ) + } + } // Overwrite some built-in rules for handling of special behaviours // (see documentation for each extension for more details)