Skip to content

Commit

Permalink
Fix: correctly escape text that would otherwise be interpreted as
Browse files Browse the repository at this point in the history
     raw HTML and HTML blocks.

Closes mixmark-io#106
Closes mixmark-io#261
  • Loading branch information
bjones1 committed Aug 23, 2024
1 parent cc73387 commit f060c41
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 1 deletion.
62 changes: 61 additions & 1 deletion src/turndown.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,45 @@ import { extend, trimLeadingNewlines, trimTrailingNewlines } from './utilities'
import RootNode from './root-node'
import Node from './node'
var reduce = Array.prototype.reduce
// Taken from `commonmark.js/lib/common.js`.
var TAGNAME = '[A-Za-z][A-Za-z0-9-]*'
var ATTRIBUTENAME = '[a-zA-Z_:][a-zA-Z0-9:._-]*'
var UNQUOTEDVALUE = "[^\"'=<>`\\x00-\\x20]+"
var SINGLEQUOTEDVALUE = "'[^']*'"
var DOUBLEQUOTEDVALUE = '"[^"]*"'
var ATTRIBUTEVALUE =
'(?:' +
UNQUOTEDVALUE +
'|' +
SINGLEQUOTEDVALUE +
'|' +
DOUBLEQUOTEDVALUE +
')'
var ATTRIBUTEVALUESPEC = '(?:' + '\\s*=' + '\\s*' + ATTRIBUTEVALUE + ')'
var ATTRIBUTE = '(?:' + '\\s+' + ATTRIBUTENAME + ATTRIBUTEVALUESPEC + '?)'
var OPENTAG = '<' + TAGNAME + ATTRIBUTE + '*' + '\\s*/?>'
var CLOSETAG = '</' + TAGNAME + '\\s*[>]'
var HTMLCOMMENT = '<!-->|<!--->|<!--(?:[^-]+|-[^-]|--[^>])*-->'
var PROCESSINGINSTRUCTION = '[<][?][\\s\\S]*?[?][>]'
var DECLARATION = '<![A-Z]+' + '[^>]*>'
var CDATA = '<!\\[CDATA\\[[\\s\\S]*?\\]\\]>'
var HTMLTAG =
'(?:' +
OPENTAG +
'|' +
CLOSETAG +
'|' +
// Note: Turndown removes comments, so this portion of the regex isn't
// necessary, but doesn't cause problems.
HTMLCOMMENT +
'|' +
PROCESSINGINSTRUCTION +
'|' +
DECLARATION +
'|' +
CDATA +
')'
// End of copied commonmark code.
var escapes = [
[/\\/g, '\\\\'],
[/\*/g, '\\*'],
Expand All @@ -17,7 +56,28 @@ var escapes = [
[/\]/g, '\\]'],
[/^>/g, '\\>'],
[/_/g, '\\_'],
[/^(\d+)\. /g, '$1\\. ']
[/^(\d+)\. /g, '$1\\. '],
// Per [section 6.6 of the CommonMark spec](https://spec.commonmark.org/0.30/#raw-html),
// Raw HTML, CommonMark recognizes and passes through HTML-like tags and
// their contents. Therefore, Turndown needs to escape text that would parse
// as an HTML-like tag. This regex recognizes these tags and escapes them by
// inserting a leading backslash.
[new RegExp(HTMLTAG, 'g'), '\\$&'],
// Likewise, [section 4.6 of the CommonMark spec](https://spec.commonmark.org/0.30/#html-blocks),
// HTML blocks, requires the same treatment.
//
// This regex was copied from `commonmark.js/lib/blocks.js`, the
// `reHtmlBlockOpen` variable. We only need regexps for patterns not matched
// by the previous pattern, so this doesn't need all expressions there.
//
// TODO: this is too aggressive; it should only recognize this pattern at
// the beginning of a line of CommonnMark source; these will recognize the
// pattern at the beginning of any inline or block markup. The approach I
// tried was to put this in `commonmark-rules.js` for the `paragraph` and
// `heading` rules (the only block beginning-of-line rules). However, text
// outside a paragraph/heading doesn't get escaped in this case.
[/^<(?:script|pre|textarea|style)(?:\s|>|$)/i, '\\$&'],
[/^<[/]?(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h[123456]|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(?:\s|[/]?[>]|$)/i, '\\$&']
]

export default function TurndownService (options) {
Expand Down
31 changes: 31 additions & 0 deletions test/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -1086,6 +1086,37 @@ <h2>This is a header.</h2>
<pre class="expected">` nasty code `</pre>
</div>

<div class="case" data-name="Correct escaping of inline raw HTML">
<div class="input">Test &lt;code&gt;tags&lt;/code&gt;, &lt;!-- comments --&gt;, &lt;?processing instructions?&gt;, &lt;!A declaration&gt;, and &lt;![CDATA[character data]]&gt;.</div>
<pre class="expected">Test \&lt;code&gt;tags\&lt;/code&gt;, \&lt;!-- comments --&gt;, \&lt;?processing instructions?&gt;, \&lt;!A declaration&gt;, and &lt;!\[CDATA\[character data\]\]&gt;.</pre>
</div>

<div class="case" data-name="Correct escaping of multi-line raw inline HTML">
<div class="input">Test &lt;code&gt;multi-line
tags&lt;/code&gt;, &lt;!-- multi-line
comments --&gt;, &lt;?multi-line
processing instructions?&gt;, &lt;!A multi-line
declaration&gt;, and &lt;![CDATA[multi-line
character data]]&gt;.</div>
<pre class="expected">Test \&lt;code&gt;multi-line tags\&lt;/code&gt;, \&lt;!-- multi-line comments --&gt;, \&lt;?multi-line processing instructions?&gt;, \&lt;!A multi-line declaration&gt;, and &lt;!\[CDATA\[multi-line character data\]\]&gt;.</pre>
</div>

<div class="case" data-name="Correct escaping of HTML blocks">
<div class="input"><p>&lt;pre</p> <p>&lt;script</p> <p>&lt;style</p> <p>&lt;textarea</p> <p>&lt;address</p> <p>&lt;ul</p></div>
<pre class="expected">\&lt;pre

\&lt;script

\&lt;style

\&lt;textarea

\&lt;address

\&lt;ul</pre>
</div>


<!-- /TEST CASES -->

<script src="turndown-test.browser.js"></script>
Expand Down

0 comments on commit f060c41

Please sign in to comment.