Skip to content

Properly parse unclosed tags in code spans #1072

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/change_log/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Python-Markdown Change Log

Under development: version 3.3.4 (a bug-fix release).

* Properly parse unclosed tags in code spans (#1066).
* Properly parse processing instructions in md_in_html (#1070).
* Properly parse code spans in md_in_html (#1069).

Expand Down
20 changes: 20 additions & 0 deletions markdown/extensions/md_in_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,26 @@ def handle_empty_tag(self, data, is_block):
else:
self.handle_data(self.md.htmlStash.store(data))

def parse_pi(self, i):
if self.at_line_start() or self.intail or self.mdstack:
# The same override exists in HTMLExtractor without the check
# for mdstack. Therefore, use HTMLExtractor's parent instead.
return super(HTMLExtractor, self).parse_pi(i)
# This is not the beginning of a raw block so treat as plain data
# and avoid consuming any tags which may follow (see #1066).
self.handle_data('<?')
return i + 2

def parse_html_declaration(self, i):
if self.at_line_start() or self.intail or self.mdstack:
# The same override exists in HTMLExtractor without the check
# for mdstack. Therefore, use HTMLExtractor's parent instead.
return super(HTMLExtractor, self).parse_html_declaration(i)
# This is not the beginning of a raw block so treat as plain data
# and avoid consuming any tags which may follow (see #1066).
self.handle_data('<!')
return i + 2


class HtmlBlockPreprocessor(Preprocessor):
"""Remove html blocks from the text and store them for later retrieval."""
Expand Down
32 changes: 32 additions & 0 deletions markdown/htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,22 @@
# so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete,
# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
htmlparser.incomplete = htmlparser.entityref
# Monkeypatch HTMLParser to not accept a backtick in a tag name, attribute name, or bare value.
htmlparser.locatestarttagend_tolerant = re.compile(r"""
<[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
(?:[\s/]* # optional whitespace before attribute name
(?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
(?:\s*=+\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|"[^"]*" # LIT-enclosed value
|(?!['"])[^`>\s]* # bare value <= added backtick here
)
(?:\s*,)* # possibly followed by a comma
)?(?:\s|/(?!>))*
)*
)?
\s* # trailing whitespace
""", re.VERBOSE)

# Match a blank line at the start of a block of text (two newlines).
# The newlines may be preceded by additional whitespace.
Expand Down Expand Up @@ -230,6 +246,22 @@ def unknown_decl(self, data):
end = ']]>' if data.startswith('CDATA[') else ']>'
self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)

def parse_pi(self, i):
if self.at_line_start() or self.intail:
return super().parse_pi(i)
# This is not the beginning of a raw block so treat as plain data
# and avoid consuming any tags which may follow (see #1066).
self.handle_data('<?')
return i + 2

def parse_html_declaration(self, i):
if self.at_line_start() or self.intail:
return super().parse_html_declaration(i)
# This is not the beginning of a raw block so treat as plain data
# and avoid consuming any tags which may follow (see #1066).
self.handle_data('<!')
return i + 2

# The rest has been copied from base class in standard lib to address #1036.
# As __startag_text is private, all references to it must be in this subclass.
# The last few lines of parse_starttag are reversed so that handle_starttag
Expand Down
105 changes: 105 additions & 0 deletions tests/test_syntax/blocks/test_html_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,48 @@ def test_raw_missing_close_bracket(self):
'<p>&lt;foo</p>'
)

def test_raw_unclosed_tag_in_code_span(self):
self.assertMarkdownRenders(
self.dedent(
"""
`<div`.

<div>
hello
</div>
"""
),
self.dedent(
"""
<p><code>&lt;div</code>.</p>
<div>
hello
</div>
"""
)
)

def test_raw_unclosed_tag_in_code_span_space(self):
self.assertMarkdownRenders(
self.dedent(
"""
` <div `.

<div>
hello
</div>
"""
),
self.dedent(
"""
<p><code>&lt;div</code>.</p>
<div>
hello
</div>
"""
)
)

def test_raw_attributes(self):
self.assertMarkdownRenders(
'<p id="foo", class="bar baz", style="margin: 15px; line-height: 1.5; text-align: center;">text</p>',
Expand Down Expand Up @@ -1073,6 +1115,27 @@ def test_raw_processing_instruction_indented(self):
)
)

def test_raw_processing_instruction_code_span(self):
self.assertMarkdownRenders(
self.dedent(
"""
`<?php`

<div>
foo
</div>
"""
),
self.dedent(
"""
<p><code>&lt;?php</code></p>
<div>
foo
</div>
"""
)
)

def test_raw_declaration_one_line(self):
self.assertMarkdownRenders(
'<!DOCTYPE html>',
Expand Down Expand Up @@ -1110,6 +1173,27 @@ def test_raw_multiline_declaration(self):
)
)

def test_raw_declaration_code_span(self):
self.assertMarkdownRenders(
self.dedent(
"""
`<!`

<div>
foo
</div>
"""
),
self.dedent(
"""
<p><code>&lt;!</code></p>
<div>
foo
</div>
"""
)
)

def test_raw_cdata_one_line(self):
self.assertMarkdownRenders(
'<![CDATA[ document.write(">"); ]]>',
Expand Down Expand Up @@ -1190,6 +1274,27 @@ def test_raw_cdata_indented(self):
)
)

def test_raw_cdata_code_span(self):
self.assertMarkdownRenders(
self.dedent(
"""
`<![`

<div>
foo
</div>
"""
),
self.dedent(
"""
<p><code>&lt;![</code></p>
<div>
foo
</div>
"""
)
)

def test_charref(self):
self.assertMarkdownRenders(
'&sect;',
Expand Down