From 585f2074cfb065cda6ffed15f2c5b28dd071395d Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 7 Mar 2019 15:46:46 -0500 Subject: [PATCH 01/67] Refactor HTML Parser This is experimental. More of the HTMLParser methods need to be fleshed out. So far the basic stuff works as long as there is no invalid HTML in the document. --- markdown/blockprocessors.py | 23 ++++ markdown/htmlparser.py | 86 ++++++++++++ markdown/preprocessors.py | 266 ------------------------------------ 3 files changed, 109 insertions(+), 266 deletions(-) create mode 100644 markdown/htmlparser.py diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py index e81f83c9a..ce16dbff2 100644 --- a/markdown/blockprocessors.py +++ b/markdown/blockprocessors.py @@ -35,6 +35,7 @@ import xml.etree.ElementTree as etree from . import util from .blockparser import BlockParser +from .htmlparser import HTMLExtractor logger = logging.getLogger('MARKDOWN') @@ -45,6 +46,7 @@ def build_block_parser(md, **kwargs): parser.blockprocessors.register(EmptyBlockProcessor(parser), 'empty', 100) parser.blockprocessors.register(ListIndentProcessor(parser), 'indent', 90) parser.blockprocessors.register(CodeBlockProcessor(parser), 'code', 80) + parser.blockprocessors.register(RawHtmlProcessor(parser), 'html', 75) parser.blockprocessors.register(HashHeaderProcessor(parser), 'hashheader', 70) parser.blockprocessors.register(SetextHeaderProcessor(parser), 'setextheader', 60) parser.blockprocessors.register(HRProcessor(parser), 'hr', 50) @@ -271,6 +273,27 @@ def run(self, parent, blocks): blocks.insert(0, theRest) +class RawHtmlProcessor(BlockProcessor): + + TAG_RE = re.compile(r'^\<(?P[^<> ]+)[^<>]*>') + + def test(self, parent, block): + m = self.TAG_RE.match(block) + return m and self.parser.md.is_block_level(m.group('tag')) + + def run(self, parent, blocks): + parser = HTMLExtractor(md=self.parser.md) + while blocks: + parser.feed(blocks.pop(0) + '\n\n') + if not parser.inraw: + break + # Insert Markdown back into blocks with raw HTML extracted. + parts = ''.join(parser.cleandoc).split('\n\n') + parts.reverse() + for block in parts: + blocks.insert(0, block) + + class BlockQuoteProcessor(BlockProcessor): RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)') diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py new file mode 100644 index 000000000..677f2bab1 --- /dev/null +++ b/markdown/htmlparser.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- +""" +Python Markdown + +A Python implementation of John Gruber's Markdown. + +Documentation: https://python-markdown.github.io/ +GitHub: https://github.com/Python-Markdown/markdown/ +PyPI: https://pypi.org/project/Markdown/ + +Started by Manfred Stienstra (http://www.dwerg.net/). +Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). +Currently maintained by Waylan Limberg (https://github.com/waylan), +Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). + +Copyright 2007-2019 The Python Markdown Project (v. 1.7 and later) +Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) +Copyright 2004 Manfred Stienstra (the original version) + +License: BSD (see LICENSE.md for details). +""" + +try: + from HTMLParser import HTMLParser + PY2 = True +except ImportError: + from html.parser import HTMLParser + PY2 = False + + +class HTMLExtractor(HTMLParser): + """ + Extract raw HTML from text. + + The raw HTML is stored in the `htmlStash` of the Markdown instance passed + to `md` and the remaining text is stored in `cleandoc` as a list of strings. + """ + + def __init__(self, md): + if PY2: + # In PY2 HTMLParser is an old style class :( + HTMLParser.__init__(self) + else: + super(HTMLExtractor, self).__init__() + self.md = md + self.inraw = False + self.stack = [] # When inraw==True, stack contains a list of tags + self._cache = [] + self.cleandoc = [] + + def handle_starttag(self, tag, attrs): + self.stack.append(tag) + + line, col = self.getpos() + if col < 4 and self.md.is_block_level(tag) and not self.inraw: + # Started a new raw block + self.inraw = True + + text = self.get_starttag_text() + if self.inraw: + self._cache.append(text) + else: + self.cleandoc.append(text) + + def handle_endtag(self, tag): + text = '<{0}/>'.format(tag) + if tag in self.stack: + while self.stack: + if self.stack.pop() == tag: + break + if self.inraw and len(self.stack) == 0: + # End of raw block + self.inraw = False + self._cache.append(text) + self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) + self._cache = [] + elif self.inraw: + self._cache.append(text) + else: + self.cleandoc.append(text) + + def handle_data(self, data): + if self.inraw: + self._cache.append(data) + else: + self.cleandoc.append(data) diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py index f12a02a94..c646b5ca4 100644 --- a/markdown/preprocessors.py +++ b/markdown/preprocessors.py @@ -33,7 +33,6 @@ def build_preprocessors(md, **kwargs): """ Build the default set of preprocessors used by Markdown. """ preprocessors = util.Registry() preprocessors.register(NormalizeWhitespace(md), 'normalize_whitespace', 30) - preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20) preprocessors.register(ReferencePreprocessor(md), 'reference', 10) return preprocessors @@ -71,271 +70,6 @@ def run(self, lines): return source.split('\n') -class HtmlBlockPreprocessor(Preprocessor): - """Remove html blocks from the text and store them for later retrieval.""" - - right_tag_patterns = ["", "%s>"] - attrs_pattern = r""" - \s+(?P[^>"'/= ]+)=(?P['"])(?P.*?)(?P=q) # attr="value" - | # OR - \s+(?P[^>"'/= ]+)=(?P[^> ]+) # attr=value - | # OR - \s+(?P[^>"'/= ]+) # attr - """ - left_tag_pattern = r'^\<(?P[^> ]+)(?P(%s)*)\s*\/?\>?' % \ - attrs_pattern - attrs_re = re.compile(attrs_pattern, re.VERBOSE) - left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) - markdown_in_raw = False - - def _get_left_tag(self, block): - m = self.left_tag_re.match(block) - if m: - tag = m.group('tag') - raw_attrs = m.group('attrs') - attrs = {} - if raw_attrs: - for ma in self.attrs_re.finditer(raw_attrs): - if ma.group('attr'): - if ma.group('value'): - attrs[ma.group('attr').strip()] = ma.group('value') - else: - attrs[ma.group('attr').strip()] = "" - elif ma.group('attr1'): - if ma.group('value1'): - attrs[ma.group('attr1').strip()] = ma.group( - 'value1' - ) - else: - attrs[ma.group('attr1').strip()] = "" - elif ma.group('attr2'): - attrs[ma.group('attr2').strip()] = "" - return tag, len(m.group(0)), attrs - else: - tag = block[1:].split(">", 1)[0].lower() - return tag, len(tag)+2, {} - - def _recursive_tagfind(self, ltag, rtag, start_index, block): - while 1: - i = block.find(rtag, start_index) - if i == -1: - return -1 - j = block.find(ltag, start_index) - # if no ltag, or rtag found before another ltag, return index - if (j > i or j == -1): - return i + len(rtag) - # another ltag found before rtag, use end of ltag as starting - # point and search again - j = block.find('>', j) - start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) - if start_index == -1: - # HTML potentially malformed- ltag has no corresponding - # rtag - return -1 - - def _get_right_tag(self, left_tag, left_index, block): - for p in self.right_tag_patterns: - tag = p % left_tag - i = self._recursive_tagfind( - "<%s" % left_tag, tag, left_index, block - ) - if i > 2: - return tag.lstrip("<").rstrip(">"), i - return block.rstrip()[-left_index:-1].lower(), len(block) - - def _equal_tags(self, left_tag, right_tag): - if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. - return True - if ("/" + left_tag) == right_tag: - return True - if (right_tag == "--" and left_tag == "--"): - return True - elif left_tag == right_tag[1:] and right_tag[0] == "/": - return True - else: - return False - - def _is_oneliner(self, tag): - return (tag in ['hr', 'hr/']) - - def _stringindex_to_listindex(self, stringindex, items): - """ - Same effect as concatenating the strings in items, - finding the character to which stringindex refers in that string, - and returning the index of the item in which that character resides. - """ - items.append('dummy') - i, count = 0, 0 - while count <= stringindex: - count += len(items[i]) - i += 1 - return i - 1 - - def _nested_markdown_in_html(self, items): - """Find and process html child elements of the given element block.""" - for i, item in enumerate(items): - if self.left_tag_re.match(item): - left_tag, left_index, attrs = \ - self._get_left_tag(''.join(items[i:])) - right_tag, data_index = self._get_right_tag( - left_tag, left_index, ''.join(items[i:])) - right_listindex = \ - self._stringindex_to_listindex(data_index, items[i:]) + i - if 'markdown' in attrs.keys(): - items[i] = items[i][left_index:] # remove opening tag - placeholder = self.md.htmlStash.store_tag( - left_tag, attrs, i + 1, right_listindex + 1) - items.insert(i, placeholder) - if len(items) - right_listindex <= 1: # last nest, no tail - right_listindex -= 1 - items[right_listindex] = items[right_listindex][ - :-len(right_tag) - 2] # remove closing tag - else: # raw html - if len(items) - right_listindex <= 1: # last element - right_listindex -= 1 - if right_listindex <= i: - right_listindex = i + 1 - placeholder = self.md.htmlStash.store('\n\n'.join( - items[i:right_listindex])) - del items[i:right_listindex] - items.insert(i, placeholder) - return items - - def run(self, lines): - text = "\n".join(lines) - new_blocks = [] - text = text.rsplit("\n\n") - items = [] - left_tag = '' - right_tag = '' - in_tag = False # flag - - while text: - block = text[0] - if block.startswith("\n"): - block = block[1:] - text = text[1:] - - if block.startswith("\n"): - block = block[1:] - - if not in_tag: - if block.startswith("<") and len(block.strip()) > 1: - - if block[1:4] == "!--": - # is a comment block - left_tag, left_index, attrs = "--", 2, {} - else: - left_tag, left_index, attrs = self._get_left_tag(block) - right_tag, data_index = self._get_right_tag(left_tag, - left_index, - block) - # keep checking conditions below and maybe just append - - if data_index < len(block) and (self.md.is_block_level(left_tag) or left_tag == '--'): - text.insert(0, block[data_index:]) - block = block[:data_index] - - if not (self.md.is_block_level(left_tag) or block[1] in ["!", "?", "@", "%"]): - new_blocks.append(block) - continue - - if self._is_oneliner(left_tag): - new_blocks.append(block.strip()) - continue - - if block.rstrip().endswith(">") \ - and self._equal_tags(left_tag, right_tag): - if self.markdown_in_raw and 'markdown' in attrs.keys(): - block = block[left_index:-len(right_tag) - 2] - new_blocks.append(self.md.htmlStash. - store_tag(left_tag, attrs, 0, 2)) - new_blocks.extend([block]) - else: - new_blocks.append( - self.md.htmlStash.store(block.strip())) - continue - else: - # if is block level tag and is not complete - if (not self._equal_tags(left_tag, right_tag)) and \ - (self.md.is_block_level(left_tag) or left_tag == "--"): - items.append(block.strip()) - in_tag = True - else: - new_blocks.append( - self.md.htmlStash.store(block.strip()) - ) - continue - - else: - new_blocks.append(block) - - else: - items.append(block) - - # Need to evaluate all items so we can calculate relative to the left index. - right_tag, data_index = self._get_right_tag(left_tag, left_index, ''.join(items)) - # Adjust data_index: relative to items -> relative to last block - prev_block_length = 0 - for item in items[:-1]: - prev_block_length += len(item) - data_index -= prev_block_length - - if self._equal_tags(left_tag, right_tag): - # if find closing tag - - if data_index < len(block): - # we have more text after right_tag - items[-1] = block[:data_index] - text.insert(0, block[data_index:]) - - in_tag = False - if self.markdown_in_raw and 'markdown' in attrs.keys(): - items[0] = items[0][left_index:] - items[-1] = items[-1][:-len(right_tag) - 2] - if items[len(items) - 1]: # not a newline/empty string - right_index = len(items) + 3 - else: - right_index = len(items) + 2 - new_blocks.append(self.md.htmlStash.store_tag( - left_tag, attrs, 0, right_index)) - placeholderslen = len(self.md.htmlStash.tag_data) - new_blocks.extend( - self._nested_markdown_in_html(items)) - nests = len(self.md.htmlStash.tag_data) - \ - placeholderslen - self.md.htmlStash.tag_data[-1 - nests][ - 'right_index'] += nests - 2 - else: - new_blocks.append( - self.md.htmlStash.store('\n\n'.join(items))) - items = [] - - if items: - if self.markdown_in_raw and 'markdown' in attrs.keys(): - items[0] = items[0][left_index:] - items[-1] = items[-1][:-len(right_tag) - 2] - if items[len(items) - 1]: # not a newline/empty string - right_index = len(items) + 3 - else: - right_index = len(items) + 2 - new_blocks.append( - self.md.htmlStash.store_tag( - left_tag, attrs, 0, right_index)) - placeholderslen = len(self.md.htmlStash.tag_data) - new_blocks.extend(self._nested_markdown_in_html(items)) - nests = len(self.md.htmlStash.tag_data) - placeholderslen - self.md.htmlStash.tag_data[-1 - nests][ - 'right_index'] += nests - 2 - else: - new_blocks.append( - self.md.htmlStash.store('\n\n'.join(items))) - new_blocks.append('\n') - - new_text = "\n\n".join(new_blocks) - return new_text.split("\n") - - class ReferencePreprocessor(Preprocessor): """ Remove reference definitions from text and store for later use. """ From 77baade81ea9c880a9f0ba9ecd2f7aa9236c8c81 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 7 Mar 2019 16:05:38 -0500 Subject: [PATCH 02/67] fix silly error --- markdown/htmlparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 677f2bab1..8c675a619 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -63,7 +63,7 @@ def handle_starttag(self, tag, attrs): self.cleandoc.append(text) def handle_endtag(self, tag): - text = '<{0}/>'.format(tag) + text = ''.format(tag) if tag in self.stack: while self.stack: if self.stack.pop() == tag: From d4c8951994780e7ab13f48f59989bcba5fc70d0b Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 7 Mar 2019 20:16:18 -0500 Subject: [PATCH 03/67] Add some new tests --- tests/test_syntax/blocks/test_html_blocks.py | 310 +++++++++++++++++++ 1 file changed, 310 insertions(+) create mode 100644 tests/test_syntax/blocks/test_html_blocks.py diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py new file mode 100644 index 000000000..e25d76d81 --- /dev/null +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -0,0 +1,310 @@ +# -*- coding: utf-8 -*- +""" +Python Markdown + +A Python implementation of John Gruber's Markdown. + +Documentation: https://python-markdown.github.io/ +GitHub: https://github.com/Python-Markdown/markdown/ +PyPI: https://pypi.org/project/Markdown/ + +Started by Manfred Stienstra (http://www.dwerg.net/). +Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). +Currently maintained by Waylan Limberg (https://github.com/waylan), +Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). + +Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later) +Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) +Copyright 2004 Manfred Stienstra (the original version) + +License: BSD (see LICENSE.md for details). +""" + +import unittest +from markdown.test_tools import TestCase + + +class TestHTMLBlocks(TestCase): + + def test_raw_paragraph(self): + self.assertMarkdownRenders( + '

A raw paragraph.

', + '

A raw paragraph.

' + ) + + def test_raw_skip_inline_markdown(self): + self.assertMarkdownRenders( + '

A *raw* paragraph.

', + '

A *raw* paragraph.

' + ) + + def test_raw_indent_one_space(self): + self.assertMarkdownRenders( + '

A *raw* paragraph.

', + # TODO: reevaluate. This matches strict rules and reference + # implementation version 1.0.1 but not 1.0.2b8. + '

A raw paragraph.

' + ) + + def test_raw_indent_four_spaces(self): + self.assertMarkdownRenders( + '

code block

', + self.dedent( + """ +
<p>code block</p>
+                
+ """ + ) + ) + + def test_raw_span(self): + self.assertMarkdownRenders( + '*inline*', + '

inline

' + ) + + def test_code_span(self): + self.assertMarkdownRenders( + '`code span`', + '

<em>code span</em>

' + ) + + def test_multiline_raw(self): + self.assertMarkdownRenders( + self.dedent( + """ +

+ A raw paragraph + with multiple lines. +

+ """ + ), + self.dedent( + """ +

+ A raw paragraph + with multiple lines. +

+ """ + ) + ) + + def test_blank_lines_in_raw(self): + self.assertMarkdownRenders( + self.dedent( + """ +

+ + A raw paragraph... + + with many blank lines. + +

+ """ + ), + self.dedent( + """ +

+ + A raw paragraph... + + with many blank lines. + +

+ """ + ) + ) + + def test_raw_surrounded_by_Markdown(self): + self.assertMarkdownRenders( + self.dedent( + """ + Some *Markdown* text. + +

*Raw* HTML.

+ + More *Markdown* text. + """ + ), + self.dedent( + """ +

Some Markdown text.

+

*Raw* HTML.

+ +

More Markdown text.

+ """ + ) + ) + + def test_raw_without_blank_lines(self): + self.assertMarkdownRenders( + self.dedent( + """ + Some *Markdown* text. +

*Raw* HTML.

+ More *Markdown* text. + """ + ), + # The raw gets treated as inline HTML. This follows the rules and this lib's + # previous behavior, but not the reference implementation. TODO: Reevaluate. + self.dedent( + """ +

Some Markdown text. +

Raw HTML.

+ More Markdown text.

+ """ + ) + # The reference implementation does this instead: + # self.dedent( + # """ + #

Some Markdown text.

+ #

*Raw* HTML.

+ #

More Markdown text.

+ # """ + # ) + ) + + def test_raw_with_markdown_blocks(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ Not a Markdown paragraph. + + * Not a list item. + * Another non-list item. + + Another non-Markdown paragraph. +
+ """ + ), + self.dedent( + """ +
+ Not a Markdown paragraph. + + * Not a list item. + * Another non-list item. + + Another non-Markdown paragraph. +
+ """ + ) + ) + + # TODO: This fails. Fix it. + def test_adjacent_raw_blocks(self): + self.assertMarkdownRenders( + self.dedent( + """ +

A raw paragraph.

+

A second raw paragraph.

+ """ + ), + self.dedent( + """ +

A raw paragraph.

+

A second raw paragraph.

+ """ + ) + ) + + def test_adjacent_raw_blocks_with_blank_lines(self): + self.assertMarkdownRenders( + self.dedent( + """ +

A raw paragraph.

+ +

A second raw paragraph.

+ """ + ), + self.dedent( + """ +

A raw paragraph.

+ +

A second raw paragraph.

+ """ + ) + ) + + def test_nested_raw_block(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+

A raw paragraph.

+
+ """ + ), + self.dedent( + """ +
+

A raw paragraph.

+
+ """ + ) + ) + + def test_nested_indented_raw_block(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+

A raw paragraph.

+
+ """ + ), + self.dedent( + """ +
+

A raw paragraph.

+
+ """ + ) + ) + + def test_nested_raw_blocks(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+

A raw paragraph.

+

A second raw paragraph.

+
+ """ + ), + self.dedent( + """ +
+

A raw paragraph.

+

A second raw paragraph.

+
+ """ + ) + ) + + def test_nested_raw_blocks_with_blank_lines(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ +

A raw paragraph.

+ +

A second raw paragraph.

+ +
+ """ + ), + self.dedent( + """ +
+ +

A raw paragraph.

+ +

A second raw paragraph.

+ +
+ """ + ) + ) From 356f5c3c53f7f68c24e2751922f35f137c50a07d Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Fri, 8 Mar 2019 16:28:07 -0500 Subject: [PATCH 04/67] More tests. --- markdown/blockprocessors.py | 1 + tests/test_syntax/blocks/test_html_blocks.py | 228 ++++++++++++++++++- 2 files changed, 228 insertions(+), 1 deletion(-) diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py index ce16dbff2..eea883adb 100644 --- a/markdown/blockprocessors.py +++ b/markdown/blockprocessors.py @@ -287,6 +287,7 @@ def run(self, parent, blocks): parser.feed(blocks.pop(0) + '\n\n') if not parser.inraw: break + parser.close() # Insert Markdown back into blocks with raw HTML extracted. parts = ''.join(parser.cleandoc).split('\n\n') parts.reverse() diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index e25d76d81..8c600a457 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -20,7 +20,6 @@ License: BSD (see LICENSE.md for details). """ -import unittest from markdown.test_tools import TestCase @@ -69,6 +68,30 @@ def test_code_span(self): '

<em>code span</em>

' ) + def test_raw_empty(self): + self.assertMarkdownRenders( + '

', + '

' + ) + + def test_raw_empty_space(self): + self.assertMarkdownRenders( + '

', + '

' + ) + + def test_raw_empty_newline(self): + self.assertMarkdownRenders( + '

\n

', + '

\n

' + ) + + def test_raw_empty_blank_line(self): + self.assertMarkdownRenders( + '

\n\n

', + '

\n\n

' + ) + def test_multiline_raw(self): self.assertMarkdownRenders( self.dedent( @@ -308,3 +331,206 @@ def test_nested_raw_blocks_with_blank_lines(self): """ ) ) + + def test_raw_nested_inline(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+

+ *text* +

+
+ """ + ), + self.dedent( + """ +
+

+ *text* +

+
+ """ + ) + ) + + def test_raw_nested_inline_with_blank_lines(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ +

+ + *text* + +

+ +
+ """ + ), + self.dedent( + """ +
+ +

+ + *text* + +

+ +
+ """ + ) + ) + + # TODO: fix this. Presumably as the parser finishes things up in the `handle_endtag` + # method and as there is no end tag here, that code never gets run. Is there a way + # to retrieve the unprocessed source text from HTMLParser? May have to read the source. + # The `test_raw_nested_p_no_end_tag` below works because of the closing ``. + def test_raw_p_no_end_tag(self): + self.assertMarkdownRenders( + '

*text*', + '

*text*' + ) + + # TODO: fix this. See comment on previous test method. + def test_raw_multiple_p_no_end_tag(self): + self.assertMarkdownRenders( + self.dedent( + """ +

*text*' + +

more *text* + """ + ), + self.dedent( + """ +

*text*' + +

more *text* + """ + ) + ) + + # TODO: fix this. See comment on previous test method. + def test_raw_p_no_end_tag_followed_by_blank_line(self): + self.assertMarkdownRenders( + self.dedent( + """ +

*raw text*' + + Still part of *raw* text. + """ + ), + self.dedent( + """ +

*raw text*' + + Still part of *raw* text. + """ + ) + ) + + def test_raw_nested_p_no_end_tag(self): + self.assertMarkdownRenders( + '

*text*

', + '

*text*

' + ) + + def test_raw_open_bracket_only(self): + self.assertMarkdownRenders( + '<', + '

<

' + ) + + def test_raw_open_bracket_followed_by_space(self): + self.assertMarkdownRenders( + '< foo', + '

< foo

' + ) + + def test_raw_missing_close_bracket(self): + self.assertMarkdownRenders( + '<foo

' + ) + + def test_raw_attributes(self): + self.assertMarkdownRenders( + '

text

', + '

text

' + ) + + def test_raw_attributes_nested(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+

text

+
+ """ + ), + self.dedent( + """ +
+

text

+
+ """ + ) + ) + + def test_raw_comment_one_line(self): + self.assertMarkdownRenders( + '', + '' + ) + + # TODO: Confirm this is correct + def test_raw_comment_one_line_followed_by_text(self): + self.assertMarkdownRenders( + '*bar*', + '*bar*' + ) + + def test_raw_multiline_comment(self): + self.assertMarkdownRenders( + self.dedent( + """ + + """ + ), + self.dedent( + """ + + """ + ) + ) + + def test_raw_comment_with_blank_lines(self): + self.assertMarkdownRenders( + self.dedent( + """ + + """ + ), + self.dedent( + """ + + """ + ) + ) + + # TODO: processing instruction, declaration, CDATA... From ff0f8f2d9208abb45ae52a29c8d1ac75451bee94 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Mon, 11 Mar 2019 10:02:47 -0400 Subject: [PATCH 05/67] Round out tests of valid markup. There are some design desisions to make as noted in comments. --- tests/test_syntax/blocks/test_html_blocks.py | 234 ++++++++++++++++++- 1 file changed, 232 insertions(+), 2 deletions(-) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 8c600a457..8bcdfd27c 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -486,7 +486,18 @@ def test_raw_comment_one_line(self): '' ) - # TODO: Confirm this is correct + # TODO: Decide behavior here. Python-Markdown current outputs: + # + # + #

bar

+ # + # But the reference implementation outputs: + # + #

bar

+ # + # As the raw HTML is not alone on the line, the reference implementation + # considers it inline rather than block level. The behavior defined in + # the test below is from the CommonMark spec, which we don't follow. def test_raw_comment_one_line_followed_by_text(self): self.assertMarkdownRenders( '*bar*', @@ -533,4 +544,223 @@ def test_raw_comment_with_blank_lines(self): ) ) - # TODO: processing instruction, declaration, CDATA... + def test_raw_comment_indented(self): + self.assertMarkdownRenders( + self.dedent( + """ + + """ + ), + self.dedent( + """ + + """ + ) + ) + + def test_raw_processing_instruction_one_line(self): + self.assertMarkdownRenders( + "';' ?>", + "';' ?>" + ) + + # This is inline as it is not on a line by itself. + def test_raw_processing_instruction_one_line_followed_by_text(self): + self.assertMarkdownRenders( + "';' ?>*bar*", + "

'; ' ?>bar

" + ) + + def test_raw_multiline_processing_instruction(self): + self.assertMarkdownRenders( + self.dedent( + """ + ';' + ?> + """ + ), + self.dedent( + """ + ';' + ?> + """ + ) + ) + + def test_raw_processing_instruction_with_blank_lines(self): + self.assertMarkdownRenders( + self.dedent( + """ + ';' + + ?> + """ + ), + self.dedent( + """ + ';' + + ?> + """ + ) + ) + + def test_raw_processing_instruction_indented(self): + self.assertMarkdownRenders( + self.dedent( + """ + ';' + + ?> + """ + ), + self.dedent( + """ + ';' + + ?> + """ + ) + ) + + def test_raw_declaration_one_line(self): + self.assertMarkdownRenders( + '', + '' + ) + + # TODO: Decide correct behavior. This matches current behavior and Commonmark. + # The reference implementation considers this inline not block level: + # + #

bar

+ # + # But most implementations do this instead: + # + #

<!DOCTYPE html>bar

+ # + # Either makes sense, but the later seems more correct to me. + def test_raw_declaration_one_line_followed_by_text(self): + self.assertMarkdownRenders( + '*bar*', + '*bar*' + ) + + def test_raw_multiline_declaration(self): + self.assertMarkdownRenders( + self.dedent( + """ + + """ + ), + self.dedent( + """ + + """ + ) + ) + + def test_raw_cdata_one_line(self): + self.assertMarkdownRenders( + '"); ]]>', + '"); ]]>' + ) + + # TODO: Decide correct behavior. This matches current behavior and Commonmark. + # The reference implementation considers this inline not block level: + # + #

"); ]]>bar

+ # + # But most implementations do this instead: + # + #

<[CDATA[ document.write(“>”); ]]>bar

+ # + # Either makes sense, but the later seems more correct to me. + def test_raw_cdata_one_line_followed_by_text(self): + self.assertMarkdownRenders( + '"); ]]>*bar*', + '"); ]]>*bar*' + ) + + def test_raw_multiline_cdata(self): + self.assertMarkdownRenders( + self.dedent( + """ + "); + ]]> + """ + ), + self.dedent( + """ + "); + ]]> + """ + ) + ) + + def test_raw_cdata_with_blank_lines(self): + self.assertMarkdownRenders( + self.dedent( + """ + "); + + ]]> + """ + ), + self.dedent( + """ + "); + + ]]> + """ + ) + ) + + def test_raw_cdata_indented(self): + self.assertMarkdownRenders( + self.dedent( + """ + "); + + ]]> + """ + ), + self.dedent( + """ + "); + + ]]> + """ + ) + ) From 6efe8d5e31f6456b38fbbbcb1bb533a4178c1c8c Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Mon, 11 Mar 2019 10:45:03 -0400 Subject: [PATCH 06/67] Some cleanup and bugfixes --- markdown/htmlparser.py | 21 ++++++++++++-------- tests/test_syntax/blocks/test_html_blocks.py | 6 ------ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 8c675a619..b1c6dfc7c 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -22,10 +22,8 @@ try: from HTMLParser import HTMLParser - PY2 = True except ImportError: from html.parser import HTMLParser - PY2 = False class HTMLExtractor(HTMLParser): @@ -36,17 +34,24 @@ class HTMLExtractor(HTMLParser): to `md` and the remaining text is stored in `cleandoc` as a list of strings. """ - def __init__(self, md): - if PY2: - # In PY2 HTMLParser is an old style class :( - HTMLParser.__init__(self) - else: - super(HTMLExtractor, self).__init__() + def __init__(self, md, *args, **kwargs): + # This calls self.reset + HTMLParser.__init__(self, *args, **kwargs) # TODO: Use super when we drop PY2 support self.md = md + + def reset(self): + """Reset this instance. Loses all unprocessed data.""" self.inraw = False self.stack = [] # When inraw==True, stack contains a list of tags self._cache = [] self.cleandoc = [] + HTMLParser.reset(self) # TODO: Use super when we drop PY2 support + + def close(self): + """Handle any buffered data.""" + HTMLParser.close(self) # TODO: Use super when we drop PY2 support + if len(self._cache): + self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) def handle_starttag(self, tag, attrs): self.stack.append(tag) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 8bcdfd27c..8cf049bc8 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -384,17 +384,12 @@ def test_raw_nested_inline_with_blank_lines(self): ) ) - # TODO: fix this. Presumably as the parser finishes things up in the `handle_endtag` - # method and as there is no end tag here, that code never gets run. Is there a way - # to retrieve the unprocessed source text from HTMLParser? May have to read the source. - # The `test_raw_nested_p_no_end_tag` below works because of the closing ``. def test_raw_p_no_end_tag(self): self.assertMarkdownRenders( '

*text*', '

*text*' ) - # TODO: fix this. See comment on previous test method. def test_raw_multiple_p_no_end_tag(self): self.assertMarkdownRenders( self.dedent( @@ -413,7 +408,6 @@ def test_raw_multiple_p_no_end_tag(self): ) ) - # TODO: fix this. See comment on previous test method. def test_raw_p_no_end_tag_followed_by_blank_line(self): self.assertMarkdownRenders( self.dedent( From e5f9ca49393d7011227f6c7209990be579b61696 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Mon, 11 Mar 2019 15:58:19 -0400 Subject: [PATCH 07/67] Some minor tweaks --- markdown/htmlparser.py | 2 ++ tests/test_syntax/blocks/test_html_blocks.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index b1c6dfc7c..a4adec4c9 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -50,8 +50,10 @@ def reset(self): def close(self): """Handle any buffered data.""" HTMLParser.close(self) # TODO: Use super when we drop PY2 support + # Handle any unclosed tags. if len(self._cache): self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) + self._cache = [] def handle_starttag(self, tag, attrs): self.stack.append(tag) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 8cf049bc8..03f7afd36 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -216,6 +216,11 @@ def test_raw_with_markdown_blocks(self): ) # TODO: This fails. Fix it. + # The HTML parser is mostly doing the correct thing here. The problem is that the + # two placeholders are in one paragraph so the postprocessor is not properly + # swaping then out. It checks for `

{ placeholder }

` but we have + # `

{ placeholder2 }\n{ placeholder2 }

`. We would need to add a second + # newline or use some other method than the current HTML stash. def test_adjacent_raw_blocks(self): self.assertMarkdownRenders( self.dedent( From 95e8498845a80878d92170fef21a6a09bf9e7d42 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Mon, 11 Mar 2019 16:25:45 -0400 Subject: [PATCH 08/67] comments partially fixed. --- markdown/htmlparser.py | 10 ++++++++++ tests/test_syntax/blocks/test_html_blocks.py | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index a4adec4c9..3a3fd9ba0 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -91,3 +91,13 @@ def handle_data(self, data): self._cache.append(data) else: self.cleandoc.append(data) + + def handle_comment(self, data): + text = ''.format(data) + line, col = self.getpos() + if self.inraw: + # Append this to the existing raw block + self._cache.append(text) + else: + # Handle this as a standalone raw block + self.cleandoc.append(self.md.htmlStash.store(text)) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 03f7afd36..021ecc03c 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -565,6 +565,24 @@ def test_raw_comment_indented(self): ) ) + def test_raw_comment_nested(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ +
+ """ + ), + self.dedent( + """ +
+ +
+ """ + ) + ) + def test_raw_processing_instruction_one_line(self): self.assertMarkdownRenders( "';' ?>", From ea98546557ccfd62dca1c6c12c6594322b883225 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 13 Mar 2019 15:48:46 -0400 Subject: [PATCH 09/67] Support 0-3 spaces of indent for raw HTML blocks --- markdown/blockprocessors.py | 2 +- tests/test_syntax/blocks/test_html_blocks.py | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py index eea883adb..257078c68 100644 --- a/markdown/blockprocessors.py +++ b/markdown/blockprocessors.py @@ -275,7 +275,7 @@ def run(self, parent, blocks): class RawHtmlProcessor(BlockProcessor): - TAG_RE = re.compile(r'^\<(?P[^<> ]+)[^<>]*>') + TAG_RE = re.compile(r'^[ ]{0,3}\<(?P[^<> ]+)[^<>]*>') def test(self, parent, block): m = self.TAG_RE.match(block) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 021ecc03c..2a5c0ad06 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -40,9 +40,19 @@ def test_raw_skip_inline_markdown(self): def test_raw_indent_one_space(self): self.assertMarkdownRenders( '

A *raw* paragraph.

', - # TODO: reevaluate. This matches strict rules and reference - # implementation version 1.0.1 but not 1.0.2b8. - '

A raw paragraph.

' + '

A *raw* paragraph.

' + ) + + def test_raw_indent_two_spaces(self): + self.assertMarkdownRenders( + '

A *raw* paragraph.

', + '

A *raw* paragraph.

' + ) + + def test_raw_indent_three_spaces(self): + self.assertMarkdownRenders( + '

A *raw* paragraph.

', + '

A *raw* paragraph.

' ) def test_raw_indent_four_spaces(self): From 23e41d39c7a44643ca9fd30c0e82eb67d8071ff3 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 13 Mar 2019 21:11:48 -0400 Subject: [PATCH 10/67] Remove need to wrap raw in blank lines --- markdown/blockprocessors.py | 5 ++-- markdown/htmlparser.py | 6 +++++ tests/test_syntax/blocks/test_html_blocks.py | 26 ++++++-------------- 3 files changed, 16 insertions(+), 21 deletions(-) diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py index 257078c68..16413c7d1 100644 --- a/markdown/blockprocessors.py +++ b/markdown/blockprocessors.py @@ -275,10 +275,10 @@ def run(self, parent, blocks): class RawHtmlProcessor(BlockProcessor): - TAG_RE = re.compile(r'^[ ]{0,3}\<(?P[^<> ]+)[^<>]*>') + TAG_RE = re.compile(r'(^|\n)[ ]{0,3}\<(?P[^<> ]+)[^<>]*>') def test(self, parent, block): - m = self.TAG_RE.match(block) + m = self.TAG_RE.search(block) return m and self.parser.md.is_block_level(m.group('tag')) def run(self, parent, blocks): @@ -289,6 +289,7 @@ def run(self, parent, blocks): break parser.close() # Insert Markdown back into blocks with raw HTML extracted. + print parser.cleandoc parts = ''.join(parser.cleandoc).split('\n\n') parts.reverse() for block in parts: diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 3a3fd9ba0..92261e13f 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -20,6 +20,7 @@ License: BSD (see LICENSE.md for details). """ +from __future__ import unicode_literals try: from HTMLParser import HTMLParser except ImportError: @@ -62,6 +63,9 @@ def handle_starttag(self, tag, attrs): if col < 4 and self.md.is_block_level(tag) and not self.inraw: # Started a new raw block self.inraw = True + if len(self.cleandoc): + # Insert blank line between this and previous line. + self.cleandoc.append('\n') text = self.get_starttag_text() if self.inraw: @@ -80,6 +84,8 @@ def handle_endtag(self, tag): self.inraw = False self._cache.append(text) self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) + # Insert blank line between this and next line. TODO: make this conditional?? + self.cleandoc.append('\n') self._cache = [] elif self.inraw: self._cache.append(text) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 2a5c0ad06..8a2d2a85d 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -178,23 +178,15 @@ def test_raw_without_blank_lines(self): More *Markdown* text. """ ), - # The raw gets treated as inline HTML. This follows the rules and this lib's - # previous behavior, but not the reference implementation. TODO: Reevaluate. + # TODO: Work out a way to eliminate the extra blank line. self.dedent( """ -

Some Markdown text. -

Raw HTML.

- More Markdown text.

+

Some Markdown text.

+

*Raw* HTML.

+ +

More Markdown text.

""" ) - # The reference implementation does this instead: - # self.dedent( - # """ - #

Some Markdown text.

- #

*Raw* HTML.

- #

More Markdown text.

- # """ - # ) ) def test_raw_with_markdown_blocks(self): @@ -225,12 +217,6 @@ def test_raw_with_markdown_blocks(self): ) ) - # TODO: This fails. Fix it. - # The HTML parser is mostly doing the correct thing here. The problem is that the - # two placeholders are in one paragraph so the postprocessor is not properly - # swaping then out. It checks for `

{ placeholder }

` but we have - # `

{ placeholder2 }\n{ placeholder2 }

`. We would need to add a second - # newline or use some other method than the current HTML stash. def test_adjacent_raw_blocks(self): self.assertMarkdownRenders( self.dedent( @@ -239,9 +225,11 @@ def test_adjacent_raw_blocks(self):

A second raw paragraph.

""" ), + # TODO: Work out a way to eliminate the extra blank line. self.dedent( """

A raw paragraph.

+

A second raw paragraph.

""" ) From 46b3a1be4bbd18a67e9fc08e9b4b4a60340f516a Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 13 Mar 2019 22:39:23 -0400 Subject: [PATCH 11/67] More tests passing --- markdown/blockprocessors.py | 6 +-- markdown/htmlparser.py | 25 +++++++-- tests/test_syntax/blocks/test_html_blocks.py | 55 +++++++------------- 3 files changed, 43 insertions(+), 43 deletions(-) diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py index 16413c7d1..3dc63ef3a 100644 --- a/markdown/blockprocessors.py +++ b/markdown/blockprocessors.py @@ -275,11 +275,12 @@ def run(self, parent, blocks): class RawHtmlProcessor(BlockProcessor): - TAG_RE = re.compile(r'(^|\n)[ ]{0,3}\<(?P[^<> ]+)[^<>]*>') + TAG_RE = re.compile(r'(^|\n)[ ]{0,3}<([?!].*?|(?P[^<> ]+)[^<>]*)>', re.S | re.U) def test(self, parent, block): m = self.TAG_RE.search(block) - return m and self.parser.md.is_block_level(m.group('tag')) + # If m but no 'tag', then we have a comment, declaration, or processing instruction. + return m and (self.parser.md.is_block_level(m.group('tag')) or not m.group('tag')) def run(self, parent, blocks): parser = HTMLExtractor(md=self.parser.md) @@ -289,7 +290,6 @@ def run(self, parent, blocks): break parser.close() # Insert Markdown back into blocks with raw HTML extracted. - print parser.cleandoc parts = ''.join(parser.cleandoc).split('\n\n') parts.reverse() for block in parts: diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 92261e13f..a970cfd16 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -98,12 +98,27 @@ def handle_data(self, data): else: self.cleandoc.append(data) - def handle_comment(self, data): - text = ''.format(data) + def handle_empty_tag(self, data): + """ Handle empty tags (``). """ line, col = self.getpos() if self.inraw: # Append this to the existing raw block - self._cache.append(text) - else: + self._cache.append(data) + elif col < 4: # Handle this as a standalone raw block - self.cleandoc.append(self.md.htmlStash.store(text)) + self.cleandoc.append(self.md.htmlStash.store(data)) + else: + # Presumably part of a code block. + self.cleandoc.append(data) + + def handle_comment(self, data): + self.handle_empty_tag(''.format(data)) + + def handle_decl(self, data): + self.handle_empty_tag(''.format(data)) + + def handle_pi(self, data): + self.handle_empty_tag(''.format(data)) + + def handle_unknown_decl(self, data): + self.handle_empty_tag(''.format(data)) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 8a2d2a85d..aa4215205 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -483,22 +483,12 @@ def test_raw_comment_one_line(self): '' ) - # TODO: Decide behavior here. Python-Markdown current outputs: - # - # - #

bar

- # - # But the reference implementation outputs: - # - #

bar

- # - # As the raw HTML is not alone on the line, the reference implementation - # considers it inline rather than block level. The behavior defined in - # the test below is from the CommonMark spec, which we don't follow. + # Note: this is a change in behavior for Python_markdown but matches the reference implementation. + # Previous output was `\n

bar

`. Browsers render both the same. def test_raw_comment_one_line_followed_by_text(self): self.assertMarkdownRenders( '*bar*', - '*bar*' + '

bar

' ) def test_raw_multiline_comment(self): @@ -581,6 +571,17 @@ def test_raw_comment_nested(self): ) ) + def test_comment_in_code_block(self): + self.assertMarkdownRenders( + ' ', + self.dedent( + """ +
<!-- *foo* -->
+                
+ """ + ) + ) + def test_raw_processing_instruction_one_line(self): self.assertMarkdownRenders( "';' ?>", @@ -662,20 +663,12 @@ def test_raw_declaration_one_line(self): '' ) - # TODO: Decide correct behavior. This matches current behavior and Commonmark. - # The reference implementation considers this inline not block level: - # - #

bar

- # - # But most implementations do this instead: - # - #

<!DOCTYPE html>bar

- # - # Either makes sense, but the later seems more correct to me. + # Note: this is a change in behavior for Python_markdown but matches the reference implementation. + # Previous output was `*bar*`. def test_raw_declaration_one_line_followed_by_text(self): self.assertMarkdownRenders( '*bar*', - '*bar*' + '

bar

' ) def test_raw_multiline_declaration(self): @@ -702,20 +695,12 @@ def test_raw_cdata_one_line(self): '"); ]]>' ) - # TODO: Decide correct behavior. This matches current behavior and Commonmark. - # The reference implementation considers this inline not block level: - # - #

"); ]]>bar

- # - # But most implementations do this instead: - # - #

<[CDATA[ document.write(“>”); ]]>bar

- # - # Either makes sense, but the later seems more correct to me. + # Note: this is a change in behavior for Python_markdown but matches the reference implementation. + # Previous output was `"); ]]>*bar*`. def test_raw_cdata_one_line_followed_by_text(self): self.assertMarkdownRenders( '"); ]]>*bar*', - '"); ]]>*bar*' + '

"); ]]>bar

' ) def test_raw_multiline_cdata(self): From 8a17794575389d6c1395a659b701160ed84dee26 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 14 Mar 2019 20:13:16 -0400 Subject: [PATCH 12/67] All handle_* methods are now defined and tested --- markdown/htmlparser.py | 25 +++++++--- tests/test_syntax/blocks/test_html_blocks.py | 48 ++++++++++++++++++++ 2 files changed, 66 insertions(+), 7 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index a970cfd16..c05e82f49 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -21,6 +21,7 @@ """ from __future__ import unicode_literals +from . import util try: from HTMLParser import HTMLParser except ImportError: @@ -36,6 +37,8 @@ class HTMLExtractor(HTMLParser): """ def __init__(self, md, *args, **kwargs): + if util.PY3 and 'convert_charrefs' not in kwargs: + kwargs['convert_charrefs'] = False # This calls self.reset HTMLParser.__init__(self, *args, **kwargs) # TODO: Use super when we drop PY2 support self.md = md @@ -98,27 +101,35 @@ def handle_data(self, data): else: self.cleandoc.append(data) - def handle_empty_tag(self, data): + def handle_empty_tag(self, data, is_block): """ Handle empty tags (``). """ line, col = self.getpos() if self.inraw: # Append this to the existing raw block self._cache.append(data) - elif col < 4: + elif col < 4 and is_block: # Handle this as a standalone raw block self.cleandoc.append(self.md.htmlStash.store(data)) else: - # Presumably part of a code block. self.cleandoc.append(data) + def handle_startendtag(self, tag, attrs): + self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag)) + + def handle_charref(self, name): + self.handle_empty_tag('&#{};'.format(name), is_block=False) + + def handle_entityref(self, name): + self.handle_empty_tag('&{};'.format(name), is_block=False) + def handle_comment(self, data): - self.handle_empty_tag(''.format(data)) + self.handle_empty_tag(''.format(data), is_block=True) def handle_decl(self, data): - self.handle_empty_tag(''.format(data)) + self.handle_empty_tag(''.format(data), is_block=True) def handle_pi(self, data): - self.handle_empty_tag(''.format(data)) + self.handle_empty_tag(''.format(data), is_block=True) def handle_unknown_decl(self, data): - self.handle_empty_tag(''.format(data)) + self.handle_empty_tag(''.format(data), is_block=True) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index aa4215205..464ed7280 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -764,3 +764,51 @@ def test_raw_cdata_indented(self): """ ) ) + + def test_charref(self): + self.assertMarkdownRenders( + '§', + '

§

' + ) + + def test_nested_charref(self): + self.assertMarkdownRenders( + '

§

', + '

§

' + ) + + def test_entityref(self): + self.assertMarkdownRenders( + '§', + '

§

' + ) + + def test_nested_entityref(self): + self.assertMarkdownRenders( + '

§

', + '

§

' + ) + + def test_startendtag(self): + self.assertMarkdownRenders( + '', + '

' + ) + + def test_closed_startendtag(self): + self.assertMarkdownRenders( + '', + '

' + ) + + def test_nested_startendtag(self): + self.assertMarkdownRenders( + '
', + '
' + ) + + def test_nested_closed_startendtag(self): + self.assertMarkdownRenders( + '
', + '
' + ) From 845637a643cd5c8a5df3bac0b6f03c6794cb7423 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Fri, 15 Mar 2019 21:00:23 -0400 Subject: [PATCH 13/67] Some test cleanup --- tests/test_syntax/blocks/test_html_blocks.py | 36 ++++++++++++++------ 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 464ed7280..a480ca7df 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -584,15 +584,15 @@ def test_comment_in_code_block(self): def test_raw_processing_instruction_one_line(self): self.assertMarkdownRenders( - "';' ?>", - "';' ?>" + "'; ?>", + "'; ?>" ) # This is inline as it is not on a line by itself. def test_raw_processing_instruction_one_line_followed_by_text(self): self.assertMarkdownRenders( - "';' ?>*bar*", - "

'; ' ?>bar

" + "'; ?>*bar*", + "

'; ?>bar

" ) def test_raw_multiline_processing_instruction(self): @@ -600,14 +600,14 @@ def test_raw_multiline_processing_instruction(self): self.dedent( """ ';' + echo '>'; ?> """ ), self.dedent( """ ';' + echo '>'; ?> """ ) @@ -619,7 +619,7 @@ def test_raw_processing_instruction_with_blank_lines(self): """ ';' + echo '>'; ?> """ @@ -628,7 +628,7 @@ def test_raw_processing_instruction_with_blank_lines(self): """ ';' + echo '>'; ?> """ @@ -641,7 +641,7 @@ def test_raw_processing_instruction_indented(self): """ ';' + echo '>'; ?> """ @@ -650,7 +650,7 @@ def test_raw_processing_instruction_indented(self): """ ';' + echo '>'; ?> """ @@ -812,3 +812,19 @@ def test_nested_closed_startendtag(self): '
', '
' ) + + def test_auto_links_dont_break_parser(self): + self.assertMarkdownRenders( + self.dedent( + """ + + + + """ + ), + '

https://example.com

\n' + '

email@e' + 'xample.com

' + ) From eee4e4980ae32c5909cafeca171fc92edcf4179b Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Sat, 16 Mar 2019 14:22:13 -0400 Subject: [PATCH 14/67] Monkeypatch HTMLParser piclose --- markdown/htmlparser.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index c05e82f49..562d60727 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -23,12 +23,16 @@ from __future__ import unicode_literals from . import util try: - from HTMLParser import HTMLParser + import HTMLParser as parser except ImportError: - from html.parser import HTMLParser + from html import parser +# Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions. +import re +parser.piclose = re.compile(r'\?>') -class HTMLExtractor(HTMLParser): + +class HTMLExtractor(parser.HTMLParser): """ Extract raw HTML from text. @@ -40,7 +44,7 @@ def __init__(self, md, *args, **kwargs): if util.PY3 and 'convert_charrefs' not in kwargs: kwargs['convert_charrefs'] = False # This calls self.reset - HTMLParser.__init__(self, *args, **kwargs) # TODO: Use super when we drop PY2 support + parser.HTMLParser.__init__(self, *args, **kwargs) # TODO: Use super when we drop PY2 support self.md = md def reset(self): @@ -49,11 +53,11 @@ def reset(self): self.stack = [] # When inraw==True, stack contains a list of tags self._cache = [] self.cleandoc = [] - HTMLParser.reset(self) # TODO: Use super when we drop PY2 support + parser.HTMLParser.reset(self) # TODO: Use super when we drop PY2 support def close(self): """Handle any buffered data.""" - HTMLParser.close(self) # TODO: Use super when we drop PY2 support + parser.HTMLParser.close(self) # TODO: Use super when we drop PY2 support # Handle any unclosed tags. if len(self._cache): self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) @@ -129,7 +133,7 @@ def handle_decl(self, data): self.handle_empty_tag(''.format(data), is_block=True) def handle_pi(self, data): - self.handle_empty_tag(''.format(data), is_block=True) + self.handle_empty_tag(''.format(data), is_block=True) def handle_unknown_decl(self, data): self.handle_empty_tag(''.format(data), is_block=True) From b8f70b723e747e35c95ea09fb3adf9ef45042223 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Sat, 16 Mar 2019 16:24:58 -0400 Subject: [PATCH 15/67] unknown_decl is not a handle method --- markdown/htmlparser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 562d60727..7934a8cbf 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -135,5 +135,6 @@ def handle_decl(self, data): def handle_pi(self, data): self.handle_empty_tag(''.format(data), is_block=True) - def handle_unknown_decl(self, data): - self.handle_empty_tag(''.format(data), is_block=True) + def unknown_decl(self, data): + end = ']]>' if data.startswith('CDATA[') else ']>' + self.handle_empty_tag(' Date: Sat, 16 Mar 2019 17:09:38 -0400 Subject: [PATCH 16/67] Switch back to a preprocessor --- markdown/blockprocessors.py | 24 ------------------------ markdown/preprocessors.py | 13 +++++++++++++ 2 files changed, 13 insertions(+), 24 deletions(-) diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py index 3dc63ef3a..a9e93e516 100644 --- a/markdown/blockprocessors.py +++ b/markdown/blockprocessors.py @@ -46,7 +46,6 @@ def build_block_parser(md, **kwargs): parser.blockprocessors.register(EmptyBlockProcessor(parser), 'empty', 100) parser.blockprocessors.register(ListIndentProcessor(parser), 'indent', 90) parser.blockprocessors.register(CodeBlockProcessor(parser), 'code', 80) - parser.blockprocessors.register(RawHtmlProcessor(parser), 'html', 75) parser.blockprocessors.register(HashHeaderProcessor(parser), 'hashheader', 70) parser.blockprocessors.register(SetextHeaderProcessor(parser), 'setextheader', 60) parser.blockprocessors.register(HRProcessor(parser), 'hr', 50) @@ -273,29 +272,6 @@ def run(self, parent, blocks): blocks.insert(0, theRest) -class RawHtmlProcessor(BlockProcessor): - - TAG_RE = re.compile(r'(^|\n)[ ]{0,3}<([?!].*?|(?P[^<> ]+)[^<>]*)>', re.S | re.U) - - def test(self, parent, block): - m = self.TAG_RE.search(block) - # If m but no 'tag', then we have a comment, declaration, or processing instruction. - return m and (self.parser.md.is_block_level(m.group('tag')) or not m.group('tag')) - - def run(self, parent, blocks): - parser = HTMLExtractor(md=self.parser.md) - while blocks: - parser.feed(blocks.pop(0) + '\n\n') - if not parser.inraw: - break - parser.close() - # Insert Markdown back into blocks with raw HTML extracted. - parts = ''.join(parser.cleandoc).split('\n\n') - parts.reverse() - for block in parts: - blocks.insert(0, block) - - class BlockQuoteProcessor(BlockProcessor): RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)') diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py index c646b5ca4..528e1de84 100644 --- a/markdown/preprocessors.py +++ b/markdown/preprocessors.py @@ -26,6 +26,7 @@ """ from . import util +from .htmlparser import HTMLExtractor import re @@ -33,6 +34,7 @@ def build_preprocessors(md, **kwargs): """ Build the default set of preprocessors used by Markdown. """ preprocessors = util.Registry() preprocessors.register(NormalizeWhitespace(md), 'normalize_whitespace', 30) + preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20) preprocessors.register(ReferencePreprocessor(md), 'reference', 10) return preprocessors @@ -70,6 +72,17 @@ def run(self, lines): return source.split('\n') +class HtmlBlockPreprocessor(Preprocessor): + """Remove html blocks from the text and store them for later retrieval.""" + + def run(self, lines): + source = '\n'.join(lines) + parser = HTMLExtractor(md=self.md) + parser.feed(source) + parser.close() + return ''.join(parser.cleandoc).split('\n') + + class ReferencePreprocessor(Preprocessor): """ Remove reference definitions from text and store for later use. """ From 22151c7bb71f376ad7b21d30aa2314b854856f87 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 20 Mar 2019 11:50:46 -0400 Subject: [PATCH 17/67] Start audit of legacy tests --- tests/misc/block_html5.html | 16 -- tests/misc/block_html5.txt | 14 - tests/misc/comments.html | 9 - tests/misc/comments.txt | 10 - tests/misc/div.html | 10 - tests/misc/div.txt | 11 - tests/misc/mismatched-tags.html | 14 - tests/misc/mismatched-tags.txt | 9 - tests/misc/more_comments.html | 8 - tests/misc/more_comments.txt | 11 - tests/misc/multiline-comments.html | 37 --- tests/misc/multiline-comments.txt | 38 --- tests/misc/php.html | 11 - tests/misc/php.txt | 13 - tests/test_syntax/blocks/test_html_blocks.py | 276 ++++++++++++++++++- 15 files changed, 275 insertions(+), 212 deletions(-) delete mode 100644 tests/misc/block_html5.html delete mode 100644 tests/misc/block_html5.txt delete mode 100644 tests/misc/comments.html delete mode 100644 tests/misc/comments.txt delete mode 100644 tests/misc/div.html delete mode 100644 tests/misc/div.txt delete mode 100644 tests/misc/mismatched-tags.html delete mode 100644 tests/misc/mismatched-tags.txt delete mode 100644 tests/misc/more_comments.html delete mode 100644 tests/misc/more_comments.txt delete mode 100644 tests/misc/multiline-comments.html delete mode 100644 tests/misc/multiline-comments.txt delete mode 100644 tests/misc/php.html delete mode 100644 tests/misc/php.txt diff --git a/tests/misc/block_html5.html b/tests/misc/block_html5.html deleted file mode 100644 index b7a2fd328..000000000 --- a/tests/misc/block_html5.html +++ /dev/null @@ -1,16 +0,0 @@ -
-
-
-

Hello :-)

-
-
-
- -
Caption
-
-
-

Some footer

-
-
- -
\ No newline at end of file diff --git a/tests/misc/block_html5.txt b/tests/misc/block_html5.txt deleted file mode 100644 index 2b24cade7..000000000 --- a/tests/misc/block_html5.txt +++ /dev/null @@ -1,14 +0,0 @@ -
-
-
-

Hello :-)

-
-
-
- -
Caption
-
-
-

Some footer

-
-
diff --git a/tests/misc/comments.html b/tests/misc/comments.html deleted file mode 100644 index 2240ab9d3..000000000 --- a/tests/misc/comments.html +++ /dev/null @@ -1,9 +0,0 @@ -

X<0

-

X>0

- - -
as if
- - - -

no blank line

\ No newline at end of file diff --git a/tests/misc/comments.txt b/tests/misc/comments.txt deleted file mode 100644 index d9186f01b..000000000 --- a/tests/misc/comments.txt +++ /dev/null @@ -1,10 +0,0 @@ -X<0 - -X>0 - - - -
as if
- - -__no blank line__ diff --git a/tests/misc/div.html b/tests/misc/div.html deleted file mode 100644 index cb6a759e0..000000000 --- a/tests/misc/div.html +++ /dev/null @@ -1,10 +0,0 @@ - - -

And now in uppercase:

-
-foo -
\ No newline at end of file diff --git a/tests/misc/div.txt b/tests/misc/div.txt deleted file mode 100644 index 4ff972e2e..000000000 --- a/tests/misc/div.txt +++ /dev/null @@ -1,11 +0,0 @@ - - -And now in uppercase: - -
-foo -
diff --git a/tests/misc/mismatched-tags.html b/tests/misc/mismatched-tags.html deleted file mode 100644 index 06bd57f3b..000000000 --- a/tests/misc/mismatched-tags.html +++ /dev/null @@ -1,14 +0,0 @@ -

Some text

- -
some more text
- -

and a bit more

-

And this output

- -

Compatible with PHP Markdown Extra 1.2.2 and Markdown.pl1.0.2b8:

- - -

text

- -


-

Should be in p

\ No newline at end of file diff --git a/tests/misc/mismatched-tags.txt b/tests/misc/mismatched-tags.txt deleted file mode 100644 index 8e6a52f57..000000000 --- a/tests/misc/mismatched-tags.txt +++ /dev/null @@ -1,9 +0,0 @@ -

Some text

some more text
- -and a bit more - -

And this output

*Compatible with PHP Markdown Extra 1.2.2 and Markdown.pl1.0.2b8:* - -

text


- -Should be in p diff --git a/tests/misc/more_comments.html b/tests/misc/more_comments.html deleted file mode 100644 index 5ca673199..000000000 --- a/tests/misc/more_comments.html +++ /dev/null @@ -1,8 +0,0 @@ - - -

Foo

-

-

Bar

- - -

- -foo - -

- -
- -foo - -
- - - - - - - - - - - -

- -foo - -

- - -
- -foo - -
- - - - - - - - - - - -
This shouldn't
- - - -

<?php echo "not_block_level";?>

\ No newline at end of file diff --git a/tests/misc/php.txt b/tests/misc/php.txt deleted file mode 100644 index ca5be4532..000000000 --- a/tests/misc/php.txt +++ /dev/null @@ -1,13 +0,0 @@ - - -This should have a p tag - - - -
This shouldn't
- - - - - diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index a480ca7df..25c6fb87b 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -102,6 +102,26 @@ def test_raw_empty_blank_line(self): '

\n\n

' ) + # Note: this is a change in behavior. We don't preserve capitalization on closing tags. + def test_raw_uppercase(self): + self.assertMarkdownRenders( + '

foo

', + '

foo

' + ) + + # TODO: fix this. The blank line is optional but matches previous behavior and reference implementation. + def test_multiple_raw_single__line(self): + self.assertMarkdownRenders( + '

*foo*

*bar*
', + self.dedent( + """ +

*foo*

+ +
*bar*
+ """ + ) + ) + def test_multiline_raw(self): self.assertMarkdownRenders( self.dedent( @@ -169,7 +189,7 @@ def test_raw_surrounded_by_Markdown(self): ) ) - def test_raw_without_blank_lines(self): + def test_raw_surrounded_by_text_without_blank_lines(self): self.assertMarkdownRenders( self.dedent( """ @@ -189,6 +209,18 @@ def test_raw_without_blank_lines(self): ) ) + # TODO: fix this. A blank line between the tags is optional but would be a change in behavior. + def test_raw_one_line_followed_by_text(self): + self.assertMarkdownRenders( + '

*foo*

*bar*', + self.dedent( + """ +

*foo*

+

bar

+ """ + ) + ) + def test_raw_with_markdown_blocks(self): self.assertMarkdownRenders( self.dedent( @@ -253,6 +285,12 @@ def test_adjacent_raw_blocks_with_blank_lines(self): ) ) + def test_nested_raw_one_line(self): + self.assertMarkdownRenders( + '

*foo*

', + '

*foo*

' + ) + def test_nested_raw_block(self): self.assertMarkdownRenders( self.dedent( @@ -335,6 +373,12 @@ def test_nested_raw_blocks_with_blank_lines(self): ) ) + def test_nested_inline_one_line(self): + self.assertMarkdownRenders( + '

foo

', + '

foo

' + ) + def test_raw_nested_inline(self): self.assertMarkdownRenders( self.dedent( @@ -387,6 +431,46 @@ def test_raw_nested_inline_with_blank_lines(self): ) ) + def test_raw_html5(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+
+
+

Hello :-)

+
+
+
+ +
Caption
+
+
+

Some footer

+
+
+ """ + ), + self.dedent( + """ +
+
+
+

Hello :-)

+
+
+
+ +
Caption
+
+
+

Some footer

+
+
+ """ + ) + ) + def test_raw_p_no_end_tag(self): self.assertMarkdownRenders( '

*text*', @@ -483,6 +567,12 @@ def test_raw_comment_one_line(self): '' ) + def test_raw_comment_one_line_with_tag(self): + self.assertMarkdownRenders( + '', + '' + ) + # Note: this is a change in behavior for Python_markdown but matches the reference implementation. # Previous output was `\n

bar

`. Browsers render both the same. def test_raw_comment_one_line_followed_by_text(self): @@ -491,6 +581,40 @@ def test_raw_comment_one_line_followed_by_text(self): '

bar

' ) + # TODO: Fix this. This matches Python-Markdown's previous behavior but not the reference implementation, + # which outputs `

bar

` (which is also the pre-fixed behavior). + def test_raw_comment_one_line_followed_by_html(self): + self.assertMarkdownRenders( + '

*bar*

', + self.dedent( + """ + +

*bar*

+ """ + ) + ) + + # TODO: Fix this. The trailing space is triping up the postprocessor: `

{placeholder}

`. + # Note: this reflects a slight change in behavior as the trailing spacer is preserved. This matches + # the reference implementation. However, it should be ok if we did not preserve the trailing space. + def test_raw_comment_trailing_whitespace(self): + self.assertMarkdownRenders( + ' ', + ' ' + ) + + # Note: this is a change in behavior for Python-Markdown, which does *not* match the reference + # implementation. However, it does match the HTML5 spec. Declarations must start with either + # `', + '' + ) + def test_raw_multiline_comment(self): self.assertMarkdownRenders( self.dedent( @@ -509,6 +633,56 @@ def test_raw_multiline_comment(self): ) ) + def test_raw_multiline_comment_with_tag(self): + self.assertMarkdownRenders( + self.dedent( + """ + + """ + ), + self.dedent( + """ + + """ + ) + ) + + def test_raw_multiline_comment_first_line(self): + self.assertMarkdownRenders( + self.dedent( + """ + + """ + ), + self.dedent( + """ + + """ + ) + ) + + def test_raw_multiline_comment_last_line(self): + self.assertMarkdownRenders( + self.dedent( + """ + + """ + ), + self.dedent( + """ + + """ + ) + ) + def test_raw_comment_with_blank_lines(self): self.assertMarkdownRenders( self.dedent( @@ -531,6 +705,64 @@ def test_raw_comment_with_blank_lines(self): ) ) + def test_raw_comment_with_blank_lines_with_tag(self): + self.assertMarkdownRenders( + self.dedent( + """ + + """ + ), + self.dedent( + """ + + """ + ) + ) + + def test_raw_comment_with_blank_lines_first_line(self): + self.assertMarkdownRenders( + self.dedent( + """ + + """ + ), + self.dedent( + """ + + """ + ) + ) + + def test_raw_comment_with_blank_lines_last_line(self): + self.assertMarkdownRenders( + self.dedent( + """ + + """ + ), + self.dedent( + """ + + """ + ) + ) + def test_raw_comment_indented(self): self.assertMarkdownRenders( self.dedent( @@ -553,6 +785,28 @@ def test_raw_comment_indented(self): ) ) + def test_raw_comment_indented_with_tag(self): + self.assertMarkdownRenders( + self.dedent( + """ + + """ + ), + self.dedent( + """ + + """ + ) + ) + def test_raw_comment_nested(self): self.assertMarkdownRenders( self.dedent( @@ -582,6 +836,26 @@ def test_comment_in_code_block(self): ) ) + # Note: This is a change in behavior. Previously, Python-Markdown interpreted this in the same manner + # as browsers and all text after the opening comment tag was considered to be in a comment. However, + # that did not match the reference implementation. The new behavior does. + def test_unclosed_comment_(self): + self.assertMarkdownRenders( + self.dedent( + """ + - - - - - \ No newline at end of file diff --git a/tests/misc/block_html_attr.txt b/tests/misc/block_html_attr.txt deleted file mode 100644 index b2603cc88..000000000 --- a/tests/misc/block_html_attr.txt +++ /dev/null @@ -1,24 +0,0 @@ -
-Raw HTML processing should not confuse this with the blockquote below -
-
-
-
-
-

Header2

-
-
-

Header3

-

Paragraph

-

Header3

-

Paragraph

-
-

Paragraph

-
-

Paragraph

-

linktext

-
-
- -
-
diff --git a/tests/misc/block_html_simple.html b/tests/misc/block_html_simple.html deleted file mode 100644 index dce68bc19..000000000 --- a/tests/misc/block_html_simple.html +++ /dev/null @@ -1,10 +0,0 @@ -

foo

- -
    -
  • -

    bar

    -
  • -
  • -

    baz

    -
  • -
\ No newline at end of file diff --git a/tests/misc/block_html_simple.txt b/tests/misc/block_html_simple.txt deleted file mode 100644 index d108c50e3..000000000 --- a/tests/misc/block_html_simple.txt +++ /dev/null @@ -1,9 +0,0 @@ -

foo

-
    -
  • -

    bar

    -
  • -
  • -

    baz

    -
  • -
diff --git a/tests/misc/html-comments.html b/tests/misc/html-comments.html deleted file mode 100644 index 7b36246dc..000000000 --- a/tests/misc/html-comments.html +++ /dev/null @@ -1,2 +0,0 @@ -

Here is HTML -and once more

\ No newline at end of file diff --git a/tests/misc/html-comments.txt b/tests/misc/html-comments.txt deleted file mode 100644 index cac4da574..000000000 --- a/tests/misc/html-comments.txt +++ /dev/null @@ -1,2 +0,0 @@ -Here is HTML -and once more

diff --git a/tests/misc/html.html b/tests/misc/html.html deleted file mode 100644 index 293e6cce0..000000000 --- a/tests/misc/html.html +++ /dev/null @@ -1,29 +0,0 @@ -

Block level html

- -

Some inline stuff.

-

Now some arbitrary tags.

-
More block level html.
- -
-Html with various attributes. -
- -
-
- Div with a blank line - - in the middle. -
-
- This gets treated as HTML. -
-
- -

And of course .

-

this . - -[this ) - -Some funky inline stuff with markdown escaping syntax. - - - -And now a line with only an opening bracket: - -< - -And one with other stuff but no closing bracket: - -< foo - diff --git a/tests/misc/markup-inside-p.html b/tests/misc/markup-inside-p.html deleted file mode 100644 index 1b6b42097..000000000 --- a/tests/misc/markup-inside-p.html +++ /dev/null @@ -1,21 +0,0 @@ -

- -_foo_ - -

- -

-_foo_ -

- -

_foo_

- -

- -_foo_ -

- -

-_foo_ - -

\ No newline at end of file diff --git a/tests/misc/markup-inside-p.txt b/tests/misc/markup-inside-p.txt deleted file mode 100644 index ab7dd0f62..000000000 --- a/tests/misc/markup-inside-p.txt +++ /dev/null @@ -1,21 +0,0 @@ -

- -_foo_ - -

- -

-_foo_ -

- -

_foo_

- -

- -_foo_ -

- -

-_foo_ - -

diff --git a/tests/misc/multi-line-tags.html b/tests/misc/multi-line-tags.html deleted file mode 100644 index 69899aa62..000000000 --- a/tests/misc/multi-line-tags.html +++ /dev/null @@ -1,13 +0,0 @@ -
- -asdf asdfasd - -
- -
- -foo bar - -
- -

No blank line.

\ No newline at end of file diff --git a/tests/misc/multi-line-tags.txt b/tests/misc/multi-line-tags.txt deleted file mode 100644 index 905647376..000000000 --- a/tests/misc/multi-line-tags.txt +++ /dev/null @@ -1,13 +0,0 @@ - -
- -asdf asdfasd - -
- -
- -foo bar - -
-No blank line. diff --git a/tests/misc/pre.html b/tests/misc/pre.html deleted file mode 100644 index a44ae126b..000000000 --- a/tests/misc/pre.html +++ /dev/null @@ -1,13 +0,0 @@ -
-
-aaa
-
-bbb
-
- -
-* and this is pre-formatted content
-* and it should be printed just like this
-* and not formatted as a list
-
-
\ No newline at end of file diff --git a/tests/misc/pre.txt b/tests/misc/pre.txt deleted file mode 100644 index 31243b59c..000000000 --- a/tests/misc/pre.txt +++ /dev/null @@ -1,14 +0,0 @@ -
-
-aaa
-
-bbb
-
- -
-* and this is pre-formatted content
-* and it should be printed just like this
-* and not formatted as a list
-
-
- diff --git a/tests/misc/raw_whitespace.html b/tests/misc/raw_whitespace.html deleted file mode 100644 index 7a6f13184..000000000 --- a/tests/misc/raw_whitespace.html +++ /dev/null @@ -1,8 +0,0 @@ -

Preserve whitespace in raw html

-
-class Foo():
-    bar = 'bar'
-
-    def baz(self):
-        print self.bar
-
\ No newline at end of file diff --git a/tests/misc/raw_whitespace.txt b/tests/misc/raw_whitespace.txt deleted file mode 100644 index bbc7cece6..000000000 --- a/tests/misc/raw_whitespace.txt +++ /dev/null @@ -1,10 +0,0 @@ -Preserve whitespace in raw html - -
-class Foo():
-    bar = 'bar'
-
-    def baz(self):
-        print self.bar
-
- diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 25c6fb87b..50fb67d43 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -471,6 +471,55 @@ def test_raw_html5(self): ) ) + def test_raw_pre_tag(self): + self.assertMarkdownRenders( + self.dedent( + """ + Preserve whitespace in raw html + +
+                class Foo():
+                    bar = 'bar'
+
+                    @property
+                    def baz(self):
+                        return self.bar
+                
+ """ + ), + self.dedent( + """ +

Preserve whitespace in raw html

+
+                class Foo():
+                    bar = 'bar'
+
+                    @property
+                    def baz(self):
+                        return self.bar
+                
+ """ + ) + ) + + def test_raw_pre_tag_nested_escaped_html(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+                <p>foo</p>
+                
+ """ + ), + self.dedent( + """ +
+                <p>foo</p>
+                
+ """ + ) + ) + def test_raw_p_no_end_tag(self): self.assertMarkdownRenders( '

*text*', @@ -548,14 +597,18 @@ def test_raw_attributes_nested(self): self.dedent( """

-

text

+

+ +

""" ), self.dedent( """
-

text

+

+ +

""" ) @@ -1063,6 +1116,13 @@ def test_nested_entityref(self): '

§

' ) + # TODO: Fix this. `&T;` is not a valid charref. + def test_amperstand(self): + self.assertMarkdownRenders( + 'AT&T & AT&T', + '

AT&T & AT&T

' + ) + def test_startendtag(self): self.assertMarkdownRenders( '', @@ -1102,3 +1162,20 @@ def test_auto_links_dont_break_parser(self): '.com">email@e' 'xample.com

' ) + + def test_text_links_ignored(self): + self.assertMarkdownRenders( + self.dedent( + """ + https://example.com + + email@example.com + """ + ), + self.dedent( + """ +

https://example.com

+

email@example.com

+ """ + ), + ) From 0e4a545bab05f11aa0cc28481fe0efd2601e6310 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 21 Mar 2019 10:26:02 -0400 Subject: [PATCH 19/67] More test audits --- tests/basic/inline-html-advanced.html | 12 ---- tests/basic/inline-html-advanced.txt | 14 ---- tests/basic/inline-html-comments.html | 11 --- tests/basic/inline-html-comments.txt | 13 ---- tests/basic/inline-html-simple.html | 61 ----------------- tests/basic/inline-html-simple.txt | 72 -------------------- tests/test_syntax/blocks/test_html_blocks.py | 58 +++++++++++++--- 7 files changed, 50 insertions(+), 191 deletions(-) delete mode 100644 tests/basic/inline-html-advanced.html delete mode 100644 tests/basic/inline-html-advanced.txt delete mode 100644 tests/basic/inline-html-comments.html delete mode 100644 tests/basic/inline-html-comments.txt delete mode 100644 tests/basic/inline-html-simple.html delete mode 100644 tests/basic/inline-html-simple.txt diff --git a/tests/basic/inline-html-advanced.html b/tests/basic/inline-html-advanced.html deleted file mode 100644 index af1dec1e8..000000000 --- a/tests/basic/inline-html-advanced.html +++ /dev/null @@ -1,12 +0,0 @@ -

Simple block on one line:

-
foo
- -

And nested without indentation:

-
-
-
-foo -
-
-
bar
-
\ No newline at end of file diff --git a/tests/basic/inline-html-advanced.txt b/tests/basic/inline-html-advanced.txt deleted file mode 100644 index 9d71ddcc1..000000000 --- a/tests/basic/inline-html-advanced.txt +++ /dev/null @@ -1,14 +0,0 @@ -Simple block on one line: - -
foo
- -And nested without indentation: - -
-
-
-foo -
-
-
bar
-
diff --git a/tests/basic/inline-html-comments.html b/tests/basic/inline-html-comments.html deleted file mode 100644 index 0d4cad949..000000000 --- a/tests/basic/inline-html-comments.html +++ /dev/null @@ -1,11 +0,0 @@ -

Paragraph one.

- - - - -

Paragraph two.

- - -

The end.

\ No newline at end of file diff --git a/tests/basic/inline-html-comments.txt b/tests/basic/inline-html-comments.txt deleted file mode 100644 index 41d830d03..000000000 --- a/tests/basic/inline-html-comments.txt +++ /dev/null @@ -1,13 +0,0 @@ -Paragraph one. - - - - - -Paragraph two. - - - -The end. diff --git a/tests/basic/inline-html-simple.html b/tests/basic/inline-html-simple.html deleted file mode 100644 index 0f2633cfe..000000000 --- a/tests/basic/inline-html-simple.html +++ /dev/null @@ -1,61 +0,0 @@ -

Here's a simple block:

-
- foo -
- -

This should be a code block, though:

-
<div>
-    foo
-</div>
-
-

As should this:

-
<div>foo</div>
-
-

Now, nested:

-
-
-
- foo -
-
-
- -

This should just be an HTML comment:

- - -

Multiline:

- - -

Code block:

-
<!-- Comment -->
-
-

Just plain comment, with trailing spaces on the line:

- - -

Code:

-
<hr />
-
-

Hr's:

-
- -
- -
- -
- -
- -
- -
- -
- -
- -

weird stuff>

-

> <> <

\ No newline at end of file diff --git a/tests/basic/inline-html-simple.txt b/tests/basic/inline-html-simple.txt deleted file mode 100644 index 359aca4e5..000000000 --- a/tests/basic/inline-html-simple.txt +++ /dev/null @@ -1,72 +0,0 @@ -Here's a simple block: - -
- foo -
- -This should be a code block, though: - -
- foo -
- -As should this: - -
foo
- -Now, nested: - -
-
-
- foo -
-
-
- -This should just be an HTML comment: - - - -Multiline: - - - -Code block: - - - -Just plain comment, with trailing spaces on the line: - - - -Code: - -
- -Hr's: - -
- -
- -
- -
- -
- -
- -
- -
- -
- - - -> <> < \ No newline at end of file diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 50fb67d43..2ff3ad7de 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -1125,26 +1125,50 @@ def test_amperstand(self): def test_startendtag(self): self.assertMarkdownRenders( - '', - '

' + '
', + '
' + ) + + def test_startendtag_with_attrs(self): + self.assertMarkdownRenders( + '
', + '
' + ) + + def test_startendtag_with_space(self): + self.assertMarkdownRenders( + '
', + '
' ) def test_closed_startendtag(self): self.assertMarkdownRenders( - '', - '

' + '
', + '
' + ) + + def test_closed_startendtag_without_space(self): + self.assertMarkdownRenders( + '
', + '
' + ) + + def test_closed_startendtag_with_attrs(self): + self.assertMarkdownRenders( + '
', + '
' ) def test_nested_startendtag(self): self.assertMarkdownRenders( - '
', - '
' + '

', + '

' ) def test_nested_closed_startendtag(self): self.assertMarkdownRenders( - '
', - '
' + '

', + '

' ) def test_auto_links_dont_break_parser(self): @@ -1179,3 +1203,21 @@ def test_text_links_ignored(self): """ ), ) + + # TODO: fix this + def text_invalid_tags(self): + self.assertMarkdownRenders( + self.dedent( + """ + + + > <> < + """ + ), + self.dedent( + """ +

weird stuff>

+

> <> <

+ """ + ) + ) From 49c187dfca1865414ef8eec75c5ed297c62c0ac0 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 21 Mar 2019 11:17:38 -0400 Subject: [PATCH 20/67] Fix amperstand handling --- markdown/htmlparser.py | 4 +++- tests/test_syntax/blocks/test_html_blocks.py | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 7934a8cbf..d356f971c 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -26,10 +26,12 @@ import HTMLParser as parser except ImportError: from html import parser +import re # Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions. -import re parser.piclose = re.compile(r'\?>') +# Monkeypatch HTMLParser to only recognize entity references with a closing semicolon. +parser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') class HTMLExtractor(parser.HTMLParser): diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 2ff3ad7de..ab566bfce 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -102,11 +102,11 @@ def test_raw_empty_blank_line(self): '

\n\n

' ) - # Note: this is a change in behavior. We don't preserve capitalization on closing tags. + # TODO: Fix this. Capitalization should be preserved on all tags. def test_raw_uppercase(self): self.assertMarkdownRenders( - '

foo

', - '

foo

' + '
foo
', + '
foo
' ) # TODO: fix this. The blank line is optional but matches previous behavior and reference implementation. From 3bc2960509d0cae8b516ac772d85336310a6e8dd Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 21 Mar 2019 16:11:39 -0400 Subject: [PATCH 21/67] preserve actual closing tags Retain capitalization, etc. The HTMLParser already has `get_starttag_text`. This does the same for endtag. --- markdown/htmlparser.py | 15 +++++++++++- tests/test_syntax/blocks/test_html_blocks.py | 24 ++++++++++++++++---- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index d356f971c..ce070da01 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -83,7 +83,20 @@ def handle_starttag(self, tag, attrs): self.cleandoc.append(text) def handle_endtag(self, tag): - text = ''.format(tag) + # Attempt to extract actual tag from raw source text + if self.lineno > 1: + # Find start position: char index for end of line at self.lineno + self.offset + start = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata).end() + self.offset + else: + # On first line. Just use self.offset for start position. + start = self.offset + m = parser.endendtag.search(self.rawdata, start) + if m: + text = self.rawdata[start:m.end()] + else: + # Failed to extract from raw data. Assume well formed and lowercase. + text = ''.format(tag) + if tag in self.stack: while self.stack: if self.stack.pop() == tag: diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index ab566bfce..361442397 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -102,11 +102,28 @@ def test_raw_empty_blank_line(self): '

\n\n

' ) - # TODO: Fix this. Capitalization should be preserved on all tags. def test_raw_uppercase(self): self.assertMarkdownRenders( - '
foo
', - '
foo
' + '
*foo*
', + '
*foo*
' + ) + + def test_raw_uppercase_multiline(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ *foo* +
+ """ + ), + self.dedent( + """ +
+ *foo* +
+ """ + ) ) # TODO: fix this. The blank line is optional but matches previous behavior and reference implementation. @@ -1116,7 +1133,6 @@ def test_nested_entityref(self): '

§

' ) - # TODO: Fix this. `&T;` is not a valid charref. def test_amperstand(self): self.assertMarkdownRenders( 'AT&T & AT&T', From 49532727cdd5a5a85a18ff04be1b51d022a13b3a Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Fri, 22 Mar 2019 09:45:07 -0400 Subject: [PATCH 22/67] More bugs fixed --- markdown/htmlparser.py | 4 +- tests/test_syntax/blocks/test_html_blocks.py | 69 +++++++++++++------- 2 files changed, 50 insertions(+), 23 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index ce070da01..dbb957b16 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -107,7 +107,7 @@ def handle_endtag(self, tag): self._cache.append(text) self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) # Insert blank line between this and next line. TODO: make this conditional?? - self.cleandoc.append('\n') + self.cleandoc.append('\n\n') self._cache = [] elif self.inraw: self._cache.append(text) @@ -129,6 +129,8 @@ def handle_empty_tag(self, data, is_block): elif col < 4 and is_block: # Handle this as a standalone raw block self.cleandoc.append(self.md.htmlStash.store(data)) + # Insert blank line between this and next line. + self.cleandoc.append('\n\n') else: self.cleandoc.append(data) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 361442397..18180fa9f 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -126,7 +126,8 @@ def test_raw_uppercase_multiline(self): ) ) - # TODO: fix this. The blank line is optional but matches previous behavior and reference implementation. + # Note: This is a change in behavior, but follows the rules and the reference implementation. + # To change we would need to not restrict block-level content to begin at start of line. def test_multiple_raw_single__line(self): self.assertMarkdownRenders( '

*foo*

*bar*
', @@ -134,7 +135,7 @@ def test_multiple_raw_single__line(self): """

*foo*

-
*bar*
+

bar

""" ) ) @@ -226,13 +227,14 @@ def test_raw_surrounded_by_text_without_blank_lines(self): ) ) - # TODO: fix this. A blank line between the tags is optional but would be a change in behavior. + # Note: The blank line between the tags is a change in behavior. def test_raw_one_line_followed_by_text(self): self.assertMarkdownRenders( '

*foo*

*bar*', self.dedent( """

*foo*

+

bar

""" ) @@ -643,34 +645,40 @@ def test_raw_comment_one_line_with_tag(self): '' ) - # Note: this is a change in behavior for Python_markdown but matches the reference implementation. - # Previous output was `\n

bar

`. Browsers render both the same. + # Note: this is a change in behavior for Python-Markdown only in that a blank line is added. + # While it does not match the reference implementation, there is no difference in rendering. def test_raw_comment_one_line_followed_by_text(self): self.assertMarkdownRenders( '*bar*', - '

bar

' + self.dedent( + """ + + +

bar

+ """ + ) ) - # TODO: Fix this. This matches Python-Markdown's previous behavior but not the reference implementation, - # which outputs `

bar

` (which is also the pre-fixed behavior). + # This is a change in behavior and does not match the reference implementation. + # We have no way to determine if text is on the same line, so we get this. TODO: reevaluate! def test_raw_comment_one_line_followed_by_html(self): self.assertMarkdownRenders( '

*bar*

', self.dedent( """ -

*bar*

+ +

bar

""" ) ) - # TODO: Fix this. The trailing space is triping up the postprocessor: `

{placeholder}

`. - # Note: this reflects a slight change in behavior as the trailing spacer is preserved. This matches - # the reference implementation. However, it should be ok if we did not preserve the trailing space. + # Note: Trailing (insignificant) whitespace is not preserved, which does not match the + # reference implementation. However, it is not a change in behavior for Python-Markdown. def test_raw_comment_trailing_whitespace(self): self.assertMarkdownRenders( ' ', - ' ' + '' ) # Note: this is a change in behavior for Python-Markdown, which does *not* match the reference @@ -932,11 +940,18 @@ def test_raw_processing_instruction_one_line(self): "'; ?>" ) - # This is inline as it is not on a line by itself. + # This is a change in behavior and does not match the reference implementation. + # We have no way to determine if text is on the same line, so we get this. TODO: reevaluate! def test_raw_processing_instruction_one_line_followed_by_text(self): self.assertMarkdownRenders( "'; ?>*bar*", - "

'; ?>bar

" + self.dedent( + """ + '; ?> + +

bar

+ """ + ) ) def test_raw_multiline_processing_instruction(self): @@ -1007,12 +1022,18 @@ def test_raw_declaration_one_line(self): '' ) - # Note: this is a change in behavior for Python_markdown but matches the reference implementation. - # Previous output was `*bar*`. + # This is a change in behavior and does not match the reference implementation. + # We have no way to determine if text is on the same line, so we get this. TODO: reevaluate! def test_raw_declaration_one_line_followed_by_text(self): self.assertMarkdownRenders( '*bar*', - '

bar

' + self.dedent( + """ + + +

bar

+ """ + ) ) def test_raw_multiline_declaration(self): @@ -1039,12 +1060,17 @@ def test_raw_cdata_one_line(self): '"); ]]>' ) - # Note: this is a change in behavior for Python_markdown but matches the reference implementation. - # Previous output was `"); ]]>*bar*`. + # Note: this is a change. Neither previous output nor this match reference implementation. def test_raw_cdata_one_line_followed_by_text(self): self.assertMarkdownRenders( '"); ]]>*bar*', - '

"); ]]>bar

' + self.dedent( + """ + "); ]]> + +

bar

+ """ + ) ) def test_raw_multiline_cdata(self): @@ -1220,7 +1246,6 @@ def test_text_links_ignored(self): ), ) - # TODO: fix this def text_invalid_tags(self): self.assertMarkdownRenders( self.dedent( From 29cc7ba0db5de5216c734ffb774388cc995675b4 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Fri, 22 Mar 2019 10:26:20 -0400 Subject: [PATCH 23/67] Account for code spans at start of line. Previously we didn't test for this because we did not allow raw blocks to have any indent. Now, that we allow up to 3 spaces of indent, we need to confirm those 3 chars are actually whitespace. Tests added. Note: the case where the code span is not on the first line is still failing. I havn't worked out why but ran out of time today. --- markdown/htmlparser.py | 33 ++++++++++++++------ tests/test_syntax/blocks/test_html_blocks.py | 29 +++++++++++++++-- 2 files changed, 50 insertions(+), 12 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index dbb957b16..b457fb653 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -65,11 +65,30 @@ def close(self): self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) self._cache = [] + @property + def line_offset(self): + """Returns char index in self.rawdata for the start of the current line. """ + if self.lineno > 1: + return re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata).end() + return 0 + + def at_line_start(self): + """ + Returns True if current position is at start of line. + + Allows for up to three blank spaces at start of line. + """ + if self.offset == 0: + return True + if self.offset > 3: + return False + # Confirm up to first 3 chars are whitespace + return self.rawdata[self.line_offset:self.offset].strip() == '' + def handle_starttag(self, tag, attrs): self.stack.append(tag) - line, col = self.getpos() - if col < 4 and self.md.is_block_level(tag) and not self.inraw: + if self.at_line_start() and self.md.is_block_level(tag) and not self.inraw: # Started a new raw block self.inraw = True if len(self.cleandoc): @@ -84,12 +103,7 @@ def handle_starttag(self, tag, attrs): def handle_endtag(self, tag): # Attempt to extract actual tag from raw source text - if self.lineno > 1: - # Find start position: char index for end of line at self.lineno + self.offset - start = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata).end() + self.offset - else: - # On first line. Just use self.offset for start position. - start = self.offset + start = self.line_offset + self.offset m = parser.endendtag.search(self.rawdata, start) if m: text = self.rawdata[start:m.end()] @@ -122,11 +136,10 @@ def handle_data(self, data): def handle_empty_tag(self, data, is_block): """ Handle empty tags (``). """ - line, col = self.getpos() if self.inraw: # Append this to the existing raw block self._cache.append(data) - elif col < 4 and is_block: + elif self.at_line_start() and is_block: # Handle this as a standalone raw block self.cleandoc.append(self.md.htmlStash.store(data)) # Insert blank line between this and next line. diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 18180fa9f..ceb65b804 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -74,8 +74,8 @@ def test_raw_span(self): def test_code_span(self): self.assertMarkdownRenders( - '`code span`', - '

<em>code span</em>

' + '`

code span

`', + '

<p>code span</p>

' ) def test_raw_empty(self): @@ -227,6 +227,25 @@ def test_raw_surrounded_by_text_without_blank_lines(self): ) ) + # TODO: Fix this. Not sure why its failing... + def test_multiline_markdown_with_code_span(self): + self.assertMarkdownRenders( + self.dedent( + """ + A paragraph with a block-level + `

code span

`, which is + at the start of a line. + """ + ), + self.dedent( + """ +

A paragraph with a block-level + <p>code span</p>. + More Markdown text.

+ """ + ) + ) + # Note: The blank line between the tags is a change in behavior. def test_raw_one_line_followed_by_text(self): self.assertMarkdownRenders( @@ -645,6 +664,12 @@ def test_raw_comment_one_line_with_tag(self): '' ) + def test_comment_in_code_span(self): + self.assertMarkdownRenders( + '``', + '

<!-- *foo* -->

' + ) + # Note: this is a change in behavior for Python-Markdown only in that a blank line is added. # While it does not match the reference implementation, there is no difference in rendering. def test_raw_comment_one_line_followed_by_text(self): From d09d6026f79e16789cbfbdc00c3c688fd433e6b0 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Sun, 24 Mar 2019 11:54:21 -0400 Subject: [PATCH 24/67] Code spans at start of line 2nd attempt. There were two obvious reasons why that was failing. That's what hapens when I'm in a hurry. Fixed now. --- markdown/htmlparser.py | 2 +- tests/test_syntax/blocks/test_html_blocks.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index b457fb653..ee67ac10d 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -83,7 +83,7 @@ def at_line_start(self): if self.offset > 3: return False # Confirm up to first 3 chars are whitespace - return self.rawdata[self.line_offset:self.offset].strip() == '' + return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == '' def handle_starttag(self, tag, attrs): self.stack.append(tag) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index ceb65b804..6e4406cbe 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -227,7 +227,6 @@ def test_raw_surrounded_by_text_without_blank_lines(self): ) ) - # TODO: Fix this. Not sure why its failing... def test_multiline_markdown_with_code_span(self): self.assertMarkdownRenders( self.dedent( @@ -240,8 +239,8 @@ def test_multiline_markdown_with_code_span(self): self.dedent( """

A paragraph with a block-level - <p>code span</p>. - More Markdown text.

+ <p>code span</p>, which is + at the start of a line.

""" ) ) From 1e16fd08da7bcc8bb169f4360e14356330d60201 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 1 Jul 2020 13:36:32 -0400 Subject: [PATCH 25/67] Drop py2 and cleanup after rebase. --- markdown/blockprocessors.py | 1 - markdown/htmlparser.py | 22 +++++++------------- tests/test_syntax/blocks/test_html_blocks.py | 4 ++-- 3 files changed, 10 insertions(+), 17 deletions(-) diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py index a9e93e516..e81f83c9a 100644 --- a/markdown/blockprocessors.py +++ b/markdown/blockprocessors.py @@ -35,7 +35,6 @@ import xml.etree.ElementTree as etree from . import util from .blockparser import BlockParser -from .htmlparser import HTMLExtractor logger = logging.getLogger('MARKDOWN') diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index ee67ac10d..8f09a2cf9 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """ Python Markdown @@ -13,19 +12,14 @@ Currently maintained by Waylan Limberg (https://github.com/waylan), Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). -Copyright 2007-2019 The Python Markdown Project (v. 1.7 and later) +Copyright 2007-2020 The Python Markdown Project (v. 1.7 and later) Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) Copyright 2004 Manfred Stienstra (the original version) License: BSD (see LICENSE.md for details). """ -from __future__ import unicode_literals -from . import util -try: - import HTMLParser as parser -except ImportError: - from html import parser +from html import parser import re # Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions. @@ -43,10 +37,10 @@ class HTMLExtractor(parser.HTMLParser): """ def __init__(self, md, *args, **kwargs): - if util.PY3 and 'convert_charrefs' not in kwargs: + if 'convert_charrefs' not in kwargs: kwargs['convert_charrefs'] = False # This calls self.reset - parser.HTMLParser.__init__(self, *args, **kwargs) # TODO: Use super when we drop PY2 support + super().__init__(*args, **kwargs) self.md = md def reset(self): @@ -55,11 +49,11 @@ def reset(self): self.stack = [] # When inraw==True, stack contains a list of tags self._cache = [] self.cleandoc = [] - parser.HTMLParser.reset(self) # TODO: Use super when we drop PY2 support + super().reset() def close(self): """Handle any buffered data.""" - parser.HTMLParser.close(self) # TODO: Use super when we drop PY2 support + super().close() # Handle any unclosed tags. if len(self._cache): self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) @@ -82,7 +76,7 @@ def at_line_start(self): return True if self.offset > 3: return False - # Confirm up to first 3 chars are whitespace + # Confirm up to first 3 chars are whitespace return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == '' def handle_starttag(self, tag, attrs): @@ -109,7 +103,7 @@ def handle_endtag(self, tag): text = self.rawdata[start:m.end()] else: # Failed to extract from raw data. Assume well formed and lowercase. - text = ''.format(tag) + text = ''.format(tag) if tag in self.stack: while self.stack: diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 6e4406cbe..f75a773ba 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -972,7 +972,7 @@ def test_raw_processing_instruction_one_line_followed_by_text(self): self.dedent( """ '; ?> - +

bar

""" ) @@ -1054,7 +1054,7 @@ def test_raw_declaration_one_line_followed_by_text(self): self.dedent( """ - +

bar

""" ) From 9fe2473478695df03efc42468f2c8a5427786a01 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 1 Jul 2020 15:54:15 -0400 Subject: [PATCH 26/67] First attempt at md in raw. It fails as blocks are wrapped in

tags. --- markdown/htmlparser.py | 18 ++++++++++++++---- markdown/preprocessors.py | 4 +++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 8f09a2cf9..896e0247c 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -36,12 +36,13 @@ class HTMLExtractor(parser.HTMLParser): to `md` and the remaining text is stored in `cleandoc` as a list of strings. """ - def __init__(self, md, *args, **kwargs): + def __init__(self, md, md_in_raw, *args, **kwargs): if 'convert_charrefs' not in kwargs: kwargs['convert_charrefs'] = False # This calls self.reset super().__init__(*args, **kwargs) self.md = md + self.md_in_raw = md_in_raw def reset(self): """Reset this instance. Loses all unprocessed data.""" @@ -80,16 +81,25 @@ def at_line_start(self): return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == '' def handle_starttag(self, tag, attrs): + attrs = dict(attrs) self.stack.append(tag) if self.at_line_start() and self.md.is_block_level(tag) and not self.inraw: - # Started a new raw block - self.inraw = True + if not (self.md_in_raw and attrs.get('markdown', None) == '1'): + # Started a new raw block + self.inraw = True if len(self.cleandoc): # Insert blank line between this and previous line. self.cleandoc.append('\n') - text = self.get_starttag_text() + if self.md_in_raw and 'markdown' in attrs: + # Remove markdown attribute and rebuild start tag. + attrs.pop('markdown') + attrs_str = ' ' + ' '.join('{}="{}"'.format(k, v) for k, v in attrs.items()) if attrs else '' + text = '<{}{}>'.format(tag, attrs_str) + else: + text = self.get_starttag_text() + if self.inraw: self._cache.append(text) else: diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py index 528e1de84..2402b6e0f 100644 --- a/markdown/preprocessors.py +++ b/markdown/preprocessors.py @@ -75,9 +75,11 @@ def run(self, lines): class HtmlBlockPreprocessor(Preprocessor): """Remove html blocks from the text and store them for later retrieval.""" + markdown_in_raw = False + def run(self, lines): source = '\n'.join(lines) - parser = HTMLExtractor(md=self.md) + parser = HTMLExtractor(self.md, self.markdown_in_raw) parser.feed(source) parser.close() return ''.join(parser.cleandoc).split('\n') From e4a87961b219d7850351ffd2a7db7489ba0be5dd Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 2 Jul 2020 14:17:51 -0400 Subject: [PATCH 27/67] Support markdown=1 Still some cleanup to do in the postprocessor. For example we need to better handle nested `

` tags. Also needs tests. However, in the common case, this works well. All `markdown=1` blocks are normalized to the same insignificant whitespace by the preprocessor: start-tag-placeholder content end-tag-placeholder This results in the start and end tags each in a `

` tags by itself. The postprocessor then correctly removes the wrapping `

` tags. And the content gets wrapped in its own `

`. Note that PHP Markdown Extra follows this behavior:

*text*
*text*
*text*
Each results in the same normalized output:

text

text

text

--- markdown/extensions/md_in_html.py | 12 ++++++------ markdown/htmlparser.py | 28 ++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 500c16641..98fee6e37 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -88,12 +88,12 @@ def extendMarkdown(self, md): # Turn on processing of markdown text within raw html md.preprocessors['html_block'].markdown_in_raw = True - md.parser.blockprocessors.register( - MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105 - ) - md.parser.blockprocessors.tag_counter = -1 - md.parser.blockprocessors.contain_span_tags = re.compile( - r'^(p|h[1-6]|li|dd|dt|td|th|legend|address)$', re.IGNORECASE) + # md.parser.blockprocessors.register( + # MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105 + # ) + # md.parser.blockprocessors.tag_counter = -1 + # md.parser.blockprocessors.contain_span_tags = re.compile( + # r'^(p|h[1-6]|li|dd|dt|td|th|legend|address)$', re.IGNORECASE) def makeExtension(**kwargs): # pragma: no cover diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 896e0247c..8dd8dfdb2 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -48,6 +48,7 @@ def reset(self): """Reset this instance. Loses all unprocessed data.""" self.inraw = False self.stack = [] # When inraw==True, stack contains a list of tags + self.mdstack = [] # WHen markdown=1, stack contains a list of tags self._cache = [] self.cleandoc = [] super().reset() @@ -80,6 +81,9 @@ def at_line_start(self): # Confirm up to first 3 chars are whitespace return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == '' + def store_cache(self): + self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) + def handle_starttag(self, tag, attrs): attrs = dict(attrs) self.stack.append(tag) @@ -92,18 +96,21 @@ def handle_starttag(self, tag, attrs): # Insert blank line between this and previous line. self.cleandoc.append('\n') - if self.md_in_raw and 'markdown' in attrs: + if not self.inraw and self.md_in_raw and 'markdown' in attrs: + self.mdstack.append(tag) # Remove markdown attribute and rebuild start tag. attrs.pop('markdown') attrs_str = ' ' + ' '.join('{}="{}"'.format(k, v) for k, v in attrs.items()) if attrs else '' text = '<{}{}>'.format(tag, attrs_str) + self.cleandoc.append(self.md.htmlStash.store(text)) + if tag != 'p': + self.cleandoc.append('\n\n') else: text = self.get_starttag_text() - - if self.inraw: - self._cache.append(text) - else: - self.cleandoc.append(text) + if self.inraw: + self._cache.append(text) + else: + self.cleandoc.append(text) def handle_endtag(self, tag): # Attempt to extract actual tag from raw source text @@ -129,6 +136,15 @@ def handle_endtag(self, tag): self._cache = [] elif self.inraw: self._cache.append(text) + elif tag in self.mdstack: + # Handle closing tag of markdown=1 element + while self.mdstack: + if self.mdstack.pop() == tag: + break + if tag != 'p': + self.cleandoc.append('\n\n') + self.cleandoc.append(self.md.htmlStash.store(text)) + self.cleandoc.append('\n\n') else: self.cleandoc.append(text) From 1d17525d40fb7b043b9fdf10938c680d285444ca Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Tue, 7 Jul 2020 09:39:26 -0400 Subject: [PATCH 28/67] Eliminate extra blank lines. In most cases, this restored previous behavior. However, in a few cases, it is a change in behavior. See the legacy tests for the changes. Any changes here are of insignificant whitespace in the output, however. --- markdown/postprocessors.py | 5 ++--- tests/misc/hash.html | 2 -- tests/misc/span.html | 2 -- tests/test_syntax/blocks/test_html_blocks.py | 16 ---------------- 4 files changed, 2 insertions(+), 23 deletions(-) diff --git a/markdown/postprocessors.py b/markdown/postprocessors.py index 95b85cd1e..cd32687d2 100644 --- a/markdown/postprocessors.py +++ b/markdown/postprocessors.py @@ -71,9 +71,8 @@ def run(self, text): for i in range(self.md.htmlStash.html_counter): html = self.md.htmlStash.rawHtmlBlocks[i] if self.isblocklevel(html): - replacements["

%s

" % - (self.md.htmlStash.get_placeholder(i))] = \ - html + "\n" + replacements["

{}

".format( + self.md.htmlStash.get_placeholder(i))] = html replacements[self.md.htmlStash.get_placeholder(i)] = html if replacements: diff --git a/tests/misc/hash.html b/tests/misc/hash.html index 186599476..95473c3f6 100644 --- a/tests/misc/hash.html +++ b/tests/misc/hash.html @@ -2,10 +2,8 @@
 #!/usr/bin/python
 hello
-

a

 !/usr/bin/python
 hello
-

a

\ No newline at end of file diff --git a/tests/misc/span.html b/tests/misc/span.html index bafcf0f23..5d711f84d 100644 --- a/tests/misc/span.html +++ b/tests/misc/span.html @@ -1,6 +1,4 @@

Foo bar Baz

*foo*
-
Foo *bar* Baz
-

Foo bar Baz

\ No newline at end of file diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index f75a773ba..2a7715118 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -134,7 +134,6 @@ def test_multiple_raw_single__line(self): self.dedent( """

*foo*

-

bar

""" ) @@ -201,7 +200,6 @@ def test_raw_surrounded_by_Markdown(self): """

Some Markdown text.

*Raw* HTML.

-

More Markdown text.

""" ) @@ -216,12 +214,10 @@ def test_raw_surrounded_by_text_without_blank_lines(self): More *Markdown* text. """ ), - # TODO: Work out a way to eliminate the extra blank line. self.dedent( """

Some Markdown text.

*Raw* HTML.

-

More Markdown text.

""" ) @@ -245,14 +241,12 @@ def test_multiline_markdown_with_code_span(self): ) ) - # Note: The blank line between the tags is a change in behavior. def test_raw_one_line_followed_by_text(self): self.assertMarkdownRenders( '

*foo*

*bar*', self.dedent( """

*foo*

-

bar

""" ) @@ -294,11 +288,9 @@ def test_adjacent_raw_blocks(self):

A second raw paragraph.

""" ), - # TODO: Work out a way to eliminate the extra blank line. self.dedent( """

A raw paragraph.

-

A second raw paragraph.

""" ) @@ -316,7 +308,6 @@ def test_adjacent_raw_blocks_with_blank_lines(self): self.dedent( """

A raw paragraph.

-

A second raw paragraph.

""" ) @@ -669,15 +660,12 @@ def test_comment_in_code_span(self): '

<!-- *foo* -->

' ) - # Note: this is a change in behavior for Python-Markdown only in that a blank line is added. - # While it does not match the reference implementation, there is no difference in rendering. def test_raw_comment_one_line_followed_by_text(self): self.assertMarkdownRenders( '*bar*', self.dedent( """ -

bar

""" ) @@ -691,7 +679,6 @@ def test_raw_comment_one_line_followed_by_html(self): self.dedent( """ -

bar

""" ) @@ -972,7 +959,6 @@ def test_raw_processing_instruction_one_line_followed_by_text(self): self.dedent( """ '; ?> -

bar

""" ) @@ -1054,7 +1040,6 @@ def test_raw_declaration_one_line_followed_by_text(self): self.dedent( """ -

bar

""" ) @@ -1091,7 +1076,6 @@ def test_raw_cdata_one_line_followed_by_text(self): self.dedent( """ "); ]]> -

bar

""" ) From 6b4b3519b8d9d415e498b95f4376ca5905deb265 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Tue, 7 Jul 2020 11:43:53 -0400 Subject: [PATCH 29/67] Add more tests --- tests/test_syntax/blocks/test_html_blocks.py | 26 ++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 2a7715118..2b44982fe 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -78,6 +78,12 @@ def test_code_span(self): '

<p>code span</p>

' ) + def test_code_span_open_gt(self): + self.assertMarkdownRenders( + '*bar* `<` *foo*', + '

bar < foo

' + ) + def test_raw_empty(self): self.assertMarkdownRenders( '

', @@ -241,6 +247,26 @@ def test_multiline_markdown_with_code_span(self): ) ) + def test_raw_block_preceded_by_markdown_code_span_with_unclosed_block_tag(self): + self.assertMarkdownRenders( + self.dedent( + """ + A paragraph with a block-level code span: `
`. + +

*not markdown*

+ + This is *markdown* + """ + ), + self.dedent( + """ +

A paragraph with a block-level code span: <div>.

+

*not markdown*

+

This is markdown

+ """ + ) + ) + def test_raw_one_line_followed_by_text(self): self.assertMarkdownRenders( '

*foo*

*bar*', From c0194f36406843209647bda3817bab23b935dd28 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Tue, 7 Jul 2020 14:28:08 -0400 Subject: [PATCH 30/67] Track index of containing tag in stack. --- markdown/htmlparser.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 8dd8dfdb2..9c3e5c12b 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -48,7 +48,8 @@ def reset(self): """Reset this instance. Loses all unprocessed data.""" self.inraw = False self.stack = [] # When inraw==True, stack contains a list of tags - self.mdstack = [] # WHen markdown=1, stack contains a list of tags + self.container_index = None # Index in stack of parent tag of raw block + self.mdstack = [] # When markdown=1, stack contains a list of tags self._cache = [] self.cleandoc = [] super().reset() @@ -92,6 +93,7 @@ def handle_starttag(self, tag, attrs): if not (self.md_in_raw and attrs.get('markdown', None) == '1'): # Started a new raw block self.inraw = True + self.container_index = len(self.stack) - 1 if len(self.cleandoc): # Insert blank line between this and previous line. self.cleandoc.append('\n') @@ -126,9 +128,10 @@ def handle_endtag(self, tag): while self.stack: if self.stack.pop() == tag: break - if self.inraw and len(self.stack) == 0: + if self.inraw and len(self.stack) <= self.container_index: # End of raw block self.inraw = False + self.container_index = None self._cache.append(text) self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) # Insert blank line between this and next line. TODO: make this conditional?? From 23375a51a88d2aaa761d3584079f0948c1c8d6b2 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Tue, 7 Jul 2020 15:05:47 -0400 Subject: [PATCH 31/67] Minor tweaks. --- markdown/htmlparser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 9c3e5c12b..86c51b282 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -48,7 +48,7 @@ def reset(self): """Reset this instance. Loses all unprocessed data.""" self.inraw = False self.stack = [] # When inraw==True, stack contains a list of tags - self.container_index = None # Index in stack of parent tag of raw block + self.container_index = -1 # Index in stack of parent tag of raw block self.mdstack = [] # When markdown=1, stack contains a list of tags self._cache = [] self.cleandoc = [] @@ -131,7 +131,8 @@ def handle_endtag(self, tag): if self.inraw and len(self.stack) <= self.container_index: # End of raw block self.inraw = False - self.container_index = None + self.stack = [] # Reset stack as it could have extranious items in it. + self.container_index = -1 self._cache.append(text) self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) # Insert blank line between this and next line. TODO: make this conditional?? From 9ffead57f295ba42563c512ac92ee31bbec76ced Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 8 Jul 2020 16:04:38 -0400 Subject: [PATCH 32/67] break md_in_html out into subclass of HTML parser. --- markdown/extensions/md_in_html.py | 93 ++++++++++++++++++++++++++++++- markdown/htmlparser.py | 38 +++---------- markdown/preprocessors.py | 4 +- 3 files changed, 100 insertions(+), 35 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 98fee6e37..fd0384d4a 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -16,11 +16,100 @@ from . import Extension from ..blockprocessors import BlockProcessor +from ..preprocessors import Preprocessor from .. import util +from ..htmlparser import HTMLExtractor +from html import parser import re import xml.etree.ElementTree as etree +class HTMLExtractorExtra(HTMLExtractor): + + def reset(self): + """Reset this instance. Loses all unprocessed data.""" + self.mdstack = [] # When markdown=1, stack contains a list of tags + super().reset() + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + self.stack.append(tag) + + if self.at_line_start() and self.md.is_block_level(tag) and not self.inraw: + if not attrs.get('markdown', None) == '1': + # Started a new raw block + self.inraw = True + self.container_index = len(self.stack) - 1 + if len(self.cleandoc): + # Insert blank line between this and previous line. + self.cleandoc.append('\n') + + if not self.inraw and 'markdown' in attrs: + self.mdstack.append(tag) + # Remove markdown attribute and rebuild start tag. + attrs.pop('markdown') + attrs_str = ' ' + ' '.join('{}="{}"'.format(k, v) for k, v in attrs.items()) if attrs else '' + text = '<{}{}>'.format(tag, attrs_str) + self.cleandoc.append(self.md.htmlStash.store(text)) + if tag != 'p': + self.cleandoc.append('\n\n') + else: + text = self.get_starttag_text() + if self.inraw: + self._cache.append(text) + else: + self.cleandoc.append(text) + + def handle_endtag(self, tag): + # Attempt to extract actual tag from raw source text + start = self.line_offset + self.offset + m = parser.endendtag.search(self.rawdata, start) + if m: + text = self.rawdata[start:m.end()] + else: + # Failed to extract from raw data. Assume well formed and lowercase. + text = ''.format(tag) + + if tag in self.stack: + while self.stack: + if self.stack.pop() == tag: + break + if self.inraw and len(self.stack) <= self.container_index: + # End of raw block + self.inraw = False + self.stack = [] # Reset stack as it could have extranious items in it. + self.container_index = -1 + self._cache.append(text) + self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) + # Insert blank line between this and next line. TODO: make this conditional?? + self.cleandoc.append('\n\n') + self._cache = [] + elif self.inraw: + self._cache.append(text) + elif tag in self.mdstack: + # Handle closing tag of markdown=1 element + while self.mdstack: + if self.mdstack.pop() == tag: + break + if tag != 'p': + self.cleandoc.append('\n\n') + self.cleandoc.append(self.md.htmlStash.store(text)) + self.cleandoc.append('\n\n') + else: + self.cleandoc.append(text) + + +class HtmlBlockPreprocessor(Preprocessor): + """Remove html blocks from the text and store them for later retrieval.""" + + def run(self, lines): + source = '\n'.join(lines) + parser = HTMLExtractorExtra(self.md) + parser.feed(source) + parser.close() + return ''.join(parser.cleandoc).split('\n') + + class MarkdownInHtmlProcessor(BlockProcessor): """Process Markdown Inside HTML Blocks.""" def test(self, parent, block): @@ -86,8 +175,8 @@ class MarkdownInHtmlExtension(Extension): def extendMarkdown(self, md): """ Register extension instances. """ - # Turn on processing of markdown text within raw html - md.preprocessors['html_block'].markdown_in_raw = True + # Replace raw HTML preprocessor + md.preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20) # md.parser.blockprocessors.register( # MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105 # ) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 86c51b282..fb12dec68 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -36,20 +36,18 @@ class HTMLExtractor(parser.HTMLParser): to `md` and the remaining text is stored in `cleandoc` as a list of strings. """ - def __init__(self, md, md_in_raw, *args, **kwargs): + def __init__(self, md, *args, **kwargs): if 'convert_charrefs' not in kwargs: kwargs['convert_charrefs'] = False # This calls self.reset super().__init__(*args, **kwargs) self.md = md - self.md_in_raw = md_in_raw def reset(self): """Reset this instance. Loses all unprocessed data.""" self.inraw = False self.stack = [] # When inraw==True, stack contains a list of tags self.container_index = -1 # Index in stack of parent tag of raw block - self.mdstack = [] # When markdown=1, stack contains a list of tags self._cache = [] self.cleandoc = [] super().reset() @@ -90,29 +88,18 @@ def handle_starttag(self, tag, attrs): self.stack.append(tag) if self.at_line_start() and self.md.is_block_level(tag) and not self.inraw: - if not (self.md_in_raw and attrs.get('markdown', None) == '1'): - # Started a new raw block - self.inraw = True - self.container_index = len(self.stack) - 1 + # Started a new raw block + self.inraw = True + self.container_index = len(self.stack) - 1 if len(self.cleandoc): # Insert blank line between this and previous line. self.cleandoc.append('\n') - if not self.inraw and self.md_in_raw and 'markdown' in attrs: - self.mdstack.append(tag) - # Remove markdown attribute and rebuild start tag. - attrs.pop('markdown') - attrs_str = ' ' + ' '.join('{}="{}"'.format(k, v) for k, v in attrs.items()) if attrs else '' - text = '<{}{}>'.format(tag, attrs_str) - self.cleandoc.append(self.md.htmlStash.store(text)) - if tag != 'p': - self.cleandoc.append('\n\n') + text = self.get_starttag_text() + if self.inraw: + self._cache.append(text) else: - text = self.get_starttag_text() - if self.inraw: - self._cache.append(text) - else: - self.cleandoc.append(text) + self.cleandoc.append(text) def handle_endtag(self, tag): # Attempt to extract actual tag from raw source text @@ -140,15 +127,6 @@ def handle_endtag(self, tag): self._cache = [] elif self.inraw: self._cache.append(text) - elif tag in self.mdstack: - # Handle closing tag of markdown=1 element - while self.mdstack: - if self.mdstack.pop() == tag: - break - if tag != 'p': - self.cleandoc.append('\n\n') - self.cleandoc.append(self.md.htmlStash.store(text)) - self.cleandoc.append('\n\n') else: self.cleandoc.append(text) diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py index 2402b6e0f..76a25428a 100644 --- a/markdown/preprocessors.py +++ b/markdown/preprocessors.py @@ -75,11 +75,9 @@ def run(self, lines): class HtmlBlockPreprocessor(Preprocessor): """Remove html blocks from the text and store them for later retrieval.""" - markdown_in_raw = False - def run(self, lines): source = '\n'.join(lines) - parser = HTMLExtractor(self.md, self.markdown_in_raw) + parser = HTMLExtractor(self.md) parser.feed(source) parser.close() return ''.join(parser.cleandoc).split('\n') From e3ff3686fb7091202669bba1ae0d1d5059e5aca0 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 8 Jul 2020 16:16:55 -0400 Subject: [PATCH 33/67] Only put raw tags in stack. --- markdown/htmlparser.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index fb12dec68..e2bf3e822 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -47,7 +47,6 @@ def reset(self): """Reset this instance. Loses all unprocessed data.""" self.inraw = False self.stack = [] # When inraw==True, stack contains a list of tags - self.container_index = -1 # Index in stack of parent tag of raw block self._cache = [] self.cleandoc = [] super().reset() @@ -85,18 +84,17 @@ def store_cache(self): def handle_starttag(self, tag, attrs): attrs = dict(attrs) - self.stack.append(tag) if self.at_line_start() and self.md.is_block_level(tag) and not self.inraw: # Started a new raw block self.inraw = True - self.container_index = len(self.stack) - 1 if len(self.cleandoc): # Insert blank line between this and previous line. self.cleandoc.append('\n') text = self.get_starttag_text() if self.inraw: + self.stack.append(tag) self._cache.append(text) else: self.cleandoc.append(text) @@ -111,15 +109,13 @@ def handle_endtag(self, tag): # Failed to extract from raw data. Assume well formed and lowercase. text = ''.format(tag) - if tag in self.stack: + if self.inraw and tag in self.stack: while self.stack: if self.stack.pop() == tag: break - if self.inraw and len(self.stack) <= self.container_index: + if self.inraw and len(self.stack) == 0: # End of raw block self.inraw = False - self.stack = [] # Reset stack as it could have extranious items in it. - self.container_index = -1 self._cache.append(text) self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) # Insert blank line between this and next line. TODO: make this conditional?? From c96efada59c8adbc9286783b64b88fb670274bc2 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 9 Jul 2020 11:57:58 -0400 Subject: [PATCH 34/67] Refactor and simplify logic. --- markdown/htmlparser.py | 59 ++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index e2bf3e822..cd2b8c0f0 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -79,18 +79,28 @@ def at_line_start(self): # Confirm up to first 3 chars are whitespace return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == '' - def store_cache(self): - self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) + def get_endtag_text(self, tag): + """ + Returns the text of the end tag. + + If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`. + """ + # Attempt to extract actual tag from raw source text + start = self.line_offset + self.offset + m = parser.endendtag.search(self.rawdata, start) + if m: + return self.rawdata[start:m.end()] + else: + # Failed to extract from raw data. Assume well formed and lowercase. + return ''.format(tag) def handle_starttag(self, tag, attrs): attrs = dict(attrs) if self.at_line_start() and self.md.is_block_level(tag) and not self.inraw: - # Started a new raw block + # Started a new raw block. Prepare stack. self.inraw = True - if len(self.cleandoc): - # Insert blank line between this and previous line. - self.cleandoc.append('\n') + self.cleandoc.append('\n') text = self.get_starttag_text() if self.inraw: @@ -100,29 +110,22 @@ def handle_starttag(self, tag, attrs): self.cleandoc.append(text) def handle_endtag(self, tag): - # Attempt to extract actual tag from raw source text - start = self.line_offset + self.offset - m = parser.endendtag.search(self.rawdata, start) - if m: - text = self.rawdata[start:m.end()] - else: - # Failed to extract from raw data. Assume well formed and lowercase. - text = ''.format(tag) - - if self.inraw and tag in self.stack: - while self.stack: - if self.stack.pop() == tag: - break - if self.inraw and len(self.stack) == 0: - # End of raw block - self.inraw = False - self._cache.append(text) - self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) - # Insert blank line between this and next line. TODO: make this conditional?? - self.cleandoc.append('\n\n') - self._cache = [] - elif self.inraw: + text = self.get_endtag_text(tag) + + if self.inraw: self._cache.append(text) + if tag in self.stack: + # Remove tag from stack + while self.stack: + if self.stack.pop() == tag: + break + if len(self.stack) == 0: + # End of raw block. Reset stack. + self.inraw = False + self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) + # Insert blank line between this and next line. + self.cleandoc.append('\n\n') + self._cache = [] else: self.cleandoc.append(text) From 37ff86abb3abe60144c6c0c65a3586b2255f19ff Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Sun, 12 Jul 2020 15:01:26 -0400 Subject: [PATCH 35/67] Disable 'incomplete' entity handling of HTMLParser. --- markdown/htmlparser.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index cd2b8c0f0..db63c2fc4 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -26,6 +26,10 @@ parser.piclose = re.compile(r'\?>') # Monkeypatch HTMLParser to only recognize entity references with a closing semicolon. parser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') +# Monkeypatch HTMLParser to no longer support partial entities. We are always feeding a complete block, +# so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete, +# and the two regex are the same, then incomplete will simply never match and we avoid the logic within. +parser.incomplete = parser.entityref class HTMLExtractor(parser.HTMLParser): From f02b42741c8f663c27428890c9771442450419be Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Sun, 12 Jul 2020 20:30:52 -0400 Subject: [PATCH 36/67] Fixed whitespace issues. --- markdown/htmlparser.py | 5 ++++- tests/misc/hash.html | 2 ++ tests/misc/span.html | 2 ++ tests/test_syntax/blocks/test_html_blocks.py | 3 +++ 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index db63c2fc4..1a49d5ac2 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -124,7 +124,10 @@ def handle_endtag(self, tag): if self.stack.pop() == tag: break if len(self.stack) == 0: - # End of raw block. Reset stack. + # End of raw block. + if self.rawdata[self.line_offset + self.offset + len(text):].startswith('\n\n'): + self._cache.append('\n') + # Reset stack. self.inraw = False self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) # Insert blank line between this and next line. diff --git a/tests/misc/hash.html b/tests/misc/hash.html index 95473c3f6..186599476 100644 --- a/tests/misc/hash.html +++ b/tests/misc/hash.html @@ -2,8 +2,10 @@
 #!/usr/bin/python
 hello
+

a

 !/usr/bin/python
 hello
+

a

\ No newline at end of file diff --git a/tests/misc/span.html b/tests/misc/span.html index 5d711f84d..bafcf0f23 100644 --- a/tests/misc/span.html +++ b/tests/misc/span.html @@ -1,4 +1,6 @@

Foo bar Baz

*foo*
+
Foo *bar* Baz
+

Foo bar Baz

\ No newline at end of file diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 2b44982fe..57e7982a8 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -206,6 +206,7 @@ def test_raw_surrounded_by_Markdown(self): """

Some Markdown text.

*Raw* HTML.

+

More Markdown text.

""" ) @@ -262,6 +263,7 @@ def test_raw_block_preceded_by_markdown_code_span_with_unclosed_block_tag(self): """

A paragraph with a block-level code span: <div>.

*not markdown*

+

This is markdown

""" ) @@ -334,6 +336,7 @@ def test_adjacent_raw_blocks_with_blank_lines(self): self.dedent( """

A raw paragraph.

+

A second raw paragraph.

""" ) From efa36c8a03d29783d930d595cf341f34a080aa7d Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Sun, 12 Jul 2020 21:00:08 -0400 Subject: [PATCH 37/67] Import copy of html.parser so our monkeypatches don't break user's code. --- markdown/htmlparser.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 1a49d5ac2..c3767b8ae 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -19,20 +19,29 @@ License: BSD (see LICENSE.md for details). """ -from html import parser import re +import importlib +import sys + + +# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it. +# Users can still do `from html import parser` and get the default behavior. +spec = importlib.util.find_spec('html.parser') +htmlparser = importlib.util.module_from_spec(spec) +spec.loader.exec_module(htmlparser) +sys.modules['htmlparser'] = htmlparser # Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions. -parser.piclose = re.compile(r'\?>') +htmlparser.piclose = re.compile(r'\?>') # Monkeypatch HTMLParser to only recognize entity references with a closing semicolon. -parser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') +htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') # Monkeypatch HTMLParser to no longer support partial entities. We are always feeding a complete block, # so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete, # and the two regex are the same, then incomplete will simply never match and we avoid the logic within. -parser.incomplete = parser.entityref +htmlparser.incomplete = htmlparser.entityref -class HTMLExtractor(parser.HTMLParser): +class HTMLExtractor(htmlparser.HTMLParser): """ Extract raw HTML from text. @@ -91,7 +100,7 @@ def get_endtag_text(self, tag): """ # Attempt to extract actual tag from raw source text start = self.line_offset + self.offset - m = parser.endendtag.search(self.rawdata, start) + m = htmlparser.endendtag.search(self.rawdata, start) if m: return self.rawdata[start:m.end()] else: From a8145f8abd4c065958d0311dcf8eebb04b6a4fe1 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Mon, 13 Jul 2020 20:22:49 -0400 Subject: [PATCH 38/67] Handle raw blocks in tail of previous block. --- markdown/htmlparser.py | 13 ++++++++++-- tests/test_syntax/blocks/test_html_blocks.py | 21 +++++++++++++------- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index c3767b8ae..9945ebbec 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -59,6 +59,7 @@ def __init__(self, md, *args, **kwargs): def reset(self): """Reset this instance. Loses all unprocessed data.""" self.inraw = False + self.intail = False self.stack = [] # When inraw==True, stack contains a list of tags self._cache = [] self.cleandoc = [] @@ -110,7 +111,7 @@ def get_endtag_text(self, tag): def handle_starttag(self, tag, attrs): attrs = dict(attrs) - if self.at_line_start() and self.md.is_block_level(tag) and not self.inraw: + if self.intail or (self.at_line_start() and self.md.is_block_level(tag) and not self.inraw): # Started a new raw block. Prepare stack. self.inraw = True self.cleandoc.append('\n') @@ -136,6 +137,8 @@ def handle_endtag(self, tag): # End of raw block. if self.rawdata[self.line_offset + self.offset + len(text):].startswith('\n\n'): self._cache.append('\n') + else: + self.intail = True # Reset stack. self.inraw = False self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) @@ -146,6 +149,8 @@ def handle_endtag(self, tag): self.cleandoc.append(text) def handle_data(self, data): + if self.intail and '\n' in data: + self.intail = False if self.inraw: self._cache.append(data) else: @@ -153,11 +158,15 @@ def handle_data(self, data): def handle_empty_tag(self, data, is_block): """ Handle empty tags (``). """ - if self.inraw: + if self.inraw or self.intail: # Append this to the existing raw block self._cache.append(data) elif self.at_line_start() and is_block: # Handle this as a standalone raw block + if self.rawdata[self.line_offset + self.offset + len(data):].startswith('\n\n'): + data += '\n' + else: + self.intail = True self.cleandoc.append(self.md.htmlStash.store(data)) # Insert blank line between this and next line. self.cleandoc.append('\n\n') diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 57e7982a8..a4e7513d6 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -132,15 +132,24 @@ def test_raw_uppercase_multiline(self): ) ) - # Note: This is a change in behavior, but follows the rules and the reference implementation. - # To change we would need to not restrict block-level content to begin at start of line. - def test_multiple_raw_single__line(self): + def test_multiple_raw_single_line(self): self.assertMarkdownRenders( '

*foo*

*bar*
', self.dedent( """

*foo*

-

bar

+
*bar*
+ """ + ) + ) + + def test_multiple_raw_single_line_with_pi(self): + self.assertMarkdownRenders( + "

*foo*

'; ?>", + self.dedent( + """ +

*foo*

+ '; ?> """ ) ) @@ -700,15 +709,13 @@ def test_raw_comment_one_line_followed_by_text(self): ) ) - # This is a change in behavior and does not match the reference implementation. - # We have no way to determine if text is on the same line, so we get this. TODO: reevaluate! def test_raw_comment_one_line_followed_by_html(self): self.assertMarkdownRenders( '

*bar*

', self.dedent( """ -

bar

+

*bar*

""" ) ) From 70d26244e7b99139b0092d2ca18a139972f19d56 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Tue, 14 Jul 2020 11:50:23 -0400 Subject: [PATCH 39/67] Account for extra whitespace on blank lines. --- markdown/htmlparser.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 9945ebbec..bd7e22e0c 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -40,6 +40,10 @@ # and the two regex are the same, then incomplete will simply never match and we avoid the logic within. htmlparser.incomplete = htmlparser.entityref +# Match a blank line at the start of a block of text (two newlines). +# The newlines may be preceded by additional whitespace. +blank_line_re = re.compile(r'^([ ]*\n){2}') + class HTMLExtractor(htmlparser.HTMLParser): """ @@ -135,9 +139,11 @@ def handle_endtag(self, tag): break if len(self.stack) == 0: # End of raw block. - if self.rawdata[self.line_offset + self.offset + len(text):].startswith('\n\n'): + if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]): + # Preserve blank line and end of raw block. self._cache.append('\n') else: + # More content exists after endtag. self.intail = True # Reset stack. self.inraw = False @@ -163,9 +169,11 @@ def handle_empty_tag(self, data, is_block): self._cache.append(data) elif self.at_line_start() and is_block: # Handle this as a standalone raw block - if self.rawdata[self.line_offset + self.offset + len(data):].startswith('\n\n'): + if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]): + # Preserve blank line after tag in raw block. data += '\n' else: + # More content exists after tag. self.intail = True self.cleandoc.append(self.md.htmlStash.store(data)) # Insert blank line between this and next line. From 335816e231cd8bc540d5cd14348ee5b6542aa61d Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Tue, 14 Jul 2020 13:22:39 -0400 Subject: [PATCH 40/67] Handle inline raw html in tail. --- markdown/htmlparser.py | 2 +- tests/test_syntax/blocks/test_html_blocks.py | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index bd7e22e0c..a8f792580 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -115,7 +115,7 @@ def get_endtag_text(self, tag): def handle_starttag(self, tag, attrs): attrs = dict(attrs) - if self.intail or (self.at_line_start() and self.md.is_block_level(tag) and not self.inraw): + if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)): # Started a new raw block. Prepare stack. self.inraw = True self.cleandoc.append('\n') diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index a4e7513d6..0a2092d3f 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -289,6 +289,17 @@ def test_raw_one_line_followed_by_text(self): ) ) + def test_raw_one_line_followed_by_span(self): + self.assertMarkdownRenders( + "

*foo*

*bar*", + self.dedent( + """ +

*foo*

+

bar

+ """ + ) + ) + def test_raw_with_markdown_blocks(self): self.assertMarkdownRenders( self.dedent( @@ -345,7 +356,7 @@ def test_adjacent_raw_blocks_with_blank_lines(self): self.dedent( """

A raw paragraph.

- +

A second raw paragraph.

""" ) From 5776e97c7b1ad01e4fe7d354516882fa33db14e4 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 15 Jul 2020 14:54:42 -0400 Subject: [PATCH 41/67] Update md_in_html with recent htmlparser changes. --- markdown/extensions/md_in_html.py | 55 ++++++++++++++----------------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index fd0384d4a..2c9f17215 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -18,7 +18,7 @@ from ..blockprocessors import BlockProcessor from ..preprocessors import Preprocessor from .. import util -from ..htmlparser import HTMLExtractor +from ..htmlparser import HTMLExtractor, blank_line_re from html import parser import re import xml.etree.ElementTree as etree @@ -33,15 +33,11 @@ def reset(self): def handle_starttag(self, tag, attrs): attrs = dict(attrs) - self.stack.append(tag) - if self.at_line_start() and self.md.is_block_level(tag) and not self.inraw: + if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)): if not attrs.get('markdown', None) == '1': - # Started a new raw block + # Started a new raw block. Prepare stack. self.inraw = True - self.container_index = len(self.stack) - 1 - if len(self.cleandoc): - # Insert blank line between this and previous line. self.cleandoc.append('\n') if not self.inraw and 'markdown' in attrs: @@ -56,36 +52,35 @@ def handle_starttag(self, tag, attrs): else: text = self.get_starttag_text() if self.inraw: + self.stack.append(tag) self._cache.append(text) else: self.cleandoc.append(text) def handle_endtag(self, tag): - # Attempt to extract actual tag from raw source text - start = self.line_offset + self.offset - m = parser.endendtag.search(self.rawdata, start) - if m: - text = self.rawdata[start:m.end()] - else: - # Failed to extract from raw data. Assume well formed and lowercase. - text = ''.format(tag) + text = self.get_endtag_text(tag) - if tag in self.stack: - while self.stack: - if self.stack.pop() == tag: - break - if self.inraw and len(self.stack) <= self.container_index: - # End of raw block - self.inraw = False - self.stack = [] # Reset stack as it could have extranious items in it. - self.container_index = -1 - self._cache.append(text) - self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) - # Insert blank line between this and next line. TODO: make this conditional?? - self.cleandoc.append('\n\n') - self._cache = [] - elif self.inraw: + if self.inraw: self._cache.append(text) + if tag in self.stack: + # Remove tag from stack + while self.stack: + if self.stack.pop() == tag: + break + if len(self.stack) == 0: + # End of raw block. + if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]): + # Preserve blank line and end of raw block. + self._cache.append('\n') + else: + # More content exists after endtag. + self.intail = True + # Reset stack. + self.inraw = False + self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) + # Insert blank line between this and next line. + self.cleandoc.append('\n\n') + self._cache = [] elif tag in self.mdstack: # Handle closing tag of markdown=1 element while self.mdstack: From 488846475c7a3d3c48431aff885af68df8c8d2f3 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 22 Jul 2020 10:22:54 -0400 Subject: [PATCH 42/67] Add test_md_in_html.py --- .../test_syntax/extensions/test_md_in_html.py | 448 ++++++++++++++++++ 1 file changed, 448 insertions(+) create mode 100644 tests/test_syntax/extensions/test_md_in_html.py diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py new file mode 100644 index 000000000..1d0da5224 --- /dev/null +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -0,0 +1,448 @@ +# -*- coding: utf-8 -*- +""" +Python Markdown + +A Python implementation of John Gruber's Markdown. + +Documentation: https://python-markdown.github.io/ +GitHub: https://github.com/Python-Markdown/markdown/ +PyPI: https://pypi.org/project/Markdown/ + +Started by Manfred Stienstra (http://www.dwerg.net/). +Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). +Currently maintained by Waylan Limberg (https://github.com/waylan), +Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). + +Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later) +Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) +Copyright 2004 Manfred Stienstra (the original version) + +License: BSD (see LICENSE.md for details). +""" + +from markdown.test_tools import TestCase + + +class TestMdInHTML(TestCase): + + default_kwargs = {'extensions': ['md_in_html']} + + def test_md1_paragraph(self): + self.assertMarkdownRenders( + '

*foo*

', + '

foo

' + ) + + def test_md1_p_linebreaks(self): + self.assertMarkdownRenders( + self.dedent( + """ +

+ *foo* +

+ """ + ), + self.dedent( + """ +

+ foo +

+ """ + ) + ) + + def test_md1_p_blank_lines(self): + self.assertMarkdownRenders( + self.dedent( + """ +

+ + *foo* + +

+ """ + ), + self.dedent( + """ +

+ + foo + +

+ """ + ) + ) + + def test_md1_div(self): + self.assertMarkdownRenders( + '
*foo*
', + self.dedent( + """ +
+

foo

+
+ """ + ) + ) + + def test_md1_div_linebreaks(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ *foo* +
+ """ + ), + self.dedent( + """ +
+

foo

+
+ """ + ) + ) + + def test_md1_div_blank_lines(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ + *foo* + +
+ """ + ), + self.dedent( + """ +
+

foo

+
+ """ + ) + ) + + def test_md1_div_multi(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ + *foo* + + __bar__ + +
+ """ + ), + self.dedent( + """ +
+

foo

+

bar

+
+ """ + ) + ) + + def test_md1_div_nested(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ +
+ *foo* +
+ +
+ """ + ), + self.dedent( + """ +
+
+

foo

+
+
+ """ + ) + ) + + def test_md1_div_multi_nest(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ +
+

*foo*

+
+ +
+ """ + ), + self.dedent( + """ +
+
+

foo

+
+
+ """ + ) + ) + + def test_md1_mix(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ A _Markdown_ paragraph before a raw child. + +

A *raw* child.

+ + A _Markdown_ tail to the raw child. +
+ """ + ), + self.dedent( + """ +
+

A Markdown paragraph before a raw child.

+

A raw child.

+

A Markdown tail to the raw child.

+
+ """ + ) + ) + + def test_md1_deep_mix(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ + A _Markdown_ paragraph before a raw child. + + A second Markdown paragraph. + +
+ + A *raw* child. + +

*foo*

+ + Raw child tail. + +
+ + A _Markdown_ tail to the raw child. + + A second tail item + +

More raw.

+ +
+ """ + ), + self.dedent( + """ +
+

A Markdown paragraph before a raw child.

+

A second Markdown paragraph.

+
+

A raw child.

+

foo

+

Raw child tail.

+
+

A Markdown tail to the raw child.

+

A second tail item

+

More raw.

+
+ """ + ) + ) + + def test_md1_div_raw_inline(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ + foo + +
+ """ + ), + self.dedent( + """ +
+

foo

+
+ """ + ) + ) + + def test_no_md1_paragraph(self): + self.assertMarkdownRenders( + '

*foo*

', + '

*foo*

' + ) + + def test_no_md1_nest(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ A _Markdown_ paragraph before a raw child. + +

A *raw* child.

+ + A _Markdown_ tail to the raw child. +
+ """ + ), + self.dedent( + """ +
+

A Markdown paragraph before a raw child.

+

A *raw* child.

+

A Markdown tail to the raw child.

+
+ """ + ) + ) + + def test_md_span_paragraph(self): + self.assertMarkdownRenders( + '

*foo*

', + '

foo

' + ) + + def test_md_block_paragraph(self): + self.assertMarkdownRenders( + '

*foo*

', + self.dedent( + """ +

+

foo

+

+ """ + ) + ) + + def test_md_span_div(self): + self.assertMarkdownRenders( + '
*foo*
', + '
foo
' + ) + + def test_md_block_div(self): + self.assertMarkdownRenders( + '
*foo*
', + self.dedent( + """ +
+

foo

+
+ """ + ) + ) + + def test_md_span_nested_in_block(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+
*foo*
+
+ """ + ), + self.dedent( + """ +
+
foo
+
+ """ + ) + ) + + def test_md_block_nested_in_span(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+
*foo*
+
+ """ + ), + self.dedent( + """ +
+
foo
+
+ """ + ) + ) + + def test_md1_nested_in_nomd(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+
*foo*
+
+ """ + ), + self.dedent( + """ +
+
*foo*
+
+ """ + ) + ) + + def test_md1_single_quotes(self): + self.assertMarkdownRenders( + "

*foo*

", + '

foo

' + ) + + def test_md1_no_quotes(self): + self.assertMarkdownRenders( + '

*foo*

', + '

foo

' + ) + + def test_md_no_value(self): + self.assertMarkdownRenders( + '

*foo*

', + '

foo

' + ) + + def test_md1_preserve_attrs(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ +
+

*foo*

+
+ +
+ """ + ), + self.dedent( + """ +
+
+

foo

+
+
+ """ + ) + ) From aae6676419dcc4c7ba4952607ac4b8793cc207da Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Mon, 27 Jul 2020 08:57:33 -0400 Subject: [PATCH 43/67] More tests --- .../test_syntax/extensions/test_md_in_html.py | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index 1d0da5224..f7e242909 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -446,3 +446,96 @@ def test_md1_preserve_attrs(self): """ ) ) + + def test_md1_unclosed_div(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ + _foo_ + +
+

foo

+
+ __bar__ +
+
+ """ + ) + ) + + def test_md1_orphan_endtag(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ + _foo_ + +

+ + _bar_ + +
+ """ + ), + self.dedent( + """ +
+

foo

+

+

bar

+
+ """ + ) + ) + + def test_md1_unclosed_p(self): + self.assertMarkdownRenders( + self.dedent( + """ +

_foo_ +

_bar_ + """ + ), + self.dedent( + """ +

foo +

+

bar +

+ """ + ) + ) + + def test_md1_nested_unclosed_p(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+

_foo_ +

_bar_ +

+ """ + ), + self.dedent( + """ +
+

foo +

+

bar +

+
+ """ + ) + ) From 183537fbd17b357a375979717934d1b09e2cf2f1 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Tue, 28 Jul 2020 16:23:33 -0400 Subject: [PATCH 44/67] Handle markdown=1 attrs. --- markdown/extensions/md_in_html.py | 314 +++++++++++++++++++----------- 1 file changed, 200 insertions(+), 114 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 2c9f17215..c6bfea7c6 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -24,74 +24,128 @@ import xml.etree.ElementTree as etree +# Block-level tags in which the content only gets span level parsing +span_tags = ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th'] + +# Block-level tags in which the content gets parsed as blocks +block_tags = [ + 'address', 'article', 'aside', 'blockquote', 'body', 'colgroup', 'details', 'div', 'dl', 'fieldset', + 'figcaption', 'figure', 'footer', 'form', 'iframe', 'header', 'hr', 'main', 'menu', 'nav', 'map', + 'noscript', 'object', 'ol', 'section', 'table', 'tbody', 'thead', 'tfoot', 'tr', 'ul' +] + +# Block-level tags which never get their content parsed. +raw_tags = ['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea'] + +block_level_tags = span_tags + block_tags + raw_tags + + class HTMLExtractorExtra(HTMLExtractor): + """ + Override HTMLExtractor and create etree Elements for any elements which should have content parsed as Markdown. + """ def reset(self): """Reset this instance. Loses all unprocessed data.""" self.mdstack = [] # When markdown=1, stack contains a list of tags + self.treebuilder = etree.TreeBuilder(insert_comments=True, insert_pis=True) + self.mdstate = None # one of 'block', 'span', 'off', or None super().reset() + def get_element(self): + """ Return element from treebuilder and reset treebuilder for later use. """ + element = self.treebuilder.close() + self.treebuilder = etree.TreeBuilder(insert_comments=True, insert_pis=True) + return element + + def get_state(self, tag, attrs, parent_state=None): + """ Return state from tag and `markdown` attr. One of 'block', 'span', or 'off'. """ + md_attr = attrs.get('markdown', '0') + if md_attr == 'markdown': + # `` is the same as ``. + md_attr = '1' + if parent_state == 'off' or (parent_state == 'span' and md_attr != '0'): + # Only use the parent state if it is more restrictive than the markdown attribute. + md_attr = parent_state + if ((md_attr == '1' and tag in block_tags) or + (md_attr == 'block' and tag in span_tags + block_tags)): + return 'block' + elif ((md_attr == '1' and tag in span_tags) or + (md_attr == 'span' and tag in span_tags + block_tags)): + return 'span' + elif tag in block_level_tags: + return 'off' + else: + return None + def handle_starttag(self, tag, attrs): - attrs = dict(attrs) - - if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)): - if not attrs.get('markdown', None) == '1': - # Started a new raw block. Prepare stack. - self.inraw = True - self.cleandoc.append('\n') - - if not self.inraw and 'markdown' in attrs: - self.mdstack.append(tag) - # Remove markdown attribute and rebuild start tag. - attrs.pop('markdown') - attrs_str = ' ' + ' '.join('{}="{}"'.format(k, v) for k, v in attrs.items()) if attrs else '' - text = '<{}{}>'.format(tag, attrs_str) - self.cleandoc.append(self.md.htmlStash.store(text)) - if tag != 'p': - self.cleandoc.append('\n\n') + if tag in block_level_tags: + # Valueless attr (ex: ``) results in `[('checked', None)]`. Convert to `{'checked': 'checked'}`. + attrs = {key: value if value is not None else key for key, value in attrs} + state = self.get_state(tag, attrs, self.mdstate) + + if self.inraw or (state in [None, 'off'] and not self.mdstack): + # fall back to default behavior + attrs.pop('markdown', None) + super().handle_starttag(tag, attrs) + else: + if 'p' in self.mdstack and tag in block_level_tags: + # Close unclosed 'p' tag + self.handle_endtag('p') + self.mdstate = state + self.mdstack.append(tag) + attrs['markdown'] = state + self.treebuilder.start(tag, attrs) else: - text = self.get_starttag_text() + # Span level tag if self.inraw: - self.stack.append(tag) - self._cache.append(text) + super().handle_starttag(tag, attrs) else: - self.cleandoc.append(text) + text = self.get_starttag_text() + self.handle_data(text) def handle_endtag(self, tag): - text = self.get_endtag_text(tag) - - if self.inraw: - self._cache.append(text) - if tag in self.stack: - # Remove tag from stack - while self.stack: - if self.stack.pop() == tag: + if tag in block_level_tags: + if self.inraw: + super().handle_endtag(tag) + elif tag in self.mdstack: + # Close element and any unclosed children + while self.mdstack: + item = self.mdstack.pop() + self.treebuilder.end(item) + if item == tag: break - if len(self.stack) == 0: - # End of raw block. - if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]): - # Preserve blank line and end of raw block. - self._cache.append('\n') - else: - # More content exists after endtag. - self.intail = True - # Reset stack. - self.inraw = False - self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) - # Insert blank line between this and next line. - self.cleandoc.append('\n\n') - self._cache = [] - elif tag in self.mdstack: - # Handle closing tag of markdown=1 element - while self.mdstack: - if self.mdstack.pop() == tag: - break - if tag != 'p': - self.cleandoc.append('\n\n') - self.cleandoc.append(self.md.htmlStash.store(text)) - self.cleandoc.append('\n\n') + if not self.mdstack: + # Last item in stack is closed. Stash it + element = self.get_element() + self.cleandoc.append(self.md.htmlStash.store(element)) + self.cleandoc.append('\n\n') + self.state = None + else: + # Treat orphan closing tag as an empty tag. + self.handle_startendtag(tag, {}) else: - self.cleandoc.append(text) + # Span level tag + if self.inraw: + super().handle_endtag(tag) + else: + text = self.get_endtag_text(tag) + self.handle_data(text) + + def handle_data(self, data): + if self.inraw or not self.mdstack: + super().handle_data(data) + else: + self.treebuilder.data(data) + + def handle_empty_tag(self, data, is_block): + if self.inraw or not self.mdstack: + super().handle_empty_tag(data, is_block) + else: + if self.at_line_start() and is_block: + self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n') + else: + self.handle_date(text) class HtmlBlockPreprocessor(Preprocessor): @@ -106,62 +160,96 @@ def run(self, lines): class MarkdownInHtmlProcessor(BlockProcessor): - """Process Markdown Inside HTML Blocks.""" + """Process Markdown Inside HTML Blocks which have been stored in the HtmlStash.""" + def test(self, parent, block): - return block == util.TAG_PLACEHOLDER % \ - str(self.parser.blockprocessors.tag_counter + 1) - - def _process_nests(self, element, block): - """Process the element's child elements in self.run.""" - # Build list of indexes of each nest within the parent element. - nest_index = [] # a list of tuples: (left index, right index) - i = self.parser.blockprocessors.tag_counter + 1 - while len(self._tag_data) > i and self._tag_data[i]['left_index']: - left_child_index = self._tag_data[i]['left_index'] - right_child_index = self._tag_data[i]['right_index'] - nest_index.append((left_child_index - 1, right_child_index)) - i += 1 - - # Create each nest subelement. - for i, (left_index, right_index) in enumerate(nest_index[:-1]): - self.run(element, block[left_index:right_index], - block[right_index:nest_index[i + 1][0]], True) - self.run(element, block[nest_index[-1][0]:nest_index[-1][1]], # last - block[nest_index[-1][1]:], True) # nest - - def run(self, parent, blocks, tail=None, nest=False): - self._tag_data = self.parser.md.htmlStash.tag_data - - self.parser.blockprocessors.tag_counter += 1 - tag = self._tag_data[self.parser.blockprocessors.tag_counter] - - # Create Element - markdown_value = tag['attrs'].pop('markdown') - element = etree.SubElement(parent, tag['tag'], tag['attrs']) - - # Slice Off Block - if nest: - self.parser.parseBlocks(parent, tail) # Process Tail - block = blocks[1:] - else: # includes nests since a third level of nesting isn't supported - block = blocks[tag['left_index'] + 1: tag['right_index']] - del blocks[:tag['right_index']] - - # Process Text - if (self.parser.blockprocessors.contain_span_tags.match( # Span Mode - tag['tag']) and markdown_value != 'block') or \ - markdown_value == 'span': - element.text = '\n'.join(block) - else: # Block Mode - i = self.parser.blockprocessors.tag_counter + 1 - if len(self._tag_data) > i and self._tag_data[i]['left_index']: - first_subelement_index = self._tag_data[i]['left_index'] - 1 - self.parser.parseBlocks( - element, block[:first_subelement_index]) - if not nest: - block = self._process_nests(element, block) - else: - self.parser.parseBlocks(element, block) + # ALways return True. `run` will return `False` it not a valid match. + return True + + def parse_element_content(self, element): + """ + Resursively parse the text content of an etree Element as Markdown. + + Any block level elements generated from the Markdown will be inserted as children of the element in place + of the text content. All `markdown` attributes are removed. For any elements in which Markdown parsing has + been dissabled, the text content of it and its chidlren are wrapped in an `AtomicString`. + """ + + md_attr = element.attrib.pop('markdown', 'off') + + if md_attr == 'block': + # Parse content as block level + # The order in which the different parts are parsed (text, children, tails) is important here as the + # order of elements needs to be preserved. We can't be inserting items at a later point in the current + # iteration as we don't want to do raw processing on elements created from parsing Markdown text (for + # example). Therefore, the order of operations is children, tails, text. + + # Recursively parse existing children from raw HTML + for child in list(element): + self.parse_element_content(child) + + # Parse Markdown text in tail of children. Do this seperate to avoid raw HTML parsing. + # Save the position of each item to be inserted later in reverse. + tails = [] + for pos, child in enumerate(element): + if child.tail: + block = child.tail + child.tail = '' + # Use a dummy placeholder element. + dummy = etree.Element('div') + self.parser.parseBlocks(dummy, block.split('\n')) + children = list(dummy) + children.reverse() + tails.append((pos + 1, children)) + + # Insert the elements created from the tails in reverse. + tails.reverse() + for pos, tail in tails: + for item in tail: + element.insert(pos, item) + + # Parse Markdown text content. Do this last to avoid raw HTML parsing. + if element.text: + block = element.text + element.text = '' + # Use a dummy placeholder element as the content needs to get inserted before existing children. + dummy = etree.Element('div') + self.parser.parseBlocks(dummy, block.split('\n')) + children = list(dummy) + children.reverse() + for child in children: + element.insert(0, child) + + elif md_attr == 'span': + # Span level parsing will be handled by inlineprocessors. + # Walk children here to remove any `markdown` attributes. + for child in list(element): + self.parse_element_content(child) + + else: + # Disable inline parsing for everything else + element.text = util.AtomicString(element.text) + for child in list(element): + self.parse_element_content(child) + if child.tail: + child.tail = util.AtomicString(child.tail) + + + def run(self, parent, blocks): + m = util.HTML_PLACEHOLDER_RE.match(blocks[0]) + if m: + index = int(m.group(1)) + element = self.parser.md.htmlStash.rawHtmlBlocks[index] + if isinstance(element, etree.Element): + # We have a match. Process it. + blocks.pop(0) + self.parse_element_content(element) + parent.append(element) + # Cleanup stash. Replace element with empty string to avoid confusing postprocessor. + self.parser.md.htmlStash.rawHtmlBlocks.pop(index) + self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '') + # No match found. + return False class MarkdownInHtmlExtension(Extension): @@ -172,12 +260,10 @@ def extendMarkdown(self, md): # Replace raw HTML preprocessor md.preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20) - # md.parser.blockprocessors.register( - # MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105 - # ) - # md.parser.blockprocessors.tag_counter = -1 - # md.parser.blockprocessors.contain_span_tags = re.compile( - # r'^(p|h[1-6]|li|dd|dt|td|th|legend|address)$', re.IGNORECASE) + # Add blockprocessor which handles the placeholders for etree elements + md.parser.blockprocessors.register( + MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105 + ) def makeExtension(**kwargs): # pragma: no cover From 7783d48df705cc9a6ea8fa44d1019181a7c0cbba Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Tue, 1 Sep 2020 13:57:51 -0400 Subject: [PATCH 45/67] Fix some bugs. --- markdown/extensions/md_in_html.py | 20 ++++++++++++++++--- tests/extensions/github_flavored.html | 1 - tests/test_legacy.py | 1 + .../test_syntax/extensions/test_md_in_html.py | 9 ++++++--- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index c6bfea7c6..ac7ca2c0c 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -52,6 +52,17 @@ def reset(self): self.mdstate = None # one of 'block', 'span', 'off', or None super().reset() + def close(self): + """Handle any buffered data.""" + super().close() + # Handle any unclosed tags. + if self.mdstack: + # Close the outermost parent. handle_endtag will close all unclosed children. + self.handle_endtag(self.mdstack[0]) + if len(self._cache): + self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) + self._cache = [] + def get_element(self): """ Return element from treebuilder and reset treebuilder for later use. """ element = self.treebuilder.close() @@ -122,8 +133,9 @@ def handle_endtag(self, tag): self.cleandoc.append('\n\n') self.state = None else: - # Treat orphan closing tag as an empty tag. - self.handle_startendtag(tag, {}) + # Treat orphan closing tag as a span level tag. + text = self.get_endtag_text(tag) + self.handle_data(text) else: # Span level tag if self.inraw: @@ -241,13 +253,15 @@ def run(self, parent, blocks): index = int(m.group(1)) element = self.parser.md.htmlStash.rawHtmlBlocks[index] if isinstance(element, etree.Element): - # We have a match. Process it. + # We have a matched element. Process it. blocks.pop(0) self.parse_element_content(element) parent.append(element) # Cleanup stash. Replace element with empty string to avoid confusing postprocessor. self.parser.md.htmlStash.rawHtmlBlocks.pop(index) self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '') + # Comfirm the match to the blockparser. + return True # No match found. return False diff --git a/tests/extensions/github_flavored.html b/tests/extensions/github_flavored.html index b39165a2e..98dc82a81 100644 --- a/tests/extensions/github_flavored.html +++ b/tests/extensions/github_flavored.html @@ -30,7 +30,6 @@ + CONTEXT_DIFF_LINE_PATTERN, +``` -

Test support for foo+bar lexer names.

<title>{% block title %}{% endblock %}</title>
 <ul>
diff --git a/tests/test_legacy.py b/tests/test_legacy.py
index 94fdd13f1..9cc09e398 100644
--- a/tests/test_legacy.py
+++ b/tests/test_legacy.py
@@ -131,6 +131,7 @@ class TestPl2007(LegacyTestCase):
 class TestExtensions(LegacyTestCase):
     location = os.path.join(parent_test_dir, 'extensions')
     exclude = ['codehilite']
+    maxDiff = None
 
     attr_list = Kwargs(extensions=['attr_list', 'def_list', 'smarty'])
 
diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py
index f7e242909..a45367705 100644
--- a/tests/test_syntax/extensions/test_md_in_html.py
+++ b/tests/test_syntax/extensions/test_md_in_html.py
@@ -455,7 +455,7 @@ def test_md1_unclosed_div(self):
 
                 _foo_
 
-                
_bar_ @@ -467,7 +467,9 @@ def test_md1_unclosed_div(self):

foo

- __bar__ + + _bar_ +
""" @@ -493,7 +495,7 @@ def test_md1_orphan_endtag(self): """

foo

-

+

bar

""" @@ -513,6 +515,7 @@ def test_md1_unclosed_p(self):

foo

bar +

""" ) From cae2ef044caca984efd34f37d573fc871659fce3 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Tue, 1 Sep 2020 16:20:18 -0400 Subject: [PATCH 46/67] track mdstate down and back up nested elements. --- markdown/extensions/md_in_html.py | 12 +++++----- tests/extensions/extra/raw-html.html | 11 +++++----- .../test_syntax/extensions/test_md_in_html.py | 22 +++++++++++++++++++ 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index ac7ca2c0c..5fc71aab9 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -49,7 +49,7 @@ def reset(self): """Reset this instance. Loses all unprocessed data.""" self.mdstack = [] # When markdown=1, stack contains a list of tags self.treebuilder = etree.TreeBuilder(insert_comments=True, insert_pis=True) - self.mdstate = None # one of 'block', 'span', 'off', or None + self.mdstate = [] # one of 'block', 'span', 'off', or None super().reset() def close(self): @@ -69,12 +69,13 @@ def get_element(self): self.treebuilder = etree.TreeBuilder(insert_comments=True, insert_pis=True) return element - def get_state(self, tag, attrs, parent_state=None): + def get_state(self, tag, attrs): """ Return state from tag and `markdown` attr. One of 'block', 'span', or 'off'. """ md_attr = attrs.get('markdown', '0') if md_attr == 'markdown': # `` is the same as ``. md_attr = '1' + parent_state = self.mdstate[-1] if self.mdstate else None if parent_state == 'off' or (parent_state == 'span' and md_attr != '0'): # Only use the parent state if it is more restrictive than the markdown attribute. md_attr = parent_state @@ -93,7 +94,7 @@ def handle_starttag(self, tag, attrs): if tag in block_level_tags: # Valueless attr (ex: ``) results in `[('checked', None)]`. Convert to `{'checked': 'checked'}`. attrs = {key: value if value is not None else key for key, value in attrs} - state = self.get_state(tag, attrs, self.mdstate) + state = self.get_state(tag, attrs) if self.inraw or (state in [None, 'off'] and not self.mdstack): # fall back to default behavior @@ -103,7 +104,7 @@ def handle_starttag(self, tag, attrs): if 'p' in self.mdstack and tag in block_level_tags: # Close unclosed 'p' tag self.handle_endtag('p') - self.mdstate = state + self.mdstate.append(state) self.mdstack.append(tag) attrs['markdown'] = state self.treebuilder.start(tag, attrs) @@ -123,6 +124,7 @@ def handle_endtag(self, tag): # Close element and any unclosed children while self.mdstack: item = self.mdstack.pop() + self.mdstate.pop() self.treebuilder.end(item) if item == tag: break @@ -131,7 +133,7 @@ def handle_endtag(self, tag): element = self.get_element() self.cleandoc.append(self.md.htmlStash.store(element)) self.cleandoc.append('\n\n') - self.state = None + self.state = [] else: # Treat orphan closing tag as a span level tag. text = self.get_endtag_text(tag) diff --git a/tests/extensions/extra/raw-html.html b/tests/extensions/extra/raw-html.html index ac367d77b..ef94cb303 100644 --- a/tests/extensions/extra/raw-html.html +++ b/tests/extensions/extra/raw-html.html @@ -14,11 +14,13 @@

The tail of the DefaultBlockMode subelement.

-This text is not wrapped in additional p tags.

+This text is not wrapped in additional p tags. +

The tail of the DefaultSpanMode subelement.

This div block is not wrapped in paragraph tags. -Note: Subelements are not required to have tail text.
+Note: Subelements are not required to have tail text. +

This p block is foolishly wrapped in further paragraph tags.

@@ -26,7 +28,6 @@
Raw html blocks may also be nested.
-

This text is after the markdown in html.

@@ -38,14 +39,12 @@
Raw html blocks may also be nested.
-

Markdown is still active here.

Markdown is active again here.

foo bar

-

bar -

+

bar

diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index a45367705..29341b9bb 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -387,6 +387,28 @@ def test_md_block_nested_in_span(self): ) ) + def test_md_block_after_span_nested_in_block(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+
*foo*
+
*bar*
+
+ """ + ), + self.dedent( + """ +
+
foo
+
+

bar

+
+
+ """ + ) + ) + def test_md1_nested_in_nomd(self): self.assertMarkdownRenders( self.dedent( From 56111c43cdd8233729cbc6a0b162f1ac9e88d15e Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 2 Sep 2020 10:00:01 -0400 Subject: [PATCH 47/67] fix nested multiline paragraphs. --- markdown/extensions/md_in_html.py | 8 ++++---- tests/test_syntax/extensions/test_md_in_html.py | 10 +++++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 5fc71aab9..6bf8e27a5 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -207,11 +207,11 @@ def parse_element_content(self, element): tails = [] for pos, child in enumerate(element): if child.tail: - block = child.tail + block = child.tail.rstrip('\n') child.tail = '' # Use a dummy placeholder element. dummy = etree.Element('div') - self.parser.parseBlocks(dummy, block.split('\n')) + self.parser.parseBlocks(dummy, block.split('\n\n')) children = list(dummy) children.reverse() tails.append((pos + 1, children)) @@ -224,11 +224,11 @@ def parse_element_content(self, element): # Parse Markdown text content. Do this last to avoid raw HTML parsing. if element.text: - block = element.text + block = element.text.rstrip('\n') element.text = '' # Use a dummy placeholder element as the content needs to get inserted before existing children. dummy = etree.Element('div') - self.parser.parseBlocks(dummy, block.split('\n')) + self.parser.parseBlocks(dummy, block.split('\n\n')) children = list(dummy) children.reverse() for child in children: diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index 29341b9bb..0de5c10a0 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -226,7 +226,8 @@ def test_md1_deep_mix(self): A _Markdown_ paragraph before a raw child. - A second Markdown paragraph. + A second Markdown paragraph + with two lines.
@@ -241,6 +242,7 @@ def test_md1_deep_mix(self): A _Markdown_ tail to the raw child. A second tail item + with two lines.

More raw.

@@ -251,14 +253,16 @@ def test_md1_deep_mix(self): """

A Markdown paragraph before a raw child.

-

A second Markdown paragraph.

+

A second Markdown paragraph + with two lines.

A raw child.

foo

Raw child tail.

A Markdown tail to the raw child.

-

A second tail item

+

A second tail item + with two lines.

More raw.

""" From dda275573dc2d541546a5231c8f95376c3d904d9 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 3 Sep 2020 16:10:15 -0400 Subject: [PATCH 48/67] Move link reference handling to block parser. --- markdown/blockprocessors.py | 30 ++++++++++++++++ markdown/preprocessors.py | 35 ------------------- .../test_syntax/extensions/test_md_in_html.py | 23 ++++++++++++ 3 files changed, 53 insertions(+), 35 deletions(-) diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py index e81f83c9a..742f17470 100644 --- a/markdown/blockprocessors.py +++ b/markdown/blockprocessors.py @@ -51,6 +51,7 @@ def build_block_parser(md, **kwargs): parser.blockprocessors.register(OListProcessor(parser), 'olist', 40) parser.blockprocessors.register(UListProcessor(parser), 'ulist', 30) parser.blockprocessors.register(BlockQuoteProcessor(parser), 'quote', 20) + parser.blockprocessors.register(ReferenceProcessor(parser), 'reference', 15) parser.blockprocessors.register(ParagraphProcessor(parser), 'paragraph', 10) return parser @@ -554,6 +555,35 @@ def run(self, parent, blocks): ) +class ReferenceProcessor(BlockProcessor): + """ Process link references. """ + RE = re.compile( + r'^[ ]{0,3}\[([^\]]*)\]:[ ]*\n?[ ]*([^\s]+)[ ]*\n?[ ]*((["\'])(.*)\4|\((.*)\))?[ ]*$', re.MULTILINE + ) + + def test(self, parent, block): + return True + + def run(self, parent, blocks): + block = blocks.pop(0) + m = self.RE.search(block) + if m: + id = m.group(1).strip().lower() + link = m.group(2).lstrip('<').rstrip('>') + title = m.group(5) or m.group(6) + self.parser.md.references[id] = (link, title) + if block[m.end():].strip(): + # Add any content after match back to blocks as separate block + blocks.insert(0, block[m.end():].lstrip('\n')) + if block[:m.start()].strip(): + # Add any content before match back to blocks as separate block + blocks.insert(0, block[:m.start()].rstrip('\n')) + return True + # No match. Restore block. + blocks.insert(0, block) + return False + + class ParagraphProcessor(BlockProcessor): """ Process Paragraph blocks. """ diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py index 76a25428a..e1023c59a 100644 --- a/markdown/preprocessors.py +++ b/markdown/preprocessors.py @@ -35,7 +35,6 @@ def build_preprocessors(md, **kwargs): preprocessors = util.Registry() preprocessors.register(NormalizeWhitespace(md), 'normalize_whitespace', 30) preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20) - preprocessors.register(ReferencePreprocessor(md), 'reference', 10) return preprocessors @@ -81,37 +80,3 @@ def run(self, lines): parser.feed(source) parser.close() return ''.join(parser.cleandoc).split('\n') - - -class ReferencePreprocessor(Preprocessor): - """ Remove reference definitions from text and store for later use. """ - - TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*' - RE = re.compile( - r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL - ) - TITLE_RE = re.compile(r'^%s$' % TITLE) - - def run(self, lines): - new_text = [] - while lines: - line = lines.pop(0) - m = self.RE.match(line) - if m: - id = m.group(1).strip().lower() - link = m.group(2).lstrip('<').rstrip('>') - t = m.group(5) or m.group(6) or m.group(7) - if not t: - # Check next line for title - tm = self.TITLE_RE.match(lines[0]) - if tm: - lines.pop(0) - t = tm.group(2) or tm.group(3) or tm.group(4) - self.md.references[id] = (link, t) - # Preserve the line to prevent raw HTML indexing issue. - # https://github.com/Python-Markdown/markdown/issues/584 - new_text.append('') - else: - new_text.append(line) - - return new_text # + "\n" diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index 0de5c10a0..132caa42f 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -568,3 +568,26 @@ def test_md1_nested_unclosed_p(self): """ ) ) + + def test_md1_nested_link_ref(self): + self.assertMarkdownRenders( + self.dedent( + """ +
+ [link]: http://example.com +
+ [link][link] +
+
+ """ + ), + self.dedent( + """ +
+
+

link

+
+
+ """ + ) + ) From 370d601eb75d4c32ca7c26a7311ca9c1f3c81495 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Tue, 8 Sep 2020 10:12:24 -0400 Subject: [PATCH 49/67] Move abbr reference handling to block parser. --- markdown/extensions/abbr.py | 49 ++++++++++--------- .../test_syntax/extensions/test_md_in_html.py | 24 +++++++++ 2 files changed, 51 insertions(+), 22 deletions(-) diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py index b53f2c4e1..9879314f5 100644 --- a/markdown/extensions/abbr.py +++ b/markdown/extensions/abbr.py @@ -17,48 +17,53 @@ ''' from . import Extension -from ..preprocessors import Preprocessor +from ..blockprocessors import BlockProcessor from ..inlinepatterns import InlineProcessor from ..util import AtomicString import re import xml.etree.ElementTree as etree -# Global Vars -ABBR_REF_RE = re.compile(r'[*]\[(?P[^\]]*)\][ ]?:\s*(?P.*)') - class AbbrExtension(Extension): """ Abbreviation Extension for Python-Markdown. """ def extendMarkdown(self, md): """ Insert AbbrPreprocessor before ReferencePreprocessor. """ - md.preprocessors.register(AbbrPreprocessor(md), 'abbr', 12) + md.parser.blockprocessors.register(AbbrPreprocessor(md.parser), 'abbr', 16) -class AbbrPreprocessor(Preprocessor): +class AbbrPreprocessor(BlockProcessor): """ Abbreviation Preprocessor - parse text for abbr references. """ - def run(self, lines): + RE = re.compile(r'^[*]\[(?P<abbr>[^\]]*)\][ ]?:[ ]*\n?[ ]*(?P<title>.*)$', re.MULTILINE) + + def test(self, parent, block): + return True + + def run(self, parent, blocks): ''' Find and remove all Abbreviation references from the text. Each reference is set as a new AbbrPattern in the markdown instance. ''' - new_text = [] - for line in lines: - m = ABBR_REF_RE.match(line) - if m: - abbr = m.group('abbr').strip() - title = m.group('title').strip() - self.md.inlinePatterns.register( - AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2 - ) - # Preserve the line to prevent raw HTML indexing issue. - # https://github.com/Python-Markdown/markdown/issues/584 - new_text.append('') - else: - new_text.append(line) - return new_text + block = blocks.pop(0) + m = self.RE.search(block) + if m: + abbr = m.group('abbr').strip() + title = m.group('title').strip() + self.parser.md.inlinePatterns.register( + AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2 + ) + if block[m.end():].strip(): + # Add any content after match back to blocks as separate block + blocks.insert(0, block[m.end():].lstrip('\n')) + if block[:m.start()].strip(): + # Add any content before match back to blocks as separate block + blocks.insert(0, block[:m.start()].rstrip('\n')) + return True + # No match. Restore block. + blocks.insert(0, block) + return False def _generate_pattern(self, text): ''' diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index 132caa42f..061ffb928 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -591,3 +591,27 @@ def test_md1_nested_link_ref(self): """ ) ) + + def test_md1_nested_abbr_ref(self): + self.assertMarkdownRenders( + self.dedent( + """ + <div markdown="1"> + *[abbr]: Abbreviation + <div markdown="1"> + abbr + </div> + </div> + """ + ), + self.dedent( + """ + <div> + <div> + <p><abbr title="Abbreviation">abbr</abbr></p> + </div> + </div> + """ + ), + extensions=['md_in_html', 'abbr'] + ) From 81ac09dc0fee81aa4f3e25dba855d975b91e184a Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Tue, 8 Sep 2020 14:16:01 -0400 Subject: [PATCH 50/67] Move footnote reference handling to block parser. --- markdown/extensions/footnotes.py | 163 ++++++++---------- .../test_syntax/extensions/test_md_in_html.py | 32 ++++ 2 files changed, 106 insertions(+), 89 deletions(-) diff --git a/markdown/extensions/footnotes.py b/markdown/extensions/footnotes.py index beab9196f..77366e784 100644 --- a/markdown/extensions/footnotes.py +++ b/markdown/extensions/footnotes.py @@ -15,6 +15,7 @@ from . import Extension from ..preprocessors import Preprocessor +from ..blockprocessors import BlockProcessor from ..inlinepatterns import InlineProcessor from ..treeprocessors import Treeprocessor from ..postprocessors import Postprocessor @@ -26,8 +27,6 @@ FN_BACKLINK_TEXT = util.STX + "zz1337820767766393qq" + util.ETX NBSP_PLACEHOLDER = util.STX + "qq3936677670287331zz" + util.ETX -DEF_RE = re.compile(r'[ ]{0,3}\[\^([^\]]*)\]:\s*(.*)') -TABBED_RE = re.compile(r'((\t)|( ))(.*)') RE_REF_ID = re.compile(r'(fnref)(\d+)') @@ -72,8 +71,8 @@ def extendMarkdown(self, md): md.registerExtension(self) self.parser = md.parser self.md = md - # Insert a preprocessor before ReferencePreprocessor - md.preprocessors.register(FootnotePreprocessor(self), 'footnote', 15) + # Insert a blockprocessor before ReferencePreprocessor + md.parser.blockprocessors.register(FootnoteBlockProcessor(self), 'footnote', 17) # Insert an inline pattern before ImageReferencePattern FOOTNOTE_RE = r'\[\^([^\]]*)\]' # blah blah [^1] blah @@ -202,106 +201,92 @@ def makeFootnotesDiv(self, root): return div -class FootnotePreprocessor(Preprocessor): +class FootnoteBlockProcessor(BlockProcessor): """ Find all footnote references and store for later use. """ + RE = re.compile(r'^[ ]{0,3}\[\^([^\]]*)\]:[ ]*(.*)$', re.MULTILINE) + def __init__(self, footnotes): + super().__init__(footnotes.parser) self.footnotes = footnotes - def run(self, lines): - """ - Loop through lines and find, set, and remove footnote definitions. - - Keywords: - - * lines: A list of lines of text - - Return: A list of lines of text with footnote definitions removed. - - """ - newlines = [] - i = 0 - while True: - m = DEF_RE.match(lines[i]) - if m: - fn, _i = self.detectTabbed(lines[i+1:]) - fn.insert(0, m.group(2)) - i += _i-1 # skip past footnote - footnote = "\n".join(fn) - self.footnotes.setFootnote(m.group(1), footnote.rstrip()) - # Preserve a line for each block to prevent raw HTML indexing issue. - # https://github.com/Python-Markdown/markdown/issues/584 - num_blocks = (len(footnote.split('\n\n')) * 2) - newlines.extend([''] * (num_blocks)) + def test(self, parent, block): + return True + + def run(self, parent, blocks): + """ Find, set, and remove footnote definitions. """ + block = blocks.pop(0) + m = self.RE.search(block) + if m: + id = m.group(1) + fn_blocks = [m.group(2)] + + # Handle rest of block + therest = block[m.end():].lstrip('\n') + m2 = self.RE.search(therest) + if m2: + # Another footnote exists in the rest of this block. + # Any content before match is continuation of this footnote, which may be lazily indented. + before = therest[:m2.start()].rstrip('\n') + fn_blocks[0] = '\n'.join([fn_blocks[0], self.detab(before)]).lstrip('\n') + # Add back to blocks everything from begining of match forward for next iteration. + blocks.insert(0, therest[m2.start():]) else: - newlines.append(lines[i]) - if len(lines) > i+1: - i += 1 - else: - break - return newlines + # All remaining lines of block are continuation of this footnote, which may be lazily indented. + fn_blocks[0] = '\n'.join([fn_blocks[0], self.detab(therest)]).strip('\n') - def detectTabbed(self, lines): - """ Find indented text and remove indent before further proccesing. + # Check for child elements in remaining blocks. + fn_blocks.extend(self.detectTabbed(blocks)) - Keyword arguments: + footnote = "\n\n".join(fn_blocks) + self.footnotes.setFootnote(id, footnote.rstrip()) - * lines: an array of strings + if block[:m.start()].strip(): + # Add any content before match back to blocks as separate block + blocks.insert(0, block[:m.start()].rstrip('\n')) + return True + # No match. Restore block. + blocks.insert(0, block) + return False - Returns: a list of post processed items and the index of last line. + def detectTabbed(self, blocks): + """ Find indented text and remove indent before further proccesing. + Returns: a list of blocks with indentation removed. """ - items = [] - blank_line = False # have we encountered a blank line yet? - i = 0 # to keep track of where we are - - def detab(line): - match = TABBED_RE.match(line) - if match: - return match.group(4) - - for line in lines: - if line.strip(): # Non-blank line - detabbed_line = detab(line) - if detabbed_line: - items.append(detabbed_line) - i += 1 - continue - elif not blank_line and not DEF_RE.match(line): - # not tabbed but still part of first par. - items.append(line) - i += 1 - continue - else: - return items, i+1 - - else: # Blank line: _maybe_ we are done. - blank_line = True - i += 1 # advance - - # Find the next non-blank line - for j in range(i, len(lines)): - if lines[j].strip(): - next_line = lines[j] - break - else: - # Include extreaneous padding to prevent raw HTML - # parsing issue: https://github.com/Python-Markdown/markdown/issues/584 - items.append("") - i += 1 + fn_blocks = [] + while blocks: + if blocks[0].startswith(' '*4): + block = blocks.pop(0) + # Check for new footnotes within this block and split at new footnote. + m = self.RE.search(block) + if m: + # Another footnote exists in this block. + # Any content before match is continuation of this footnote, which may be lazily indented. + before = block[:m.start()].rstrip('\n') + fn_blocks.append(self.detab(before)) + # Add back to blocks everything from begining of match forward for next iteration. + blocks.insert(0, block[m.start():]) + # End of this footnote. + break else: - break # There is no more text; we are done. + # Entire block is part of this footnote. + fn_blocks.append(self.detab(block)) + else: + # End of this footnote. + break + return fn_blocks - # Check if the next non-blank line is tabbed - if detab(next_line): # Yes, more work to do. - items.append("") - continue - else: - break # No, we are done. - else: - i += 1 + def detab(self, block): + """ Remove one level of indent from a block. - return items, i + Preserve lazily indented blocks by only removing indent from indented lines. + """ + lines = block.split('\n') + for i, line in enumerate(lines): + if line.startswith(' '*4): + lines[i] = line[4:] + return '\n'.join(lines) class FootnoteInlineProcessor(InlineProcessor): diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index 061ffb928..0cb6a5b8e 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -615,3 +615,35 @@ def test_md1_nested_abbr_ref(self): ), extensions=['md_in_html', 'abbr'] ) + + def test_md1_nested_footnote_ref(self): + self.assertMarkdownRenders( + self.dedent( + """ + <div markdown="1"> + [^1]: The footnote. + <div markdown="1"> + Paragraph with a footnote.[^1] + </div> + </div> + """ + ), + self.dedent( + """ + <div> + <div> + <p>Paragraph with a footnote.<sup id="fnref:1"><a class="footnote-ref" href="#fn:1">1</a></sup></p> + </div> + </div> + <div class="footnote"> + <hr /> + <ol> + <li id="fn:1"> + <p>The footnote. <a class="footnote-backref" href="#fnref:1" title="Jump back to footnote 1 in the text">↩</a></p> + </li> + </ol> + </div> + """ + ), + extensions=['md_in_html', 'footnotes'] + ) From 6b068e3a58ee098e92a1e907dfdad0b59d1b1a26 Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Tue, 8 Sep 2020 14:32:15 -0400 Subject: [PATCH 51/67] Cleanup --- markdown/extensions/footnotes.py | 1 - markdown/extensions/md_in_html.py | 10 +++--- tests/test_legacy.py | 1 - .../test_syntax/extensions/test_md_in_html.py | 32 +++++++++---------- 4 files changed, 19 insertions(+), 25 deletions(-) diff --git a/markdown/extensions/footnotes.py b/markdown/extensions/footnotes.py index 77366e784..f6f4c8577 100644 --- a/markdown/extensions/footnotes.py +++ b/markdown/extensions/footnotes.py @@ -14,7 +14,6 @@ """ from . import Extension -from ..preprocessors import Preprocessor from ..blockprocessors import BlockProcessor from ..inlinepatterns import InlineProcessor from ..treeprocessors import Treeprocessor diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 6bf8e27a5..24b7463e9 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -18,9 +18,7 @@ from ..blockprocessors import BlockProcessor from ..preprocessors import Preprocessor from .. import util -from ..htmlparser import HTMLExtractor, blank_line_re -from html import parser -import re +from ..htmlparser import HTMLExtractor import xml.etree.ElementTree as etree @@ -92,7 +90,8 @@ def get_state(self, tag, attrs): def handle_starttag(self, tag, attrs): if tag in block_level_tags: - # Valueless attr (ex: `<tag checked>`) results in `[('checked', None)]`. Convert to `{'checked': 'checked'}`. + # Valueless attr (ex: `<tag checked>`) results in `[('checked', None)]`. + # Convert to `{'checked': 'checked'}`. attrs = {key: value if value is not None else key for key, value in attrs} state = self.get_state(tag, attrs) @@ -159,7 +158,7 @@ def handle_empty_tag(self, data, is_block): if self.at_line_start() and is_block: self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n') else: - self.handle_date(text) + self.handle_date(data) class HtmlBlockPreprocessor(Preprocessor): @@ -248,7 +247,6 @@ def parse_element_content(self, element): if child.tail: child.tail = util.AtomicString(child.tail) - def run(self, parent, blocks): m = util.HTML_PLACEHOLDER_RE.match(blocks[0]) if m: diff --git a/tests/test_legacy.py b/tests/test_legacy.py index 9cc09e398..94fdd13f1 100644 --- a/tests/test_legacy.py +++ b/tests/test_legacy.py @@ -131,7 +131,6 @@ class TestPl2007(LegacyTestCase): class TestExtensions(LegacyTestCase): location = os.path.join(parent_test_dir, 'extensions') exclude = ['codehilite'] - maxDiff = None attr_list = Kwargs(extensions=['attr_list', 'def_list', 'smarty']) diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index 0cb6a5b8e..6a9b3d432 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -628,22 +628,20 @@ def test_md1_nested_footnote_ref(self): </div> """ ), - self.dedent( - """ - <div> - <div> - <p>Paragraph with a footnote.<sup id="fnref:1"><a class="footnote-ref" href="#fn:1">1</a></sup></p> - </div> - </div> - <div class="footnote"> - <hr /> - <ol> - <li id="fn:1"> - <p>The footnote. <a class="footnote-backref" href="#fnref:1" title="Jump back to footnote 1 in the text">↩</a></p> - </li> - </ol> - </div> - """ - ), + '<div>\n' + '<div>\n' + '<p>Paragraph with a footnote.<sup id="fnref:1"><a class="footnote-ref" href="#fn:1">1</a></sup></p>\n' + '</div>\n' + '</div>\n' + '<div class="footnote">\n' + '<hr />\n' + '<ol>\n' + '<li id="fn:1">\n' + '<p>The footnote. ' + '<a class="footnote-backref" href="#fnref:1" title="Jump back to footnote 1 in the text">↩</a>' + '</p>\n' + '</li>\n' + '</ol>\n' + '</div>', extensions=['md_in_html', 'footnotes'] ) From 7a853978442521f8dcc55d71ca173897e96a9cbc Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Tue, 8 Sep 2020 15:31:54 -0400 Subject: [PATCH 52/67] Remove reference to comments and PIs in TreeBuilder as unused. --- markdown/extensions/md_in_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 24b7463e9..3cd8ed55e 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -46,7 +46,7 @@ class HTMLExtractorExtra(HTMLExtractor): def reset(self): """Reset this instance. Loses all unprocessed data.""" self.mdstack = [] # When markdown=1, stack contains a list of tags - self.treebuilder = etree.TreeBuilder(insert_comments=True, insert_pis=True) + self.treebuilder = etree.TreeBuilder() self.mdstate = [] # one of 'block', 'span', 'off', or None super().reset() From 42299a84b6545d18523a8d43e93799606e92eab4 Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Tue, 8 Sep 2020 15:36:24 -0400 Subject: [PATCH 53/67] Remove other reference to comments and PIs in TreeBuilder. --- markdown/extensions/md_in_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 3cd8ed55e..2d155e0b8 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -64,7 +64,7 @@ def close(self): def get_element(self): """ Return element from treebuilder and reset treebuilder for later use. """ element = self.treebuilder.close() - self.treebuilder = etree.TreeBuilder(insert_comments=True, insert_pis=True) + self.treebuilder = etree.TreeBuilder() return element def get_state(self, tag, attrs): From fbae48445dde9dfe409b30501f2277cbe469f654 Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Wed, 9 Sep 2020 14:54:49 -0400 Subject: [PATCH 54/67] Rewrite extension docs. --- docs/extensions/md_in_html.md | 252 ++++++++++++++++++++++++---------- 1 file changed, 177 insertions(+), 75 deletions(-) diff --git a/docs/extensions/md_in_html.md b/docs/extensions/md_in_html.md index b57197b9f..1f30d55af 100644 --- a/docs/extensions/md_in_html.md +++ b/docs/extensions/md_in_html.md @@ -6,120 +6,222 @@ title: Markdown in HTML Extension An extensions that parses Markdown inside of HTML tags. -## Usage +## Syntax -From the Python interpreter: +By default, Markdown ignores any content within a raw HTML block-level element. With the `md-in-html` extension +enabled, the content of a raw HTML block-level element can be parsed as Markdown by including a `markdown` attribute +on the opening tag. The `markdown` attribute will be stripped from the output, while all other attributes will be +preserved. -```pycon ->>> import markdown ->>> html = markdown.markdown(text, extensions=['md_in_html']) -``` +The `markdown` attribute can be assigned one of three values: [`"1"`](#1), [`"block"`](#block), or [`"span"`](#span). -Unlike the other Extra features, this feature is built into the markdown core and -is turned on when `markdown.extensions.extra` or `markdown.extensions.md_in_html` -is enabled. +!!! note -The content of any raw HTML block element can be Markdown-formatted simply by -adding a `markdown` attribute to the opening tag. The markdown attribute will be -stripped from the output, but all other attributes will be preserved. + The expressions "block-level" and "span-level" as used in this document refer to an element's designation + according to the HTML specification. Whereas the `"span"` and `"block"` values assigned to the `markdown` + attribute refer to the Markdown parser's behavior. -If the markdown value is set to `1` (recommended) or any value other than `span` -or `block`, the default behavior will be executed: `p`,`h[1-6]`,`li`,`dd`,`dt`, -`td`,`th`,`legend`, and `address` elements skip block parsing while others do not. -If the default is overridden by a value of `span`, *block parsing will be skipped* -regardless of tag. If the default is overridden by a value of `block`, -*block parsing will occur* regardless of tag. +### `markdown="1"` { #1 } -#### Simple Example: +When the `markdown` attribute is set to `"1"`, then the parser will use the default behavior for that specific tag. -```md -This is *true* markdown text. +The following tags have the `block` behavior by default: `address`, `article`, `aside`, `blockquote`, `body`, +`colgroup`, `details`, `div`, `dl`, `fieldset`, `figcaption`, `figure`, `footer`, `form`, `iframe`, `header`, `hr`, +`main`, `menu`, `nav`, `map`, `noscript`, `object`, `ol`, `section`, `table`, `tbody`, `thead`, `tfoot`, `tr`, and +`ul`. +For example, the following: + +``` <div markdown="1"> -This is *true* markdown text. +This is a *Markdown* Paragraph. </div> ``` -#### Result: +... is rendered as: -```html -<p>This is <em>true</em> markdown text.</p> +``` html <div> -<p>This is <em>true</em> markdown text.</p> +<p>This is a <em>Markdown</em> Paragraph.</p> </div> ``` -### Nested Markdown Inside HTML Blocks +The following tags have the `span` behavior by default: `address`, `dd`, `dt`, `h[1-6]`, `legend`, `li`, `p`, `td`, +and `th`. -Nested elements are more sensitive and must be used cautiously. To avoid -unexpected results: +For example, the following: -* Only nest elements within block mode elements. -* Follow the closing tag of inner elements with a blank line. -* Only have one level of nesting. +``` +<p markdown="1"> +This is not a *Markdown* Paragraph. +</p> +``` -#### Complex Example: +... is rendered as: -```md -<div markdown="1" name="Example"> +``` html +<p> +This is not a <em>Markdown</em> Paragraph. +</p> +``` -The text of the `Example` element. +### `markdown="block"` { #block } -<div markdown="1" name="DefaultBlockMode"> -This text gets wrapped in `p` tags. -</div> +When the `markdown` attribute is set to `"block"`, then the parser will force the `block` behavior on the contents of +the element so long as it is one of the `block` or `span` tags. -The tail of the `DefaultBlockMode` subelement. +The content of a `block` element is parsed into block-level content. In other words, the text is rendered as +paragraphs, headers, lists, blockquotes, etc. Any inline syntax within those elements is processed as well. -<p markdown="1" name="DefaultSpanMode"> -This text *is not* wrapped in additional `p` tags. -</p> +For example, the following: -The tail of the `DefaultSpanMode` subelement. +``` +<section markdown="block"> +# A header. -<div markdown="span" name="SpanModeOverride"> -This `div` block is not wrapped in paragraph tags. -Note: Subelements are not required to have tail text. -</div> +A *Markdown* paragraph. -<p markdown="block" name="BlockModeOverride"> -This `p` block *is* foolishly wrapped in further paragraph tags. -</p> +* A list item. +* A second list item. + +</section> +``` + +... is rendered as: + +``` html +<section> +<h1>A header.</h1> +<p>A <em>Markdown</em> paragraph.</p> +<ul> +<li>A list item.</li> +<li>A second list item.</li> +</ul> +</section> +``` + +!!! warning + + Forcing elements to be parsed as `block` elements when they are not by default could result in invalid HTML. + For example, one could force a `<p>` element to be nested within another `<p>` element. In most cases it is + recommended to use the default behavior of `markdown="1"`. Explicitly setting `markdown="block"` should be + reserved for advanced users who understand the HTML specification and how browsers parse and render HTML. + +### `markdown="span"` { #span } + +When the `markdown` attribute is set to `"span"`, then the parser will force the `span` behavior on the contents +of the element so long as it is one of the `block` or `span` tags. -The tail of the `BlockModeOverride` subelement. +The content of a `span` element is not parsed into block-level content. In other words, the content will not be +rendered as paragraphs, headers, etc. Only inline syntax will be rendered, such as links, strong, emphasis, etc. -<div name="RawHtml"> -Raw HTML blocks may also be nested. +For example, the following: + +``` +<div markdown="span"> +# *Not* a header </div> +``` + +... is rendered as: +``` html +<div> +# <em>Not</em> a header </div> +``` + +### Ignored Elements + +The following tags are always ignored, regardless of any `markdown` attribute: `canvas`, `math`, `option`, `pre`, +`script`, `style`, and `textarea`. All other raw HTML tags are treated as span-level tags and are not affected by this +extension. + +### Nesting + +When nesting multiple levels of raw HTML elements, a `markdown` attribute must be defined for each block-level +element. For any block-level element which does not have a `markdown` attribute, everything inside that element is +ignored, including child elements with `markdown` attributes. + +For example, the following: -This text is after the markdown in HTML. ``` +<article id="my-article" markdown="1"> +# Article Title + +A Markdown paragraph. + +<section id="section-1" markdown="1"> +## Section 1 Title + +<p>Custom raw **HTML** which gets ignored.</p> + +</section> + +<section id="section-2" markdown="1"> +## Section 2 Title + +<p markdown="1">**Markdown** content.</p> -#### Complex Result: +</section> + +</article> +``` + +... is rendered as: ```html -<div name="Example"> -<p>The text of the <code>Example</code> element.</p> -<div name="DefaultBlockMode"> -<p>This text gets wrapped in <code>p</code> tags.</p> -</div> -<p>The tail of the <code>DefaultBlockMode</code> subelement.</p> -<p name="DefaultSpanMode"> -This text <em>is not</em> wrapped in additional <code>p</code> tags.</p> -<p>The tail of the <code>DefaultSpanMode</code> subelement.</p> -<div name="SpanModeOverride"> -This <code>div</code> block is not wrapped in paragraph tags. -Note: Subelements are not required to have tail text.</div> -<p name="BlockModeOverride"> -<p>This <code>p</code> block <em>is</em> foolishly wrapped in further paragraph tags.</p> -</p> -<p>The tail of the <code>BlockModeOverride</code> subelement.</p> -<div name="RawHtml"> -Raw HTML blocks may also be nested. +<article id="my-article"> +<h1>Article Title</h1> +<p>A Markdown paragraph.</p> +<section id="section-1"> +<h2>Section 1 Title</h2> +<p>Custom raw **HTML** which gets ignored.</p> +</section> +<section id="section-2"> +<h2>Section 2 Title</h2> +<p><strong>Markdown</strong> content.</p> +</section> +</article> +``` + +When the value of an element's `markdown` attribute is more permissive that its parent, then the parent's stricter +behavior is enforced. For example, a `block` element nested within a `span` element will be parsed using the `span` +behavior. However, if the value of an element's `markdown` attribute is the same as, or more restrictive than, its +parent, the the child element's behavior is observed. For example, a `block` element may contain either `block` +elements or `span` elements as children and each element will be parsed using the specified behavior. + +### Normalization + +While the default behavior is for Markdown to not alter raw HTML, any block-level elements with the `markdown` element +defined are normalized. For example, the following raw HTML: + +``` +<div markdown="1"> +<p markdown="1">A Markdown paragraph with *no* closing tag. +<p>A raw paragraph with *no* closing tag. </div> +``` + +... is rendered as: +``` html +<div> +<p>A Markdown paragraph with <em>no</em> closing tag.</p> +<p>A raw paragraph with *no* closing tag. </div> -<p>This text is after the markdown in HTML.</p> +``` + +Notice that the parser properly recognized that an unclosed `<p>` tag ends when another `<p>` tag begins or when the +parent element ends. While the first `<p>` element, which included a `markdown` attribute, had the closing tag added +to the rendered output, the second `<p>` did not get the closing tag added as it did not have a `markdown` attribute. +Of course, a browser will properly interpret both tags. + +## Usage + +From the Python interpreter: + +``` pycon +>>> import markdown +>>> html = markdown.markdown(text, extensions=['md_in_html']) ``` From 097f52c98a3d8fdbaf1f87c50d92b26cfc4c2831 Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Wed, 9 Sep 2020 15:36:52 -0400 Subject: [PATCH 55/67] Fix normalization docs to match behavior. --- docs/extensions/md_in_html.md | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/docs/extensions/md_in_html.md b/docs/extensions/md_in_html.md index 1f30d55af..ba4424bcb 100644 --- a/docs/extensions/md_in_html.md +++ b/docs/extensions/md_in_html.md @@ -4,7 +4,7 @@ title: Markdown in HTML Extension ## Summary -An extensions that parses Markdown inside of HTML tags. +An extension that parses Markdown inside of HTML tags. ## Syntax @@ -191,10 +191,9 @@ behavior. However, if the value of an element's `markdown` attribute is the same parent, the the child element's behavior is observed. For example, a `block` element may contain either `block` elements or `span` elements as children and each element will be parsed using the specified behavior. -### Normalization +### Tag Normalization -While the default behavior is for Markdown to not alter raw HTML, any block-level elements with the `markdown` element -defined are normalized. For example, the following raw HTML: +While the default behavior is for Markdown to not alter raw HTML, as this extension is parsing the content of raw HTML elements, it will do some normalization of the tags of block-level elements. For example, the following raw HTML: ``` <div markdown="1"> @@ -207,15 +206,26 @@ defined are normalized. For example, the following raw HTML: ``` html <div> -<p>A Markdown paragraph with <em>no</em> closing tag.</p> +<p>A Markdown paragraph with <em>no</em> closing tag. +</p> <p>A raw paragraph with *no* closing tag. +</p> </div> ``` -Notice that the parser properly recognized that an unclosed `<p>` tag ends when another `<p>` tag begins or when the -parent element ends. While the first `<p>` element, which included a `markdown` attribute, had the closing tag added -to the rendered output, the second `<p>` did not get the closing tag added as it did not have a `markdown` attribute. -Of course, a browser will properly interpret both tags. +Notice that the parser properly recognizes that an unclosed `<p>` tag ends when another `<p>` tag begins or when the +parent element ends. In both cases, a closing `</p>` was added to the end of the element, regardless of whether a +`markdown` attribute was assigned to the element. + +To avoid any normalization, an element must not be a descendant of any block-level element which has a `markdown` +attribute defined. + +!!! warning + + The normalization behavior is only documented here so that document authors are not surprised when their carefully + crafted raw HTML is altered by Markdown. This extension should not be relied on to normalize and generate valid + HTML. For the best results, always include valid raw HTML (with both opening and closing tags) in your Markdown + documents. ## Usage From df14000630eae07f059d37ce75d24938496db9b9 Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Wed, 9 Sep 2020 15:40:21 -0400 Subject: [PATCH 56/67] Update spelling dict with unclosed --- .spell-dict | 1 + 1 file changed, 1 insertion(+) diff --git a/.spell-dict b/.spell-dict index eed0f67b5..fbe4865a7 100644 --- a/.spell-dict +++ b/.spell-dict @@ -131,6 +131,7 @@ Treeprocessor Treeprocessors tuple tuples +unclosed unescape unescaping unittest From f61eb284429376f4f49617ce0d870ad6fc764cf3 Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Fri, 11 Sep 2020 16:28:28 -0400 Subject: [PATCH 57/67] Address some coverage. --- markdown/extensions/md_in_html.py | 7 +- markdown/htmlparser.py | 2 +- .../test_syntax/extensions/test_md_in_html.py | 78 +++++++++++++++++++ 3 files changed, 81 insertions(+), 6 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 2d155e0b8..9d7e839fa 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -57,9 +57,6 @@ def close(self): if self.mdstack: # Close the outermost parent. handle_endtag will close all unclosed children. self.handle_endtag(self.mdstack[0]) - if len(self._cache): - self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) - self._cache = [] def get_element(self): """ Return element from treebuilder and reset treebuilder for later use. """ @@ -85,7 +82,7 @@ def get_state(self, tag, attrs): return 'span' elif tag in block_level_tags: return 'off' - else: + else: #pragma: no cover return None def handle_starttag(self, tag, attrs): @@ -158,7 +155,7 @@ def handle_empty_tag(self, data, is_block): if self.at_line_start() and is_block: self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n') else: - self.handle_date(data) + self.handle_data(data) class HtmlBlockPreprocessor(Preprocessor): diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index a8f792580..1a86e9345 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -108,7 +108,7 @@ def get_endtag_text(self, tag): m = htmlparser.endendtag.search(self.rawdata, start) if m: return self.rawdata[start:m.end()] - else: + else: # pragma: no cover # Failed to extract from raw data. Assume well formed and lowercase. return '</{}>'.format(tag) diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index 6a9b3d432..014fdb714 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -319,6 +319,54 @@ def test_no_md1_nest(self): ) ) + def test_md1_nested_empty(self): + self.assertMarkdownRenders( + self.dedent( + """ + <div markdown="1"> + A _Markdown_ paragraph before a raw empty tag. + + <img src="image.png" alt="An image" /> + + A _Markdown_ tail to the raw empty tag. + </div> + """ + ), + self.dedent( + """ + <div> + <p>A <em>Markdown</em> paragraph before a raw empty tag.</p> + <p><img src="image.png" alt="An image" /></p> + <p>A <em>Markdown</em> tail to the raw empty tag.</p> + </div> + """ + ) + ) + + def test_md1_nested_empty_block(self): + self.assertMarkdownRenders( + self.dedent( + """ + <div markdown="1"> + A _Markdown_ paragraph before a raw empty tag. + + <hr /> + + A _Markdown_ tail to the raw empty tag. + </div> + """ + ), + self.dedent( + """ + <div> + <p>A <em>Markdown</em> paragraph before a raw empty tag.</p> + <hr /> + <p>A <em>Markdown</em> tail to the raw empty tag.</p> + </div> + """ + ) + ) + def test_md_span_paragraph(self): self.assertMarkdownRenders( '<p markdown="span">*foo*</p>', @@ -413,6 +461,36 @@ def test_md_block_after_span_nested_in_block(self): ) ) + def test_nomd_nested_in_md1(self): + self.assertMarkdownRenders( + self.dedent( + """ + <div markdown="1"> + *foo* + <div> + *foo* + <p>*bar*</p> + *baz* + </div> + *bar* + </div> + """ + ), + self.dedent( + """ + <div> + <p><em>foo</em></p> + <div> + *foo* + <p>*bar*</p> + *baz* + </div> + <p><em>bar</em></p> + </div> + """ + ) + ) + def test_md1_nested_in_nomd(self): self.assertMarkdownRenders( self.dedent( From 2d8ce544766abf532896d6c9f7700ac4feb4d70d Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Tue, 15 Sep 2020 09:03:32 -0400 Subject: [PATCH 58/67] Ensure extension doesn't break default behavior. --- tests/test_syntax/extensions/test_md_in_html.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index 014fdb714..d29a4437b 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -21,6 +21,13 @@ """ from markdown.test_tools import TestCase +from ..blocks.test_html_blocks import TestHTMLBlocks + + +class TestDefaultwMdInHTML(TestHTMLBlocks): + """ Ensure the md_in_html extension does not break the default behavior. """ + + default_kwargs = {'extensions': ['md_in_html']} class TestMdInHTML(TestCase): From 4856e86622ee7ad08ade437c5398689f6965b095 Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Tue, 15 Sep 2020 10:04:36 -0400 Subject: [PATCH 59/67] update abbr tests --- tests/extensions/extra/abbr.html | 4 - tests/extensions/extra/abbr.txt | 13 -- tests/test_syntax/extensions/test_abbr.py | 242 ++++++++++++++++++++++ 3 files changed, 242 insertions(+), 17 deletions(-) delete mode 100644 tests/extensions/extra/abbr.html delete mode 100644 tests/extensions/extra/abbr.txt create mode 100644 tests/test_syntax/extensions/test_abbr.py diff --git a/tests/extensions/extra/abbr.html b/tests/extensions/extra/abbr.html deleted file mode 100644 index 456524e86..000000000 --- a/tests/extensions/extra/abbr.html +++ /dev/null @@ -1,4 +0,0 @@ -<p>An <abbr title="Abbreviation">ABBR</abbr>: "<abbr title="Reference">REF</abbr>". -ref and REFERENCE should be ignored.</p> -<p>The <abbr title="Hyper Text Markup Language">HTML</abbr> specification -is maintained by the <abbr title="World Wide Web Consortium">W3C</abbr>.</p> \ No newline at end of file diff --git a/tests/extensions/extra/abbr.txt b/tests/extensions/extra/abbr.txt deleted file mode 100644 index 991bf1561..000000000 --- a/tests/extensions/extra/abbr.txt +++ /dev/null @@ -1,13 +0,0 @@ -An ABBR: "REF". -ref and REFERENCE should be ignored. - -*[REF]: Reference -*[ABBR]: This gets overriden by the next one. -*[ABBR]: Abbreviation - -The HTML specification -is maintained by the W3C. - -*[HTML]: Hyper Text Markup Language -*[W3C]: World Wide Web Consortium - diff --git a/tests/test_syntax/extensions/test_abbr.py b/tests/test_syntax/extensions/test_abbr.py new file mode 100644 index 000000000..64388c2d8 --- /dev/null +++ b/tests/test_syntax/extensions/test_abbr.py @@ -0,0 +1,242 @@ +# -*- coding: utf-8 -*- +""" +Python Markdown + +A Python implementation of John Gruber's Markdown. + +Documentation: https://python-markdown.github.io/ +GitHub: https://github.com/Python-Markdown/markdown/ +PyPI: https://pypi.org/project/Markdown/ + +Started by Manfred Stienstra (http://www.dwerg.net/). +Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). +Currently maintained by Waylan Limberg (https://github.com/waylan), +Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). + +Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later) +Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) +Copyright 2004 Manfred Stienstra (the original version) + +License: BSD (see LICENSE.md for details). +""" + +from markdown.test_tools import TestCase + + +class TestAbbr(TestCase): + + default_kwargs = {'extensions': ['abbr']} + + def test_abbr_upper(self): + self.assertMarkdownRenders( + self.dedent( + """ + ABBR + + *[ABBR]: Abbreviation + """ + ), + self.dedent( + """ + <p><abbr title="Abbreviation">ABBR</abbr></p> + """ + ) + ) + + def test_abbr_lower(self): + self.assertMarkdownRenders( + self.dedent( + """ + abbr + + *[abbr]: Abbreviation + """ + ), + self.dedent( + """ + <p><abbr title="Abbreviation">abbr</abbr></p> + """ + ) + ) + + def test_abbr_multiple(self): + self.assertMarkdownRenders( + self.dedent( + """ + The HTML specification + is maintained by the W3C. + + *[HTML]: Hyper Text Markup Language + *[W3C]: World Wide Web Consortium + """ + ), + self.dedent( + """ + <p>The <abbr title="Hyper Text Markup Language">HTML</abbr> specification + is maintained by the <abbr title="World Wide Web Consortium">W3C</abbr>.</p> + """ + ) + ) + + def test_abbr_override(self): + self.assertMarkdownRenders( + self.dedent( + """ + ABBR + + *[ABBR]: Ignored + *[ABBR]: The override + """ + ), + self.dedent( + """ + <p><abbr title="The override">ABBR</abbr></p> + """ + ) + ) + + def test_abbr_no_blank_Lines(self): + self.assertMarkdownRenders( + self.dedent( + """ + ABBR + *[ABBR]: Abbreviation + ABBR + """ + ), + self.dedent( + """ + <p><abbr title="Abbreviation">ABBR</abbr></p> + <p><abbr title="Abbreviation">ABBR</abbr></p> + """ + ) + ) + + def test_abbr_no_space(self): + self.assertMarkdownRenders( + self.dedent( + """ + ABBR + + *[ABBR]:Abbreviation + """ + ), + self.dedent( + """ + <p><abbr title="Abbreviation">ABBR</abbr></p> + """ + ) + ) + + def test_abbr_extra_space(self): + self.assertMarkdownRenders( + self.dedent( + """ + ABBR + + *[ABBR] : Abbreviation + """ + ), + self.dedent( + """ + <p><abbr title="Abbreviation">ABBR</abbr></p> + """ + ) + ) + + def test_abbr_line_break(self): + self.assertMarkdownRenders( + self.dedent( + """ + ABBR + + *[ABBR]: + Abbreviation + """ + ), + self.dedent( + """ + <p><abbr title="Abbreviation">ABBR</abbr></p> + """ + ) + ) + + def test_abbr_ignore_unmatched_case(self): + self.assertMarkdownRenders( + self.dedent( + """ + ABBR abbr + + *[ABBR]: Abbreviation + """ + ), + self.dedent( + """ + <p><abbr title="Abbreviation">ABBR</abbr> abbr</p> + """ + ) + ) + + def test_abbr_partial_word(self): + self.assertMarkdownRenders( + self.dedent( + """ + ABBR ABBREVIATION + + *[ABBR]: Abbreviation + """ + ), + self.dedent( + """ + <p><abbr title="Abbreviation">ABBR</abbr> ABBREVIATION</p> + """ + ) + ) + + def test_abbr_unused(self): + self.assertMarkdownRenders( + self.dedent( + """ + foo bar + + *[ABBR]: Abbreviation + """ + ), + self.dedent( + """ + <p>foo bar</p> + """ + ) + ) + + def test_abbr_double_quoted(self): + self.assertMarkdownRenders( + self.dedent( + """ + ABBR + + *[ABBR]: "Abbreviation" + """ + ), + self.dedent( + """ + <p><abbr title=""Abbreviation"">ABBR</abbr></p> + """ + ) + ) + + def test_abbr_single_quoted(self): + self.assertMarkdownRenders( + self.dedent( + """ + ABBR + + *[ABBR]: 'Abbreviation' + """ + ), + self.dedent( + """ + <p><abbr title="'Abbreviation'">ABBR</abbr></p> + """ + ) + ) From 07c9267606b795586e23b978b90a3447b84e3c6e Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Tue, 15 Sep 2020 11:03:09 -0400 Subject: [PATCH 60/67] add basic link ref tests. --- tests/test_syntax/inline/test_links.py | 184 ++++++++++++++++++++++++- 1 file changed, 183 insertions(+), 1 deletion(-) diff --git a/tests/test_syntax/inline/test_links.py b/tests/test_syntax/inline/test_links.py index be4237db0..d650223fa 100644 --- a/tests/test_syntax/inline/test_links.py +++ b/tests/test_syntax/inline/test_links.py @@ -22,7 +22,7 @@ from markdown.test_tools import TestCase -class TestAdvancedLinks(TestCase): +class TestInlineLinks(TestCase): def test_nested_square_brackets(self): self.assertMarkdownRenders( @@ -134,6 +134,188 @@ def test_amp_in_url(self): '<p><a href="http://example.com/?a=1&b=2">title</a></p>' ) + +class TestReferenceLinks(TestCase): + + def test_ref_link(self): + self.assertMarkdownRenders( + self.dedent( + """ + [Text] + + [Text]: http://example.com + """ + ), + """<p><a href="http://example.com">Text</a></p>""" + ) + + def test_ref_link_angle_brackets(self): + self.assertMarkdownRenders( + self.dedent( + """ + [Text] + + [Text]: <http://example.com> + """ + ), + """<p><a href="http://example.com">Text</a></p>""" + ) + + def test_ref_link_no_space(self): + self.assertMarkdownRenders( + self.dedent( + """ + [Text] + + [Text]:http://example.com + """ + ), + """<p><a href="http://example.com">Text</a></p>""" + ) + + def test_ref_link_angle_brackets_no_space(self): + self.assertMarkdownRenders( + self.dedent( + """ + [Text] + + [Text]:<http://example.com> + """ + ), + """<p><a href="http://example.com">Text</a></p>""" + ) + + def test_ref_link_angle_brackets_title(self): + self.assertMarkdownRenders( + self.dedent( + """ + [Text] + + [Text]: <http://example.com> "title" + """ + ), + """<p><a href="http://example.com" title="title">Text</a></p>""" + ) + + + def test_ref_link_title(self): + self.assertMarkdownRenders( + self.dedent( + """ + [Text] + + [Text]: http://example.com "title" + """ + ), + """<p><a href="http://example.com" title="title">Text</a></p>""" + ) + + def test_ref_link_angle_brackets_title_no_space(self): + # TODO: Maybe reevaluate this? + self.assertMarkdownRenders( + self.dedent( + """ + [Text] + + [Text]: <http://example.com>"title" + """ + ), + """<p><a href="http://example.com>"title"">Text</a></p>""" + ) + + + def test_ref_link_title_no_space(self): + self.assertMarkdownRenders( + self.dedent( + """ + [Text] + + [Text]: http://example.com"title" + """ + ), + """<p><a href="http://example.com"title"">Text</a></p>""" + ) + + def test_ref_link_single_quoted_title(self): + self.assertMarkdownRenders( + self.dedent( + """ + [Text] + + [Text]: http://example.com 'title' + """ + ), + """<p><a href="http://example.com" title="title">Text</a></p>""" + ) + + def test_ref_link_title_nested_quote(self): + self.assertMarkdownRenders( + self.dedent( + """ + [Text] + + [Text]: http://example.com "title'" + """ + ), + """<p><a href="http://example.com" title="title'">Text</a></p>""" + ) + + def test_ref_link_single_quoted_title_nested_quote(self): + self.assertMarkdownRenders( + self.dedent( + """ + [Text] + + [Text]: http://example.com 'title"' + """ + ), + """<p><a href="http://example.com" title="title"">Text</a></p>""" + ) + + def test_ref_link_override(self): + self.assertMarkdownRenders( + self.dedent( + """ + [Text] + + [Text]: http://example.com 'ignore' + [Text]: https://example.com 'override' + """ + ), + """<p><a href="https://example.com" title="override">Text</a></p>""" + ) + + def test_ref_link_title_no_blank_lines(self): + self.assertMarkdownRenders( + self.dedent( + """ + [Text] + [Text]: http://example.com "title" + [Text] + """ + ), + self.dedent( + """ + <p><a href="http://example.com" title="title">Text</a></p> + <p><a href="http://example.com" title="title">Text</a></p> + """ + ) + ) + + def test_ref_link_multi_line(self): + self.assertMarkdownRenders( + self.dedent( + """ + [Text] + + [Text]: + http://example.com + "title" + """ + ), + """<p><a href="http://example.com" title="title">Text</a></p>""" + ) + def test_reference_newlines(self): """Test reference id whitespace cleanup.""" From 82b97e5d378723d71824b1f5fb4f1d9c6dd9e276 Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Tue, 15 Sep 2020 11:10:04 -0400 Subject: [PATCH 61/67] flake8 cleanup --- markdown/extensions/md_in_html.py | 2 +- tests/test_syntax/inline/test_links.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 9d7e839fa..3518d059f 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -82,7 +82,7 @@ def get_state(self, tag, attrs): return 'span' elif tag in block_level_tags: return 'off' - else: #pragma: no cover + else: # pragma: no cover return None def handle_starttag(self, tag, attrs): diff --git a/tests/test_syntax/inline/test_links.py b/tests/test_syntax/inline/test_links.py index d650223fa..7a3e1c322 100644 --- a/tests/test_syntax/inline/test_links.py +++ b/tests/test_syntax/inline/test_links.py @@ -197,7 +197,6 @@ def test_ref_link_angle_brackets_title(self): """<p><a href="http://example.com" title="title">Text</a></p>""" ) - def test_ref_link_title(self): self.assertMarkdownRenders( self.dedent( @@ -223,7 +222,6 @@ def test_ref_link_angle_brackets_title_no_space(self): """<p><a href="http://example.com>"title"">Text</a></p>""" ) - def test_ref_link_title_no_space(self): self.assertMarkdownRenders( self.dedent( From 1a0a89377f97734ca3e886897167be16ab161381 Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Tue, 15 Sep 2020 11:45:49 -0400 Subject: [PATCH 62/67] footnote tests. 100% patch coverage --- .../test_syntax/extensions/test_footnotes.py | 243 +++++++++++++++++- 1 file changed, 241 insertions(+), 2 deletions(-) diff --git a/tests/test_syntax/extensions/test_footnotes.py b/tests/test_syntax/extensions/test_footnotes.py index 7785a2b2a..1a3a2b0bb 100644 --- a/tests/test_syntax/extensions/test_footnotes.py +++ b/tests/test_syntax/extensions/test_footnotes.py @@ -24,6 +24,247 @@ class TestFootnotes(TestCase): + default_kwargs = {'extensions': ['footnotes']} + maxDiff = None + + def test_basic_footnote(self): + self.assertMarkdownRenders( + self.dedent( + """ + paragraph[^1] + + [^1]: A Footnote + """ + ), + '<p>paragraph<sup id="fnref:1"><a class="footnote-ref" href="#fn:1">1</a></sup></p>\n' + '<div class="footnote">\n' + '<hr />\n' + '<ol>\n' + '<li id="fn:1">\n' + '<p>A Footnote <a class="footnote-backref" href="#fnref:1"' + ' title="Jump back to footnote 1 in the text">↩</a></p>\n' + '</li>\n' + '</ol>\n' + '</div>' + ) + + def test_multiple_footnotes(self): + self.assertMarkdownRenders( + self.dedent( + """ + foo[^1] + + bar[^2] + + [^1]: Footnote 1 + [^2]: Footnote 2 + """ + ), + '<p>foo<sup id="fnref:1"><a class="footnote-ref" href="#fn:1">1</a></sup></p>\n' + '<p>bar<sup id="fnref:2"><a class="footnote-ref" href="#fn:2">2</a></sup></p>\n' + '<div class="footnote">\n' + '<hr />\n' + '<ol>\n' + '<li id="fn:1">\n' + '<p>Footnote 1 <a class="footnote-backref" href="#fnref:1"' + ' title="Jump back to footnote 1 in the text">↩</a></p>\n' + '</li>\n' + '<li id="fn:2">\n' + '<p>Footnote 2 <a class="footnote-backref" href="#fnref:2"' + ' title="Jump back to footnote 2 in the text">↩</a></p>\n' + '</li>\n' + '</ol>\n' + '</div>' + ) + + def test_multiple_footnotes_multiline(self): + self.assertMarkdownRenders( + self.dedent( + """ + foo[^1] + + bar[^2] + + [^1]: Footnote 1 + line 2 + [^2]: Footnote 2 + """ + ), + '<p>foo<sup id="fnref:1"><a class="footnote-ref" href="#fn:1">1</a></sup></p>\n' + '<p>bar<sup id="fnref:2"><a class="footnote-ref" href="#fn:2">2</a></sup></p>\n' + '<div class="footnote">\n' + '<hr />\n' + '<ol>\n' + '<li id="fn:1">\n' + '<p>Footnote 1\nline 2 <a class="footnote-backref" href="#fnref:1"' + ' title="Jump back to footnote 1 in the text">↩</a></p>\n' + '</li>\n' + '<li id="fn:2">\n' + '<p>Footnote 2 <a class="footnote-backref" href="#fnref:2"' + ' title="Jump back to footnote 2 in the text">↩</a></p>\n' + '</li>\n' + '</ol>\n' + '</div>' + ) + + def test_footnote_multi_line(self): + self.assertMarkdownRenders( + self.dedent( + """ + paragraph[^1] + [^1]: A Footnote + line 2 + """ + ), + '<p>paragraph<sup id="fnref:1"><a class="footnote-ref" href="#fn:1">1</a></sup></p>\n' + '<div class="footnote">\n' + '<hr />\n' + '<ol>\n' + '<li id="fn:1">\n' + '<p>A Footnote\nline 2 <a class="footnote-backref" href="#fnref:1"' + ' title="Jump back to footnote 1 in the text">↩</a></p>\n' + '</li>\n' + '</ol>\n' + '</div>' + ) + + def test_footnote_multi_line_lazy_indent(self): + self.assertMarkdownRenders( + self.dedent( + """ + paragraph[^1] + [^1]: A Footnote + line 2 + """ + ), + '<p>paragraph<sup id="fnref:1"><a class="footnote-ref" href="#fn:1">1</a></sup></p>\n' + '<div class="footnote">\n' + '<hr />\n' + '<ol>\n' + '<li id="fn:1">\n' + '<p>A Footnote\nline 2 <a class="footnote-backref" href="#fnref:1"' + ' title="Jump back to footnote 1 in the text">↩</a></p>\n' + '</li>\n' + '</ol>\n' + '</div>' + ) + + def test_footnote_multi_line_complex(self): + self.assertMarkdownRenders( + self.dedent( + """ + paragraph[^1] + + [^1]: + + A Footnote + line 2 + + * list item + + > blockquote + """ + ), + '<p>paragraph<sup id="fnref:1"><a class="footnote-ref" href="#fn:1">1</a></sup></p>\n' + '<div class="footnote">\n' + '<hr />\n' + '<ol>\n' + '<li id="fn:1">\n' + '<p>A Footnote\nline 2</p>\n' + '<ul>\n<li>list item</li>\n</ul>\n' + '<blockquote>\n<p>blockquote</p>\n</blockquote>\n' + '<p><a class="footnote-backref" href="#fnref:1"' + ' title="Jump back to footnote 1 in the text">↩</a></p>\n' + '</li>\n' + '</ol>\n' + '</div>' + ) + + def test_footnote_multple_complex(self): + self.assertMarkdownRenders( + self.dedent( + """ + foo[^1] + + bar[^2] + + [^1]: + + A Footnote + line 2 + + * list item + + > blockquote + + [^2]: Second footnote + + paragraph 2 + """ + ), + '<p>foo<sup id="fnref:1"><a class="footnote-ref" href="#fn:1">1</a></sup></p>\n' + '<p>bar<sup id="fnref:2"><a class="footnote-ref" href="#fn:2">2</a></sup></p>\n' + '<div class="footnote">\n' + '<hr />\n' + '<ol>\n' + '<li id="fn:1">\n' + '<p>A Footnote\nline 2</p>\n' + '<ul>\n<li>list item</li>\n</ul>\n' + '<blockquote>\n<p>blockquote</p>\n</blockquote>\n' + '<p><a class="footnote-backref" href="#fnref:1"' + ' title="Jump back to footnote 1 in the text">↩</a></p>\n' + '</li>\n' + '<li id="fn:2">\n' + '<p>Second footnote</p>\n' + '<p>paragraph 2 <a class="footnote-backref" href="#fnref:2"' + ' title="Jump back to footnote 2 in the text">↩</a></p>\n' + '</li>\n' + '</ol>\n' + '</div>' + ) + + def test_footnote_multple_complex_no_blank_line_between(self): + self.assertMarkdownRenders( + self.dedent( + """ + foo[^1] + + bar[^2] + + [^1]: + + A Footnote + line 2 + + * list item + + > blockquote + [^2]: Second footnote + + paragraph 2 + """ + ), + '<p>foo<sup id="fnref:1"><a class="footnote-ref" href="#fn:1">1</a></sup></p>\n' + '<p>bar<sup id="fnref:2"><a class="footnote-ref" href="#fn:2">2</a></sup></p>\n' + '<div class="footnote">\n' + '<hr />\n' + '<ol>\n' + '<li id="fn:1">\n' + '<p>A Footnote\nline 2</p>\n' + '<ul>\n<li>list item</li>\n</ul>\n' + '<blockquote>\n<p>blockquote</p>\n</blockquote>\n' + '<p><a class="footnote-backref" href="#fnref:1"' + ' title="Jump back to footnote 1 in the text">↩</a></p>\n' + '</li>\n' + '<li id="fn:2">\n' + '<p>Second footnote</p>\n' + '<p>paragraph 2 <a class="footnote-backref" href="#fnref:2"' + ' title="Jump back to footnote 2 in the text">↩</a></p>\n' + '</li>\n' + '</ol>\n' + '</div>' + ) + def test_backlink_text(self): """Test backlink configuration.""" @@ -39,7 +280,6 @@ def test_backlink_text(self): '</li>\n' '</ol>\n' '</div>', - extensions=['footnotes'], extension_configs={'footnotes': {'BACKLINK_TEXT': 'back'}} ) @@ -58,6 +298,5 @@ def test_footnote_separator(self): '</li>\n' '</ol>\n' '</div>', - extensions=['footnotes'], extension_configs={'footnotes': {'SEPARATOR': '-'}} ) From 46ac4363440f69cf13e890e3315d34c267ade4a1 Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Tue, 15 Sep 2020 13:32:40 -0400 Subject: [PATCH 63/67] Add test for case in #1012. --- .../test_syntax/extensions/test_md_in_html.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index d29a4437b..7b980f6e3 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -654,6 +654,28 @@ def test_md1_nested_unclosed_p(self): ) ) + def test_md1_nested_comment(self): + self.assertMarkdownRenders( + self.dedent( + """ + <div markdown="1"> + A *Markdown* paragraph. + <!-- foobar --> + A *Markdown* paragraph. + </div> + """ + ), + self.dedent( + """ + <div> + <p>A <em>Markdown</em> paragraph.</p> + <!-- foobar --> + <p>A <em>Markdown</em> paragraph.</p> + </div> + """ + ) + ) + def test_md1_nested_link_ref(self): self.assertMarkdownRenders( self.dedent( From 9cfbf204478d9598a0166feed252c1be9d49ce7d Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Tue, 15 Sep 2020 13:51:58 -0400 Subject: [PATCH 64/67] Add release notes. --- docs/change_log/release-3.3.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/change_log/release-3.3.md b/docs/change_log/release-3.3.md index dfbb384e3..cf2f1dd5d 100644 --- a/docs/change_log/release-3.3.md +++ b/docs/change_log/release-3.3.md @@ -66,6 +66,21 @@ The following new features have been included in the 3.3 release: Any random HTML attribute can be defined and set on the `<code>` tag of fenced code blocks when the `attr_list` extension is enabled (#816). +* The HTML parser has been completely replaced. The new HTML parser is built on Python's + [html.parser.HTMLParser](https://docs.python.org/3/library/html.parser.html), which + alleviates various bugs and simplify maintenance of the code (#803, #830). + +* The [Markdown in HTML](../md_in_html.md) extension has been rebuilt on the new HTML + Parser, which drastically simplifies it. Note that raw HTML elements with a `markdown` + attribute defined are now converted to ElementTree Elements and are rendered by the + serializer. Various bugs have been fixed (#803, #595, #780, and #1012). + +* Link reference parsing, abbreviation reference parsing and footnote reference parsing + has all been moved from `preprocessors` to `blockprocessors`, which allows them to be + nested within other block level elements. Specifically, this change was necessary to + maintain the current behavior in the rebuilt Markdown in HTML extension. A few random + edge-case bugs (see the included tests) were resolved in the process (#803). + ## Bug fixes The following bug fixes are included in the 3.3 release: From 1eb9fd3523575a066d9f7406c522890bfb3d371f Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Tue, 15 Sep 2020 14:07:18 -0400 Subject: [PATCH 65/67] Avoid duplicate tests. --- tests/test_syntax/extensions/test_md_in_html.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py index 7b980f6e3..b68412c8e 100644 --- a/tests/test_syntax/extensions/test_md_in_html.py +++ b/tests/test_syntax/extensions/test_md_in_html.py @@ -20,6 +20,7 @@ License: BSD (see LICENSE.md for details). """ +from unittest import TestSuite from markdown.test_tools import TestCase from ..blocks.test_html_blocks import TestHTMLBlocks @@ -752,3 +753,12 @@ def test_md1_nested_footnote_ref(self): '</div>', extensions=['md_in_html', 'footnotes'] ) + + +def load_tests(loader, tests, pattern): + ''' Ensure TestHTMLBlocks doesn't get run twice by excluding it here. ''' + suite = TestSuite() + for test_class in [TestDefaultwMdInHTML, TestMdInHTML]: + tests = loader.loadTestsFromTestCase(test_class) + suite.addTests(tests) + return suite From 6f3b417266df12812ff7227aeb172db36ad14eaa Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Tue, 15 Sep 2020 14:14:50 -0400 Subject: [PATCH 66/67] Fix a broken link --- docs/change_log/release-3.3.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/change_log/release-3.3.md b/docs/change_log/release-3.3.md index cf2f1dd5d..4417c9a6c 100644 --- a/docs/change_log/release-3.3.md +++ b/docs/change_log/release-3.3.md @@ -70,10 +70,10 @@ The following new features have been included in the 3.3 release: [html.parser.HTMLParser](https://docs.python.org/3/library/html.parser.html), which alleviates various bugs and simplify maintenance of the code (#803, #830). -* The [Markdown in HTML](../md_in_html.md) extension has been rebuilt on the new HTML - Parser, which drastically simplifies it. Note that raw HTML elements with a `markdown` - attribute defined are now converted to ElementTree Elements and are rendered by the - serializer. Various bugs have been fixed (#803, #595, #780, and #1012). +* The [Markdown in HTML](../extensions/md_in_html.md) extension has been rebuilt on the + new HTML Parser, which drastically simplifies it. Note that raw HTML elements with a + `markdown` attribute defined are now converted to ElementTree Elements and are rendered + by the serializer. Various bugs have been fixed (#803, #595, #780, and #1012). * Link reference parsing, abbreviation reference parsing and footnote reference parsing has all been moved from `preprocessors` to `blockprocessors`, which allows them to be From 15b431aefdaaf20457cf1e6c7b7bcea07c809e7f Mon Sep 17 00:00:00 2001 From: Waylan Limberg <waylan.limberg@icloud.com> Date: Wed, 16 Sep 2020 14:32:05 -0400 Subject: [PATCH 67/67] Final cleanup. --- markdown/htmlparser.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 1a86e9345..f83ddeace 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -113,8 +113,6 @@ def get_endtag_text(self, tag): return '</{}>'.format(tag) def handle_starttag(self, tag, attrs): - attrs = dict(attrs) - if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)): # Started a new raw block. Prepare stack. self.inraw = True