Correct escape text

lepture · May 7, 2022 · 8452faf · 8452faf
1 parent 8e3fe09
commit 8452faf
Show file tree

Hide file tree

Showing 5 changed files with 57 additions and 36 deletions.
diff --git a/mistune/__init__.py b/mistune/__init__.py
@@ -3,7 +3,7 @@
 from .inline_parser import InlineParser, InlineState
 from .renderers import HTMLRenderer
 # from .plugins import PLUGINS
-from .util import escape, escape_url, escape_html, unikey
+from .util import escape, escape_url, safe_entity, unikey
 
 
 def create_markdown(escape=True, hard_wrap=False, renderer=None, plugins=None):
@@ -69,7 +69,7 @@ def markdown(text, escape=True, renderer=None, plugins=None):
     'Markdown', 'HTMLRenderer',
     'BlockParser', 'BlockState',
     'InlineParser', 'InlineState',
-    'escape', 'escape_url', 'escape_html', 'unikey',
+    'escape', 'escape_url', 'safe_entity', 'unikey',
     'html', 'create_markdown', 'markdown',
 ]
 

diff --git a/mistune/block_parser.py b/mistune/block_parser.py
@@ -1,7 +1,9 @@
 import re
 from .util import (
     unikey,
+    escape,
     escape_url,
+    safe_entity,
     ESCAPE_CHAR_RE,
     LINK_LABEL,
     LINK_BRACKET_RE,
@@ -179,7 +181,7 @@ def parse_indent_code(self, line, cursor, state):
 
         code = expand_leading_tab(code)
         code = _INDENT_CODE_TRIM.sub('', code)
-        code = code.strip('\n')
+        code = escape(code.strip('\n'))
         state.add_token({'type': 'block_code', 'raw': code}, start_line, cursor)
         return cursor + 1
 
@@ -215,9 +217,9 @@ def parse_fenced_code(self, line, cursor, state):
             _trim_pattern = re.compile('^ {0,' + str(len(spaces)) + '}', re.M)
             code = _trim_pattern.sub('', code)
 
-        token = {'type': 'block_code', 'raw': code}
+        token = {'type': 'block_code', 'raw': escape(code)}
         if info:
-            token['attrs'] = {'info': info.strip()}
+            token['attrs'] = {'info': safe_entity(info.strip())}
 
         state.add_token(token, start_line, cursor)
         return cursor + 1
@@ -288,7 +290,7 @@ def parse_def_link(self, line, cursor, state):
             attrs = {'url': escape_url(url)}
             if title:
                 title = ESCAPE_CHAR_RE.sub(r'\1', title)
-                attrs['title'] = title
+                attrs['title'] = safe_entity(title)
             state.def_links[key] = attrs
         return cursor + 1
 

diff --git a/mistune/inline_parser.py b/mistune/inline_parser.py
@@ -10,6 +10,7 @@
 
     escape,
     escape_url,
+    safe_entity,
     unikey,
 )
 
@@ -118,7 +119,7 @@ def parse_escape(self, m, state):
         text = ESCAPE_CHAR_RE.sub(r'\1', text)
         state.tokens.append({
             'type': 'text',
-            'raw': text,
+            'raw': safe_entity(text),
         })
         return m.end()
 
@@ -245,7 +246,7 @@ def _parse_std_link(self, m, token_type, text, state):
                         'children': self.render_text(text, new_state),
                         'attrs': {
                             'url': escape_url(url),
-                            'title': title,
+                            'title': safe_entity(title),
                         },
                     })
                     return m3.end()
@@ -302,7 +303,7 @@ def parse_auto_email(self, m, state):
         return pos
 
     def _parse_auto_link(self, url, text, state):
-        children = self.render_tokens([{'type': 'text', 'raw': escape(text)}])
+        children = self.render_tokens([{'type': 'text', 'raw': safe_entity(text)}])
         state.tokens.append({
             'type': 'link',
             'children': children,
@@ -336,7 +337,7 @@ def parse_emphasis(self, m, state):
             return self.record_text(pos, marker, state)
 
         if hole:
-            state.tokens.append({'type': 'text', 'raw': hole})
+            state.tokens.append({'type': 'text', 'raw': safe_entity(hole)})
 
         new_state = state.copy()
         text = m1.group(1)
@@ -378,7 +379,7 @@ def parse_codespan(self, m, state):
             if len(code.strip()):
                 if code.startswith(' ') and code.endswith(' '):
                     code = code[1:-1]
-            state.tokens.append({'type': 'codespan', 'raw': code})
+            state.tokens.append({'type': 'codespan', 'raw': escape(code)})
             return m.end()
         return self.record_text(pos, marker, state)
 
@@ -406,7 +407,7 @@ def parse(self, s, pos, state):
 
             end_pos = m.start()
             if end_pos > pos:
-                hole = s[pos:end_pos]
+                hole = safe_entity(s[pos:end_pos])
                 state.tokens.append({'type': 'text', 'raw': hole})
 
             token_type = m.lastgroup
@@ -415,20 +416,20 @@ def parse(self, s, pos, state):
             if not new_pos:
                 # move cursor 1 character forward
                 pos = end_pos + 1
-                hole = s[end_pos:pos]
+                hole = safe_entity(s[end_pos:pos])
                 state.tokens.append({'type': 'text', 'raw': hole})
             else:
                 pos = new_pos
 
         if pos == 0:
             # special case, just pure text
-            state.tokens.append({'type': 'text', 'raw': s})
+            state.tokens.append({'type': 'text', 'raw': safe_entity(s)})
         elif pos < len(s):
-            state.tokens.append({'type': 'text', 'raw': s[pos:]})
+            state.tokens.append({'type': 'text', 'raw': safe_entity(s[pos:])})
         return state.tokens
 
     def record_text(self, pos, text, state):
-        state.tokens.append({'type': 'text', 'raw': text})
+        state.tokens.append({'type': 'text', 'raw': safe_entity(text)})
         return pos
 
     def render_text(self, s: str, state: InlineState):

diff --git a/mistune/renderers.py b/mistune/renderers.py
@@ -1,4 +1,4 @@
-from .util import escape, escape_html
+from .util import escape
 
 
 class BaseRenderer(object):
@@ -78,20 +78,19 @@ def _safe_url(self, url):
     def text(self, text):
         if self._escape:
             return escape(text)
-        return escape_html(text)
+        return text
 
     def link(self, text, url, title=None):
         s = '<a href="' + self._safe_url(url) + '"'
         if title:
-            s += ' title="' + escape_html(title) + '"'
+            s += ' title="' + title + '"'
         return s + '>' + text + '</a>'
 
     def image(self, text, url, title=None):
         src = self._safe_url(url)
-        alt = escape_html(text)
-        s = '<img src="' + src + '" alt="' + alt + '"'
+        s = '<img src="' + src + '" alt="' + escape(text) + '"'
         if title:
-            s += ' title="' + escape_html(title) + '"'
+            s += ' title="' + title + '"'
         return s + ' />'
 
     def emphasis(self, text):
@@ -101,7 +100,7 @@ def strong(self, text):
         return '<strong>' + text + '</strong>'
 
     def codespan(self, text):
-        return '<code>' + escape(text) + '</code>'
+        return '<code>' + text + '</code>'
 
     def linebreak(self):
         return '<br />\n'
@@ -136,9 +135,8 @@ def block_code(self, code, info=None):
             info = info.strip()
         if info:
             lang = info.split(None, 1)[0]
-            lang = escape_html(lang)
             html += ' class="language-' + lang + '"'
-        return html + '>' + escape(code) + '</code></pre>\n'
+        return html + '>' + code + '</code></pre>\n'
 
     def block_quote(self, text):
         return '<blockquote>' + text + '</blockquote>\n'

diff --git a/mistune/util.py b/mistune/util.py
@@ -1,10 +1,13 @@
 import re
 try:
     from urllib.parse import quote
-    import html
 except ImportError:
     from urllib import quote
-    html = None
+
+try:
+    from html import _replace_charref
+except ImportError:
+    _replace_charref = None
 
 
 PREVENT_BACKSLASH = r'(?<!\\)(?:\\\\)*'
@@ -21,10 +24,12 @@
 HTML_TAGNAME = r'[A-Za-z][A-Za-z0-9-]*'
 HTML_ATTRIBUTES = (
     r'(?:\s+[A-Za-z_:][A-Za-z0-9_.:-]*'
-    r'(?:\s*=\s*(?:[^ "\'=<>`]+|\'[^\']*?\'|"[^\"]*?"))?)*'
+    r'(?:\s*=\s*(?:[^ !"\'=<>`]+|\'[^\']*?\'|"[^\"]*?"))?)*'
 )
 
 
+
+
 def escape(s, quote=True):
     s = s.replace("&", "&amp;")
     s = s.replace("<", "&lt;")
@@ -40,18 +45,33 @@ def escape_url(link):
         '!$&()*+,;='      # sub-delims - "'" (rfc3986)
         '%'               # leave already-encoded octets alone
     )
-
-    if html is None:
-        return quote(link.encode('utf-8'), safe=safe)
-    return html.escape(quote(html.unescape(link), safe=safe))
+    return escape(quote(unescape(link), safe=safe))
 
 
-def escape_html(s):
-    if html is not None:
-        return html.escape(html.unescape(s)).replace('&#x27;', "'")
-    return escape(s)
+def safe_entity(s):
+    return escape(unescape(s))
 
 
 def unikey(s):
     key = ' '.join(s.split()).strip()
     return key.lower().upper()
+
+
+_charref = re.compile(
+    r'&(#[0-9]{1,7};'
+    r'|#[xX][0-9a-fA-F]+;'
+    r'|[^\t\n\f <&#;]{1,32};)'
+)
+
+
+def unescape(s):
+    """
+    Copy from `html.unescape`, but `_charref` is different. CommonMark
+    does not accept entity references without a trailing semicolon
+    """
+    if not _replace_charref:
+        return s
+
+    if '&' not in s:
+        return s
+    return _charref.sub(_replace_charref, s)