Skip to content

Commit

Permalink
Correct escape text
Browse files Browse the repository at this point in the history
  • Loading branch information
lepture committed May 7, 2022
1 parent 8e3fe09 commit 8452faf
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 36 deletions.
4 changes: 2 additions & 2 deletions mistune/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from .inline_parser import InlineParser, InlineState
from .renderers import HTMLRenderer
# from .plugins import PLUGINS
from .util import escape, escape_url, escape_html, unikey
from .util import escape, escape_url, safe_entity, unikey


def create_markdown(escape=True, hard_wrap=False, renderer=None, plugins=None):
Expand Down Expand Up @@ -69,7 +69,7 @@ def markdown(text, escape=True, renderer=None, plugins=None):
'Markdown', 'HTMLRenderer',
'BlockParser', 'BlockState',
'InlineParser', 'InlineState',
'escape', 'escape_url', 'escape_html', 'unikey',
'escape', 'escape_url', 'safe_entity', 'unikey',
'html', 'create_markdown', 'markdown',
]

Expand Down
10 changes: 6 additions & 4 deletions mistune/block_parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import re
from .util import (
unikey,
escape,
escape_url,
safe_entity,
ESCAPE_CHAR_RE,
LINK_LABEL,
LINK_BRACKET_RE,
Expand Down Expand Up @@ -179,7 +181,7 @@ def parse_indent_code(self, line, cursor, state):

code = expand_leading_tab(code)
code = _INDENT_CODE_TRIM.sub('', code)
code = code.strip('\n')
code = escape(code.strip('\n'))
state.add_token({'type': 'block_code', 'raw': code}, start_line, cursor)
return cursor + 1

Expand Down Expand Up @@ -215,9 +217,9 @@ def parse_fenced_code(self, line, cursor, state):
_trim_pattern = re.compile('^ {0,' + str(len(spaces)) + '}', re.M)
code = _trim_pattern.sub('', code)

token = {'type': 'block_code', 'raw': code}
token = {'type': 'block_code', 'raw': escape(code)}
if info:
token['attrs'] = {'info': info.strip()}
token['attrs'] = {'info': safe_entity(info.strip())}

state.add_token(token, start_line, cursor)
return cursor + 1
Expand Down Expand Up @@ -288,7 +290,7 @@ def parse_def_link(self, line, cursor, state):
attrs = {'url': escape_url(url)}
if title:
title = ESCAPE_CHAR_RE.sub(r'\1', title)
attrs['title'] = title
attrs['title'] = safe_entity(title)
state.def_links[key] = attrs
return cursor + 1

Expand Down
21 changes: 11 additions & 10 deletions mistune/inline_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

escape,
escape_url,
safe_entity,
unikey,
)

Expand Down Expand Up @@ -118,7 +119,7 @@ def parse_escape(self, m, state):
text = ESCAPE_CHAR_RE.sub(r'\1', text)
state.tokens.append({
'type': 'text',
'raw': text,
'raw': safe_entity(text),
})
return m.end()

Expand Down Expand Up @@ -245,7 +246,7 @@ def _parse_std_link(self, m, token_type, text, state):
'children': self.render_text(text, new_state),
'attrs': {
'url': escape_url(url),
'title': title,
'title': safe_entity(title),
},
})
return m3.end()
Expand Down Expand Up @@ -302,7 +303,7 @@ def parse_auto_email(self, m, state):
return pos

def _parse_auto_link(self, url, text, state):
children = self.render_tokens([{'type': 'text', 'raw': escape(text)}])
children = self.render_tokens([{'type': 'text', 'raw': safe_entity(text)}])
state.tokens.append({
'type': 'link',
'children': children,
Expand Down Expand Up @@ -336,7 +337,7 @@ def parse_emphasis(self, m, state):
return self.record_text(pos, marker, state)

if hole:
state.tokens.append({'type': 'text', 'raw': hole})
state.tokens.append({'type': 'text', 'raw': safe_entity(hole)})

new_state = state.copy()
text = m1.group(1)
Expand Down Expand Up @@ -378,7 +379,7 @@ def parse_codespan(self, m, state):
if len(code.strip()):
if code.startswith(' ') and code.endswith(' '):
code = code[1:-1]
state.tokens.append({'type': 'codespan', 'raw': code})
state.tokens.append({'type': 'codespan', 'raw': escape(code)})
return m.end()
return self.record_text(pos, marker, state)

Expand Down Expand Up @@ -406,7 +407,7 @@ def parse(self, s, pos, state):

end_pos = m.start()
if end_pos > pos:
hole = s[pos:end_pos]
hole = safe_entity(s[pos:end_pos])
state.tokens.append({'type': 'text', 'raw': hole})

token_type = m.lastgroup
Expand All @@ -415,20 +416,20 @@ def parse(self, s, pos, state):
if not new_pos:
# move cursor 1 character forward
pos = end_pos + 1
hole = s[end_pos:pos]
hole = safe_entity(s[end_pos:pos])
state.tokens.append({'type': 'text', 'raw': hole})
else:
pos = new_pos

if pos == 0:
# special case, just pure text
state.tokens.append({'type': 'text', 'raw': s})
state.tokens.append({'type': 'text', 'raw': safe_entity(s)})
elif pos < len(s):
state.tokens.append({'type': 'text', 'raw': s[pos:]})
state.tokens.append({'type': 'text', 'raw': safe_entity(s[pos:])})
return state.tokens

def record_text(self, pos, text, state):
state.tokens.append({'type': 'text', 'raw': text})
state.tokens.append({'type': 'text', 'raw': safe_entity(text)})
return pos

def render_text(self, s: str, state: InlineState):
Expand Down
16 changes: 7 additions & 9 deletions mistune/renderers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .util import escape, escape_html
from .util import escape


class BaseRenderer(object):
Expand Down Expand Up @@ -78,20 +78,19 @@ def _safe_url(self, url):
def text(self, text):
if self._escape:
return escape(text)
return escape_html(text)
return text

def link(self, text, url, title=None):
s = '<a href="' + self._safe_url(url) + '"'
if title:
s += ' title="' + escape_html(title) + '"'
s += ' title="' + title + '"'
return s + '>' + text + '</a>'

def image(self, text, url, title=None):
src = self._safe_url(url)
alt = escape_html(text)
s = '<img src="' + src + '" alt="' + alt + '"'
s = '<img src="' + src + '" alt="' + escape(text) + '"'
if title:
s += ' title="' + escape_html(title) + '"'
s += ' title="' + title + '"'
return s + ' />'

def emphasis(self, text):
Expand All @@ -101,7 +100,7 @@ def strong(self, text):
return '<strong>' + text + '</strong>'

def codespan(self, text):
return '<code>' + escape(text) + '</code>'
return '<code>' + text + '</code>'

def linebreak(self):
return '<br />\n'
Expand Down Expand Up @@ -136,9 +135,8 @@ def block_code(self, code, info=None):
info = info.strip()
if info:
lang = info.split(None, 1)[0]
lang = escape_html(lang)
html += ' class="language-' + lang + '"'
return html + '>' + escape(code) + '</code></pre>\n'
return html + '>' + code + '</code></pre>\n'

def block_quote(self, text):
return '<blockquote>' + text + '</blockquote>\n'
Expand Down
42 changes: 31 additions & 11 deletions mistune/util.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import re
try:
from urllib.parse import quote
import html
except ImportError:
from urllib import quote
html = None

try:
from html import _replace_charref
except ImportError:
_replace_charref = None


PREVENT_BACKSLASH = r'(?<!\\)(?:\\\\)*'
Expand All @@ -21,10 +24,12 @@
HTML_TAGNAME = r'[A-Za-z][A-Za-z0-9-]*'
HTML_ATTRIBUTES = (
r'(?:\s+[A-Za-z_:][A-Za-z0-9_.:-]*'
r'(?:\s*=\s*(?:[^ "\'=<>`]+|\'[^\']*?\'|"[^\"]*?"))?)*'
r'(?:\s*=\s*(?:[^ !"\'=<>`]+|\'[^\']*?\'|"[^\"]*?"))?)*'
)




def escape(s, quote=True):
s = s.replace("&", "&amp;")
s = s.replace("<", "&lt;")
Expand All @@ -40,18 +45,33 @@ def escape_url(link):
'!$&()*+,;=' # sub-delims - "'" (rfc3986)
'%' # leave already-encoded octets alone
)

if html is None:
return quote(link.encode('utf-8'), safe=safe)
return html.escape(quote(html.unescape(link), safe=safe))
return escape(quote(unescape(link), safe=safe))


def escape_html(s):
if html is not None:
return html.escape(html.unescape(s)).replace('&#x27;', "'")
return escape(s)
def safe_entity(s):
return escape(unescape(s))


def unikey(s):
key = ' '.join(s.split()).strip()
return key.lower().upper()


_charref = re.compile(
r'&(#[0-9]{1,7};'
r'|#[xX][0-9a-fA-F]+;'
r'|[^\t\n\f <&#;]{1,32};)'
)


def unescape(s):
"""
Copy from `html.unescape`, but `_charref` is different. CommonMark
does not accept entity references without a trailing semicolon
"""
if not _replace_charref:
return s

if '&' not in s:
return s
return _charref.sub(_replace_charref, s)

0 comments on commit 8452faf

Please sign in to comment.