diff --git a/mistune.py b/mistune.py index 50bc85f..c25596c 100644 --- a/mistune.py +++ b/mistune.py @@ -22,6 +22,7 @@ _key_pattern = re.compile(r'\s+') +_nonalpha_pattern = re.compile(r'\W') _escape_pattern = re.compile(r'&(?!#?\w+;)') _newline_pattern = re.compile(r'\r\n|\r') _block_quote_leading_pattern = re.compile(r'^ *> ?', flags=re.M) @@ -36,6 +37,7 @@ _valid_end = r'(?!:/|[^\w\s@]*@)\b' _valid_attr = r'''"[^"]*"|'[^']*'|[^'">]''' _block_tag = r'(?!(?:%s)\b)\w+%s' % ('|'.join(_inline_tags), _valid_end) +_scheme_blacklist = ('javascript', 'data', 'vbscript') def _pure_pattern(regex): @@ -70,6 +72,19 @@ def escape(text, quote=False, smart_amp=True): return text +def escape_link(url, **kwargs): + """Remove dangerous URL schemes like javascript: and escape afterwards.""" + if ':' in url: + scheme, _ = url.split(':', 1) + scheme = _nonalpha_pattern.sub('', scheme) + # whitelist would be better but mistune's use case is too general + if scheme in _scheme_blacklist: + return '' + # escape &entities; to &entities; + kwargs['smart_amp'] = False + return escape(url, **kwargs) + + def preprocessing(text, tab=4): text = _newline_pattern.sub('\n', text) text = text.replace('\t', ' ' * tab) @@ -838,8 +853,7 @@ def link(self, link, title, text): :param title: title content for `title` attribute. :param text: text content for description. """ - if link.startswith('javascript:'): - link = '' + link = escape_link(link, quote=True) if not title: return '%s' % (link, text) title = escape(title, quote=True) @@ -852,8 +866,7 @@ def image(self, src, title, text): :param title: title text of the image. :param text: alt text of the image. """ - if src.startswith('javascript:'): - src = '' + src = escape_link(src, quote=True) text = escape(text, quote=True) if title: title = escape(title, quote=True) diff --git a/tests/fixtures/normal/amps_and_angles_encoding.html b/tests/fixtures/normal/amps_and_angles_encoding.html index 138f4d5..483f8ff 100644 --- a/tests/fixtures/normal/amps_and_angles_encoding.html +++ b/tests/fixtures/normal/amps_and_angles_encoding.html @@ -8,10 +8,10 @@
6 > 5.
-Here's a link with an ampersand in the URL.
+Here's a link with an ampersand in the URL.
Here's a link with an amersand in the link text: AT&T.
-Here's an inline link.
+Here's an inline link.
-Here's an inline link.
+Here's an inline link.
diff --git a/tests/test_extra.py b/tests/test_extra.py index 0198329..1685c24 100644 --- a/tests/test_extra.py +++ b/tests/test_extra.py @@ -18,10 +18,25 @@ def test_linebreak(): def test_safe_links(): - ret = mistune.markdown('javascript ![foo](