diff --git a/mistune.py b/mistune.py index 50bc85f..c25596c 100644 --- a/mistune.py +++ b/mistune.py @@ -22,6 +22,7 @@ _key_pattern = re.compile(r'\s+') +_nonalpha_pattern = re.compile(r'\W') _escape_pattern = re.compile(r'&(?!#?\w+;)') _newline_pattern = re.compile(r'\r\n|\r') _block_quote_leading_pattern = re.compile(r'^ *> ?', flags=re.M) @@ -36,6 +37,7 @@ _valid_end = r'(?!:/|[^\w\s@]*@)\b' _valid_attr = r'''"[^"]*"|'[^']*'|[^'">]''' _block_tag = r'(?!(?:%s)\b)\w+%s' % ('|'.join(_inline_tags), _valid_end) +_scheme_blacklist = ('javascript', 'data', 'vbscript') def _pure_pattern(regex): @@ -70,6 +72,19 @@ def escape(text, quote=False, smart_amp=True): return text +def escape_link(url, **kwargs): + """Remove dangerous URL schemes like javascript: and escape afterwards.""" + if ':' in url: + scheme, _ = url.split(':', 1) + scheme = _nonalpha_pattern.sub('', scheme) + # whitelist would be better but mistune's use case is too general + if scheme in _scheme_blacklist: + return '' + # escape &entities; to &entities; + kwargs['smart_amp'] = False + return escape(url, **kwargs) + + def preprocessing(text, tab=4): text = _newline_pattern.sub('\n', text) text = text.replace('\t', ' ' * tab) @@ -838,8 +853,7 @@ def link(self, link, title, text): :param title: title content for `title` attribute. :param text: text content for description. """ - if link.startswith('javascript:'): - link = '' + link = escape_link(link, quote=True) if not title: return '%s' % (link, text) title = escape(title, quote=True) @@ -852,8 +866,7 @@ def image(self, src, title, text): :param title: title text of the image. :param text: alt text of the image. """ - if src.startswith('javascript:'): - src = '' + src = escape_link(src, quote=True) text = escape(text, quote=True) if title: title = escape(title, quote=True) diff --git a/tests/fixtures/normal/amps_and_angles_encoding.html b/tests/fixtures/normal/amps_and_angles_encoding.html index 138f4d5..483f8ff 100644 --- a/tests/fixtures/normal/amps_and_angles_encoding.html +++ b/tests/fixtures/normal/amps_and_angles_encoding.html @@ -8,10 +8,10 @@

6 > 5.

-

Here's a link with an ampersand in the URL.

+

Here's a link with an ampersand in the URL.

Here's a link with an amersand in the link text: AT&T.

-

Here's an inline link.

+

Here's an inline link.

-

Here's an inline link.

+

Here's an inline link.

diff --git a/tests/test_extra.py b/tests/test_extra.py index 0198329..1685c24 100644 --- a/tests/test_extra.py +++ b/tests/test_extra.py @@ -18,10 +18,25 @@ def test_linebreak(): def test_safe_links(): - ret = mistune.markdown('javascript ![foo]() alert') - assert 'src=""' in ret - ret = mistune.markdown('javascript [foo]() alert') - assert 'href=""' in ret + attack_vectors = ( + # "standard" javascript pseudo protocol + ('javascript:alert`1`', ''), + # javascript pseudo protocol with entities + ('javascript:alert`1`', 'javascript&colon;alert`1`'), + # javascript pseudo protocol with prefix (dangerous in Chrome) + ('\x1Ajavascript:alert`1`', ''), + # data-URI (dangerous in Firefox) + ('data:text/html,', ''), + # vbscript-URI (dangerous in Internet Explorer) + ('vbscript:msgbox', ''), + # breaking out of the attribute + ('"<>', '"<>'), + ) + for vector, expected in attack_vectors: + # image + assert 'src="%s"' % expected in mistune.markdown('![atk](%s)' % vector) + # link + assert 'href="%s"' % expected in mistune.markdown('[atk](%s)' % vector) def test_skip_style():