Skip to content

Commit

Permalink
Remove only the CSS comment if a suspicious content is detected
Browse files Browse the repository at this point in the history
  • Loading branch information
frenzymadness committed Nov 14, 2024
1 parent 90bcfa8 commit a074425
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 26 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Bugs fixed
within CSS comments. In certain contexts, such as within ``<svg>`` or ``<math>`` tags,
``<style>`` tags may lose their intended function, allowing comments
like ``/* foo */`` to potentially be executed by the browser.
If a suspicious content is detected, only the comment is removed.

0.3.1 (2024-10-09)
==================
Expand Down
62 changes: 40 additions & 22 deletions lxml_html_clean/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,8 +366,11 @@ def __call__(self, doc):
new = _replace_css_import('', new)
if self._has_sneaky_javascript(new):
# Something tricky is going on...
el.text = '/* deleted */'
elif new != old:
new = '/* deleted */'
else:
new = self._remove_sneaky_css_comments(new)

if new != old:
el.text = new
if self.comments:
kill_tags.add(etree.Comment)
Expand Down Expand Up @@ -568,7 +571,9 @@ def _remove_javascript_link(self, link):
return ''
return link

_substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
_comments_re = re.compile(r'/\*.*?\*/', re.S)
_find_comments = _comments_re.finditer
_substitute_comments = _comments_re.sub

def _has_sneaky_javascript(self, style):
"""
Expand All @@ -581,29 +586,42 @@ def _has_sneaky_javascript(self, style):
that and remove only the Javascript from the style; this catches
more sneaky attempts.
"""
style = self._substitute_comments('', style)
style = style.replace('\\', '')
style = _substitute_whitespace('', style)
style = style.lower()

for with_comments in True, False:
if not with_comments:
style = self._substitute_comments('', style)

style = style.replace('\\', '')

if _has_javascript_scheme(style):
return True
if 'expression(' in style:
return True
if '@import' in style:
return True
if '</noscript' in style:
# e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
return True
if _looks_like_tag_content(style):
# e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
return True
if _has_javascript_scheme(style):
return True
if 'expression(' in style:
return True
if '@import' in style:
return True
if '</noscript' in style:
# e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
return True
if _looks_like_tag_content(style):
# e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
return True
return False

def _remove_sneaky_css_comments(self, style):
"""
Look for suspicious code in CSS comment and if found,
remove the entire comment from the given style.
Browsers might parse <style> as an ordinary HTML tag
in some specific context and that might cause code in CSS
comments to run.
"""
for match in self._find_comments(style):
comment = match.group(0)
print("f", comment)
if _has_javascript_scheme(comment) or _looks_like_tag_content(comment):
style = style.replace(comment, "/* deleted */")
print("f", style)

return style

def clean_html(self, html):
result_type = type(html)
if isinstance(html, (str, bytes)):
Expand Down
10 changes: 6 additions & 4 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,19 +129,21 @@ def test_sneaky_js_in_math_style(self):

def test_sneaky_js_in_style_comment_math_svg(self):
for tag in "svg", "math":
html = f'<{tag}><style>/*<img src onerror=alert(origin)>*/'
html = f'<{tag}><style>p {{color: red;}}/*<img src onerror=alert(origin)>*/h2 {{color: blue;}}</style></{tag}>'
s = lxml.html.fragment_fromstring(html)

expected = f'<{tag}><style>p {{color: red;}}/* deleted */h2 {{color: blue;}}</style></{tag}>'.encode()

self.assertEqual(
f'<{tag}><style>/* deleted */</style></{tag}>'.encode(),
expected,
lxml.html.tostring(clean_html(s)))

def test_sneaky_js_in_style_comment_noscript(self):
html = '<noscript><style>/*</noscript><img src onerror=alert(origin)>*/'
html = '<noscript><style>p {{color: red;}}/*</noscript><img src onerror=alert(origin)>*/h2 {{color: blue;}}</style></noscript>'
s = lxml.html.fragment_fromstring(html)

self.assertEqual(
b'<noscript><style>/* deleted */</style></noscript>',
b'<noscript><style>p {{color: red;}}/* deleted */h2 {{color: blue;}}</style></noscript>',
lxml.html.tostring(clean_html(s)))

def test_sneaky_import_in_style(self):
Expand Down

0 comments on commit a074425

Please sign in to comment.