Remove only the CSS comment if a suspicious content is detected

fedora-python · Nov 14, 2024 · a074425 · a074425
1 parent 90bcfa8
commit a074425
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 26 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -16,6 +16,7 @@ Bugs fixed
   within CSS comments. In certain contexts, such as within ``<svg>`` or ``<math>`` tags,
   ``<style>`` tags may lose their intended function, allowing comments
   like ``/* foo */`` to potentially be executed by the browser.
+  If a suspicious content is detected, only the comment is removed.
 
 0.3.1 (2024-10-09)
 ==================

diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py
@@ -366,8 +366,11 @@ def __call__(self, doc):
                     new = _replace_css_import('', new)
                     if self._has_sneaky_javascript(new):
                         # Something tricky is going on...
-                        el.text = '/* deleted */'
-                    elif new != old:
+                        new = '/* deleted */'
+                    else:
+                        new = self._remove_sneaky_css_comments(new)
+
+                    if new != old:
                         el.text = new
         if self.comments:
             kill_tags.add(etree.Comment)
@@ -568,7 +571,9 @@ def _remove_javascript_link(self, link):
             return ''
         return link
 
-    _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
+    _comments_re = re.compile(r'/\*.*?\*/', re.S)
+    _find_comments = _comments_re.finditer
+    _substitute_comments = _comments_re.sub
 
     def _has_sneaky_javascript(self, style):
         """
@@ -581,29 +586,42 @@ def _has_sneaky_javascript(self, style):
         that and remove only the Javascript from the style; this catches
         more sneaky attempts.
         """
+        style = self._substitute_comments('', style)
+        style = style.replace('\\', '')
         style = _substitute_whitespace('', style)
         style = style.lower()
-
-        for with_comments in True, False:
-            if not with_comments:
-                style = self._substitute_comments('', style)
-
-            style = style.replace('\\', '')
-
-            if _has_javascript_scheme(style):
-                return True
-            if 'expression(' in style:
-                return True
-            if '@import' in style:
-                return True
-            if '</noscript' in style:
-                # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
-                return True
-            if _looks_like_tag_content(style):
-                # e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
-                return True
+        if _has_javascript_scheme(style):
+            return True
+        if 'expression(' in style:
+            return True
+        if '@import' in style:
+            return True
+        if '</noscript' in style:
+            # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
+            return True
+        if _looks_like_tag_content(style):
+            # e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
+            return True
         return False
 
+    def _remove_sneaky_css_comments(self, style):
+        """
+        Look for suspicious code in CSS comment and if found,
+        remove the entire comment from the given style.
+
+        Browsers might parse <style> as an ordinary HTML tag
+        in some specific context and that might cause code in CSS
+        comments to run.
+        """
+        for match in self._find_comments(style):
+            comment = match.group(0)
+            print("f", comment)
+            if _has_javascript_scheme(comment) or _looks_like_tag_content(comment):
+                style = style.replace(comment, "/* deleted */")
+                print("f", style)
+
+        return style
+
     def clean_html(self, html):
         result_type = type(html)
         if isinstance(html, (str, bytes)):

diff --git a/tests/test_clean.py b/tests/test_clean.py
@@ -129,19 +129,21 @@ def test_sneaky_js_in_math_style(self):
 
     def test_sneaky_js_in_style_comment_math_svg(self):
         for tag in "svg", "math":
-            html = f'<{tag}><style>/*<img src onerror=alert(origin)>*/'
+            html = f'<{tag}><style>p {{color: red;}}/*<img src onerror=alert(origin)>*/h2 {{color: blue;}}</style></{tag}>'
             s = lxml.html.fragment_fromstring(html)
 
+            expected = f'<{tag}><style>p {{color: red;}}/* deleted */h2 {{color: blue;}}</style></{tag}>'.encode()
+
             self.assertEqual(
-                f'<{tag}><style>/* deleted */</style></{tag}>'.encode(),
+                expected,
                 lxml.html.tostring(clean_html(s)))
 
     def test_sneaky_js_in_style_comment_noscript(self):
-        html = '<noscript><style>/*</noscript><img src onerror=alert(origin)>*/'
+        html = '<noscript><style>p {{color: red;}}/*</noscript><img src onerror=alert(origin)>*/h2 {{color: blue;}}</style></noscript>'
         s = lxml.html.fragment_fromstring(html)
 
         self.assertEqual(
-            b'<noscript><style>/* deleted */</style></noscript>',
+            b'<noscript><style>p {{color: red;}}/* deleted */h2 {{color: blue;}}</style></noscript>',
             lxml.html.tostring(clean_html(s)))
 
     def test_sneaky_import_in_style(self):