From 7c4f4c8c521e5706e5a72c774f8c6a1c3027f70c Mon Sep 17 00:00:00 2001
From: Charmander <~@charmander.me>
Date: Sat, 2 Nov 2024 03:17:14 -0700
Subject: [PATCH] Fix `<a href=" javascript:">` with leading space bypassing
 HTML filter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Speaking of CVE-2023-24329…

Yes! Weasyl has one of the most obvious XSSes, in 2024.

I’m looking forward to getting to the point of implementing a real Content-Security-Policy, a real safe document model that isn’t created from arbitrary HTML, ….
---
 libweasyl/defang.py         | 13 +++++++++----
 libweasyl/test/test_text.py | 23 ++++++++++++++++++++++-
 2 files changed, 31 insertions(+), 5 deletions(-)
diff --git a/libweasyl/defang.py b/libweasyl/defang.py
index 1c7e6b0f4..eabe462d6 100644
--- a/libweasyl/defang.py
+++ b/libweasyl/defang.py
@@ -77,6 +77,9 @@
 """, re.X | re.I)
 
 
+_C0_OR_SPACE = "".join(map(chr, range(0x21)))
+
+
 def get_scheme(url):
     """
     Get the scheme from a URL, if the URL is valid.
@@ -128,13 +131,15 @@ def defang(fragment):
         extend_attributes = []
 
         for key, value in child.items():
-            if key == "href" and child.tag == "a" and get_scheme(value) in allowed_schemes:
-                url = urlparse(value)
+            # `value_stripped` is a correct thing to do according to the WHATWG URL spec (but not the only possible validation error, and not all are handled here yet). It also works around CVE-2023-24329 while on Python <3.10.12.
+            if key == "href" and child.tag == "a" and get_scheme(value_stripped := value.strip(_C0_OR_SPACE)) in allowed_schemes:
+                url = urlparse(value_stripped)
+                extend_attributes.append((key, value_stripped))
 
                 if url.hostname not in (None, "www.weasyl.com", "weasyl.com"):
                     extend_attributes.append(("rel", "nofollow ugc"))
-            elif key == "src" and child.tag == "img" and get_scheme(value) in allowed_schemes:
-                pass
+            elif key == "src" and child.tag == "img" and get_scheme(value_stripped := value.strip(_C0_OR_SPACE)) in allowed_schemes:
+                extend_attributes.append((key, value_stripped))
             elif key == "style" and ALLOWED_STYLE.match(value):
                 pass
             elif key == "class":
diff --git a/libweasyl/test/test_text.py b/libweasyl/test/test_text.py
index 17e87473e..b4c419a70 100644
--- a/libweasyl/test/test_text.py
+++ b/libweasyl/test/test_text.py
@@ -1,8 +1,10 @@
 # encoding: utf-8
+from lxml import html
 from lxml.etree import LIBXML_VERSION
 import pytest
 
-from libweasyl.text import markdown, markdown_excerpt, markdown_link
+from libweasyl.defang import defang
+from libweasyl.text import markdown, markdown_excerpt, markdown_link, strip_outer_tag
 
 
 libxml_xfail = pytest.mark.xfail(LIBXML_VERSION < (2, 9), reason='libxml2 too old to preserve whitespace')
@@ -133,6 +135,7 @@ def test_markdown_strikethrough():
     ('<a href="http://example.com/">external</a>', '<a href="http://example.com/" rel="nofollow ugc">external</a>'),
     ('<a href="http://example.com/" rel="noreferrer">external</a>', '<a href="http://example.com/" rel="nofollow ugc">external</a>'),
     ("[external](//example.com/)", '<a href="//example.com/" rel="nofollow ugc">external</a>'),
+    ('<a href=" //example.com/">external</a>', '<a href="//example.com/" rel="nofollow ugc">external</a>'),
 ])
 def test_markdown_external_link_noreferrer(target, expected):
     assert markdown(target) == "<p>%s</p>\n" % (expected,)
@@ -156,6 +159,24 @@ def test_tag_stripping():
     assert markdown("<!--[if IE]><script>alert(1)</script><![endif]-->") == "\n"
 
 
+@pytest.mark.parametrize(('target', 'expected'), [
+    ('<a href=" javascript:alert(1)">no</a>', "<a>no</a>"),
+    ('<a href="java&#x09;script:alert(1)">no</a>', "<a>no</a>"),
+])
+def test_unsafe_link(target, expected):
+    assert markdown(target) == "<p>%s</p>\n" % (expected,)
+
+
+@pytest.mark.parametrize(('target', 'expected'), [
+    ('<a href="java\nscript:alert(1)">no</a>', "<a>no</a>"),
+])
+def test_unsafe_link_direct(target, expected):
+    fragment = html.fragment_fromstring(target, create_parent=True)
+    defang(fragment)
+    start, stripped, end = strip_outer_tag(html.tostring(fragment, encoding="unicode"))
+    assert stripped == expected
+
+
 markdown_excerpt_tests = [
     ('', ''),
     ('short', 'short'),