From 7c4f4c8c521e5706e5a72c774f8c6a1c3027f70c Mon Sep 17 00:00:00 2001 From: Charmander <~@charmander.me> Date: Sat, 2 Nov 2024 03:17:14 -0700 Subject: [PATCH] Fix `` with leading space bypassing HTML filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Speaking of CVE-2023-24329… Yes! Weasyl has one of the most obvious XSSes, in 2024. I’m looking forward to getting to the point of implementing a real Content-Security-Policy, a real safe document model that isn’t created from arbitrary HTML, …. --- libweasyl/defang.py | 13 +++++++++---- libweasyl/test/test_text.py | 23 ++++++++++++++++++++++- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/libweasyl/defang.py b/libweasyl/defang.py index 1c7e6b0f4..eabe462d6 100644 --- a/libweasyl/defang.py +++ b/libweasyl/defang.py @@ -77,6 +77,9 @@ """, re.X | re.I) +_C0_OR_SPACE = "".join(map(chr, range(0x21))) + + def get_scheme(url): """ Get the scheme from a URL, if the URL is valid. @@ -128,13 +131,15 @@ def defang(fragment): extend_attributes = [] for key, value in child.items(): - if key == "href" and child.tag == "a" and get_scheme(value) in allowed_schemes: - url = urlparse(value) + # `value_stripped` is a correct thing to do according to the WHATWG URL spec (but not the only possible validation error, and not all are handled here yet). It also works around CVE-2023-24329 while on Python <3.10.12. + if key == "href" and child.tag == "a" and get_scheme(value_stripped := value.strip(_C0_OR_SPACE)) in allowed_schemes: + url = urlparse(value_stripped) + extend_attributes.append((key, value_stripped)) if url.hostname not in (None, "www.weasyl.com", "weasyl.com"): extend_attributes.append(("rel", "nofollow ugc")) - elif key == "src" and child.tag == "img" and get_scheme(value) in allowed_schemes: - pass + elif key == "src" and child.tag == "img" and get_scheme(value_stripped := value.strip(_C0_OR_SPACE)) in allowed_schemes: + extend_attributes.append((key, value_stripped)) elif key == "style" and ALLOWED_STYLE.match(value): pass elif key == "class": diff --git a/libweasyl/test/test_text.py b/libweasyl/test/test_text.py index 17e87473e..b4c419a70 100644 --- a/libweasyl/test/test_text.py +++ b/libweasyl/test/test_text.py @@ -1,8 +1,10 @@ # encoding: utf-8 +from lxml import html from lxml.etree import LIBXML_VERSION import pytest -from libweasyl.text import markdown, markdown_excerpt, markdown_link +from libweasyl.defang import defang +from libweasyl.text import markdown, markdown_excerpt, markdown_link, strip_outer_tag libxml_xfail = pytest.mark.xfail(LIBXML_VERSION < (2, 9), reason='libxml2 too old to preserve whitespace') @@ -133,6 +135,7 @@ def test_markdown_strikethrough(): ('external', 'external'), ('external', 'external'), ("[external](//example.com/)", 'external'), + ('external', 'external'), ]) def test_markdown_external_link_noreferrer(target, expected): assert markdown(target) == "

%s

\n" % (expected,) @@ -156,6 +159,24 @@ def test_tag_stripping(): assert markdown("") == "\n" +@pytest.mark.parametrize(('target', 'expected'), [ + ('no', "no"), + ('no', "no"), +]) +def test_unsafe_link(target, expected): + assert markdown(target) == "

%s

\n" % (expected,) + + +@pytest.mark.parametrize(('target', 'expected'), [ + ('no', "no"), +]) +def test_unsafe_link_direct(target, expected): + fragment = html.fragment_fromstring(target, create_parent=True) + defang(fragment) + start, stripped, end = strip_outer_tag(html.tostring(fragment, encoding="unicode")) + assert stripped == expected + + markdown_excerpt_tests = [ ('', ''), ('short', 'short'),