Skip to content

Commit 582b28b

Browse files
frenzymadnessillia-vgpshead
authored andcommitted
00399-cve-2023-24329.patch
00399 # * pythongh-102153: Start stripping C0 control and space chars in `urlsplit` (pythonGH-102508) `urllib.parse.urlsplit` has already been respecting the WHATWG spec a bit pythonGH-25595. This adds more sanitizing to respect the "Remove any leading C0 control or space from input" [rule](https://url.spec.whatwg.org/GH-url-parsing:~:text=Remove%20any%20leading%20and%20trailing%20C0%20control%20or%20space%20from%20input.) in response to [CVE-2023-24329](https://nvd.nist.gov/vuln/detail/CVE-2023-24329). Backported to Python 2 from Python 3.12. Co-authored-by: Illia Volochii <illia.volochii@gmail.com> Co-authored-by: Gregory P. Smith [Google] <greg@krypto.org> Co-authored-by: Lumir Balhar <lbalhar@redhat.com>
1 parent 80ca2b4 commit 582b28b

File tree

2 files changed

+67
-0
lines changed

2 files changed

+67
-0
lines changed

Lib/test/test_urlparse.py

+57
Original file line numberDiff line numberDiff line change
@@ -666,7 +666,64 @@ def test_urlsplit_remove_unsafe_bytes(self):
666666
self.assertEqual(p.scheme, "https")
667667
self.assertEqual(p.geturl(), "https://www.python.org/javascript:alert('msg')/?query=something#fragment")
668668

669+
def test_urlsplit_strip_url(self):
670+
noise = "".join([chr(i) for i in range(0, 0x20 + 1)])
671+
base_url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag"
669672

673+
url = noise.decode("utf-8") + base_url
674+
p = urlparse.urlsplit(url)
675+
self.assertEqual(p.scheme, "http")
676+
self.assertEqual(p.netloc, "User:Pass@www.python.org:080")
677+
self.assertEqual(p.path, "/doc/")
678+
self.assertEqual(p.query, "query=yes")
679+
self.assertEqual(p.fragment, "frag")
680+
self.assertEqual(p.username, "User")
681+
self.assertEqual(p.password, "Pass")
682+
self.assertEqual(p.hostname, "www.python.org")
683+
self.assertEqual(p.port, 80)
684+
self.assertEqual(p.geturl(), base_url)
685+
686+
url = noise + base_url.encode("utf-8")
687+
p = urlparse.urlsplit(url)
688+
self.assertEqual(p.scheme, b"http")
689+
self.assertEqual(p.netloc, b"User:Pass@www.python.org:080")
690+
self.assertEqual(p.path, b"/doc/")
691+
self.assertEqual(p.query, b"query=yes")
692+
self.assertEqual(p.fragment, b"frag")
693+
self.assertEqual(p.username, b"User")
694+
self.assertEqual(p.password, b"Pass")
695+
self.assertEqual(p.hostname, b"www.python.org")
696+
self.assertEqual(p.port, 80)
697+
self.assertEqual(p.geturl(), base_url.encode("utf-8"))
698+
699+
# Test that trailing space is preserved as some applications rely on
700+
# this within query strings.
701+
query_spaces_url = "https://www.python.org:88/doc/?query= "
702+
p = urlparse.urlsplit(noise.decode("utf-8") + query_spaces_url)
703+
self.assertEqual(p.scheme, "https")
704+
self.assertEqual(p.netloc, "www.python.org:88")
705+
self.assertEqual(p.path, "/doc/")
706+
self.assertEqual(p.query, "query= ")
707+
self.assertEqual(p.port, 88)
708+
self.assertEqual(p.geturl(), query_spaces_url)
709+
710+
p = urlparse.urlsplit("www.pypi.org ")
711+
# That "hostname" gets considered a "path" due to the
712+
# trailing space and our existing logic... YUCK...
713+
# and re-assembles via geturl aka unurlsplit into the original.
714+
# django.core.validators.URLValidator (at least through v3.2) relies on
715+
# this, for better or worse, to catch it in a ValidationError via its
716+
# regular expressions.
717+
# Here we test the basic round trip concept of such a trailing space.
718+
self.assertEqual(urlparse.urlunsplit(p), "www.pypi.org ")
719+
720+
# with scheme as cache-key
721+
url = "//www.python.org/"
722+
scheme = noise.decode("utf-8") + "https" + noise.decode("utf-8")
723+
for _ in range(2):
724+
p = urlparse.urlsplit(url, scheme=scheme)
725+
self.assertEqual(p.scheme, "https")
726+
self.assertEqual(p.geturl(), "https://www.python.org/")
670727

671728
def test_attributes_bad_port(self):
672729
"""Check handling of non-integer ports."""

Lib/urlparse.py

+10
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
parsing quirks from older RFCs are retained. The testcases in
2727
test_urlparse.py provides a good indicator of parsing behavior.
2828
29+
The WHATWG URL Parser spec should also be considered. We are not compliant with
30+
it either due to existing user code API behavior expectations (Hyrum's Law).
31+
It serves as a useful guide when making changes.
32+
2933
"""
3034

3135
import re
@@ -63,6 +67,10 @@
6367
'0123456789'
6468
'+-.')
6569

70+
# Leading and trailing C0 control and space to be stripped per WHATWG spec.
71+
# == "".join([chr(i) for i in range(0, 0x20 + 1)])
72+
_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f '
73+
6674
# Unsafe bytes to be removed per WHATWG spec
6775
_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
6876

@@ -201,6 +209,8 @@ def urlsplit(url, scheme='', allow_fragments=True):
201209
(e.g. netloc is a single string) and we don't expand % escapes."""
202210
url = _remove_unsafe_bytes_from_url(url)
203211
scheme = _remove_unsafe_bytes_from_url(scheme)
212+
url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)
213+
scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)
204214
allow_fragments = bool(allow_fragments)
205215
key = url, scheme, allow_fragments, type(url), type(scheme)
206216
cached = _parse_cache.get(key, None)

0 commit comments

Comments
 (0)