From 60c9b553f29a9854e244ea21e412d7928bd57bcb Mon Sep 17 00:00:00 2001 From: Senthil Kumaran Date: Sun, 25 Apr 2021 07:47:20 -0700 Subject: [PATCH 1/7] issue43882 - urllib.parse should sanitize urls containing ASCII newline and tabs. --- Doc/library/urllib.parse.rst | 6 ++++ Lib/test/test_urlparse.py | 28 +++++++++++++++++++ Lib/urllib/parse.py | 3 ++ .../2021-04-25-07-46-37.bpo-43882.Jpwx85.rst | 6 ++++ 4 files changed, 43 insertions(+) create mode 100644 Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index 67c21208196b84..2a4ef69fe110b2 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -312,6 +312,10 @@ or on combining URL components into a URL string. ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is decomposed before parsing, no error will be raised. + Following the specification in WHATWG which updates RFC 3986, ASCII newline + ``\n``, ``\r`` or ``\r\n`` and tab ``\t`` characters are stripped from + the url. + .. versionchanged:: 3.6 Out-of-range port numbers now raise :exc:`ValueError`, instead of returning :const:`None`. @@ -320,6 +324,8 @@ or on combining URL components into a URL string. Characters that affect netloc parsing under NFKC normalization will now raise :exc:`ValueError`. + .. versionchanged:: 3.10 + ASCII newline and tab characters are stripped from the url. .. function:: urlunsplit(parts) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index c543ac9a4ba72e..9c1a5917c6b8b9 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -566,6 +566,21 @@ def test_urlsplit_attributes(self): self.assertEqual(p.port, 80) self.assertEqual(p.geturl(), url) + # Remove ASCII tabs and newlines from input + + url = "http://www.python.org/java\nscript:\talert('msg\r\n')/#frag" + p = urllib.parse.urlsplit(url) + self.assertEqual(p.scheme, "http") + self.assertEqual(p.netloc, "www.python.org") + self.assertEqual(p.path, "/javascript:alert('msg')/") + self.assertEqual(p.query, "") + self.assertEqual(p.fragment, "frag") + self.assertEqual(p.username, None) + self.assertEqual(p.password, None) + self.assertEqual(p.hostname, "www.python.org") + self.assertEqual(p.port, None) + self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/#frag") + # And check them all again, only with bytes this time url = b"HTTP://WWW.PYTHON.ORG/doc/#frag" p = urllib.parse.urlsplit(url) @@ -606,6 +621,19 @@ def test_urlsplit_attributes(self): self.assertEqual(p.port, 80) self.assertEqual(p.geturl(), url) + url = b"http://www.python.org/java\nscript:\talert('msg\r\n')/#frag" + p = urllib.parse.urlsplit(url) + self.assertEqual(p.scheme, b"http") + self.assertEqual(p.netloc, b"www.python.org") + self.assertEqual(p.path, b"/javascript:alert('msg')/") + self.assertEqual(p.query, b"") + self.assertEqual(p.fragment, b"frag") + self.assertEqual(p.username, None) + self.assertEqual(p.password, None) + self.assertEqual(p.hostname, b"www.python.org") + self.assertEqual(p.port, None) + self.assertEqual(p.geturl(), b"http://www.python.org/javascript:alert('msg')/#frag") + # Verify an illegal port raises ValueError url = b"HTTP://WWW.PYTHON.ORG:65536/doc/#frag" p = urllib.parse.urlsplit(url) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 21cae47bf38a36..5d18a7dfa37c6e 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -469,6 +469,9 @@ def urlsplit(url, scheme='', allow_fragments=True): else: scheme, url = url[:i].lower(), url[i+1:] + _unsafe_chars_to_remove = ['\t', '\r', '\n'] + url = url.translate({ord(c): None for c in _unsafe_chars_to_remove}) + if url[:2] == '//': netloc, url = _splitnetloc(url, 2) if (('[' in netloc and ']' not in netloc) or diff --git a/Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst b/Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst new file mode 100644 index 00000000000000..d0463601710b46 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst @@ -0,0 +1,6 @@ +Presence newline or tab characters in URL allowed attackers to write scripts +in URL, hijack the the the web-server. + +Following the controlling specification for URLs defined by WHATWG +urllib.parse strips ASCII newline and tabs from the url, preventing such +attacks. From ee779d61c8f70f4fd68847f9820cf25e06306f94 Mon Sep 17 00:00:00 2001 From: Senthil Kumaran Date: Wed, 28 Apr 2021 08:00:47 -0700 Subject: [PATCH 2/7] Address Code Review Comments. --- Doc/library/urllib.parse.rst | 11 ++++++--- Lib/test/test_urlparse.py | 43 ++++++++++++++++++------------------ Lib/urllib/parse.py | 7 ++++-- 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index 2a4ef69fe110b2..0aeec389f8844d 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -312,9 +312,8 @@ or on combining URL components into a URL string. ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is decomposed before parsing, no error will be raised. - Following the specification in WHATWG which updates RFC 3986, ASCII newline - ``\n``, ``\r`` or ``\r\n`` and tab ``\t`` characters are stripped from - the url. + Following the specification in `WHATWG`_ which updates RFC 3986, ASCII newline + ``\n``, ``\r`` and tab ``\t`` characters are stripped from the url. .. versionchanged:: 3.6 Out-of-range port numbers now raise :exc:`ValueError`, instead of @@ -680,6 +679,10 @@ task isn't already covered by the URL parsing functions above. .. seealso:: + `WHATWG`_ - URL Living standard + Working Group for the URL Standard that defines URLs, domains, IP addresses, the + application/x-www-form-urlencoded format, and their API. + :rfc:`3986` - Uniform Resource Identifiers This is the current standard (STD66). Any changes to urllib.parse module should conform to this. Certain deviations could be observed, which are @@ -703,3 +706,5 @@ task isn't already covered by the URL parsing functions above. :rfc:`1738` - Uniform Resource Locators (URL) This specifies the formal syntax and semantics of absolute URLs. + +.. _WHATWG: https://url.spec.whatwg.org/#concept-basic-url-parser diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 9c1a5917c6b8b9..67341fecef94cd 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -566,21 +566,6 @@ def test_urlsplit_attributes(self): self.assertEqual(p.port, 80) self.assertEqual(p.geturl(), url) - # Remove ASCII tabs and newlines from input - - url = "http://www.python.org/java\nscript:\talert('msg\r\n')/#frag" - p = urllib.parse.urlsplit(url) - self.assertEqual(p.scheme, "http") - self.assertEqual(p.netloc, "www.python.org") - self.assertEqual(p.path, "/javascript:alert('msg')/") - self.assertEqual(p.query, "") - self.assertEqual(p.fragment, "frag") - self.assertEqual(p.username, None) - self.assertEqual(p.password, None) - self.assertEqual(p.hostname, "www.python.org") - self.assertEqual(p.port, None) - self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/#frag") - # And check them all again, only with bytes this time url = b"HTTP://WWW.PYTHON.ORG/doc/#frag" p = urllib.parse.urlsplit(url) @@ -621,6 +606,28 @@ def test_urlsplit_attributes(self): self.assertEqual(p.port, 80) self.assertEqual(p.geturl(), url) + # Verify an illegal port raises ValueError + url = b"HTTP://WWW.PYTHON.ORG:65536/doc/#frag" + p = urllib.parse.urlsplit(url) + with self.assertRaisesRegex(ValueError, "out of range"): + p.port + + def test_urlsplit_remove_unsafe_bytes(self): + # Remove ASCII tabs and newlines from input + url = "http://www.python.org/java\nscript:\talert('msg\r\n')/#frag" + p = urllib.parse.urlsplit(url) + self.assertEqual(p.scheme, "http") + self.assertEqual(p.netloc, "www.python.org") + self.assertEqual(p.path, "/javascript:alert('msg')/") + self.assertEqual(p.query, "") + self.assertEqual(p.fragment, "frag") + self.assertEqual(p.username, None) + self.assertEqual(p.password, None) + self.assertEqual(p.hostname, "www.python.org") + self.assertEqual(p.port, None) + self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/#frag") + + # Remove ASCII tabs and newlines from input as bytes. url = b"http://www.python.org/java\nscript:\talert('msg\r\n')/#frag" p = urllib.parse.urlsplit(url) self.assertEqual(p.scheme, b"http") @@ -634,12 +641,6 @@ def test_urlsplit_attributes(self): self.assertEqual(p.port, None) self.assertEqual(p.geturl(), b"http://www.python.org/javascript:alert('msg')/#frag") - # Verify an illegal port raises ValueError - url = b"HTTP://WWW.PYTHON.ORG:65536/doc/#frag" - p = urllib.parse.urlsplit(url) - with self.assertRaisesRegex(ValueError, "out of range"): - p.port - def test_attributes_bad_port(self): """Check handling of invalid ports.""" for bytes in (False, True): diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 5d18a7dfa37c6e..c11c695a741c8a 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -78,6 +78,9 @@ '0123456789' '+-.') +# Unsafe bytes to be removed per WHATWG spec +_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] + # XXX: Consider replacing with functools.lru_cache MAX_CACHE_SIZE = 20 _parse_cache = {} @@ -469,8 +472,8 @@ def urlsplit(url, scheme='', allow_fragments=True): else: scheme, url = url[:i].lower(), url[i+1:] - _unsafe_chars_to_remove = ['\t', '\r', '\n'] - url = url.translate({ord(c): None for c in _unsafe_chars_to_remove}) + for b in _UNSAFE_URL_BYTES_TO_REMOVE: + url = url.replace(b, "") if url[:2] == '//': netloc, url = _splitnetloc(url, 2) From e81f32f8bd211ef626eb89d10a92bb0ebbe48770 Mon Sep 17 00:00:00 2001 From: Senthil Kumaran Date: Wed, 28 Apr 2021 08:04:41 -0700 Subject: [PATCH 3/7] Give link to specification. Reference in WG page in Seealso links. --- Doc/library/urllib.parse.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index 0aeec389f8844d..bd90ee89138041 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -312,7 +312,7 @@ or on combining URL components into a URL string. ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is decomposed before parsing, no error will be raised. - Following the specification in `WHATWG`_ which updates RFC 3986, ASCII newline + Following the `WHATWG spec`_ that updates RFC 3986, ASCII newline ``\n``, ``\r`` and tab ``\t`` characters are stripped from the url. .. versionchanged:: 3.6 @@ -326,6 +326,8 @@ or on combining URL components into a URL string. .. versionchanged:: 3.10 ASCII newline and tab characters are stripped from the url. +.. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser + .. function:: urlunsplit(parts) Combine the elements of a tuple as returned by :func:`urlsplit` into a @@ -707,4 +709,4 @@ task isn't already covered by the URL parsing functions above. :rfc:`1738` - Uniform Resource Locators (URL) This specifies the formal syntax and semantics of absolute URLs. -.. _WHATWG: https://url.spec.whatwg.org/#concept-basic-url-parser +.. _WHATWG: https://url.spec.whatwg.org/ From ae07b8f2ed2cb9ffbbb5b9587b3b8e7167aada88 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Wed, 28 Apr 2021 19:20:29 -0700 Subject: [PATCH 4/7] Wording cleanup and ReST editing for NEWS. --- .../Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst b/Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst index d0463601710b46..171509ef9daa35 100644 --- a/Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst +++ b/Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst @@ -1,6 +1,6 @@ -Presence newline or tab characters in URL allowed attackers to write scripts -in URL, hijack the the the web-server. +The presence of newline or tab characters in parts of a URL could allow +some forms of attacks. Following the controlling specification for URLs defined by WHATWG -urllib.parse strips ASCII newline and tabs from the url, preventing such -attacks. +:func:`urllib.parse` now removes ASCII newlines and tabs from urls, +preventing such attacks. From 908e744a28a2275df39ba6a1402090bf07494ef8 Mon Sep 17 00:00:00 2001 From: Senthil Kumaran Date: Thu, 29 Apr 2021 03:17:57 -0700 Subject: [PATCH 5/7] Update Doc/library/urllib.parse.rst Co-authored-by: Serhiy Storchaka --- Doc/library/urllib.parse.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index bd90ee89138041..b6126b0d13e52c 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -313,7 +313,7 @@ or on combining URL components into a URL string. decomposed before parsing, no error will be raised. Following the `WHATWG spec`_ that updates RFC 3986, ASCII newline - ``\n``, ``\r`` and tab ``\t`` characters are stripped from the url. + ``\n``, ``\r`` and tab ``\t`` characters are stripped from the URL. .. versionchanged:: 3.6 Out-of-range port numbers now raise :exc:`ValueError`, instead of From 5f61b9ebf885be41ed19fe605bdc8c5bb599eb9e Mon Sep 17 00:00:00 2001 From: Senthil Kumaran Date: Thu, 29 Apr 2021 03:18:03 -0700 Subject: [PATCH 6/7] Update Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst Co-authored-by: Serhiy Storchaka --- .../next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst b/Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst index 171509ef9daa35..a326d079dff4a4 100644 --- a/Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst +++ b/Misc/NEWS.d/next/Security/2021-04-25-07-46-37.bpo-43882.Jpwx85.rst @@ -2,5 +2,5 @@ The presence of newline or tab characters in parts of a URL could allow some forms of attacks. Following the controlling specification for URLs defined by WHATWG -:func:`urllib.parse` now removes ASCII newlines and tabs from urls, +:func:`urllib.parse` now removes ASCII newlines and tabs from URLs, preventing such attacks. From 4afcf42c94b96d164e03f1e0fd0bbf35bbdd0f67 Mon Sep 17 00:00:00 2001 From: Senthil Kumaran Date: Thu, 29 Apr 2021 03:18:16 -0700 Subject: [PATCH 7/7] Update Doc/library/urllib.parse.rst Co-authored-by: Serhiy Storchaka --- Doc/library/urllib.parse.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index b6126b0d13e52c..0aaac562883f18 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -324,7 +324,7 @@ or on combining URL components into a URL string. now raise :exc:`ValueError`. .. versionchanged:: 3.10 - ASCII newline and tab characters are stripped from the url. + ASCII newline and tab characters are stripped from the URL. .. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser