Skip to content

Commit

Permalink
Preserve the status of allow_none in results.
Browse files Browse the repository at this point in the history
  • Loading branch information
serhiy-storchaka committed Nov 27, 2024
1 parent b50b778 commit eaa9ce6
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 73 deletions.
20 changes: 10 additions & 10 deletions Doc/library/urllib.parse.rst
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,8 @@ or on combining URL components into a URL string.
a ``?`` for an empty query), only ``None`` components are omitted.
This allows to restore the URL that was parsed with option
``allow_none=True``.
By default, *keep_empty* is true if *parts* is the result of the
:func:`urlparse` call with ``allow_none=True``.

.. versionchanged:: 3.14
Added the *keep_empty* parameter.
Expand Down Expand Up @@ -417,6 +419,8 @@ or on combining URL components into a URL string.
a ``?`` for an empty query), only ``None`` components are omitted.
This allows to restore the URL that was parsed with option
``allow_none=True``.
By default, *keep_empty* is true if *parts* is the result of the
:func:`urlsplit` call with ``allow_none=True``.

.. versionchanged:: 3.14
Added the *keep_empty* parameter.
Expand Down Expand Up @@ -461,10 +465,8 @@ or on combining URL components into a URL string.


.. versionchanged:: 3.5
Behavior updated to match the semantics defined in :rfc:`3986`.

.. versionchanged:: 3.14
Added the *keep_empty* parameter.
Behavior updated to match the semantics defined in :rfc:`3986`.


.. function:: urldefrag(url, *, allow_none=False)
Expand Down Expand Up @@ -588,12 +590,13 @@ These subclasses add the attributes listed in the documentation for
those functions, the encoding and decoding support described in the
previous section, as well as an additional method:

.. method:: urllib.parse.SplitResult.geturl(*, keep_empty=False)
.. method:: urllib.parse.SplitResult.geturl()

Return the re-combined version of the original URL as a string. This may
differ from the original URL in that the scheme may be normalized to lower
case and empty components may be dropped. Specifically, empty parameters,
queries, and fragment identifiers will be removed unless *keep_empty* is true.
queries, and fragment identifiers will be removed unless the URL was parsed
with ``allow_none=True``.

For :func:`urldefrag` results, only empty fragment identifiers will be removed.
For :func:`urlsplit` and :func:`urlparse` results, all noted changes will be
Expand All @@ -611,11 +614,8 @@ previous section, as well as an additional method:
>>> r2.geturl()
'http://www.Python.org/doc/'
>>> r3 = urlsplit(url, allow_none=True)
>>> r1.geturl(keep_empty=True)
'http://www.Python.org/doc/'

.. versionchanged:: 3.14
Added the *keep_empty* parameter.
>>> r3.geturl()
'http://www.Python.org/doc/#'


The following classes provide the implementations of the structured parse
Expand Down
5 changes: 3 additions & 2 deletions Doc/whatsnew/3.14.rst
Original file line number Diff line number Diff line change
Expand Up @@ -595,8 +595,9 @@ urllib.parse
* Add the *allow_none* parameter to :func:`~urllib.parse.urlparse`,
:func:`~urllib.parse.urlsplit` and :func:`~urllib.parse.urldefrag` functions.
Add the *keep_empty* parameter to :func:`~urllib.parse.urlunparse` and
:func:`~urllib.parse.urlunsplit` functions and
:func:`~urllib.parse.SplitResult.geturl` methods.
:func:`~urllib.parse.urlunsplit` functions.
This allows to distinguish between empty and not defined URI components
and preserve empty components.
(Contributed by Serhiy Storchaka in :gh:`67041`.)

uuid
Expand Down
81 changes: 53 additions & 28 deletions Lib/test/test_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import unicodedata
import unittest
import urllib.parse
from urllib.parse import urlparse, urlsplit, urlunparse, urlunsplit

RFC1808_BASE = "http://a/b/c/d;p?q#f"
RFC2396_BASE = "http://a/b/c/d;p?q"
Expand Down Expand Up @@ -119,23 +120,50 @@ def _encode(self, s):
return tuple(self._encode(x) for x in s)
return s

def checkRoundtrips(self, url, parsed, split, url2=None, *, allow_none=True):
def checkRoundtrips(self, url, parsed, split, url2=None):
if url2 is None:
url2 = url
result = urllib.parse.urlparse(url, allow_none=allow_none)
self.checkRoundtrips1(url, parsed, split, allow_none=True)
empty = url[:0]
parsed = tuple(x or empty for x in parsed)
split = tuple(x or empty for x in split)
self.checkRoundtrips1(url, parsed, split, url2, allow_none=False)

result = urlparse(url, allow_none=True)
self.assertEqual(urlunparse(result, keep_empty=False), url2)
self.assertEqual(urlunparse(tuple(result), keep_empty=False), url2)
result = urlparse(url, allow_none=False)
with self.assertRaises(ValueError):
urlunparse(result, keep_empty=True)
urlunparse(tuple(result), keep_empty=True)

result = urlsplit(url, allow_none=True)
self.assertEqual(urlunsplit(result, keep_empty=False), url2)
self.assertEqual(urlunsplit(tuple(result), keep_empty=False), url2)
result = urlsplit(url, allow_none=False)
with self.assertRaises(ValueError):
urlunsplit(result, keep_empty=True)
urlunsplit(tuple(result), keep_empty=True)

def checkRoundtrips1(self, url, parsed, split, url2=None, *, allow_none):
if url2 is None:
url2 = url
result = urlparse(url, allow_none=allow_none)
self.assertSequenceEqual(result, parsed)
t = (result.scheme, result.netloc, result.path,
result.params, result.query, result.fragment)
result.params, result.query, result.fragment)
self.assertSequenceEqual(t, parsed)
# put it back together and it should be the same
result2 = urllib.parse.urlunparse(result, keep_empty=allow_none)
self.assertSequenceEqual(result2, url2)
self.assertSequenceEqual(result2, result.geturl(keep_empty=allow_none))
result2 = urlunparse(result)
self.assertEqual(result2, url2)
self.assertEqual(result2, result.geturl())
self.assertEqual(urlunparse(result, keep_empty=allow_none), url2)
self.assertEqual(urlunparse(tuple(result), keep_empty=allow_none), result2)

# the result of geturl() is a fixpoint; we can always parse it
# again to get the same result:
result3 = urllib.parse.urlparse(result.geturl(keep_empty=allow_none), allow_none=allow_none)
self.assertEqual(result3.geturl(keep_empty=allow_none), result.geturl(keep_empty=allow_none))
result3 = urlparse(result.geturl(), allow_none=allow_none)
self.assertEqual(result3.geturl(), result.geturl())
self.assertSequenceEqual(result3, result)
self.assertEqual(result3.scheme, result.scheme)
self.assertEqual(result3.netloc, result.netloc)
Expand All @@ -149,18 +177,19 @@ def checkRoundtrips(self, url, parsed, split, url2=None, *, allow_none=True):
self.assertEqual(result3.port, result.port)

# check the roundtrip using urlsplit() as well
result = urllib.parse.urlsplit(url, allow_none=allow_none)
result = urlsplit(url, allow_none=allow_none)
self.assertSequenceEqual(result, split)
t = (result.scheme, result.netloc, result.path,
result.query, result.fragment)
result.query, result.fragment)
self.assertSequenceEqual(t, split)
result2 = urllib.parse.urlunsplit(result, keep_empty=allow_none)
self.assertSequenceEqual(result2, url2)
self.assertSequenceEqual(result2, result.geturl(keep_empty=allow_none))
result2 = urlunsplit(result)
self.assertEqual(result2, url2)
self.assertEqual(result2, result.geturl())
self.assertEqual(urlunsplit(tuple(result), keep_empty=allow_none), result2)

# check the fixpoint property of re-parsing the result of geturl()
result3 = urllib.parse.urlsplit(result.geturl(keep_empty=allow_none), allow_none=allow_none)
self.assertEqual(result3.geturl(keep_empty=allow_none), result.geturl(keep_empty=allow_none))
result3 = urlsplit(result.geturl(), allow_none=allow_none)
self.assertEqual(result3.geturl(), result.geturl())
self.assertSequenceEqual(result3, result)
self.assertEqual(result3.scheme, result.scheme)
self.assertEqual(result3.netloc, result.netloc)
Expand Down Expand Up @@ -288,32 +317,28 @@ def test_roundtrips(self):
]
for url, parsed, split in str_cases + bytes_cases:
with self.subTest(url):
self.checkRoundtrips(url, parsed, split, allow_none=True)
empty = url[:0]
parsed = tuple(x or empty for x in parsed)
split = tuple(x or empty for x in split)
self.checkRoundtrips(url, parsed, split, allow_none=False)
self.checkRoundtrips(url, parsed, split)

def test_roundtrips_normalization(self):
str_cases = [
('///path/to/file',
'///path/to/file',
'/path/to/file',
(None, '', '/path/to/file', None, None, None),
(None, '', '/path/to/file', None, None)),
('scheme:///path/to/file',
'scheme:///path/to/file',
'scheme:/path/to/file',
('scheme', '', '/path/to/file', None, None, None),
('scheme', '', '/path/to/file', None, None)),
('file:/tmp/junk.txt',
'file:/tmp/junk.txt',
'file:///tmp/junk.txt',
('file', None, '/tmp/junk.txt', None, None, None),
('file', None, '/tmp/junk.txt', None, None)),
('http:/tmp/junk.txt',
'http:/tmp/junk.txt',
'http:///tmp/junk.txt',
('http', None, '/tmp/junk.txt', None, None, None),
('http', None, '/tmp/junk.txt', None, None)),
('https:/tmp/junk.txt',
'https:/tmp/junk.txt',
'https:///tmp/junk.txt',
('https', None, '/tmp/junk.txt', None, None, None),
('https', None, '/tmp/junk.txt', None, None)),
]
Expand Down Expand Up @@ -371,9 +396,9 @@ def checkJoin(self, base, relurl, expected, *, relroundtrip=True):
relurlb2 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb))
self.assertEqual(urllib.parse.urljoin(baseb, relurlb2), expectedb)

relurl3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl, allow_none=True), keep_empty=True)
relurl3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl, allow_none=True))
self.assertEqual(urllib.parse.urljoin(base, relurl3), expected)
relurlb3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb, allow_none=True), keep_empty=True)
relurlb3 = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb, allow_none=True))
self.assertEqual(urllib.parse.urljoin(baseb, relurlb3), expectedb)

def test_unparse_parse(self):
Expand Down Expand Up @@ -796,7 +821,7 @@ def _encode(t):
url = url.rstrip(hash)
if frag is None:
frag = url[:0]
self.assertEqual(result.geturl(keep_empty=allow_none), url)
self.assertEqual(result.geturl(), url)
self.assertEqual(result, (defrag, frag))
self.assertEqual(result.url, defrag)
self.assertEqual(result.fragment, frag)
Expand Down
85 changes: 52 additions & 33 deletions Lib/urllib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,11 +267,27 @@ def _hostinfo(self):
return hostname, port


_DefragResultBase = namedtuple('_DefragResultBase', 'url fragment')
_SplitResultBase = namedtuple(
'_SplitResultBase', 'scheme netloc path query fragment')
_ParseResultBase = namedtuple(
'_ParseResultBase', 'scheme netloc path params query fragment')
_UNSPECIFIED = ['not specified']
_ALLOW_NONE_DEFAULT = False

class _DefragResultBase(namedtuple('_DefragResultBase', 'url fragment')):
def geturl(self):
if self.fragment or (self.fragment is not None and
getattr(self, '_keep_empty', _ALLOW_NONE_DEFAULT)):
return self.url + self._HASH + self.fragment
else:
return self.url

class _SplitResultBase(namedtuple(
'_SplitResultBase', 'scheme netloc path query fragment')):
def geturl(self):
return urlunsplit(self)

class _ParseResultBase(namedtuple(
'_ParseResultBase', 'scheme netloc path params query fragment')):
def geturl(self):
return urlunparse(self)


_DefragResultBase.__doc__ = """
DefragResult(url, fragment)
Expand Down Expand Up @@ -339,45 +355,27 @@ def _hostinfo(self):
# retained since deprecating it isn't worth the hassle
ResultBase = _NetlocResultMixinStr

_ALLOW_NONE_DEFAULT = False

# Structured result objects for string data
class DefragResult(_DefragResultBase, _ResultMixinStr):
__slots__ = ()
def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
if self.fragment or (keep_empty and self.fragment is not None):
return self.url + '#' + self.fragment
else:
return self.url
_HASH = '#'

class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
__slots__ = ()
def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
return urlunsplit(self, keep_empty=keep_empty)

class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
__slots__ = ()
def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
return urlunparse(self, keep_empty=keep_empty)

# Structured result objects for bytes data
class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
__slots__ = ()
def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
if self.fragment or (keep_empty and self.fragment is not None):
return self.url + b'#' + self.fragment
else:
return self.url
_HASH = b'#'

class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
__slots__ = ()
def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
return urlunsplit(self, keep_empty=keep_empty)

class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
__slots__ = ()
def geturl(self, *, keep_empty=_ALLOW_NONE_DEFAULT):
return urlunparse(self, keep_empty=keep_empty)

# Set up the encode/decode result pairs
def _fix_result_transcoding():
Expand Down Expand Up @@ -424,7 +422,9 @@ def urlparse(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_D
if query is None: query = ''
if fragment is None: fragment = ''
result = ParseResult(scheme, netloc, url, params, query, fragment)
return _coerce_result(result)
result = _coerce_result(result)
result._keep_empty = allow_none
return result

def _urlparse(url, scheme=None, allow_fragments=True):
scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments)
Expand Down Expand Up @@ -513,8 +513,10 @@ def urlsplit(url, scheme=None, allow_fragments=True, *, allow_none=_ALLOW_NONE_D
if netloc is None: netloc = ''
if query is None: query = ''
if fragment is None: fragment = ''
v = SplitResult(scheme, netloc, url, query, fragment)
return _coerce_result(v)
result = SplitResult(scheme, netloc, url, query, fragment)
result = _coerce_result(result)
result._keep_empty = allow_none
return result

def _urlsplit(url, scheme=None, allow_fragments=True):
# Only lstrip url as some applications rely on preserving trailing space.
Expand Down Expand Up @@ -551,13 +553,20 @@ def _urlsplit(url, scheme=None, allow_fragments=True):
_checknetloc(netloc)
return (scheme, netloc, url, query, fragment)

def urlunparse(components, *, keep_empty=_ALLOW_NONE_DEFAULT):
def urlunparse(components, *, keep_empty=_UNSPECIFIED):
"""Put a parsed URL back together again. This may result in a
slightly different, but equivalent URL, if the URL that was parsed
originally had redundant delimiters, e.g. a ? with an empty query
(the draft states that these are equivalent)."""
(the draft states that these are equivalent) and keep_empty is false
or components is the result of the urlparse() call with allow_none=False."""
scheme, netloc, url, params, query, fragment, _coerce_result = (
_coerce_args(*components))
if keep_empty is _UNSPECIFIED:
keep_empty = getattr(components, '_keep_empty', _ALLOW_NONE_DEFAULT)
elif keep_empty and not getattr(components, '_keep_empty', True):
raise ValueError('Cannot distinguish between empty and not defined '
'URI components in the result of parsing URL with '
'allow_none=False')
if not keep_empty:
if not netloc:
if scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
Expand All @@ -572,14 +581,22 @@ def urlunparse(components, *, keep_empty=_ALLOW_NONE_DEFAULT):
url = "%s;%s" % (url, params)
return _coerce_result(_urlunsplit(scheme, netloc, url, query, fragment))

def urlunsplit(components, *, keep_empty=_ALLOW_NONE_DEFAULT):
def urlunsplit(components, *, keep_empty=_UNSPECIFIED):
"""Combine the elements of a tuple as returned by urlsplit() into a
complete URL as a string. The data argument can be any five-item iterable.
This may result in a slightly different, but equivalent URL, if the URL that
was parsed originally had unnecessary delimiters (for example, a ? with an
empty query; the RFC states that these are equivalent)."""
empty query; the RFC states that these are equivalent) and keep_empty
is false or components is the result of the urlsplit() call with
allow_none=False."""
scheme, netloc, url, query, fragment, _coerce_result = (
_coerce_args(*components))
if keep_empty is _UNSPECIFIED:
keep_empty = getattr(components, '_keep_empty', _ALLOW_NONE_DEFAULT)
elif keep_empty and not getattr(components, '_keep_empty', True):
raise ValueError('Cannot distinguish between empty and not defined '
'URI components in the result of parsing URL with '
'allow_none=False')
if not keep_empty:
if not netloc:
if scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
Expand Down Expand Up @@ -692,7 +709,9 @@ def urldefrag(url, *, allow_none=_ALLOW_NONE_DEFAULT):
frag = None
defrag = url
if not allow_none and frag is None: frag = ''
return _coerce_result(DefragResult(defrag, frag))
result = _coerce_result(DefragResult(defrag, frag))
result._keep_empty = allow_none
return result

_hexdig = '0123456789ABCDEFabcdef'
_hextobyte = None
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Add the *allow_none* parameter to :func:`~urllib.parse.urlparse`,
:func:`~urllib.parse.urlsplit` and :func:`~urllib.parse.urldefrag`
functions. Add the *keep_empty* parameter to
:func:`~urllib.parse.urlunparse` and :func:`~urllib.parse.urlunsplit`
functions. This allows to distinguish between empty and not defined URI
components and preserve empty components.

0 comments on commit eaa9ce6

Please sign in to comment.