Skip to content

Commit

Permalink
RFC3986 compatible URL.join honoring empty segments again (#1082)
Browse files Browse the repository at this point in the history
Co-authored-by: Sviatoslav Sydorenko (Святослав Сидоренко) <wk.cvs.github@sydorenko.org.ua>
Co-authored-by: J. Nick Koston <nick@koston.org>
Co-authored-by: Sam Bull <aa6bs0@sambull.org>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
5 people authored Sep 4, 2024
1 parent b631867 commit af585b2
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 10 deletions.
1 change: 1 addition & 0 deletions CHANGES/1039.bugfix.rst
29 changes: 29 additions & 0 deletions CHANGES/1082.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
:meth:`URL.join() <yarl.URL.join>` has been changed to match
:rfc:`3986` and align with
:meth:`/ operation <yarl.URL.__truediv__>` and :meth:`URL.joinpath() <yarl.URL.joinpath>`
when joining URLs with empty segments.
Previously :py:func:`urllib.parse.urljoin` was used,
which has known issues with empty segments
(`python/cpython#84774 <https://github.com/python/cpython/issues/84774>`_).

Due to the semantics of :meth:`URL.join() <yarl.URL.join>`, joining an
URL with scheme requires making it relative, prefixing with ``./``.

.. code-block:: pycon
>>> URL("https://web.archive.org/web/").join(URL("./https://github.com/aio-libs/yarl"))
URL('https://web.archive.org/web/https://github.com/aio-libs/yarl')
Empty segments are honored in the base as well as the joined part.

.. code-block:: pycon
>>> URL("https://web.archive.org/web/https://").join(URL("github.com/aio-libs/yarl"))
URL('https://web.archive.org/web/https://github.com/aio-libs/yarl')
-- by :user:`commonism`

This change initially appeared in 1.9.5 but was reverted in 1.9.6 to resolve a problem with query string handling.
75 changes: 75 additions & 0 deletions tests/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -1669,6 +1669,81 @@ def test_join_from_rfc_3986_abnormal(url, expected):
assert base.join(url) == expected


EMPTY_SEGMENTS = [
(
"https://web.archive.org/web/",
"./https://github.com/aio-libs/yarl",
"https://web.archive.org/web/https://github.com/aio-libs/yarl",
),
(
"https://web.archive.org/web/https://github.com/",
"aio-libs/yarl",
"https://web.archive.org/web/https://github.com/aio-libs/yarl",
),
]


@pytest.mark.parametrize("base,url,expected", EMPTY_SEGMENTS)
def test_join_empty_segments(base, url, expected):
base = URL(base)
url = URL(url)
expected = URL(expected)
joined = base.join(url)
assert joined == expected


SIMPLE_BASE = "http://a/b/c/d"
URLLIB_URLJOIN = [
("", "http://a/b/c/g?y/./x", "http://a/b/c/g?y/./x"),
("", "http://a/./g", "http://a/./g"),
("svn://pathtorepo/dir1", "dir2", "svn://pathtorepo/dir2"),
("svn+ssh://pathtorepo/dir1", "dir2", "svn+ssh://pathtorepo/dir2"),
("ws://a/b", "g", "ws://a/g"),
("wss://a/b", "g", "wss://a/g"),
# test for issue22118 duplicate slashes
(SIMPLE_BASE + "/", "foo", SIMPLE_BASE + "/foo"),
# Non-RFC-defined tests, covering variations of base and trailing
# slashes
("http://a/b/c/d/e/", "../../f/g/", "http://a/b/c/f/g/"),
("http://a/b/c/d/e", "../../f/g/", "http://a/b/f/g/"),
("http://a/b/c/d/e/", "/../../f/g/", "http://a/f/g/"),
("http://a/b/c/d/e", "/../../f/g/", "http://a/f/g/"),
("http://a/b/c/d/e/", "../../f/g", "http://a/b/c/f/g"),
("http://a/b/", "../../f/g/", "http://a/f/g/"),
("a", "b", "b"),
("http:///", "..", "http:///"),
("a/", "b", "a/b"),
("a/b", "c", "a/c"),
("a/b/", "c", "a/b/c"),
(
"https://x.org/",
"/?text=Hello+G%C3%BCnter",
"https://x.org/?text=Hello+G%C3%BCnter",
),
(
"https://x.org/",
"?text=Hello+G%C3%BCnter",
"https://x.org/?text=Hello+G%C3%BCnter",
),
("http://example.com", "http://example.com", "http://example.com"),
("http://x.org", "https://x.org#fragment", "https://x.org#fragment"),
]


@pytest.mark.parametrize("base,url,expected", URLLIB_URLJOIN)
def test_join_cpython_urljoin(base, url, expected):
# tests from cpython urljoin
base = URL(base)
url = URL(url)
expected = URL(expected)
joined = base.join(url)
assert joined == expected


def test_empty_authority():
assert URL("http:///").authority == ""


def test_split_result_non_decoded():
with pytest.raises(ValueError):
URL(SplitResult("http", "example.com", "path", "qs", "frag"))
Expand Down
71 changes: 61 additions & 10 deletions yarl/_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,15 @@
Union,
overload,
)
from urllib.parse import SplitResult, parse_qsl, quote, urljoin, urlsplit, urlunsplit
from urllib.parse import (
SplitResult,
parse_qsl,
quote,
urlsplit,
urlunsplit,
uses_netloc,
uses_relative,
)

import idna
from multidict import MultiDict, MultiDictProxy
Expand All @@ -28,6 +36,8 @@
from ._quoting import _Quoter, _Unquoter

DEFAULT_PORTS = {"http": 80, "https": 443, "ws": 80, "wss": 443}
USES_AUTHORITY = frozenset(uses_netloc)
USES_RELATIVE = frozenset(uses_relative)

sentinel = object()

Expand All @@ -52,6 +62,15 @@ class CacheInfo(TypedDict):
ip_address: _CacheInfo


class _SplitResultDict(TypedDict, total=False):

scheme: str
netloc: str
path: str
query: str
fragment: str


def rewrite_module(obj: _T) -> _T:
obj.__module__ = "yarl"
return obj
Expand Down Expand Up @@ -271,12 +290,14 @@ def build(
netloc = authority
else:
tmp = SplitResult("", authority, "", "", "")
port = None if tmp.port == DEFAULT_PORTS.get(scheme) else tmp.port
netloc = cls._make_netloc(
tmp.username, tmp.password, tmp.hostname, tmp.port, encode=True
tmp.username, tmp.password, tmp.hostname, port, encode=True
)
elif not user and not password and not host and not port:
netloc = ""
else:
port = None if port == DEFAULT_PORTS.get(scheme) else port
netloc = cls._make_netloc(
user, password, host, port, encode=not encoded, encode_host=not encoded
)
Expand Down Expand Up @@ -791,11 +812,7 @@ def _make_child(self, paths: "Sequence[str]", encoded: bool = False) -> "URL":
f"Appending path {path!r} starting from slash is forbidden"
)
path = path if encoded else self._PATH_QUOTER(path)
segments = [
segment for segment in reversed(path.split("/")) if segment != "."
]
if not segments:
continue
segments = list(reversed(path.split("/")))
# remove trailing empty segment for all but the last path
segment_slice_start = int(not last and segments[0] == "")
parsed += segments[segment_slice_start:]
Expand Down Expand Up @@ -1212,10 +1229,44 @@ def join(self, url: "URL") -> "URL":
relative URL.
"""
# See docs for urllib.parse.urljoin
if not isinstance(url, URL):
if type(url) is not URL:
raise TypeError("url should be URL")
return URL(urljoin(str(self), str(url)), encoded=True)
val = self._val
other_val = url._val
scheme = other_val.scheme or val.scheme

if scheme != val.scheme or scheme not in USES_RELATIVE:
return url

# scheme is in uses_authority as uses_authority is a superset of uses_relative
if other_val.netloc and scheme in USES_AUTHORITY:
return URL(other_val._replace(scheme=scheme), encoded=True)

parts: _SplitResultDict = {"scheme": scheme}
if other_val.path or other_val.fragment:
parts["fragment"] = other_val.fragment
if other_val.path or other_val.query:
parts["query"] = other_val.query

if not other_val.path:
return URL(val._replace(**parts), encoded=True)

if other_val.path[0] == "/" or not val.path:
path = other_val.path
elif val.path[-1] == "/":
path = f"{val.path}{other_val.path}"
else:
# …
# and relativizing ".."
# parts[0] is / for absolute urls, this join will add a double slash there
path = "/".join([*self.parts[:-1], ""])
path += other_val.path
# which has to be removed
if val.path[0] == "/":
path = path[1:]

parts["path"] = self._normalize_path(path)
return URL(val._replace(**parts), encoded=True)

def joinpath(self, *other: str, encoded: bool = False) -> "URL":
"""Return a new URL with the elements in other appended to the path."""
Expand Down

0 comments on commit af585b2

Please sign in to comment.