RFC3986 compatible URL.join honoring empty segments again (#1082)

Co-authored-by: Sviatoslav Sydorenko (Святослав Сидоренко) <wk.cvs.github@sydorenko.org.ua> Co-authored-by: J. Nick Koston <nick@koston.org> Co-authored-by: Sam Bull <aa6bs0@sambull.org> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
aio-libs · Sep 4, 2024 · af585b2 · af585b2
1 parent b631867
commit af585b2
Show file tree

Hide file tree

Showing 4 changed files with 166 additions and 10 deletions.
diff --git a/CHANGES/1039.bugfix.rst b/CHANGES/1039.bugfix.rst
@@ -0,0 +1 @@
+1082.bugfix.rst
diff --git a/CHANGES/1082.bugfix.rst b/CHANGES/1082.bugfix.rst
@@ -0,0 +1,29 @@
+:meth:`URL.join() <yarl.URL.join>` has been changed to match
+:rfc:`3986` and align with
+:meth:`/ operation <yarl.URL.__truediv__>` and :meth:`URL.joinpath() <yarl.URL.joinpath>`
+when joining URLs with empty segments.
+Previously :py:func:`urllib.parse.urljoin` was used,
+which has known issues with empty segments
+(`python/cpython#84774 <https://github.com/python/cpython/issues/84774>`_).
+
+Due to the semantics of :meth:`URL.join() <yarl.URL.join>`, joining an
+URL with scheme requires making it relative, prefixing with ``./``.
+
+.. code-block:: pycon
+
+   >>> URL("https://web.archive.org/web/").join(URL("./https://github.com/aio-libs/yarl"))
+   URL('https://web.archive.org/web/https://github.com/aio-libs/yarl')
+
+
+Empty segments are honored in the base as well as the joined part.
+
+.. code-block:: pycon
+
+   >>> URL("https://web.archive.org/web/https://").join(URL("github.com/aio-libs/yarl"))
+   URL('https://web.archive.org/web/https://github.com/aio-libs/yarl')
+
+
+
+-- by :user:`commonism`
+
+This change initially appeared in 1.9.5 but was reverted in 1.9.6 to resolve a problem with query string handling.
diff --git a/tests/test_url.py b/tests/test_url.py
@@ -1669,6 +1669,81 @@ def test_join_from_rfc_3986_abnormal(url, expected):
     assert base.join(url) == expected
 
 
+EMPTY_SEGMENTS = [
+    (
+        "https://web.archive.org/web/",
+        "./https://github.com/aio-libs/yarl",
+        "https://web.archive.org/web/https://github.com/aio-libs/yarl",
+    ),
+    (
+        "https://web.archive.org/web/https://github.com/",
+        "aio-libs/yarl",
+        "https://web.archive.org/web/https://github.com/aio-libs/yarl",
+    ),
+]
+
+
+@pytest.mark.parametrize("base,url,expected", EMPTY_SEGMENTS)
+def test_join_empty_segments(base, url, expected):
+    base = URL(base)
+    url = URL(url)
+    expected = URL(expected)
+    joined = base.join(url)
+    assert joined == expected
+
+
+SIMPLE_BASE = "http://a/b/c/d"
+URLLIB_URLJOIN = [
+    ("", "http://a/b/c/g?y/./x", "http://a/b/c/g?y/./x"),
+    ("", "http://a/./g", "http://a/./g"),
+    ("svn://pathtorepo/dir1", "dir2", "svn://pathtorepo/dir2"),
+    ("svn+ssh://pathtorepo/dir1", "dir2", "svn+ssh://pathtorepo/dir2"),
+    ("ws://a/b", "g", "ws://a/g"),
+    ("wss://a/b", "g", "wss://a/g"),
+    # test for issue22118 duplicate slashes
+    (SIMPLE_BASE + "/", "foo", SIMPLE_BASE + "/foo"),
+    # Non-RFC-defined tests, covering variations of base and trailing
+    # slashes
+    ("http://a/b/c/d/e/", "../../f/g/", "http://a/b/c/f/g/"),
+    ("http://a/b/c/d/e", "../../f/g/", "http://a/b/f/g/"),
+    ("http://a/b/c/d/e/", "/../../f/g/", "http://a/f/g/"),
+    ("http://a/b/c/d/e", "/../../f/g/", "http://a/f/g/"),
+    ("http://a/b/c/d/e/", "../../f/g", "http://a/b/c/f/g"),
+    ("http://a/b/", "../../f/g/", "http://a/f/g/"),
+    ("a", "b", "b"),
+    ("http:///", "..", "http:///"),
+    ("a/", "b", "a/b"),
+    ("a/b", "c", "a/c"),
+    ("a/b/", "c", "a/b/c"),
+    (
+        "https://x.org/",
+        "/?text=Hello+G%C3%BCnter",
+        "https://x.org/?text=Hello+G%C3%BCnter",
+    ),
+    (
+        "https://x.org/",
+        "?text=Hello+G%C3%BCnter",
+        "https://x.org/?text=Hello+G%C3%BCnter",
+    ),
+    ("http://example.com", "http://example.com", "http://example.com"),
+    ("http://x.org", "https://x.org#fragment", "https://x.org#fragment"),
+]
+
+
+@pytest.mark.parametrize("base,url,expected", URLLIB_URLJOIN)
+def test_join_cpython_urljoin(base, url, expected):
+    # tests from cpython urljoin
+    base = URL(base)
+    url = URL(url)
+    expected = URL(expected)
+    joined = base.join(url)
+    assert joined == expected
+
+
+def test_empty_authority():
+    assert URL("http:///").authority == ""
+
+
 def test_split_result_non_decoded():
     with pytest.raises(ValueError):
         URL(SplitResult("http", "example.com", "path", "qs", "frag"))

diff --git a/yarl/_url.py b/yarl/_url.py
@@ -19,7 +19,15 @@
     Union,
     overload,
 )
-from urllib.parse import SplitResult, parse_qsl, quote, urljoin, urlsplit, urlunsplit
+from urllib.parse import (
+    SplitResult,
+    parse_qsl,
+    quote,
+    urlsplit,
+    urlunsplit,
+    uses_netloc,
+    uses_relative,
+)
 
 import idna
 from multidict import MultiDict, MultiDictProxy
@@ -28,6 +36,8 @@
 from ._quoting import _Quoter, _Unquoter
 
 DEFAULT_PORTS = {"http": 80, "https": 443, "ws": 80, "wss": 443}
+USES_AUTHORITY = frozenset(uses_netloc)
+USES_RELATIVE = frozenset(uses_relative)
 
 sentinel = object()
 
@@ -52,6 +62,15 @@ class CacheInfo(TypedDict):
     ip_address: _CacheInfo
 
 
+class _SplitResultDict(TypedDict, total=False):
+
+    scheme: str
+    netloc: str
+    path: str
+    query: str
+    fragment: str
+
+
 def rewrite_module(obj: _T) -> _T:
     obj.__module__ = "yarl"
     return obj
@@ -271,12 +290,14 @@ def build(
                 netloc = authority
             else:
                 tmp = SplitResult("", authority, "", "", "")
+                port = None if tmp.port == DEFAULT_PORTS.get(scheme) else tmp.port
                 netloc = cls._make_netloc(
-                    tmp.username, tmp.password, tmp.hostname, tmp.port, encode=True
+                    tmp.username, tmp.password, tmp.hostname, port, encode=True
                 )
         elif not user and not password and not host and not port:
             netloc = ""
         else:
+            port = None if port == DEFAULT_PORTS.get(scheme) else port
             netloc = cls._make_netloc(
                 user, password, host, port, encode=not encoded, encode_host=not encoded
             )
@@ -791,11 +812,7 @@ def _make_child(self, paths: "Sequence[str]", encoded: bool = False) -> "URL":
                     f"Appending path {path!r} starting from slash is forbidden"
                 )
             path = path if encoded else self._PATH_QUOTER(path)
-            segments = [
-                segment for segment in reversed(path.split("/")) if segment != "."
-            ]
-            if not segments:
-                continue
+            segments = list(reversed(path.split("/")))
             # remove trailing empty segment for all but the last path
             segment_slice_start = int(not last and segments[0] == "")
             parsed += segments[segment_slice_start:]
@@ -1212,10 +1229,44 @@ def join(self, url: "URL") -> "URL":
         relative URL.
 
         """
-        # See docs for urllib.parse.urljoin
-        if not isinstance(url, URL):
+        if type(url) is not URL:
             raise TypeError("url should be URL")
-        return URL(urljoin(str(self), str(url)), encoded=True)
+        val = self._val
+        other_val = url._val
+        scheme = other_val.scheme or val.scheme
+
+        if scheme != val.scheme or scheme not in USES_RELATIVE:
+            return url
+
+        # scheme is in uses_authority as uses_authority is a superset of uses_relative
+        if other_val.netloc and scheme in USES_AUTHORITY:
+            return URL(other_val._replace(scheme=scheme), encoded=True)
+
+        parts: _SplitResultDict = {"scheme": scheme}
+        if other_val.path or other_val.fragment:
+            parts["fragment"] = other_val.fragment
+        if other_val.path or other_val.query:
+            parts["query"] = other_val.query
+
+        if not other_val.path:
+            return URL(val._replace(**parts), encoded=True)
+
+        if other_val.path[0] == "/" or not val.path:
+            path = other_val.path
+        elif val.path[-1] == "/":
+            path = f"{val.path}{other_val.path}"
+        else:
+            # …
+            # and relativizing ".."
+            # parts[0] is / for absolute urls, this join will add a double slash there
+            path = "/".join([*self.parts[:-1], ""])
+            path += other_val.path
+            # which has to be removed
+            if val.path[0] == "/":
+                path = path[1:]
+
+        parts["path"] = self._normalize_path(path)
+        return URL(val._replace(**parts), encoded=True)
 
     def joinpath(self, *other: str, encoded: bool = False) -> "URL":
         """Return a new URL with the elements in other appended to the path."""