From 51541ecc582cbc0196bd34c4cad74b3b15f8988b Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Wed, 4 Sep 2024 16:06:32 -1000 Subject: [PATCH] Avoid trying to parse strings that cannot be IP Addresses (#1104) --- CHANGES/1104.misc.rst | 3 +++ tests/test_url_build.py | 15 +++++++++++ yarl/_url.py | 58 +++++++++++++++++++++++++++-------------- 3 files changed, 56 insertions(+), 20 deletions(-) create mode 100644 CHANGES/1104.misc.rst diff --git a/CHANGES/1104.misc.rst b/CHANGES/1104.misc.rst new file mode 100644 index 000000000..16e839b30 --- /dev/null +++ b/CHANGES/1104.misc.rst @@ -0,0 +1,3 @@ +Improved performance of encoding hosts -- by :user:`bdraco`. + +Previously, the library would unconditionally try to parse a host as an IP Address. The library now avoids trying to parse a host as an IP Address if the string is not in one of the formats described in :rfc:`3986#section-3.2.2`. diff --git a/tests/test_url_build.py b/tests/test_url_build.py index 5aecbc585..1d5c4d09c 100644 --- a/tests/test_url_build.py +++ b/tests/test_url_build.py @@ -15,6 +15,21 @@ def test_build_simple(): assert str(u) == "http://127.0.0.1" +def test_url_build_ipv6(): + u = URL.build(scheme="http", host="::1") + assert str(u) == "http://::1" + + +def test_url_build_ipv6_brackets(): + u = URL.build(scheme="http", host="[::1]") + assert str(u) == "http://::1" + + +def test_url_ipv4_in_ipv6(): + u = URL.build(scheme="http", host="2001:db8:122:344::192.0.2.33") + assert str(u) == "http://2001:db8:122:344::c000:221" + + def test_build_with_scheme(): u = URL.build(scheme="blob", path="path") assert str(u) == "blob:path" diff --git a/yarl/_url.py b/yarl/_url.py index b32856bc2..9a429e45b 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -850,27 +850,45 @@ def _normalize_path(cls, path: str) -> str: @classmethod def _encode_host(cls, host: str, human: bool = False) -> str: raw_ip, sep, zone = host.partition("%") - # IP parsing is slow, so its wrapped in an LRU - try: - ip_compressed_version = _ip_compressed_version(raw_ip) - except ValueError: - host = host.lower() - # IDNA encoding is slow, - # skip it for ASCII-only strings - # Don't move the check into _idna_encode() helper - # to reduce the cache size - if human or host.isascii(): + if raw_ip and raw_ip[-1].isdigit() or ":" in raw_ip: + # Might be an IP address, check it + # + # IP Addresses can look like: + # https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2 + # - 127.0.0.1 (last character is a digit) + # - 2001:db8::ff00:42:8329 (contains a colon) + # - 2001:db8::ff00:42:8329%eth0 (contains a colon) + # - [2001:db8::ff00:42:8329] (contains a colon) + # Rare IP Address formats are not supported per: + # https://datatracker.ietf.org/doc/html/rfc3986#section-7.4 + # + # We try to avoid parsing IP addresses as much as possible + # since its orders of magnitude slower than almost any other operation + # this library does. + # + # IP parsing is slow, so its wrapped in an LRU + try: + ip_compressed_version = _ip_compressed_version(raw_ip) + except ValueError: + pass + else: + # These checks should not happen in the + # LRU to keep the cache size small + host, version = ip_compressed_version + if sep: + host += "%" + zone + if version == 6: + return f"[{host}]" return host - return _idna_encode(host) - - # These checks should not happen in the - # LRU to keep the cache size small - host, version = ip_compressed_version - if sep: - host += "%" + zone - if version == 6: - return f"[{host}]" - return host + + host = host.lower() + # IDNA encoding is slow, + # skip it for ASCII-only strings + # Don't move the check into _idna_encode() helper + # to reduce the cache size + if human or host.isascii(): + return host + return _idna_encode(host) @classmethod def _make_netloc(