Skip to content

Commit

Permalink
Optimize netloc extraction (#284)
Browse files Browse the repository at this point in the history
Regex can be replaced with an if-else equivalent for tangible speed improvement.

---------

Co-authored-by: John Kurkowski <john.kurkowski@gmail.com>
  • Loading branch information
elliotwutingfeng and john-kurkowski committed May 16, 2023
1 parent e187686 commit ad27cca
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 3 deletions.
7 changes: 7 additions & 0 deletions tests/main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,13 @@ def test_empty():


def test_scheme():
assert_extract("//", ("", "", "", ""))
assert_extract("://", ("", "", "", ""))
assert_extract("://example.com", ("", "", "", ""))
assert_extract("a+-.://example.com", ("example.com", "", "example", "com"))
assert_extract("a#//example.com", ("", "", "a", ""))
assert_extract("a@://example.com", ("", "", "", ""))
assert_extract("#//example.com", ("", "", "", ""))
assert_extract(
"https://mail.google.com/mail", ("mail.google.com", "mail", "google", "com")
)
Expand Down
18 changes: 15 additions & 3 deletions tldextract/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,15 @@
r"{3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$"
)

SCHEME_RE = re.compile(r"^([" + scheme_chars + "]+:)?//")
scheme_chars_set = set(scheme_chars)


def lenient_netloc(url: str) -> str:
"""Extract the netloc of a URL-like string, similar to the netloc attribute
returned by urllib.parse.{urlparse,urlsplit}, but extract more leniently,
without raising errors."""

return (
SCHEME_RE.sub("", url, 1)
_schemeless_url(url)
.partition("/")[0]
.partition("?")[0]
.partition("#")[0]
Expand All @@ -29,6 +28,19 @@ def lenient_netloc(url: str) -> str:
)


def _schemeless_url(url: str) -> str:
double_slashes_start = url.find("//")
if double_slashes_start == 0:
return url[2:]
if (
double_slashes_start < 2
or not url[double_slashes_start - 1] == ":"
or set(url[: double_slashes_start - 1]) - scheme_chars_set
):
return url
return url[double_slashes_start + 2 :]


def looks_like_ip(maybe_ip: str) -> bool:
"""Does the given str look like an IP address?"""
if not maybe_ip[0].isdigit():
Expand Down

0 comments on commit ad27cca

Please sign in to comment.