Skip to content

Commit

Permalink
efficiency: use urlunsplit and simplify urlutils (#33)
Browse files Browse the repository at this point in the history
* use urlunsplit and simplify urlutils

* filter_urls: preserve two steps and add regex
  • Loading branch information
adbar authored May 17, 2023
1 parent 892920c commit bd82dd5
Showing 1 changed file with 27 additions and 22 deletions.
49 changes: 27 additions & 22 deletions courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from functools import lru_cache
from typing import Any, List, Optional, Set, Tuple, Union
from urllib.parse import urlparse, ParseResult
from urllib.parse import urlparse, urlunsplit, ParseResult

from tld import get_tld

Expand All @@ -22,6 +22,8 @@
NO_EXTENSION_REGEX = re.compile(r"(^[^.]+)")
STRIP_DOMAIN_REGEX = re.compile(r"^.+?:.*?@|(?<=[^0-9]):[0-9]+")
CLEAN_FLD_REGEX = re.compile(r"^www[0-9]*\.")
INNER_SLASH_REGEX = re.compile(r"(.+/)+")
FEED_WHITELIST_REGEX = re.compile(r"(feedburner|feedproxy)", re.I)


@lru_cache(maxsize=1024)
Expand Down Expand Up @@ -92,7 +94,9 @@ def get_host_and_path(url: Any) -> Tuple[str, str]:
Accepts strings and urllib.parse ParseResult objects."""
parsed_url = _parse(url)
hostname = get_base_url(parsed_url)
pathval = parsed_url._replace(scheme="", netloc="").geturl()
pathval = urlunsplit(
["", "", parsed_url.path, parsed_url.query, parsed_url.fragment]
)
# correction for root/homepage
if pathval == "":
pathval = "/"
Expand All @@ -117,28 +121,24 @@ def fix_relative_urls(baseurl: str, url: str) -> str:
return baseurl + url
if url.startswith("."):
# don't try to correct these URLs
return baseurl + "/" + re.sub(r"(.+/)+", "", url)
if not url.startswith("http") and not url.startswith("{"):
return baseurl + "/" + INNER_SLASH_REGEX.sub("", url)
if not url.startswith(("http", "{")):
return baseurl + "/" + url
# todo: handle here
# if url.startswith('{'):
return url


def filter_urls(linklist: List[str], urlfilter: Optional[str]) -> List[str]:
def filter_urls(link_list: List[str], urlfilter: Optional[str]) -> List[str]:
"Return a list of links corresponding to the given substring pattern."
if urlfilter is None:
return sorted(set(linklist))
return sorted(set(link_list))
# filter links
newlist = [l for l in linklist if urlfilter in l]
filtered_list = [l for l in link_list if urlfilter in l]
# feedburner option: filter and wildcards for feeds
if not newlist:
newlist = [
l
for l in linklist
if urlfilter in l or "feedburner" in l or "feedproxy" in l
]
return sorted(set(newlist))
if not filtered_list:
filtered_list = [l for l in link_list if FEED_WHITELIST_REGEX.search(l)]
return sorted(set(filtered_list))


def is_external(url: str, reference: str, ignore_suffix: bool = True) -> bool:
Expand All @@ -154,20 +154,25 @@ def is_external(url: str, reference: str, ignore_suffix: bool = True) -> bool:

def is_known_link(link: str, known_links: Set[str]) -> bool:
"Compare the link and its possible variants to the existing URL base."
# easy check
# check exact link
if link in known_links:
return True
# trailing slash
test1 = link.rstrip("/")
test2 = test1 + "/"
if test1 in known_links or test2 in known_links:

# check link and variants with trailing slashes
test_links = [link.rstrip("/"), link.rstrip("/") + "/"]
if any(test_link in known_links for test_link in test_links):
return True
# http/https + trailing slash

# check link and variants with modified protocol
if link.startswith("http"):
if link.startswith("https"):
testlink = link[:4] + link[5:]
else:
testlink = "".join([link[:4], "s", link[4:]])
test1, test2 = testlink.rstrip("/"), testlink.rstrip("/") + "/"
return testlink in known_links or test1 in known_links or test2 in known_links
if any(
test in known_links
for test in [testlink, testlink.rstrip("/"), testlink.rstrip("/") + "/"]
):
return True

return False

0 comments on commit bd82dd5

Please sign in to comment.