Skip to content

Commit

Permalink
compatibility: extract_links + is_exhausted_domain (#51)
Browse files Browse the repository at this point in the history
* review extract_links compatibility

* fix tests

* remove KeyError in is_exhausted_domain()

* unify: remove trailing slashes

* adjust lang filter

* leave trailing slashes for now

* temp fix for tests
  • Loading branch information
adbar authored Jul 13, 2023
1 parent 422785b commit d3278ab
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 19 deletions.
8 changes: 4 additions & 4 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@


def clean_url(url: str, language: Optional[str] = None) -> Optional[str]:
"""Helper function: chained scrubbing and normalization"""
"Helper function: chained scrubbing and normalization"
try:
return normalize_url(scrub_url(url), False, language)
except (AttributeError, ValueError):
return None


def scrub_url(url: str) -> str:
"""Strip unnecessary parts and make sure only one URL is considered"""
"Strip unnecessary parts and make sure only one URL is considered"
# trim
# https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
# remove leading and trailing white space and unescaped control chars
Expand Down Expand Up @@ -100,7 +100,7 @@ def scrub_url(url: str) -> str:
def clean_query(
parsed_url: SplitResult, strict: bool = False, language: Optional[str] = None
) -> str:
"""Strip unwanted query elements"""
"Strip unwanted query elements"
if len(parsed_url.query) > 0:
qdict = parse_qs(parsed_url.query)
newqdict = {}
Expand Down Expand Up @@ -152,7 +152,7 @@ def normalize_url(
strict: bool = False,
language: Optional[str] = None,
) -> str:
"""Takes a URL string or a parsed URL and returns a (basically) normalized URL string"""
"Takes a URL string or a parsed URL and returns a normalized URL string"
parsed_url = _parse(parsed_url)
# lowercase + remove fragments + normalize punycode
scheme = parsed_url.scheme.lower()
Expand Down
19 changes: 12 additions & 7 deletions courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,9 @@ def check_url(

def extract_links(
pagecontent: str,
full_url: str,
external_bool: bool,
url: Optional[str] = None,
base_url: Optional[str] = None,
external_bool: bool = False,
no_filter: bool = False,
language: Optional[str] = None,
strict: bool = True,
Expand All @@ -144,7 +145,8 @@ def extract_links(
"""Filter links in a HTML document using a series of heuristics
Args:
pagecontent: whole page in binary format
full_url: full URL of the page
url: full URL of the original page
base_url: deprecated, legacy only
external_bool: set to True for external links only, False for
internal links only
no_filter: override settings and bypass checks to return all possible URLs
Expand All @@ -160,7 +162,8 @@ def extract_links(
Raises:
Nothing.
"""
base_url = get_base_url(full_url)
base_url = base_url or get_base_url(url)
url = url or base_url
candidates, validlinks = set(), set() # type: Set[str], Set[str]
if not pagecontent:
return validlinks
Expand Down Expand Up @@ -188,7 +191,7 @@ def extract_links(
for link in candidates:
# repair using base
if not link.startswith("http"):
link = fix_relative_urls(full_url, link)
link = fix_relative_urls(url, link)
# check
if no_filter is False:
checked = check_url(
Expand Down Expand Up @@ -216,7 +219,8 @@ def extract_links(

def filter_links(
htmlstring: str,
full_url: str,
url: Optional[str],
base_url: Optional[str] = None,
lang: Optional[str] = None,
rules: Optional[RobotFileParser] = None,
external: bool = False,
Expand All @@ -225,9 +229,10 @@ def filter_links(
) -> Tuple[List[str], List[str]]:
"Find links in a HTML document, filter them and add them to the data store."
links, links_priority = [], []
url = url or base_url
for link in extract_links(
pagecontent=htmlstring,
full_url=full_url,
url=url,
external_bool=external,
language=lang,
strict=strict,
Expand Down
13 changes: 9 additions & 4 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

from urllib.robotparser import RobotFileParser

from .clean import normalize_url
from .core import filter_links
from .filters import lang_filter, validate_url
from .meta import clear_caches
Expand Down Expand Up @@ -115,6 +116,9 @@ def _buffer_urls(
):
LOGGER.debug("Wrong language: %s", url)
raise ValueError
parsed_url = normalize_url(
parsed_url, strict=self.strict, language=self.language
)
hostinfo, urlpath = get_host_and_path(parsed_url)
inputdict[hostinfo].append(UrlPathTuple(urlpath, visited))
except (TypeError, ValueError):
Expand Down Expand Up @@ -235,18 +239,18 @@ def add_urls(
def add_from_html(
self,
htmlstring: str,
full_url: str,
url: str,
external: bool = False,
lang: Optional[str] = None,
with_nav: bool = True,
) -> None:
"Find links in a HTML document, filter them and add them to the data store."
# lang = lang or self.language
base_url = get_base_url(full_url)
base_url = get_base_url(url)
rules = self.get_rules(base_url)
links, links_priority = filter_links(
htmlstring=htmlstring,
full_url=full_url,
url=url,
external=external,
lang=lang or self.language,
rules=rules,
Expand Down Expand Up @@ -293,7 +297,8 @@ def is_exhausted_domain(self, domain: str) -> bool:
"Tell if all known URLs for the website have been visited."
if domain in self.urldict:
return self.urldict[domain].state in (State.ALL_VISITED, State.BUSTED)
raise KeyError("website not in store")
return False
# raise KeyError("website not in store")

def unvisited_websites_number(self) -> int:
"Return the number of websites for which there are still URLs to visit."
Expand Down
28 changes: 26 additions & 2 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,8 @@ def test_path_filter():


def test_lang_filter():
assert lang_filter("http://test.com/az/", "de") is False
assert lang_filter("http://test.com/de/", "de") is True
assert (
lang_filter(
"https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377",
Expand Down Expand Up @@ -828,6 +830,24 @@ def test_extraction():
"https://httpbin.org/links/2/0",
"https://httpbin.org/links/2/1",
]
links = extract_links(
pagecontent, base_url="https://httpbin.org", external_bool=False, with_nav=True
)
assert sorted(links) == [
"https://httpbin.org/links/2/0",
"https://httpbin.org/links/2/1",
]
pagecontent = "<html><head><title>Links</title></head><body><a href='links/2/0'>0</a> <a href='links/2/1'>1</a> </body></html>"
links = extract_links(
pagecontent,
url="https://httpbin.org/page1/",
external_bool=False,
with_nav=True,
)
assert sorted(links) == [
"https://httpbin.org/page1/links/2/0",
"https://httpbin.org/page1/links/2/1",
]
pagecontent = "<html><head><title>Pages</title></head><body><a href='/page/10'>10</a> <a href='/page/?=11'>11</a></body></html>"
assert (
extract_links(
Expand Down Expand Up @@ -901,8 +921,12 @@ def test_extraction():
base_url = "https://example.org"
htmlstring = '<html><body><a href="https://example.org/page1"/><a href="https://example.org/page1/"/><a href="https://test.org/page1"/></body></html>'
links, links_priority = filter_links(htmlstring, base_url)
assert len(links) == 1
assert not links_priority
assert len(links) == 1 and not links_priority
# link filtering with relative URLs
url = "https://example.org/page1.html"
htmlstring = '<html><body><a href="/subpage1"/><a href="/subpage1/"/><a href="https://test.org/page1"/></body></html>'
links, links_priority = filter_links(htmlstring, url=url)
assert len(links) == 1 and not links_priority


def test_cli():
Expand Down
5 changes: 3 additions & 2 deletions tests/urlstore_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,9 @@ def test_urlstore():
assert my_urls.urldict["https://visited.com"].tuples[1].visited is False
assert my_urls.urldict["https://visited.com"].state is State.OPEN
assert my_urls.is_exhausted_domain("https://visited.com") is False
with pytest.raises(KeyError):
assert my_urls.is_exhausted_domain("https://visited2.com") is True
# with pytest.raises(KeyError):
# assert my_urls.is_exhausted_domain("https://visited2.com") is True
assert my_urls.is_exhausted_domain("https://visited2.com") is False
# revert changes for further tests
del my_urls.urldict["https://visited.com"].tuples[1]
my_urls.urldict["https://visited.com"].state = State.ALL_VISITED
Expand Down

0 comments on commit d3278ab

Please sign in to comment.