Skip to content

Commit

Permalink
add internal parameters
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jun 2, 2023
1 parent f9b6e93 commit cf46917
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 2 deletions.
3 changes: 2 additions & 1 deletion courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ def filter_links(
lang: Optional[str] = None,
rules: Optional[RobotFileParser] = None,
external: bool = False,
strict: bool = False,
with_nav: bool = True,
) -> Tuple[List[str], List[str]]:
"Find links in a HTML document, filter them and add them to the data store."
Expand All @@ -216,7 +217,7 @@ def filter_links(
base_url=base_url,
external_bool=external,
language=lang,
strict=False,
strict=strict,
with_nav=with_nav,
):
# sanity check
Expand Down
3 changes: 2 additions & 1 deletion courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,9 @@ def add_from_html(
htmlstring=htmlstring,
base_url=base_url,
external=external,
lang=lang,
lang=lang or self.language,
rules=rules,
strict=self.strict,
with_nav=with_nav,
)
self.add_urls(urls=links, appendleft=links_priority)
Expand Down
7 changes: 7 additions & 0 deletions tests/urlstore_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,3 +370,10 @@ def test_from_html():
todo = url_store.find_unvisited_urls(base_url)
known_links = url_store.find_known_urls(base_url)
assert "https://example.org/en/page2" not in todo and len(known_links) == 4

# strict + language as URL store parameters
url_store = UrlStore(strict=True, language="de")
base_url = "https://example.org"
htmlstring = '<html><body><a href="https://example.org/en/page2"/><a href="https://example.org/imprint.html"/></body></html>'
url_store.add_from_html(htmlstring, base_url)
assert not url_store.find_known_urls(base_url)

0 comments on commit cf46917

Please sign in to comment.