Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compatibility: extract_links + is_exhausted_domain #51

Merged
merged 7 commits into from
Jul 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@


def clean_url(url: str, language: Optional[str] = None) -> Optional[str]:
"""Helper function: chained scrubbing and normalization"""
"Helper function: chained scrubbing and normalization"
try:
return normalize_url(scrub_url(url), False, language)
except (AttributeError, ValueError):
return None


def scrub_url(url: str) -> str:
"""Strip unnecessary parts and make sure only one URL is considered"""
"Strip unnecessary parts and make sure only one URL is considered"
# trim
# https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
# remove leading and trailing white space and unescaped control chars
Expand Down Expand Up @@ -100,7 +100,7 @@ def scrub_url(url: str) -> str:
def clean_query(
parsed_url: SplitResult, strict: bool = False, language: Optional[str] = None
) -> str:
"""Strip unwanted query elements"""
"Strip unwanted query elements"
if len(parsed_url.query) > 0:
qdict = parse_qs(parsed_url.query)
newqdict = {}
Expand Down Expand Up @@ -152,7 +152,7 @@ def normalize_url(
strict: bool = False,
language: Optional[str] = None,
) -> str:
"""Takes a URL string or a parsed URL and returns a (basically) normalized URL string"""
"Takes a URL string or a parsed URL and returns a normalized URL string"
parsed_url = _parse(parsed_url)
# lowercase + remove fragments + normalize punycode
scheme = parsed_url.scheme.lower()
Expand Down
19 changes: 12 additions & 7 deletions courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,9 @@ def check_url(

def extract_links(
pagecontent: str,
full_url: str,
external_bool: bool,
url: Optional[str] = None,
base_url: Optional[str] = None,
external_bool: bool = False,
no_filter: bool = False,
language: Optional[str] = None,
strict: bool = True,
Expand All @@ -138,7 +139,8 @@ def extract_links(
"""Filter links in a HTML document using a series of heuristics
Args:
pagecontent: whole page in binary format
full_url: full URL of the page
url: full URL of the original page
base_url: deprecated, legacy only
external_bool: set to True for external links only, False for
internal links only
no_filter: override settings and bypass checks to return all possible URLs
Expand All @@ -154,7 +156,8 @@ def extract_links(
Raises:
Nothing.
"""
base_url = get_base_url(full_url)
base_url = base_url or get_base_url(url)
url = url or base_url
candidates, validlinks = set(), set() # type: Set[str], Set[str]
if not pagecontent:
return validlinks
Expand Down Expand Up @@ -182,7 +185,7 @@ def extract_links(
for link in candidates:
# repair using base
if not link.startswith("http"):
link = fix_relative_urls(full_url, link)
link = fix_relative_urls(url, link)
# check
if no_filter is False:
checked = check_url(
Expand Down Expand Up @@ -210,7 +213,8 @@ def extract_links(

def filter_links(
htmlstring: str,
full_url: str,
url: Optional[str],
base_url: Optional[str] = None,
lang: Optional[str] = None,
rules: Optional[RobotFileParser] = None,
external: bool = False,
Expand All @@ -219,9 +223,10 @@ def filter_links(
) -> Tuple[List[str], List[str]]:
"Find links in a HTML document, filter them and add them to the data store."
links, links_priority = [], []
url = url or base_url
for link in extract_links(
pagecontent=htmlstring,
full_url=full_url,
url=url,
external_bool=external,
language=lang,
strict=strict,
Expand Down
13 changes: 9 additions & 4 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

from urllib.robotparser import RobotFileParser

from .clean import normalize_url
from .core import filter_links
from .filters import lang_filter, validate_url
from .meta import clear_caches
Expand Down Expand Up @@ -115,6 +116,9 @@ def _buffer_urls(
):
LOGGER.debug("Wrong language: %s", url)
raise ValueError
parsed_url = normalize_url(
parsed_url, strict=self.strict, language=self.language
)
hostinfo, urlpath = get_host_and_path(parsed_url)
inputdict[hostinfo].append(UrlPathTuple(urlpath, visited))
except (TypeError, ValueError):
Expand Down Expand Up @@ -235,18 +239,18 @@ def add_urls(
def add_from_html(
self,
htmlstring: str,
full_url: str,
url: str,
external: bool = False,
lang: Optional[str] = None,
with_nav: bool = True,
) -> None:
"Find links in a HTML document, filter them and add them to the data store."
# lang = lang or self.language
base_url = get_base_url(full_url)
base_url = get_base_url(url)
rules = self.get_rules(base_url)
links, links_priority = filter_links(
htmlstring=htmlstring,
full_url=full_url,
url=url,
external=external,
lang=lang or self.language,
rules=rules,
Expand Down Expand Up @@ -293,7 +297,8 @@ def is_exhausted_domain(self, domain: str) -> bool:
"Tell if all known URLs for the website have been visited."
if domain in self.urldict:
return self.urldict[domain].state in (State.ALL_VISITED, State.BUSTED)
raise KeyError("website not in store")
return False
# raise KeyError("website not in store")

def unvisited_websites_number(self) -> int:
"Return the number of websites for which there are still URLs to visit."
Expand Down
28 changes: 26 additions & 2 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,8 @@ def test_path_filter():


def test_lang_filter():
assert lang_filter("http://test.com/az/", "de") is False
assert lang_filter("http://test.com/de/", "de") is True
assert (
lang_filter(
"https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377",
Expand Down Expand Up @@ -816,6 +818,24 @@ def test_extraction():
"https://httpbin.org/links/2/0",
"https://httpbin.org/links/2/1",
]
links = extract_links(
pagecontent, base_url="https://httpbin.org", external_bool=False, with_nav=True
)
assert sorted(links) == [
"https://httpbin.org/links/2/0",
"https://httpbin.org/links/2/1",
]
pagecontent = "<html><head><title>Links</title></head><body><a href='links/2/0'>0</a> <a href='links/2/1'>1</a> </body></html>"
links = extract_links(
pagecontent,
url="https://httpbin.org/page1/",
external_bool=False,
with_nav=True,
)
assert sorted(links) == [
"https://httpbin.org/page1/links/2/0",
"https://httpbin.org/page1/links/2/1",
]
pagecontent = "<html><head><title>Pages</title></head><body><a href='/page/10'>10</a> <a href='/page/?=11'>11</a></body></html>"
assert (
extract_links(
Expand Down Expand Up @@ -889,8 +909,12 @@ def test_extraction():
base_url = "https://example.org"
htmlstring = '<html><body><a href="https://example.org/page1"/><a href="https://example.org/page1/"/><a href="https://test.org/page1"/></body></html>'
links, links_priority = filter_links(htmlstring, base_url)
assert len(links) == 1
assert not links_priority
assert len(links) == 1 and not links_priority
# link filtering with relative URLs
url = "https://example.org/page1.html"
htmlstring = '<html><body><a href="/subpage1"/><a href="/subpage1/"/><a href="https://test.org/page1"/></body></html>'
links, links_priority = filter_links(htmlstring, url=url)
assert len(links) == 1 and not links_priority


def test_cli():
Expand Down
5 changes: 3 additions & 2 deletions tests/urlstore_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,9 @@ def test_urlstore():
assert my_urls.urldict["https://visited.com"].tuples[1].visited is False
assert my_urls.urldict["https://visited.com"].state is State.OPEN
assert my_urls.is_exhausted_domain("https://visited.com") is False
with pytest.raises(KeyError):
assert my_urls.is_exhausted_domain("https://visited2.com") is True
# with pytest.raises(KeyError):
# assert my_urls.is_exhausted_domain("https://visited2.com") is True
assert my_urls.is_exhausted_domain("https://visited2.com") is False
# revert changes for further tests
del my_urls.urldict["https://visited.com"].tuples[1]
my_urls.urldict["https://visited.com"].state = State.ALL_VISITED
Expand Down