diff --git a/courlan/clean.py b/courlan/clean.py index 3750674..194368c 100644 --- a/courlan/clean.py +++ b/courlan/clean.py @@ -37,7 +37,7 @@ def clean_url(url: str, language: Optional[str] = None) -> Optional[str]: - """Helper function: chained scrubbing and normalization""" + "Helper function: chained scrubbing and normalization" try: return normalize_url(scrub_url(url), False, language) except (AttributeError, ValueError): @@ -45,7 +45,7 @@ def clean_url(url: str, language: Optional[str] = None) -> Optional[str]: def scrub_url(url: str) -> str: - """Strip unnecessary parts and make sure only one URL is considered""" + "Strip unnecessary parts and make sure only one URL is considered" # trim # https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py # remove leading and trailing white space and unescaped control chars @@ -100,7 +100,7 @@ def scrub_url(url: str) -> str: def clean_query( parsed_url: SplitResult, strict: bool = False, language: Optional[str] = None ) -> str: - """Strip unwanted query elements""" + "Strip unwanted query elements" if len(parsed_url.query) > 0: qdict = parse_qs(parsed_url.query) newqdict = {} @@ -152,7 +152,7 @@ def normalize_url( strict: bool = False, language: Optional[str] = None, ) -> str: - """Takes a URL string or a parsed URL and returns a (basically) normalized URL string""" + "Takes a URL string or a parsed URL and returns a normalized URL string" parsed_url = _parse(parsed_url) # lowercase + remove fragments + normalize punycode scheme = parsed_url.scheme.lower() diff --git a/courlan/core.py b/courlan/core.py index 595b8b1..8f96806 100644 --- a/courlan/core.py +++ b/courlan/core.py @@ -126,8 +126,9 @@ def check_url( def extract_links( pagecontent: str, - full_url: str, - external_bool: bool, + url: Optional[str] = None, + base_url: Optional[str] = None, + external_bool: bool = False, no_filter: bool = False, language: Optional[str] = None, strict: bool = True, @@ -138,7 +139,8 @@ def extract_links( """Filter links in a HTML document using a series of heuristics Args: pagecontent: whole page in binary format - full_url: full URL of the page + url: full URL of the original page + base_url: deprecated, legacy only external_bool: set to True for external links only, False for internal links only no_filter: override settings and bypass checks to return all possible URLs @@ -154,7 +156,8 @@ def extract_links( Raises: Nothing. """ - base_url = get_base_url(full_url) + base_url = base_url or get_base_url(url) + url = url or base_url candidates, validlinks = set(), set() # type: Set[str], Set[str] if not pagecontent: return validlinks @@ -182,7 +185,7 @@ def extract_links( for link in candidates: # repair using base if not link.startswith("http"): - link = fix_relative_urls(full_url, link) + link = fix_relative_urls(url, link) # check if no_filter is False: checked = check_url( @@ -210,7 +213,8 @@ def extract_links( def filter_links( htmlstring: str, - full_url: str, + url: Optional[str], + base_url: Optional[str] = None, lang: Optional[str] = None, rules: Optional[RobotFileParser] = None, external: bool = False, @@ -219,9 +223,10 @@ def filter_links( ) -> Tuple[List[str], List[str]]: "Find links in a HTML document, filter them and add them to the data store." links, links_priority = [], [] + url = url or base_url for link in extract_links( pagecontent=htmlstring, - full_url=full_url, + url=url, external_bool=external, language=lang, strict=strict, diff --git a/courlan/urlstore.py b/courlan/urlstore.py index 80b0ebf..caf383b 100644 --- a/courlan/urlstore.py +++ b/courlan/urlstore.py @@ -27,6 +27,7 @@ from urllib.robotparser import RobotFileParser +from .clean import normalize_url from .core import filter_links from .filters import lang_filter, validate_url from .meta import clear_caches @@ -115,6 +116,9 @@ def _buffer_urls( ): LOGGER.debug("Wrong language: %s", url) raise ValueError + parsed_url = normalize_url( + parsed_url, strict=self.strict, language=self.language + ) hostinfo, urlpath = get_host_and_path(parsed_url) inputdict[hostinfo].append(UrlPathTuple(urlpath, visited)) except (TypeError, ValueError): @@ -235,18 +239,18 @@ def add_urls( def add_from_html( self, htmlstring: str, - full_url: str, + url: str, external: bool = False, lang: Optional[str] = None, with_nav: bool = True, ) -> None: "Find links in a HTML document, filter them and add them to the data store." # lang = lang or self.language - base_url = get_base_url(full_url) + base_url = get_base_url(url) rules = self.get_rules(base_url) links, links_priority = filter_links( htmlstring=htmlstring, - full_url=full_url, + url=url, external=external, lang=lang or self.language, rules=rules, @@ -293,7 +297,8 @@ def is_exhausted_domain(self, domain: str) -> bool: "Tell if all known URLs for the website have been visited." if domain in self.urldict: return self.urldict[domain].state in (State.ALL_VISITED, State.BUSTED) - raise KeyError("website not in store") + return False + # raise KeyError("website not in store") def unvisited_websites_number(self) -> int: "Return the number of websites for which there are still URLs to visit." diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 7598892..6bd40c9 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -310,6 +310,8 @@ def test_path_filter(): def test_lang_filter(): + assert lang_filter("http://test.com/az/", "de") is False + assert lang_filter("http://test.com/de/", "de") is True assert ( lang_filter( "https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377", @@ -816,6 +818,24 @@ def test_extraction(): "https://httpbin.org/links/2/0", "https://httpbin.org/links/2/1", ] + links = extract_links( + pagecontent, base_url="https://httpbin.org", external_bool=False, with_nav=True + ) + assert sorted(links) == [ + "https://httpbin.org/links/2/0", + "https://httpbin.org/links/2/1", + ] + pagecontent = "Links0 1 " + links = extract_links( + pagecontent, + url="https://httpbin.org/page1/", + external_bool=False, + with_nav=True, + ) + assert sorted(links) == [ + "https://httpbin.org/page1/links/2/0", + "https://httpbin.org/page1/links/2/1", + ] pagecontent = "Pages10 11" assert ( extract_links( @@ -889,8 +909,12 @@ def test_extraction(): base_url = "https://example.org" htmlstring = '' links, links_priority = filter_links(htmlstring, base_url) - assert len(links) == 1 - assert not links_priority + assert len(links) == 1 and not links_priority + # link filtering with relative URLs + url = "https://example.org/page1.html" + htmlstring = '' + links, links_priority = filter_links(htmlstring, url=url) + assert len(links) == 1 and not links_priority def test_cli(): diff --git a/tests/urlstore_tests.py b/tests/urlstore_tests.py index 9faa1cc..76ab034 100644 --- a/tests/urlstore_tests.py +++ b/tests/urlstore_tests.py @@ -170,8 +170,9 @@ def test_urlstore(): assert my_urls.urldict["https://visited.com"].tuples[1].visited is False assert my_urls.urldict["https://visited.com"].state is State.OPEN assert my_urls.is_exhausted_domain("https://visited.com") is False - with pytest.raises(KeyError): - assert my_urls.is_exhausted_domain("https://visited2.com") is True + # with pytest.raises(KeyError): + # assert my_urls.is_exhausted_domain("https://visited2.com") is True + assert my_urls.is_exhausted_domain("https://visited2.com") is False # revert changes for further tests del my_urls.urldict["https://visited.com"].tuples[1] my_urls.urldict["https://visited.com"].state = State.ALL_VISITED