adbar · adbar · Jul 13, 2023 · Jul 3, 2023 · Jul 3, 2023 · Jul 5, 2023
diff --git a/courlan/clean.py b/courlan/clean.py
@@ -37,15 +37,15 @@
 
 
 def clean_url(url: str, language: Optional[str] = None) -> Optional[str]:
-    """Helper function: chained scrubbing and normalization"""
+    "Helper function: chained scrubbing and normalization"
     try:
         return normalize_url(scrub_url(url), False, language)
     except (AttributeError, ValueError):
         return None
 
 
 def scrub_url(url: str) -> str:
-    """Strip unnecessary parts and make sure only one URL is considered"""
+    "Strip unnecessary parts and make sure only one URL is considered"
     # trim
     # https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
     # remove leading and trailing white space and unescaped control chars
@@ -100,7 +100,7 @@ def scrub_url(url: str) -> str:
 def clean_query(
     parsed_url: SplitResult, strict: bool = False, language: Optional[str] = None
 ) -> str:
-    """Strip unwanted query elements"""
+    "Strip unwanted query elements"
     if len(parsed_url.query) > 0:
         qdict = parse_qs(parsed_url.query)
         newqdict = {}
@@ -152,7 +152,7 @@ def normalize_url(
     strict: bool = False,
     language: Optional[str] = None,
 ) -> str:
-    """Takes a URL string or a parsed URL and returns a (basically) normalized URL string"""
+    "Takes a URL string or a parsed URL and returns a normalized URL string"
     parsed_url = _parse(parsed_url)
     # lowercase + remove fragments + normalize punycode
     scheme = parsed_url.scheme.lower()

diff --git a/courlan/core.py b/courlan/core.py
@@ -126,8 +126,9 @@ def check_url(
 
 def extract_links(
     pagecontent: str,
-    full_url: str,
-    external_bool: bool,
+    url: Optional[str] = None,
+    base_url: Optional[str] = None,
+    external_bool: bool = False,
     no_filter: bool = False,
     language: Optional[str] = None,
     strict: bool = True,
@@ -138,7 +139,8 @@ def extract_links(
     """Filter links in a HTML document using a series of heuristics
     Args:
         pagecontent: whole page in binary format
-        full_url: full URL of the page
+        url: full URL of the original page
+        base_url: deprecated, legacy only
         external_bool: set to True for external links only, False for
                   internal links only
         no_filter: override settings and bypass checks to return all possible URLs
@@ -154,7 +156,8 @@ def extract_links(
     Raises:
         Nothing.
     """
-    base_url = get_base_url(full_url)
+    base_url = base_url or get_base_url(url)
+    url = url or base_url
     candidates, validlinks = set(), set()  # type: Set[str], Set[str]
     if not pagecontent:
         return validlinks
@@ -182,7 +185,7 @@ def extract_links(
     for link in candidates:
         # repair using base
         if not link.startswith("http"):
-            link = fix_relative_urls(full_url, link)
+            link = fix_relative_urls(url, link)
         # check
         if no_filter is False:
             checked = check_url(
@@ -210,7 +213,8 @@ def extract_links(
 
 def filter_links(
     htmlstring: str,
-    full_url: str,
+    url: Optional[str],
+    base_url: Optional[str] = None,
     lang: Optional[str] = None,
     rules: Optional[RobotFileParser] = None,
     external: bool = False,
@@ -219,9 +223,10 @@ def filter_links(
 ) -> Tuple[List[str], List[str]]:
     "Find links in a HTML document, filter them and add them to the data store."
     links, links_priority = [], []
+    url = url or base_url
     for link in extract_links(
         pagecontent=htmlstring,
-        full_url=full_url,
+        url=url,
         external_bool=external,
         language=lang,
         strict=strict,

diff --git a/courlan/urlstore.py b/courlan/urlstore.py
@@ -27,6 +27,7 @@
 
 from urllib.robotparser import RobotFileParser
 
+from .clean import normalize_url
 from .core import filter_links
 from .filters import lang_filter, validate_url
 from .meta import clear_caches
@@ -115,6 +116,9 @@ def _buffer_urls(
                 ):
                     LOGGER.debug("Wrong language: %s", url)
                     raise ValueError
+                parsed_url = normalize_url(
+                    parsed_url, strict=self.strict, language=self.language
+                )
                 hostinfo, urlpath = get_host_and_path(parsed_url)
                 inputdict[hostinfo].append(UrlPathTuple(urlpath, visited))
             except (TypeError, ValueError):
@@ -235,18 +239,18 @@ def add_urls(
     def add_from_html(
         self,
         htmlstring: str,
-        full_url: str,
+        url: str,
         external: bool = False,
         lang: Optional[str] = None,
         with_nav: bool = True,
     ) -> None:
         "Find links in a HTML document, filter them and add them to the data store."
         # lang = lang or self.language
-        base_url = get_base_url(full_url)
+        base_url = get_base_url(url)
         rules = self.get_rules(base_url)
         links, links_priority = filter_links(
             htmlstring=htmlstring,
-            full_url=full_url,
+            url=url,
             external=external,
             lang=lang or self.language,
             rules=rules,
@@ -293,7 +297,8 @@ def is_exhausted_domain(self, domain: str) -> bool:
         "Tell if all known URLs for the website have been visited."
         if domain in self.urldict:
             return self.urldict[domain].state in (State.ALL_VISITED, State.BUSTED)
-        raise KeyError("website not in store")
+        return False
+        # raise KeyError("website not in store")
 
     def unvisited_websites_number(self) -> int:
         "Return the number of websites for which there are still URLs to visit."

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -310,6 +310,8 @@ def test_path_filter():
 
 
 def test_lang_filter():
+    assert lang_filter("http://test.com/az/", "de") is False
+    assert lang_filter("http://test.com/de/", "de") is True
     assert (
         lang_filter(
             "https://www.20min.ch/fr/story/des-millions-pour-produire-de-l-energie-renouvelable-467974085377",
@@ -816,6 +818,24 @@ def test_extraction():
         "https://httpbin.org/links/2/0",
         "https://httpbin.org/links/2/1",
     ]
+    links = extract_links(
+        pagecontent, base_url="https://httpbin.org", external_bool=False, with_nav=True
+    )
+    assert sorted(links) == [
+        "https://httpbin.org/links/2/0",
+        "https://httpbin.org/links/2/1",
+    ]
+    pagecontent = "<html><head><title>Links</title></head><body><a href='links/2/0'>0</a> <a href='links/2/1'>1</a> </body></html>"
+    links = extract_links(
+        pagecontent,
+        url="https://httpbin.org/page1/",
+        external_bool=False,
+        with_nav=True,
+    )
+    assert sorted(links) == [
+        "https://httpbin.org/page1/links/2/0",
+        "https://httpbin.org/page1/links/2/1",
+    ]
     pagecontent = "<html><head><title>Pages</title></head><body><a href='/page/10'>10</a> <a href='/page/?=11'>11</a></body></html>"
     assert (
         extract_links(
@@ -889,8 +909,12 @@ def test_extraction():
     base_url = "https://example.org"
     htmlstring = '<html><body><a href="https://example.org/page1"/><a href="https://example.org/page1/"/><a href="https://test.org/page1"/></body></html>'
     links, links_priority = filter_links(htmlstring, base_url)
-    assert len(links) == 1
-    assert not links_priority
+    assert len(links) == 1 and not links_priority
+    # link filtering with relative URLs
+    url = "https://example.org/page1.html"
+    htmlstring = '<html><body><a href="/subpage1"/><a href="/subpage1/"/><a href="https://test.org/page1"/></body></html>'
+    links, links_priority = filter_links(htmlstring, url=url)
+    assert len(links) == 1 and not links_priority
 
 
 def test_cli():

diff --git a/tests/urlstore_tests.py b/tests/urlstore_tests.py
@@ -170,8 +170,9 @@ def test_urlstore():
     assert my_urls.urldict["https://visited.com"].tuples[1].visited is False
     assert my_urls.urldict["https://visited.com"].state is State.OPEN
     assert my_urls.is_exhausted_domain("https://visited.com") is False
-    with pytest.raises(KeyError):
-        assert my_urls.is_exhausted_domain("https://visited2.com") is True
+    # with pytest.raises(KeyError):
+    #    assert my_urls.is_exhausted_domain("https://visited2.com") is True
+    assert my_urls.is_exhausted_domain("https://visited2.com") is False
     # revert changes for further tests
     del my_urls.urldict["https://visited.com"].tuples[1]
     my_urls.urldict["https://visited.com"].state = State.ALL_VISITED