From ce42fcb0680e23f975aa4ad356529274a9ab576f Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <adbar@users.noreply.github.com>
Date: Thu, 1 Jun 2023 19:05:47 +0200
Subject: [PATCH] adjust unvisited urls, domains and done meta-info (#41)

* adjust unvisited domains and done meta-info

* add test

* use domain exhaustion methods

* add test
---
 courlan/urlstore.py     | 34 ++++++++++++++++++++++------------
 tests/urlstore_tests.py |  3 +++
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/courlan/urlstore.py b/courlan/urlstore.py
index ea7a981..331e124 100644
--- a/courlan/urlstore.py
+++ b/courlan/urlstore.py
@@ -172,6 +172,9 @@ def _store_urls(
             # timestamp/backoff value
             if timestamp is not None:
                 self.urldict[domain].timestamp = timestamp
+            # adjust general state
+            if self.done and not self.urldict[domain].all_visited:
+                self.done = False
 
     def _search_urls(
         self, urls: List[str], switch: Optional[int] = None
@@ -248,8 +251,15 @@ def is_exhausted_domain(self, domain: str) -> bool:
         raise KeyError("website not in store")
 
     def get_unvisited_domains(self) -> List[str]:
-        "Return all domains which have not been all visited."
-        return [d for d in self.urldict if not self.urldict[d].all_visited]
+        """Find all domains for which there are unvisited URLs
+        and potentially adjust done meta-information."""
+        unvisited = []
+        with self._lock:
+            if not self.done:
+                unvisited = [d for d in self.urldict if not self.urldict[d].all_visited]
+                if not unvisited:
+                    self.done = True
+        return unvisited
 
     # URL-BASED QUERIES
 
@@ -262,7 +272,11 @@ def has_been_visited(self, url: str) -> bool:
 
     def find_unvisited_urls(self, domain: str) -> List[str]:
         "Get all unvisited URLs for the given domain."
-        return [domain + u.urlpath for u in self._load_urls(domain) if not u.visited]
+        if not self.is_exhausted_domain(domain):
+            return [
+                domain + u.urlpath for u in self._load_urls(domain) if not u.visited
+            ]
+        return []
 
     def filter_unvisited_urls(self, urls: List[str]) -> List[Union[Any, str]]:
         "Take a list of URLs and return the currently unvisited ones."
@@ -277,7 +291,7 @@ def unvisited_websites_number(self) -> int:
     def get_url(self, domain: str, as_visited: bool = True) -> Optional[str]:
         "Retrieve a single URL and consider it to be visited (with corresponding timestamp)."
         # not fully used
-        if not self.urldict[domain].all_visited:
+        if not self.is_exhausted_domain(domain):
             url_tuples = self._load_urls(domain)
             # get first non-seen url
             for url in url_tuples:
@@ -297,10 +311,8 @@ def get_url(self, domain: str, as_visited: bool = True) -> Optional[str]:
     def get_download_urls(self, timelimit: int = 10) -> Optional[List[str]]:
         """Get a list of immediately downloadable URLs according to the given
         time limit per domain."""
-        with self._lock:
-            potential = [d for d in self.urldict if not self.urldict[d].all_visited]
+        potential = self.get_unvisited_domains()
         if not potential:
-            self.done = True
             return []
         targets = []
         for domain in potential:
@@ -319,11 +331,9 @@ def establish_download_schedule(
         """Get up to the specified number of URLs along with a suitable
         backoff schedule (in seconds)."""
         # see which domains are free
-        with self._lock:
-            potential = [d for d in self.urldict if not self.urldict[d].all_visited]
-            if not potential:
-                self.done = True
-                return []
+        potential = self.get_unvisited_domains()
+        if not potential:
+            return []
         # variables init
         per_domain = max_urls // len(potential) or 1
         targets: List[Tuple[float, str]] = []
diff --git a/tests/urlstore_tests.py b/tests/urlstore_tests.py
index 0d00a71..5f52f2a 100644
--- a/tests/urlstore_tests.py
+++ b/tests/urlstore_tests.py
@@ -132,6 +132,7 @@ def test_urlstore():
     my_urls.add_urls(["https://visited.com/visited"], visited=True)
     assert my_urls.urldict["https://visited.com"].tuples[0].visited is True
     assert my_urls.urldict["https://visited.com"].all_visited is True
+    assert not my_urls.find_unvisited_urls("https://visited.com")
     assert my_urls.is_exhausted_domain("https://visited.com") is True
     # new unvisited URLs
     my_urls.add_urls(["https://visited.com/1"], visited=False)
@@ -271,6 +272,8 @@ def test_urlstore():
     schedule = other_store.establish_download_schedule()
     assert not schedule
     # store exhaustion
+    other_store.add_urls(["https://www.test.org/1"])
+    assert other_store.done is False
     other_store = UrlStore()
     other_store.add_urls(
         ["http://domain.fi/page1", "http://domain.fi/page2", "http://domain.no/0"]