From ce42fcb0680e23f975aa4ad356529274a9ab576f Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 1 Jun 2023 19:05:47 +0200 Subject: [PATCH] adjust unvisited urls, domains and done meta-info (#41) * adjust unvisited domains and done meta-info * add test * use domain exhaustion methods * add test --- courlan/urlstore.py | 34 ++++++++++++++++++++++------------ tests/urlstore_tests.py | 3 +++ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/courlan/urlstore.py b/courlan/urlstore.py index ea7a981..331e124 100644 --- a/courlan/urlstore.py +++ b/courlan/urlstore.py @@ -172,6 +172,9 @@ def _store_urls( # timestamp/backoff value if timestamp is not None: self.urldict[domain].timestamp = timestamp + # adjust general state + if self.done and not self.urldict[domain].all_visited: + self.done = False def _search_urls( self, urls: List[str], switch: Optional[int] = None @@ -248,8 +251,15 @@ def is_exhausted_domain(self, domain: str) -> bool: raise KeyError("website not in store") def get_unvisited_domains(self) -> List[str]: - "Return all domains which have not been all visited." - return [d for d in self.urldict if not self.urldict[d].all_visited] + """Find all domains for which there are unvisited URLs + and potentially adjust done meta-information.""" + unvisited = [] + with self._lock: + if not self.done: + unvisited = [d for d in self.urldict if not self.urldict[d].all_visited] + if not unvisited: + self.done = True + return unvisited # URL-BASED QUERIES @@ -262,7 +272,11 @@ def has_been_visited(self, url: str) -> bool: def find_unvisited_urls(self, domain: str) -> List[str]: "Get all unvisited URLs for the given domain." - return [domain + u.urlpath for u in self._load_urls(domain) if not u.visited] + if not self.is_exhausted_domain(domain): + return [ + domain + u.urlpath for u in self._load_urls(domain) if not u.visited + ] + return [] def filter_unvisited_urls(self, urls: List[str]) -> List[Union[Any, str]]: "Take a list of URLs and return the currently unvisited ones." @@ -277,7 +291,7 @@ def unvisited_websites_number(self) -> int: def get_url(self, domain: str, as_visited: bool = True) -> Optional[str]: "Retrieve a single URL and consider it to be visited (with corresponding timestamp)." # not fully used - if not self.urldict[domain].all_visited: + if not self.is_exhausted_domain(domain): url_tuples = self._load_urls(domain) # get first non-seen url for url in url_tuples: @@ -297,10 +311,8 @@ def get_url(self, domain: str, as_visited: bool = True) -> Optional[str]: def get_download_urls(self, timelimit: int = 10) -> Optional[List[str]]: """Get a list of immediately downloadable URLs according to the given time limit per domain.""" - with self._lock: - potential = [d for d in self.urldict if not self.urldict[d].all_visited] + potential = self.get_unvisited_domains() if not potential: - self.done = True return [] targets = [] for domain in potential: @@ -319,11 +331,9 @@ def establish_download_schedule( """Get up to the specified number of URLs along with a suitable backoff schedule (in seconds).""" # see which domains are free - with self._lock: - potential = [d for d in self.urldict if not self.urldict[d].all_visited] - if not potential: - self.done = True - return [] + potential = self.get_unvisited_domains() + if not potential: + return [] # variables init per_domain = max_urls // len(potential) or 1 targets: List[Tuple[float, str]] = [] diff --git a/tests/urlstore_tests.py b/tests/urlstore_tests.py index 0d00a71..5f52f2a 100644 --- a/tests/urlstore_tests.py +++ b/tests/urlstore_tests.py @@ -132,6 +132,7 @@ def test_urlstore(): my_urls.add_urls(["https://visited.com/visited"], visited=True) assert my_urls.urldict["https://visited.com"].tuples[0].visited is True assert my_urls.urldict["https://visited.com"].all_visited is True + assert not my_urls.find_unvisited_urls("https://visited.com") assert my_urls.is_exhausted_domain("https://visited.com") is True # new unvisited URLs my_urls.add_urls(["https://visited.com/1"], visited=False) @@ -271,6 +272,8 @@ def test_urlstore(): schedule = other_store.establish_download_schedule() assert not schedule # store exhaustion + other_store.add_urls(["https://www.test.org/1"]) + assert other_store.done is False other_store = UrlStore() other_store.add_urls( ["http://domain.fi/page1", "http://domain.fi/page2", "http://domain.no/0"]