Skip to content

Commit

Permalink
adjust unvisited urls, domains and done meta-info (#41)
Browse files Browse the repository at this point in the history
* adjust unvisited domains and done meta-info

* add test

* use domain exhaustion methods

* add test
  • Loading branch information
adbar authored Jun 1, 2023
1 parent 05c6e20 commit ce42fcb
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 12 deletions.
34 changes: 22 additions & 12 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,9 @@ def _store_urls(
# timestamp/backoff value
if timestamp is not None:
self.urldict[domain].timestamp = timestamp
# adjust general state
if self.done and not self.urldict[domain].all_visited:
self.done = False

def _search_urls(
self, urls: List[str], switch: Optional[int] = None
Expand Down Expand Up @@ -248,8 +251,15 @@ def is_exhausted_domain(self, domain: str) -> bool:
raise KeyError("website not in store")

def get_unvisited_domains(self) -> List[str]:
"Return all domains which have not been all visited."
return [d for d in self.urldict if not self.urldict[d].all_visited]
"""Find all domains for which there are unvisited URLs
and potentially adjust done meta-information."""
unvisited = []
with self._lock:
if not self.done:
unvisited = [d for d in self.urldict if not self.urldict[d].all_visited]
if not unvisited:
self.done = True
return unvisited

# URL-BASED QUERIES

Expand All @@ -262,7 +272,11 @@ def has_been_visited(self, url: str) -> bool:

def find_unvisited_urls(self, domain: str) -> List[str]:
"Get all unvisited URLs for the given domain."
return [domain + u.urlpath for u in self._load_urls(domain) if not u.visited]
if not self.is_exhausted_domain(domain):
return [
domain + u.urlpath for u in self._load_urls(domain) if not u.visited
]
return []

def filter_unvisited_urls(self, urls: List[str]) -> List[Union[Any, str]]:
"Take a list of URLs and return the currently unvisited ones."
Expand All @@ -277,7 +291,7 @@ def unvisited_websites_number(self) -> int:
def get_url(self, domain: str, as_visited: bool = True) -> Optional[str]:
"Retrieve a single URL and consider it to be visited (with corresponding timestamp)."
# not fully used
if not self.urldict[domain].all_visited:
if not self.is_exhausted_domain(domain):
url_tuples = self._load_urls(domain)
# get first non-seen url
for url in url_tuples:
Expand All @@ -297,10 +311,8 @@ def get_url(self, domain: str, as_visited: bool = True) -> Optional[str]:
def get_download_urls(self, timelimit: int = 10) -> Optional[List[str]]:
"""Get a list of immediately downloadable URLs according to the given
time limit per domain."""
with self._lock:
potential = [d for d in self.urldict if not self.urldict[d].all_visited]
potential = self.get_unvisited_domains()
if not potential:
self.done = True
return []
targets = []
for domain in potential:
Expand All @@ -319,11 +331,9 @@ def establish_download_schedule(
"""Get up to the specified number of URLs along with a suitable
backoff schedule (in seconds)."""
# see which domains are free
with self._lock:
potential = [d for d in self.urldict if not self.urldict[d].all_visited]
if not potential:
self.done = True
return []
potential = self.get_unvisited_domains()
if not potential:
return []
# variables init
per_domain = max_urls // len(potential) or 1
targets: List[Tuple[float, str]] = []
Expand Down
3 changes: 3 additions & 0 deletions tests/urlstore_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def test_urlstore():
my_urls.add_urls(["https://visited.com/visited"], visited=True)
assert my_urls.urldict["https://visited.com"].tuples[0].visited is True
assert my_urls.urldict["https://visited.com"].all_visited is True
assert not my_urls.find_unvisited_urls("https://visited.com")
assert my_urls.is_exhausted_domain("https://visited.com") is True
# new unvisited URLs
my_urls.add_urls(["https://visited.com/1"], visited=False)
Expand Down Expand Up @@ -271,6 +272,8 @@ def test_urlstore():
schedule = other_store.establish_download_schedule()
assert not schedule
# store exhaustion
other_store.add_urls(["https://www.test.org/1"])
assert other_store.done is False
other_store = UrlStore()
other_store.add_urls(
["http://domain.fi/page1", "http://domain.fi/page2", "http://domain.no/0"]
Expand Down

0 comments on commit ce42fcb

Please sign in to comment.