From a2a3b332ad71abb3f5a5126fe6ddfcbc84972d63 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 5 Nov 2024 16:57:42 +0100 Subject: [PATCH 1/3] Simplify request queue --- .../_request_queue_client.py | 18 ++- src/crawlee/storages/_request_queue.py | 129 +----------------- 2 files changed, 20 insertions(+), 127 deletions(-) diff --git a/src/crawlee/memory_storage_client/_request_queue_client.py b/src/crawlee/memory_storage_client/_request_queue_client.py index 686b11de3..28fc885ec 100644 --- a/src/crawlee/memory_storage_client/_request_queue_client.py +++ b/src/crawlee/memory_storage_client/_request_queue_client.py @@ -73,6 +73,8 @@ def __init__( self.file_operation_lock = asyncio.Lock() self._last_used_timestamp = Decimal(0.0) + self._in_progress = set[str]() + @property def resource_info(self) -> RequestQueueMetadata: """Get the resource info for the request queue client.""" @@ -171,7 +173,7 @@ async def delete(self) -> None: await asyncio.to_thread(shutil.rmtree, queue.resource_directory) @override - async def list_head(self, *, limit: int | None = None) -> RequestQueueHead: + async def list_head(self, *, limit: int | None = None, skip_in_progress: bool = False) -> RequestQueueHead: existing_queue_by_id = find_or_create_client_by_id_or_name_inner( resource_client_class=RequestQueueClient, memory_storage_client=self._memory_storage_client, @@ -196,6 +198,9 @@ async def list_head(self, *, limit: int | None = None) -> RequestQueueHead: if len(requests) == limit: break + if skip_in_progress and request_key in self._in_progress: + continue + request = existing_queue_by_id.requests.get(request_key) # Check that the request still exists and was not handled, @@ -214,7 +219,11 @@ async def list_head(self, *, limit: int | None = None) -> RequestQueueHead: @override async def list_and_lock_head(self, *, lock_secs: int, limit: int | None = None) -> RequestQueueHeadWithLocks: - result = await self.list_head(limit=limit) + result = await self.list_head(limit=limit, skip_in_progress=True) + + for item in result.items: + self._in_progress.add(item.id) + return RequestQueueHeadWithLocks( lock_secs=lock_secs, limit=result.limit, @@ -344,6 +353,9 @@ async def update_request( persist_storage=self._memory_storage_client.persist_storage, ) + if request.handled_at is not None: + self._in_progress.discard(request.id) + return ProcessedRequest( id=request_model.id, unique_key=request_model.unique_key, @@ -395,7 +407,7 @@ async def delete_request_lock( *, forefront: bool = False, ) -> None: - return None + self._in_progress.discard(request_id) @override async def batch_add_requests( diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index bd1127d48..8d85076e2 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -5,7 +5,7 @@ from contextlib import suppress from datetime import datetime, timedelta, timezone from logging import getLogger -from typing import TYPE_CHECKING, Any, Generic, TypedDict, TypeVar +from typing import TYPE_CHECKING, Any, TypedDict, TypeVar from typing_extensions import override @@ -31,30 +31,6 @@ T = TypeVar('T') -class BoundedSet(Generic[T]): - """A simple set datastructure that removes the least recently accessed item when it reaches `max_length`.""" - - def __init__(self, max_length: int) -> None: - self._max_length = max_length - self._data = OrderedDict[T, object]() - - def __contains__(self, item: T) -> bool: - found = item in self._data - if found: - self._data.move_to_end(item, last=True) - return found - - def add(self, item: T) -> None: - self._data[item] = True - self._data.move_to_end(item) - - if len(self._data) > self._max_length: - self._data.popitem(last=False) - - def clear(self) -> None: - self._data.clear() - - class CachedRequest(TypedDict): id: str was_already_handled: bool @@ -97,12 +73,6 @@ class RequestQueue(BaseStorage, RequestProvider): _MAX_CACHED_REQUESTS = 1_000_000 """Maximum number of requests that can be cached.""" - _RECENTLY_HANDLED_CACHE_SIZE = 1000 - """Cache size for recently handled requests.""" - - _STORAGE_CONSISTENCY_DELAY = timedelta(seconds=3) - """Expected delay for storage to achieve consistency, guiding the timing of subsequent read operations.""" - def __init__( self, id: str, @@ -117,7 +87,6 @@ def __init__( # Get resource clients from storage client self._resource_client = client.request_queue(self._id) - self._resource_collection_client = client.request_queues() self._request_lock_time = timedelta(minutes=3) self._queue_paused_for_migration = False @@ -134,9 +103,7 @@ def __init__( self._assumed_handled_count = 0 self._queue_head_dict: OrderedDict[str, str] = OrderedDict() self._list_head_and_lock_task: asyncio.Task | None = None - self._in_progress: set[str] = set() self._last_activity = datetime.now(timezone.utc) - self._recently_handled: BoundedSet[str] = BoundedSet(max_length=self._RECENTLY_HANDLED_CACHE_SIZE) self._requests_cache: LRUCache[CachedRequest] = LRUCache(max_length=self._MAX_CACHED_REQUESTS) @override @@ -209,15 +176,7 @@ async def add_request( self._cache_request(cache_key, processed_request) - request_id, was_already_present = processed_request.id, processed_request.was_already_present - is_handled = request.handled_at is not None - - if ( - not is_handled - and not was_already_present - and request_id not in self._in_progress - and request_id not in self._recently_handled - ): + if request.handled_at is None and not processed_request.was_already_present: self._assumed_total_count += 1 return processed_request @@ -300,27 +259,7 @@ async def fetch_next_request(self) -> Request | None: return None next_request_id, _ = self._queue_head_dict.popitem(last=False) # ~removeFirst() - - # This should never happen, but... - if next_request_id in self._in_progress or next_request_id in self._recently_handled: - logger.warning( - 'Queue head returned a request that is already in progress?!', - extra={ - 'nextRequestId': next_request_id, - 'inProgress': next_request_id in self._in_progress, - 'recentlyHandled': next_request_id in self._recently_handled, - }, - ) - return None - - self._in_progress.add(next_request_id) - - try: - request = await self._get_or_hydrate_request(next_request_id) - except Exception: - # On error, remove the request from in progress, otherwise it would be there forever - self._in_progress.remove(next_request_id) - raise + request = await self._get_or_hydrate_request(next_request_id) # NOTE: It can happen that the queue head index is inconsistent with the main queue table. # This can occur in two situations: @@ -336,10 +275,6 @@ async def fetch_next_request(self) -> Request | None: 'Cannot find a request from the beginning of queue, will be retried later', extra={'nextRequestId': next_request_id}, ) - asyncio.get_running_loop().call_later( - self._STORAGE_CONSISTENCY_DELAY.total_seconds(), - lambda: self._in_progress.remove(next_request_id), - ) return None # 2) @@ -352,7 +287,6 @@ async def fetch_next_request(self) -> Request | None: 'Request fetched from the beginning of queue was already handled', extra={'nextRequestId': next_request_id}, ) - self._recently_handled.add(next_request_id) return None return request @@ -370,19 +304,12 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | """ self._last_activity = datetime.now(timezone.utc) - if request.id not in self._in_progress: - logger.debug(f'Cannot mark request (ID: {request.id}) as handled, because it is not in progress!') - return None - if request.handled_at is None: request.handled_at = datetime.now(timezone.utc) processed_request = await self._resource_client.update_request(request) processed_request.unique_key = request.unique_key - self._in_progress.remove(request.id) - self._recently_handled.add(request.id) - if not processed_request.was_already_handled: self._assumed_handled_count += 1 @@ -408,10 +335,6 @@ async def reclaim_request( """ self._last_activity = datetime.now(timezone.utc) - if request.id not in self._in_progress: - logger.debug(f'Cannot reclaim request (ID: {request.id}), because it is not in progress!') - return None - # TODO: If request hasn't been changed since the last get_request(), we don't need to call update_request() # and thus improve performance. # https://github.com/apify/apify-sdk-python/issues/143 @@ -420,11 +343,6 @@ async def reclaim_request( self._cache_request(unique_key_to_request_id(request.unique_key), processed_request) if processed_request: - # Mark the request as no longer in progress, - # as the moment we delete the lock, we could end up also re-fetching the request in a subsequent - # _ensure_head_is_non_empty() which could potentially lock the request again - self._in_progress.discard(request.id) - # Try to delete the request lock if possible try: await self._resource_client.delete_request_lock(request.id, forefront=forefront) @@ -451,21 +369,6 @@ async def is_finished(self) -> bool: Returns: bool: `True` if all requests were already handled and there are no more left. `False` otherwise. """ - seconds_since_last_activity = datetime.now(timezone.utc) - self._last_activity - if self._in_progress_count() > 0 and seconds_since_last_activity > self._internal_timeout: - logger.warning( - f'The request queue seems to be stuck for {self._internal_timeout.total_seconds()}s, ' - 'resetting internal state.', - extra={ - 'queue_head_ids_pending': len(self._queue_head_dict), - 'in_progress': list(self._in_progress), - }, - ) - - # We only need to reset these two variables, no need to reset all the other stats - self._queue_head_dict.clear() - self._in_progress.clear() - if self._queue_head_dict: logger.debug( 'There are still ids in the queue head that are pending processing', @@ -476,16 +379,6 @@ async def is_finished(self) -> bool: return False - if self._in_progress: - logger.debug( - 'There are still requests in progress (or zombie)', - extra={ - 'in_progress': list(self._in_progress), - }, - ) - - return False - current_head = await self._resource_client.list_head(limit=2) if current_head.items: @@ -493,7 +386,7 @@ async def is_finished(self) -> bool: 'Queue head still returned requests that need to be processed (or that are locked by other clients)', ) - return not current_head.items and not self._in_progress + return not current_head.items async def get_info(self) -> RequestQueueMetadata | None: """Get an object containing general information about the request queue.""" @@ -534,19 +427,12 @@ async def _list_head_and_lock(self) -> None: for request in response.items: # Queue head index might be behind the main table, so ensure we don't recycle requests - if ( - not request.id - or not request.unique_key - or request.id in self._in_progress - or request.id in self._recently_handled - ): + if not request.id or not request.unique_key: logger.debug( 'Skipping request from queue head, already in progress or recently handled', extra={ 'id': request.id, 'unique_key': request.unique_key, - 'in_progress': request.id in self._in_progress, - 'recently_handled': request.id in self._recently_handled, }, ) @@ -568,14 +454,9 @@ async def _list_head_and_lock(self) -> None: ), ) - def _in_progress_count(self) -> int: - return len(self._in_progress) - def _reset(self) -> None: self._queue_head_dict.clear() self._list_head_and_lock_task = None - self._in_progress.clear() - self._recently_handled.clear() self._assumed_total_count = 0 self._assumed_handled_count = 0 self._requests_cache.clear() From c2468d0512c4f1d8cd72e02b3d542739280da11f Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 5 Nov 2024 22:43:37 +0100 Subject: [PATCH 2/3] Do that convoluted memory storage thingy --- .../_request_queue_client.py | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/crawlee/memory_storage_client/_request_queue_client.py b/src/crawlee/memory_storage_client/_request_queue_client.py index 28fc885ec..c5b60b9c2 100644 --- a/src/crawlee/memory_storage_client/_request_queue_client.py +++ b/src/crawlee/memory_storage_client/_request_queue_client.py @@ -198,7 +198,7 @@ async def list_head(self, *, limit: int | None = None, skip_in_progress: bool = if len(requests) == limit: break - if skip_in_progress and request_key in self._in_progress: + if skip_in_progress and request_key in existing_queue_by_id._in_progress: # noqa: SLF001 continue request = existing_queue_by_id.requests.get(request_key) @@ -219,10 +219,20 @@ async def list_head(self, *, limit: int | None = None, skip_in_progress: bool = @override async def list_and_lock_head(self, *, lock_secs: int, limit: int | None = None) -> RequestQueueHeadWithLocks: + existing_queue_by_id = find_or_create_client_by_id_or_name_inner( + resource_client_class=RequestQueueClient, + memory_storage_client=self._memory_storage_client, + id=self.id, + name=self.name, + ) + + if existing_queue_by_id is None: + raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) + result = await self.list_head(limit=limit, skip_in_progress=True) for item in result.items: - self._in_progress.add(item.id) + existing_queue_by_id._in_progress.add(item.id) # noqa: SLF001 return RequestQueueHeadWithLocks( lock_secs=lock_secs, @@ -354,7 +364,7 @@ async def update_request( ) if request.handled_at is not None: - self._in_progress.discard(request.id) + existing_queue_by_id._in_progress.discard(request.id) # noqa: SLF001 return ProcessedRequest( id=request_model.id, @@ -407,7 +417,17 @@ async def delete_request_lock( *, forefront: bool = False, ) -> None: - self._in_progress.discard(request_id) + existing_queue_by_id = find_or_create_client_by_id_or_name_inner( + resource_client_class=RequestQueueClient, + memory_storage_client=self._memory_storage_client, + id=self.id, + name=self.name, + ) + + if existing_queue_by_id is None: + raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) + + existing_queue_by_id._in_progress.discard(request_id) # noqa: SLF001 @override async def batch_add_requests( From 87e08b965fd90b51cd214736d99292f70a7ef2f9 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 14 Nov 2024 14:45:59 +0100 Subject: [PATCH 3/3] WIP update is_finished() --- src/crawlee/storages/_request_queue.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index fe28b0348..decebcb5a 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -381,14 +381,28 @@ async def is_finished(self) -> bool: return False - current_head = await self._resource_client.list_head(limit=2) + await self._ensure_head_is_non_empty() - if current_head.items: + if self._queue_head_dict: logger.debug( - 'Queue head still returned requests that need to be processed (or that are locked by other clients)', + 'Queue head still returned requests that need to be processed (or that are locked by other clients)' ) - return not current_head.items + return False + + metadata = await self._resource_client.get() + if metadata is not None and not metadata.had_multiple_clients and not self._queue_head_dict: + logger.debug('Queue head is empty and there are no other clients - we are finished') + + return True + + current_head = await self._resource_client.list_head(limit=2) + + if not current_head.items: + await asyncio.sleep(5) # TODO not sure how long we should sleep + current_head = await self._resource_client.list_head(limit=2) + + return len(current_head.items) == 0 async def get_info(self) -> RequestQueueMetadata | None: """Get an object containing general information about the request queue."""