From dd90b34cd2751b3d8d9ad3d4713947092723318d Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 19 Aug 2025 11:10:10 +0200 Subject: [PATCH 1/2] chore: Remove sortedcollections and sortedcontainers packages --- pyproject.toml | 4 -- src/crawlee/_autoscaling/snapshotter.py | 33 +++++++++++++--- tests/unit/_autoscaling/test_snapshotter.py | 32 ++++++++++++++++ uv.lock | 42 +-------------------- 4 files changed, 60 insertions(+), 51 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b830e8c803..11d9f76102 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,8 +42,6 @@ dependencies = [ "pydantic-settings>=2.2.0,!=2.7.0,!=2.7.1,!=2.8.0", "pydantic>=2.8.0,!=2.10.0,!=2.10.1,!=2.10.2", "pyee>=9.0.0", - "sortedcollections>=2.1.0", - "sortedcontainers>=2.4.0", "tldextract>=5.1.0", "typing-extensions>=4.1.0", "yarl>=1.18.0", @@ -103,7 +101,6 @@ dev = [ "pytest~=8.4.0", "ruff~=0.12.0", "setuptools", # setuptools are used by pytest, but not explicitly required - "sortedcontainers-stubs~=2.4.0", "types-beautifulsoup4~=4.12.0.20240229", "types-cachetools~=6.1.0.20250717", "types-colorama~=0.4.15.20240106", @@ -247,7 +244,6 @@ module = [ "litestar", # Example code shows deploy on Google Cloud Run. "loguru", # Example code shows integration of loguru and crawlee for JSON logging. "sklearn.linear_model", # Untyped and stubs not available - "sortedcollections", # Untyped and stubs not available "cookiecutter.*", # Untyped and stubs not available "inquirer.*", # Untyped and stubs not available "warcio.*", # Example code shows WARC files creation. diff --git a/src/crawlee/_autoscaling/snapshotter.py b/src/crawlee/_autoscaling/snapshotter.py index cdbe3c0a44..63c01326db 100644 --- a/src/crawlee/_autoscaling/snapshotter.py +++ b/src/crawlee/_autoscaling/snapshotter.py @@ -4,11 +4,8 @@ from datetime import datetime, timedelta, timezone from logging import getLogger -from operator import attrgetter from typing import TYPE_CHECKING, TypeVar, cast -from sortedcontainers import SortedList - from crawlee import service_locator from crawlee._autoscaling._types import ClientSnapshot, CpuSnapshot, EventLoopSnapshot, MemorySnapshot, Snapshot from crawlee._utils.byte_size import ByteSize @@ -25,7 +22,25 @@ logger = getLogger(__name__) -T = TypeVar('T') +T = TypeVar('T', bound=Snapshot) + + +class SortedSnapshotList(list[T]): + """A list that maintains sorted order by `created_at` attribute for snapshot objects.""" + + def add(self, item: T) -> None: + """Add an item to the list maintaining sorted order by `created_at` using binary search.""" + left, right = 0, len(self) + item_time = item.created_at + + while left < right: + mid = (left + right) // 2 + if self[mid].created_at <= item_time: + left = mid + 1 + else: + right = mid + + self.insert(left, item) @docs_group('Autoscaling') @@ -127,8 +142,14 @@ def from_config(cls, config: Configuration | None = None) -> Snapshotter: ) @staticmethod - def _get_sorted_list_by_created_at(input_list: list[T]) -> SortedList[T]: - return SortedList(input_list, key=attrgetter('created_at')) + def _get_sorted_list_by_created_at(input_list: list[T]) -> SortedSnapshotList[T]: + """Create a sorted list from the input list. + + Returns a custom list that maintains sorted order by created_at when items are added. + """ + result = SortedSnapshotList[T]() + result.extend(input_list) + return result @property def active(self) -> bool: diff --git a/tests/unit/_autoscaling/test_snapshotter.py b/tests/unit/_autoscaling/test_snapshotter.py index 5cb0b63d67..e923eac421 100644 --- a/tests/unit/_autoscaling/test_snapshotter.py +++ b/tests/unit/_autoscaling/test_snapshotter.py @@ -10,6 +10,7 @@ from crawlee import service_locator from crawlee._autoscaling import Snapshotter from crawlee._autoscaling._types import ClientSnapshot, CpuSnapshot, EventLoopSnapshot, Snapshot +from crawlee._autoscaling.snapshotter import SortedSnapshotList from crawlee._utils.byte_size import ByteSize from crawlee._utils.system import CpuInfo, MemoryInfo from crawlee.configuration import Configuration @@ -299,3 +300,34 @@ def create_event_data(creation_time: datetime) -> EventSystemInfoData: assert cpu_samples[0].created_at == time_old assert memory_samples[1].created_at == time_new assert cpu_samples[1].created_at == time_new + + +def test_sorted_snapshot_list_add_maintains_order() -> None: + """Test that SortedSnapshotList.add method maintains sorted order by created_at with multiple items.""" + sorted_list = SortedSnapshotList[CpuSnapshot]() + + # Create snapshots with different timestamps (more items to test binary search better) + now = datetime.now(timezone.utc) + snapshots = [ + CpuSnapshot(used_ratio=0.1, max_used_ratio=0.95, created_at=now - timedelta(seconds=50)), # oldest + CpuSnapshot(used_ratio=0.2, max_used_ratio=0.95, created_at=now - timedelta(seconds=40)), + CpuSnapshot(used_ratio=0.3, max_used_ratio=0.95, created_at=now - timedelta(seconds=30)), + CpuSnapshot(used_ratio=0.4, max_used_ratio=0.95, created_at=now - timedelta(seconds=20)), + CpuSnapshot(used_ratio=0.5, max_used_ratio=0.95, created_at=now - timedelta(seconds=10)), + CpuSnapshot(used_ratio=0.6, max_used_ratio=0.95, created_at=now - timedelta(seconds=5)), + CpuSnapshot(used_ratio=0.7, max_used_ratio=0.95, created_at=now), # newest + ] + + # Add snapshots in random order to test binary search insertion + add_order = [3, 0, 5, 1, 6, 2, 4] # indices in random order + for i in add_order: + sorted_list.add(snapshots[i]) + + # Verify the list is sorted by created_at (should be in original order) + assert len(sorted_list) == 7 + for i, snapshot in enumerate(sorted_list): + assert snapshot == snapshots[i], f'Item at index {i} is not correctly sorted' + if i > 0: + prev_time = sorted_list[i - 1].created_at + curr_time = snapshot.created_at + assert prev_time <= curr_time, f'Items at indices {i - 1} and {i} are not in chronological order' diff --git a/uv.lock b/uv.lock index 4262b3b80a..7d388aed0b 100644 --- a/uv.lock +++ b/uv.lock @@ -589,8 +589,6 @@ dependencies = [ { name = "pydantic" }, { name = "pydantic-settings" }, { name = "pyee" }, - { name = "sortedcollections" }, - { name = "sortedcontainers" }, { name = "tldextract" }, { name = "typing-extensions" }, { name = "yarl" }, @@ -679,7 +677,6 @@ dev = [ { name = "pytest-xdist" }, { name = "ruff" }, { name = "setuptools" }, - { name = "sortedcontainers-stubs" }, { name = "types-beautifulsoup4" }, { name = "types-cachetools" }, { name = "types-colorama" }, @@ -724,8 +721,6 @@ requires-dist = [ { name = "pyee", specifier = ">=9.0.0" }, { name = "rich", marker = "extra == 'cli'", specifier = ">=13.9.0" }, { name = "scikit-learn", marker = "extra == 'adaptive-crawler'", specifier = ">=1.6.0" }, - { name = "sortedcollections", specifier = ">=2.1.0" }, - { name = "sortedcontainers", specifier = ">=2.4.0" }, { name = "tldextract", specifier = ">=5.1.0" }, { name = "typer", marker = "extra == 'cli'", specifier = ">=0.12.0" }, { name = "typing-extensions", specifier = ">=4.1.0" }, @@ -750,7 +745,6 @@ dev = [ { name = "pytest-xdist", specifier = "~=3.8.0" }, { name = "ruff", specifier = "~=0.12.0" }, { name = "setuptools" }, - { name = "sortedcontainers-stubs", specifier = "~=2.4.0" }, { name = "types-beautifulsoup4", specifier = "~=4.12.0.20240229" }, { name = "types-cachetools", specifier = "~=6.1.0.20250717" }, { name = "types-colorama", specifier = "~=0.4.15.20240106" }, @@ -914,7 +908,7 @@ name = "exceptiongroup" version = "1.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" } wheels = [ @@ -2845,40 +2839,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] -[[package]] -name = "sortedcollections" -version = "2.1.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "sortedcontainers" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/01/00/6d749cc1f88e7f95f5442a8abb195fa607094deba9e0475affbfb7fa8c04/sortedcollections-2.1.0.tar.gz", hash = "sha256:d8e9609d6c580a16a1224a3dc8965789e03ebc4c3e5ffd05ada54a2fed5dcacd", size = 9287, upload-time = "2021-01-18T22:15:16.623Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6e/39/c993a7d0c9dbf3aeca5008bdd00e4436ad9b7170527cef0a14634b47001f/sortedcollections-2.1.0-py3-none-any.whl", hash = "sha256:b07abbc73472cc459da9dd6e2607d73d1f3b9309a32dd9a57fa2c6fa882f4c6c", size = 9531, upload-time = "2021-01-18T22:15:15.36Z" }, -] - -[[package]] -name = "sortedcontainers" -version = "2.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" }, -] - -[[package]] -name = "sortedcontainers-stubs" -version = "2.4.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "sortedcontainers" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/19/13/2940fafc136048ec99c932bf0077613f1ae4693a313705162982335fe5fb/sortedcontainers_stubs-2.4.3.tar.gz", hash = "sha256:ba172daceda4bd617d2f6ffd24582261a494da92705409651967faa40ebc75dd", size = 6186, upload-time = "2025-04-23T07:42:00.238Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/87/3e/b0fbd7621e6430d33d234ad5e2dc9b0d4df575518f4790a2af934e1029ea/sortedcontainers_stubs-2.4.3-py3-none-any.whl", hash = "sha256:4496109dfa6645e4b675f57fbc7e42ec4d1bed2c74aab7fa379e0795e49fe406", size = 8816, upload-time = "2025-04-23T07:41:58.625Z" }, -] - [[package]] name = "soupsieve" version = "2.7" From 60f3ace7f59f68fedf45040a70a5216d8fc524e2 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 19 Aug 2025 11:14:35 +0200 Subject: [PATCH 2/2] use bisect --- src/crawlee/_autoscaling/snapshotter.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/crawlee/_autoscaling/snapshotter.py b/src/crawlee/_autoscaling/snapshotter.py index 63c01326db..90d44db65e 100644 --- a/src/crawlee/_autoscaling/snapshotter.py +++ b/src/crawlee/_autoscaling/snapshotter.py @@ -2,6 +2,7 @@ from __future__ import annotations +import bisect from datetime import datetime, timedelta, timezone from logging import getLogger from typing import TYPE_CHECKING, TypeVar, cast @@ -30,17 +31,7 @@ class SortedSnapshotList(list[T]): def add(self, item: T) -> None: """Add an item to the list maintaining sorted order by `created_at` using binary search.""" - left, right = 0, len(self) - item_time = item.created_at - - while left < right: - mid = (left + right) // 2 - if self[mid].created_at <= item_time: - left = mid + 1 - else: - right = mid - - self.insert(left, item) + bisect.insort(self, item, key=lambda item: item.created_at) @docs_group('Autoscaling')