Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ dependencies = [
"pydantic-settings>=2.2.0,!=2.7.0,!=2.7.1,!=2.8.0",
"pydantic>=2.8.0,!=2.10.0,!=2.10.1,!=2.10.2",
"pyee>=9.0.0",
"sortedcollections>=2.1.0",
"sortedcontainers>=2.4.0",
"tldextract>=5.1.0",
"typing-extensions>=4.1.0",
"yarl>=1.18.0",
Expand Down Expand Up @@ -103,7 +101,6 @@ dev = [
"pytest~=8.4.0",
"ruff~=0.12.0",
"setuptools", # setuptools are used by pytest, but not explicitly required
"sortedcontainers-stubs~=2.4.0",
"types-beautifulsoup4~=4.12.0.20240229",
"types-cachetools~=6.1.0.20250717",
"types-colorama~=0.4.15.20240106",
Expand Down Expand Up @@ -247,7 +244,6 @@ module = [
"litestar", # Example code shows deploy on Google Cloud Run.
"loguru", # Example code shows integration of loguru and crawlee for JSON logging.
"sklearn.linear_model", # Untyped and stubs not available
"sortedcollections", # Untyped and stubs not available
"cookiecutter.*", # Untyped and stubs not available
"inquirer.*", # Untyped and stubs not available
"warcio.*", # Example code shows WARC files creation.
Expand Down
24 changes: 18 additions & 6 deletions src/crawlee/_autoscaling/snapshotter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,11 @@

from __future__ import annotations

import bisect
from datetime import datetime, timedelta, timezone
from logging import getLogger
from operator import attrgetter
from typing import TYPE_CHECKING, TypeVar, cast

from sortedcontainers import SortedList

from crawlee import service_locator
from crawlee._autoscaling._types import ClientSnapshot, CpuSnapshot, EventLoopSnapshot, MemorySnapshot, Snapshot
from crawlee._utils.byte_size import ByteSize
Expand All @@ -25,7 +23,15 @@

logger = getLogger(__name__)

T = TypeVar('T')
T = TypeVar('T', bound=Snapshot)


class SortedSnapshotList(list[T]):
"""A list that maintains sorted order by `created_at` attribute for snapshot objects."""

def add(self, item: T) -> None:
"""Add an item to the list maintaining sorted order by `created_at` using binary search."""
bisect.insort(self, item, key=lambda item: item.created_at)


@docs_group('Autoscaling')
Expand Down Expand Up @@ -127,8 +133,14 @@ def from_config(cls, config: Configuration | None = None) -> Snapshotter:
)

@staticmethod
def _get_sorted_list_by_created_at(input_list: list[T]) -> SortedList[T]:
return SortedList(input_list, key=attrgetter('created_at'))
def _get_sorted_list_by_created_at(input_list: list[T]) -> SortedSnapshotList[T]:
"""Create a sorted list from the input list.

Returns a custom list that maintains sorted order by created_at when items are added.
"""
result = SortedSnapshotList[T]()
result.extend(input_list)
return result

@property
def active(self) -> bool:
Expand Down
32 changes: 32 additions & 0 deletions tests/unit/_autoscaling/test_snapshotter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from crawlee import service_locator
from crawlee._autoscaling import Snapshotter
from crawlee._autoscaling._types import ClientSnapshot, CpuSnapshot, EventLoopSnapshot, Snapshot
from crawlee._autoscaling.snapshotter import SortedSnapshotList
from crawlee._utils.byte_size import ByteSize
from crawlee._utils.system import CpuInfo, MemoryInfo
from crawlee.configuration import Configuration
Expand Down Expand Up @@ -299,3 +300,34 @@ def create_event_data(creation_time: datetime) -> EventSystemInfoData:
assert cpu_samples[0].created_at == time_old
assert memory_samples[1].created_at == time_new
assert cpu_samples[1].created_at == time_new


def test_sorted_snapshot_list_add_maintains_order() -> None:
"""Test that SortedSnapshotList.add method maintains sorted order by created_at with multiple items."""
sorted_list = SortedSnapshotList[CpuSnapshot]()

# Create snapshots with different timestamps (more items to test binary search better)
now = datetime.now(timezone.utc)
snapshots = [
CpuSnapshot(used_ratio=0.1, max_used_ratio=0.95, created_at=now - timedelta(seconds=50)), # oldest
CpuSnapshot(used_ratio=0.2, max_used_ratio=0.95, created_at=now - timedelta(seconds=40)),
CpuSnapshot(used_ratio=0.3, max_used_ratio=0.95, created_at=now - timedelta(seconds=30)),
CpuSnapshot(used_ratio=0.4, max_used_ratio=0.95, created_at=now - timedelta(seconds=20)),
CpuSnapshot(used_ratio=0.5, max_used_ratio=0.95, created_at=now - timedelta(seconds=10)),
CpuSnapshot(used_ratio=0.6, max_used_ratio=0.95, created_at=now - timedelta(seconds=5)),
CpuSnapshot(used_ratio=0.7, max_used_ratio=0.95, created_at=now), # newest
]

# Add snapshots in random order to test binary search insertion
add_order = [3, 0, 5, 1, 6, 2, 4] # indices in random order
for i in add_order:
sorted_list.add(snapshots[i])

# Verify the list is sorted by created_at (should be in original order)
assert len(sorted_list) == 7
for i, snapshot in enumerate(sorted_list):
assert snapshot == snapshots[i], f'Item at index {i} is not correctly sorted'
if i > 0:
prev_time = sorted_list[i - 1].created_at
curr_time = snapshot.created_at
assert prev_time <= curr_time, f'Items at indices {i - 1} and {i} are not in chronological order'
42 changes: 1 addition & 41 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.