Skip to content

Commit 30de3bd

Browse files
authored
feat: Prevent overloading system memory when running locally (#1270)
- closes #1232
1 parent 456c416 commit 30de3bd

File tree

6 files changed

+107
-5
lines changed

6 files changed

+107
-5
lines changed

src/crawlee/_autoscaling/_types.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
from crawlee._utils.byte_size import ByteSize
99

1010

11+
SYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD = 0.97
12+
13+
1114
@dataclass
1215
class LoadRatioInfo:
1316
"""Represent the load ratio of a resource."""
@@ -91,9 +94,15 @@ class MemorySnapshot:
9194
current_size: ByteSize
9295
"""Memory usage of the current Python process and its children."""
9396

97+
system_wide_used_size: ByteSize | None
98+
"""Memory usage of all processes, system-wide."""
99+
94100
max_memory_size: ByteSize
95101
"""The maximum memory that can be used by `AutoscaledPool`."""
96102

103+
system_wide_memory_size: ByteSize | None
104+
"""Total memory available in the whole system."""
105+
97106
max_used_memory_ratio: float
98107
"""The maximum acceptable ratio of `current_size` to `max_memory_size`."""
99108

@@ -103,6 +112,11 @@ class MemorySnapshot:
103112
@property
104113
def is_overloaded(self) -> bool:
105114
"""Indicate whether the memory is considered as overloaded."""
115+
if self.system_wide_memory_size is not None and self.system_wide_used_size is not None:
116+
system_wide_utilization = self.system_wide_used_size / self.system_wide_memory_size
117+
if system_wide_utilization > SYSTEM_WIDE_MEMORY_OVERLOAD_THRESHOLD:
118+
return True
119+
106120
return (self.current_size / self.max_memory_size) > self.max_used_memory_ratio
107121

108122

src/crawlee/_autoscaling/snapshotter.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from crawlee._utils.context import ensure_context
1616
from crawlee._utils.docs import docs_group
1717
from crawlee._utils.recurring_task import RecurringTask
18-
from crawlee._utils.system import get_memory_info
18+
from crawlee._utils.system import MemoryInfo, get_memory_info
1919
from crawlee.events._types import Event, EventSystemInfoData
2020

2121
if TYPE_CHECKING:
@@ -273,8 +273,14 @@ def _snapshot_memory(self, event_data: EventSystemInfoData) -> None:
273273
max_memory_size=self._max_memory_size,
274274
max_used_memory_ratio=self._max_used_memory_ratio,
275275
created_at=event_data.memory_info.created_at,
276+
system_wide_used_size=None,
277+
system_wide_memory_size=None,
276278
)
277279

280+
if isinstance(memory_info := event_data.memory_info, MemoryInfo):
281+
snapshot.system_wide_used_size = memory_info.system_wide_used_size
282+
snapshot.system_wide_memory_size = memory_info.total_size
283+
278284
snapshots = cast('list[Snapshot]', self._memory_snapshots)
279285
self._prune_snapshots(snapshots, snapshot.created_at)
280286
self._memory_snapshots.add(snapshot)

src/crawlee/_utils/system.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,14 @@ class MemoryInfo(MemoryUsageInfo):
5959
]
6060
"""Total memory available in the system."""
6161

62+
system_wide_used_size: Annotated[
63+
ByteSize,
64+
PlainValidator(ByteSize.validate),
65+
PlainSerializer(lambda size: size.bytes),
66+
Field(alias='systemWideUsedSize'),
67+
]
68+
"""Total memory used by all processes system-wide (including non-crawlee processes)."""
69+
6270

6371
def get_cpu_info() -> CpuInfo:
6472
"""Retrieve the current CPU usage.
@@ -89,11 +97,12 @@ def get_memory_info() -> MemoryInfo:
8997
with suppress(psutil.NoSuchProcess):
9098
current_size_bytes += _get_used_memory(child.memory_full_info())
9199

92-
total_size_bytes = psutil.virtual_memory().total
100+
vm = psutil.virtual_memory()
93101

94102
return MemoryInfo(
95-
total_size=ByteSize(total_size_bytes),
103+
total_size=ByteSize(vm.total),
96104
current_size=ByteSize(current_size_bytes),
105+
system_wide_used_size=ByteSize(vm.total - vm.available),
97106
)
98107

99108

tests/unit/_autoscaling/test_snapshotter.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def event_system_data_info() -> EventSystemInfoData:
2929
memory_info=MemoryInfo(
3030
total_size=ByteSize.from_gb(8),
3131
current_size=ByteSize.from_gb(4),
32+
system_wide_used_size=ByteSize.from_gb(5),
3233
),
3334
)
3435

@@ -52,6 +53,28 @@ def test_snapshot_memory(snapshotter: Snapshotter, event_system_data_info: Event
5253
assert snapshotter._memory_snapshots[0].current_size == event_system_data_info.memory_info.current_size
5354

5455

56+
def test_snapshot_memory_with_memory_info_sets_system_wide_fields(snapshotter: Snapshotter) -> None:
57+
memory_info = MemoryInfo(
58+
total_size=ByteSize.from_gb(16),
59+
current_size=ByteSize.from_gb(4),
60+
system_wide_used_size=ByteSize.from_gb(12),
61+
)
62+
63+
event_data = EventSystemInfoData(
64+
cpu_info=CpuInfo(used_ratio=0.5),
65+
memory_info=memory_info,
66+
)
67+
68+
snapshotter._snapshot_memory(event_data)
69+
70+
assert len(snapshotter._memory_snapshots) == 1
71+
memory_snapshot = snapshotter._memory_snapshots[0]
72+
73+
# Test that system-wide fields are properly set
74+
assert memory_snapshot.system_wide_used_size == memory_info.system_wide_used_size
75+
assert memory_snapshot.system_wide_memory_size == memory_info.total_size
76+
77+
5578
def test_snapshot_event_loop(snapshotter: Snapshotter) -> None:
5679
snapshotter._event_loop_snapshots = Snapshotter._get_sorted_list_by_created_at(
5780
[
@@ -254,7 +277,10 @@ def create_event_data(creation_time: datetime) -> EventSystemInfoData:
254277
return EventSystemInfoData(
255278
cpu_info=CpuInfo(used_ratio=0.5, created_at=creation_time),
256279
memory_info=MemoryInfo(
257-
current_size=ByteSize(bytes=1), created_at=creation_time, total_size=ByteSize(bytes=2)
280+
current_size=ByteSize(bytes=1),
281+
created_at=creation_time,
282+
total_size=ByteSize(bytes=2),
283+
system_wide_used_size=ByteSize.from_gb(5),
258284
),
259285
)
260286

tests/unit/_autoscaling/test_system_status.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,24 +102,32 @@ def test_get_system_info(snapshotter: Snapshotter, now: datetime) -> None:
102102
max_memory_size=ByteSize.from_gb(12),
103103
max_used_memory_ratio=0.8,
104104
created_at=now - timedelta(seconds=90),
105+
system_wide_used_size=None,
106+
system_wide_memory_size=None,
105107
),
106108
MemorySnapshot(
107109
current_size=ByteSize.from_gb(7),
108110
max_memory_size=ByteSize.from_gb(8),
109111
max_used_memory_ratio=0.8,
110112
created_at=now - timedelta(seconds=60),
113+
system_wide_used_size=None,
114+
system_wide_memory_size=None,
111115
),
112116
MemorySnapshot(
113117
current_size=ByteSize.from_gb(28),
114118
max_memory_size=ByteSize.from_gb(30),
115119
max_used_memory_ratio=0.8,
116120
created_at=now - timedelta(seconds=30),
121+
system_wide_used_size=None,
122+
system_wide_memory_size=None,
117123
),
118124
MemorySnapshot(
119125
current_size=ByteSize.from_gb(48),
120126
max_memory_size=ByteSize.from_gb(60),
121127
max_used_memory_ratio=0.8,
122128
created_at=now,
129+
system_wide_used_size=None,
130+
system_wide_memory_size=None,
123131
),
124132
]
125133
)
@@ -204,3 +212,42 @@ def test_client_overloaded(
204212

205213
# Ratio of overloaded snapshots is 2/3 (2 minutes out of 3)
206214
assert system_status._is_client_overloaded().is_overloaded == is_overloaded
215+
216+
217+
def test_memory_overloaded_system_wide(snapshotter: Snapshotter, now: datetime) -> None:
218+
"""Test that system-wide memory overload is detected when system-wide memory utilization exceeds threshold."""
219+
system_status = SystemStatus(
220+
snapshotter,
221+
max_snapshot_age=timedelta(minutes=1),
222+
memory_overload_threshold=0.5, # Set high threshold so process memory won't trigger overload
223+
)
224+
225+
# Add memory snapshots with system-wide memory usage above threshold (97%)
226+
system_status._snapshotter._memory_snapshots = Snapshotter._get_sorted_list_by_created_at(
227+
[
228+
MemorySnapshot(
229+
current_size=ByteSize.from_gb(1), # Process memory is low
230+
max_memory_size=ByteSize.from_gb(8), # Max memory is high
231+
max_used_memory_ratio=0.8, # Ratio is fine
232+
created_at=now - timedelta(minutes=1),
233+
system_wide_used_size=ByteSize.from_gb(31), # System-wide used is high
234+
system_wide_memory_size=ByteSize.from_gb(32), # System-wide total (31/32 = 96.875% < 97%)
235+
),
236+
MemorySnapshot(
237+
current_size=ByteSize.from_gb(1), # Process memory is low
238+
max_memory_size=ByteSize.from_gb(8), # Max memory is high
239+
max_used_memory_ratio=0.8, # Ratio is fine
240+
created_at=now,
241+
system_wide_used_size=ByteSize.from_gb(31.5), # System-wide used is high
242+
system_wide_memory_size=ByteSize.from_gb(32), # System-wide total (31.5/32 = 98.4% > 97%)
243+
),
244+
]
245+
)
246+
247+
memory_info = system_status._is_memory_overloaded()
248+
249+
# Should be overloaded due to system-wide memory usage exceeding 97% threshold
250+
assert memory_info.is_overloaded is True
251+
# The actual ratio should be 1.0 (the entire time period from first to second snapshot is overloaded)
252+
assert memory_info.actual_ratio == 1.0
253+
assert memory_info.limit_ratio == 0.5

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)