Skip to content

Commit b467908

Browse files
authored
[core] (cgroups 20/n) Move dashboard api server's subprocesses into system cgroup (#57864)
For more details about the resource isolation project see #54703. When starting the head node, move the dashboard api server's subprocesses into the system cgroup. I updated the integration test and added a helpful error message because the test will break in the future when a new dashboard module is added. I ran the integration tests 25 times locally. > (ray2) ubuntu@devbox:~/code/ray2$ python -m pytest -s python/ray/tests/resource_isolation/test_resource_isolation_integration.py --count 25 -x ... collecting ... python/ray/tests/resource_isolation/test_resource_isolation_integration.py ✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓ 25% ██▌ 2025-10-17 23:13:51,897 INFO worker.py:1833 -- Connecting to existing Ray cluster at address: 172.31.12.251:6379... 2025-10-17 23:13:51,905 INFO worker.py:2004 -- Connected to Ray cluster. View the dashboard at http://127.0.0.1:8265 python/ray/tests/resource_isolation/test_resource_isolation_integration.py ✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓ 26% ██▋ 2025-10-17 23:13:57,592 INFO worker.py:1833 -- Connecting to existing Ray cluster at address: 172.31.12.251:6379... 2025-10-17 23:13:57,598 INFO worker.py:2004 -- Connected to Ray cluster. View the dashboard at http://127.0.0.1:8265 python/ray/tests/resource_isolation/test_resource_isolation_integration.py ✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓ 98% █████████▊2025-10-17 23:19:45,417 INFO worker.py:2004 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 python/ray/tests/resource_isolation/test_resource_isolation_integration.py ✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓ 99% █████████▉2025-10-17 23:19:50,194 INFO worker.py:2004 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 python/ray/tests/resource_isolation/test_resource_isolation_integration.py ✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓ 100% ██████████ Results (366.41s): 100 passed --------- Signed-off-by: irabbani <israbbani@gmail.com>
1 parent b988ce4 commit b467908

File tree

3 files changed

+40
-5
lines changed

3 files changed

+40
-5
lines changed

python/ray/_private/node.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
)
3737
from ray._raylet import GcsClient, get_session_key_from_storage
3838

39+
import psutil
40+
3941
# Logger for this module. It should be configured at the entry point
4042
# into the program using Ray. Ray configures it by default automatically
4143
# using logging.basicConfig in its entry/init points.
@@ -1438,7 +1440,20 @@ def _get_system_processes_for_resource_isolation(self) -> str:
14381440
added to self.all_processes so it can be moved into the raylet's managed cgroup
14391441
hierarchy.
14401442
"""
1441-
return ",".join(str(p[0].process.pid) for p in self.all_processes.values())
1443+
system_process_pids = [
1444+
str(p[0].process.pid) for p in self.all_processes.values()
1445+
]
1446+
1447+
# If the dashboard api server was started on the head node, then include all of the api server's
1448+
# child processes.
1449+
if ray_constants.PROCESS_TYPE_DASHBOARD in self.all_processes:
1450+
dashboard_pid = self.all_processes[ray_constants.PROCESS_TYPE_DASHBOARD][
1451+
0
1452+
].process.pid
1453+
dashboard_process = psutil.Process(dashboard_pid)
1454+
system_process_pids += [str(p.pid) for p in dashboard_process.children()]
1455+
1456+
return ",".join(system_process_pids)
14421457

14431458
def _kill_process_type(
14441459
self,

python/ray/tests/resource_isolation/test_resource_isolation_integration.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,22 @@
6969

7070
_MOUNT_FILE_PATH = "/proc/mounts"
7171

72+
# The names are here to help debug test failures. Tests should
73+
# only use the size of this list. These processes are expected to be moved
74+
# into the the system cgroup.
75+
_EXPECTED_DASHBOARD_MODULES = [
76+
"ray.dashboard.modules.usage_stats.usage_stats_head.UsageStatsHead",
77+
"ray.dashboard.modules.metrics.metrics_head.MetricsHead",
78+
"ray.dashboard.modules.data.data_head.DataHead",
79+
"ray.dashboard.modules.event.event_head.EventHead",
80+
"ray.dashboard.modules.job.job_head.JobHead",
81+
"ray.dashboard.modules.node.node_head.NodeHead",
82+
"ray.dashboard.modules.reporter.reporter_head.ReportHead",
83+
"ray.dashboard.modules.serve.serve_head.ServeHead",
84+
"ray.dashboard.modules.state.state_head.StateHead",
85+
"ray.dashboard.modules.train.train_head.TrainHead",
86+
]
87+
7288
# The list of processes expected to be started in the system cgroup
7389
# with default params for 'ray start' and 'ray.init(...)'
7490
_EXPECTED_SYSTEM_PROCESSES_RAY_START = [
@@ -345,7 +361,7 @@ def assert_system_processes_are_in_system_cgroup(
345361
lines = cgroup_procs_file.readlines()
346362
assert (
347363
len(lines) == expected_count
348-
), f"Expected only system process passed into the raylet. Found {lines}"
364+
), f"Expected only system process passed into the raylet. Found {lines}. You may have added a new dashboard module in which case you need to update _EXPECTED_DASHBOARD_MODULES"
349365

350366

351367
def assert_worker_processes_are_in_workers_cgroup(
@@ -457,7 +473,9 @@ def get_pid(self):
457473
for actor in actor_refs:
458474
worker_pids.add(str(ray.get(actor.get_pid.remote())))
459475
assert_system_processes_are_in_system_cgroup(
460-
node_id, resource_isolation_config, len(_EXPECTED_SYSTEM_PROCESSES_RAY_START)
476+
node_id,
477+
resource_isolation_config,
478+
len(_EXPECTED_SYSTEM_PROCESSES_RAY_START) + len(_EXPECTED_DASHBOARD_MODULES),
461479
)
462480
assert_worker_processes_are_in_workers_cgroup(
463481
node_id, resource_isolation_config, worker_pids
@@ -520,7 +538,9 @@ def get_pid(self):
520538
for actor in actor_refs:
521539
worker_pids.add(str(ray.get(actor.get_pid.remote())))
522540
assert_system_processes_are_in_system_cgroup(
523-
node_id, resource_isolation_config, len(_EXPECTED_SYSTEM_PROCESSES_RAY_INIT)
541+
node_id,
542+
resource_isolation_config,
543+
len(_EXPECTED_SYSTEM_PROCESSES_RAY_INIT) + len(_EXPECTED_DASHBOARD_MODULES),
524544
)
525545
assert_worker_processes_are_in_workers_cgroup(
526546
node_id, resource_isolation_config, worker_pids

src/ray/common/cgroup2/cgroup_manager.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ StatusOr<std::unique_ptr<CgroupManager>> CgroupManager::Create(
8787
std::unique_ptr<CgroupDriverInterface> cgroup_driver) {
8888
if (!cpu_weight_constraint_.IsValid(system_reserved_cpu_weight)) {
8989
return Status::InvalidArgument(
90-
absl::StrFormat("Invalid constraint %s=%d. %s must be in the range [%d, %d].",
90+
absl::StrFormat(" Invalid constraint %s=%d. %s must be in the range [%d, %d].",
9191
cpu_weight_constraint_.name_,
9292
system_reserved_cpu_weight,
9393
cpu_weight_constraint_.name_,

0 commit comments

Comments
 (0)