Skip to content

Commit 21a52e5

Browse files
israbbaniedoakes
authored andcommitted
[core] (cgroups 15/n) Adding a user cgroup subtree for non-ray processes. (#57269)
This PR stacks on #57244. For more details about the resource isolation project see #54703. In the previous ray cgroup hierarchy, all processes that were in the path `--cgroup-path` were moved into the system cgroup. This changes the hierarchy to now have a separate cgroup for all non-ray processes. The new cgroup hierarchy looks like ``` cgroup_path (e.g. /sys/fs/cgroup) | ray-node_<node_id> | | system user | | | leaf workers non-ray ``` The cgroups contain the following processes * system/leaf (all ray non-worker processes e.g. raylet, runtime_env_agent, gcs_server, ...) * user/workers (all ray worker processes) * user/non-ray (all non-ray processes migrated from cgroup_path). Note: If you're running ray inside a container, all non-ray processes running in the container will be migrated to `user/non-ray` The following controllers will be enabled * cgroup_path (cpu, memory) * ray-node_<node_id> (cpu, memory) * system (memory) The following constraints are applied * system (cpu.weight, memory.min) * user (cpu.weight) --------- Signed-off-by: Ibrahim Rabbani <irabbani@anyscale.com> Co-authored-by: Edward Oakes <ed.nmi.oakes@gmail.com> Signed-off-by: elliot-barn <elliot.barnwell@anyscale.com>
1 parent 18593ab commit 21a52e5

File tree

9 files changed

+302
-238
lines changed

9 files changed

+302
-238
lines changed

python/ray/tests/resource_isolation/test_resource_isolation_integration.py

Lines changed: 47 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# These tests are intended to run in CI inside a container.
1717
#
1818
# If you want to run this test locally, you will need to create a cgroup that
19-
# the raylet can manage and delegate to the correct user.
19+
# the ray can manage and delegate to the correct user.
2020
#
2121
# Run these commands locally before running the test suite:
2222
#
@@ -44,13 +44,13 @@
4444
# / \
4545
# TEST_CGROUP LEAF_CGROUP
4646
# |
47-
# ray_node_<node_id>
48-
# / \
49-
# system application
50-
# | |
51-
# leaf leaf
47+
# ray-node_<node_id>
48+
# | |
49+
# system user
50+
# | | |
51+
# leaf workers non-ray
5252
#
53-
# NOTE: The test suite does not assume that ROOT_CGROUP is an actual root cgroup. Therefore,
53+
# NOTE: The test suite does not assume that ROOT_CGROUP is the OS's root cgroup. Therefore,
5454
# 1. setup will migrate all processes from the ROOT_CGROUP -> LEAF_CGROUP
5555
# 2. teardown will migrate all processes from the LEAF_CGROUP -> ROOT_CGROUP
5656
#
@@ -67,7 +67,7 @@
6767
_TEST_CGROUP = _BASE_CGROUP / "test"
6868
_LEAF_GROUP = _BASE_CGROUP / "leaf"
6969

70-
_MOUNT_FILE_PATH = "/etc/mtab"
70+
_MOUNT_FILE_PATH = "/proc/mounts"
7171

7272
# The list of processes expected to be started in the system cgroup
7373
# with default params for 'ray start' and 'ray.init(...)'
@@ -270,55 +270,64 @@ def assert_cgroup_hierarchy_exists_for_node(
270270
271271
The cgroup hierarchy looks like:
272272
273-
_TEST_CGROUP
273+
_TEST_CGROUP
274274
|
275-
ray_node_<node_id>
276-
| |
277-
system application
278-
| |
279-
leaf leaf
275+
ray-node_<node_id>
276+
| |
277+
system user
278+
| | |
279+
leaf workers non-ray
280280
281281
Args:
282282
node_id: used to find the path of the cgroup subtree
283-
resource_isolation_config: used to verify constraints enabled on the system
284-
and application cgroups
283+
resource_isolation_config: used to verify constraints enabled on the system, workers, and user cgroups
285284
"""
286285
base_cgroup_for_node = resource_isolation_config.cgroup_path
287-
node_cgroup = Path(base_cgroup_for_node) / f"ray_node_{node_id}"
286+
node_cgroup = Path(base_cgroup_for_node) / f"ray-node_{node_id}"
288287
system_cgroup = node_cgroup / "system"
289288
system_leaf_cgroup = system_cgroup / "leaf"
290-
application_cgroup = node_cgroup / "application"
291-
application_leaf_cgroup = application_cgroup / "leaf"
289+
user_cgroup = node_cgroup / "user"
290+
workers_cgroup = user_cgroup / "workers"
291+
non_ray_cgroup = user_cgroup / "non-ray"
292292

293293
# 1) Check that the cgroup hierarchy is created correctly for the node.
294294
assert node_cgroup.is_dir()
295295
assert system_cgroup.is_dir()
296296
assert system_leaf_cgroup.is_dir()
297-
assert application_cgroup.is_dir()
298-
assert application_leaf_cgroup.is_dir()
297+
assert workers_cgroup.is_dir()
298+
assert user_cgroup.is_dir()
299+
assert non_ray_cgroup.is_dir()
299300

300301
# 2) Verify the constraints are applied correctly.
301-
system_cgroup_memory_min = system_cgroup / "memory.min"
302-
with open(system_cgroup_memory_min, "r") as memory_min_file:
302+
with open(system_cgroup / "memory.min", "r") as memory_min_file:
303303
contents = memory_min_file.read().strip()
304304
assert contents == str(resource_isolation_config.system_reserved_memory)
305-
system_cgroup_cpu_weight = system_cgroup / "cpu.weight"
306-
with open(system_cgroup_cpu_weight, "r") as cpu_weight_file:
305+
with open(system_cgroup / "cpu.weight", "r") as cpu_weight_file:
307306
contents = cpu_weight_file.read().strip()
308307
assert contents == str(resource_isolation_config.system_reserved_cpu_weight)
309-
application_cgroup_cpu_weight = application_cgroup / "cpu.weight"
310-
with open(application_cgroup_cpu_weight, "r") as cpu_weight_file:
308+
with open(user_cgroup / "cpu.weight", "r") as cpu_weight_file:
311309
contents = cpu_weight_file.read().strip()
312310
assert contents == str(
313311
10000 - resource_isolation_config.system_reserved_cpu_weight
314312
)
315313

316314

317315
def assert_system_processes_are_in_system_cgroup(
318-
node_id, resource_isolation_config, expected_count
316+
node_id: str,
317+
resource_isolation_config: ResourceIsolationConfig,
318+
expected_count: int,
319319
):
320+
"""Asserts that the system processes were created in the correct cgroup.
321+
322+
Args:
323+
node_id: used to construct the path of the cgroup subtree
324+
resource_isolation_config: used to construct the path of the cgroup
325+
subtree
326+
expected_count: the number of expected system processes.
327+
328+
"""
320329
base_cgroup_for_node = resource_isolation_config.cgroup_path
321-
node_cgroup = Path(base_cgroup_for_node) / f"ray_node_{node_id}"
330+
node_cgroup = Path(base_cgroup_for_node) / f"ray-node_{node_id}"
322331
system_cgroup = node_cgroup / "system"
323332
system_leaf_cgroup = system_cgroup / "leaf"
324333

@@ -330,26 +339,24 @@ def assert_system_processes_are_in_system_cgroup(
330339
), f"Expected only system process passed into the raylet. Found {lines}"
331340

332341

333-
def assert_worker_processes_are_in_application_cgroup(
342+
def assert_worker_processes_are_in_workers_cgroup(
334343
node_id: str,
335344
resource_isolation_config: ResourceIsolationConfig,
336345
worker_pids: Set[str],
337346
):
338-
"""Asserts that the cgroup hierarchy was deleted correctly for the node.
347+
"""Asserts that the worker processes were created in the correct cgroup.
339348
340349
Args:
341350
node_id: used to construct the path of the cgroup subtree
342351
resource_isolation_config: used to construct the path of the cgroup
343352
subtree
344-
worker_pids: a set of pids that are expected inside the application
353+
worker_pids: a set of pids that are expected inside the workers
345354
leaf cgroup.
346355
"""
347356
base_cgroup_for_node = resource_isolation_config.cgroup_path
348-
node_cgroup = Path(base_cgroup_for_node) / f"ray_node_{node_id}"
349-
application_leaf_cgroup_procs = (
350-
node_cgroup / "application" / "leaf" / "cgroup.procs"
351-
)
352-
with open(application_leaf_cgroup_procs, "r") as cgroup_procs_file:
357+
node_cgroup = Path(base_cgroup_for_node) / f"ray-node_{node_id}"
358+
workers_cgroup_procs = node_cgroup / "user" / "workers" / "cgroup.procs"
359+
with open(workers_cgroup_procs, "r") as cgroup_procs_file:
353360
pids_in_cgroup = set()
354361
lines = cgroup_procs_file.readlines()
355362
for line in lines:
@@ -368,7 +375,7 @@ def assert_cgroup_hierarchy_cleaned_up_for_node(
368375
subtree
369376
"""
370377
base_cgroup_for_node = resource_isolation_config.cgroup_path
371-
node_cgroup = Path(base_cgroup_for_node) / f"ray_node_{node_id}"
378+
node_cgroup = Path(base_cgroup_for_node) / f"ray-node_{node_id}"
372379
# If the root cgroup is deleted, there's no need to check anything else.
373380
assert (
374381
not node_cgroup.is_dir()
@@ -442,7 +449,7 @@ def get_pid(self):
442449
worker_pids = set()
443450
for actor in actor_refs:
444451
worker_pids.add(str(ray.get(actor.get_pid.remote())))
445-
assert_worker_processes_are_in_application_cgroup(
452+
assert_worker_processes_are_in_workers_cgroup(
446453
node_id, resource_isolation_config, worker_pids
447454
)
448455
runner.invoke(scripts.stop)
@@ -503,7 +510,7 @@ def get_pid(self):
503510
worker_pids = set()
504511
for actor in actor_refs:
505512
worker_pids.add(str(ray.get(actor.get_pid.remote())))
506-
assert_worker_processes_are_in_application_cgroup(
513+
assert_worker_processes_are_in_workers_cgroup(
507514
node_id, resource_isolation_config, worker_pids
508515
)
509516
ray.shutdown()

src/ray/common/cgroup2/cgroup_manager.cc

Lines changed: 75 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,11 @@
1616

1717
#include <algorithm>
1818
#include <filesystem>
19+
#include <fstream>
1920
#include <memory>
21+
#include <sstream>
2022
#include <string>
23+
#include <type_traits>
2124
#include <unordered_set>
2225
#include <utility>
2326

@@ -40,10 +43,12 @@ CgroupManager::CgroupManager(std::string base_cgroup,
4043
node_cgroup_ + std::filesystem::path::preferred_separator + kSystemCgroupName;
4144
system_leaf_cgroup_ =
4245
system_cgroup_ + std::filesystem::path::preferred_separator + kLeafCgroupName;
43-
application_cgroup_ =
44-
node_cgroup_ + std::filesystem::path::preferred_separator + kApplicationCgroupName;
45-
application_leaf_cgroup_ =
46-
application_cgroup_ + std::filesystem::path::preferred_separator + kLeafCgroupName;
46+
user_cgroup_ =
47+
node_cgroup_ + std::filesystem::path::preferred_separator + kUserCgroupName;
48+
workers_cgroup_ =
49+
user_cgroup_ + std::filesystem::path::preferred_separator + kWorkersCgroupName;
50+
non_ray_cgroup_ =
51+
user_cgroup_ + std::filesystem::path::preferred_separator + kNonRayCgroupName;
4752
}
4853

4954
CgroupManager::~CgroupManager() {
@@ -56,17 +61,19 @@ CgroupManager::CgroupManager(CgroupManager &&other)
5661
: node_cgroup_(std::move(other.node_cgroup_)),
5762
system_cgroup_(std::move(other.system_cgroup_)),
5863
system_leaf_cgroup_(std::move(other.system_leaf_cgroup_)),
59-
application_cgroup_(std::move(other.application_cgroup_)),
60-
application_leaf_cgroup_(std::move(other.application_leaf_cgroup_)),
64+
user_cgroup_(std::move(other.user_cgroup_)),
65+
workers_cgroup_(std::move(other.workers_cgroup_)),
66+
non_ray_cgroup_(std::move(other.non_ray_cgroup_)),
6167
cleanup_operations_(std::move(other.cleanup_operations_)),
6268
cgroup_driver_(std::move(other.cgroup_driver_)) {}
6369

6470
CgroupManager &CgroupManager::operator=(CgroupManager &&other) {
6571
node_cgroup_ = std::move(other.node_cgroup_);
6672
system_cgroup_ = std::move(other.system_cgroup_);
6773
system_leaf_cgroup_ = std::move(other.system_leaf_cgroup_);
68-
application_cgroup_ = std::move(other.application_cgroup_);
69-
application_leaf_cgroup_ = std::move(other.application_leaf_cgroup_);
74+
user_cgroup_ = std::move(other.user_cgroup_);
75+
workers_cgroup_ = std::move(other.workers_cgroup_);
76+
non_ray_cgroup_ = std::move(other.non_ray_cgroup_);
7077
cleanup_operations_ = std::move(other.cleanup_operations_);
7178
cgroup_driver_ = std::move(other.cgroup_driver_);
7279
return *this;
@@ -202,17 +209,16 @@ Status CgroupManager::Initialize(int64_t system_reserved_cpu_weight,
202209
std::string supported_controllers =
203210
absl::StrCat("[", absl::StrJoin(supported_controllers_, ", "), "]");
204211

205-
// The cpu.weight is distributed between the system and application cgroups.
206-
// The application cgroup gets whatever is leftover from the system cgroup.
207-
int64_t application_cgroup_cpu_weight =
208-
cpu_weight_constraint_.Max() - system_reserved_cpu_weight;
212+
int64_t user_cpu_weight = cpu_weight_constraint_.Max() - system_reserved_cpu_weight;
209213

210214
RAY_LOG(INFO) << absl::StrFormat(
211215
"Initializing CgroupManager at base cgroup at '%s'. Ray's cgroup "
212-
"hierarchy will under the node cgroup at '%s'. The %s controllers will be "
213-
"enabled. "
214-
"The system cgroup at '%s' will have constraints [%s=%lld, %s=%lld]. "
215-
"The application cgroup '%s' will have constraints [%s=%lld].",
216+
"hierarchy will under the node cgroup at '%s' with %s controllers enabled. "
217+
"The system cgroup at '%s' will have [memory] controllers enabled with "
218+
"[%s=%lld, %s=%lld] constraints. "
219+
"The user cgroup '%s' will have no controllers enabled with [%s=%lld] "
220+
"constraints. "
221+
"The user cgroup will contain the [%s, %s] cgroups.",
216222
base_cgroup_,
217223
node_cgroup_,
218224
supported_controllers,
@@ -221,21 +227,23 @@ Status CgroupManager::Initialize(int64_t system_reserved_cpu_weight,
221227
system_reserved_cpu_weight,
222228
memory_min_constraint_.name_,
223229
system_reserved_memory_bytes,
224-
application_cgroup_,
230+
user_cgroup_,
225231
cpu_weight_constraint_.name_,
226-
application_cgroup_cpu_weight);
227-
228-
// Create the cgroup heirarchy:
229-
// base_cgroup_path (e.g. /sys/fs/cgroup)
230-
// |
231-
// ray_node_<node_id>
232-
// | |
233-
// system application
234-
// | |
235-
// leaf leaf
236-
//
237-
// There need to be two cgroups as leaf nodes because of the no
238-
// internal processes constraint.
232+
user_cpu_weight,
233+
workers_cgroup_,
234+
non_ray_cgroup_);
235+
236+
// Create the cgroup hierarchy:
237+
// base_cgroup_path (e.g. /sys/fs/cgroup)
238+
// |
239+
// ray-node_<node_id>
240+
// | |
241+
// system user
242+
// | | |
243+
// leaf workers non-ray
244+
245+
// There need to be leaf cgroups because of the no the internal processes
246+
// constraint.
239247
RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(node_cgroup_));
240248
RegisterDeleteCgroup(node_cgroup_);
241249

@@ -245,28 +253,41 @@ Status CgroupManager::Initialize(int64_t system_reserved_cpu_weight,
245253
RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(system_leaf_cgroup_));
246254
RegisterDeleteCgroup(system_leaf_cgroup_);
247255

248-
RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(application_cgroup_));
249-
RegisterDeleteCgroup(application_cgroup_);
256+
RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(user_cgroup_));
257+
RegisterDeleteCgroup(user_cgroup_);
250258

251-
RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(application_leaf_cgroup_));
252-
RegisterDeleteCgroup(application_leaf_cgroup_);
259+
RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(workers_cgroup_));
260+
RegisterDeleteCgroup(workers_cgroup_);
253261

254262
// Move all processes from the base_cgroup into the system_leaf_cgroup to make sure
263+
RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(non_ray_cgroup_));
264+
RegisterDeleteCgroup(non_ray_cgroup_);
265+
266+
// Move all processes from the base_cgroup into the non-ray cgroup to make sure
255267
// that the no internal process constraint is not violated. This is relevant
256-
// when the base_cgroup is not a root cgroup for the system. This is likely
257-
// the case if Ray is running inside a container.
258-
RAY_RETURN_NOT_OK(cgroup_driver_->MoveAllProcesses(base_cgroup_, system_leaf_cgroup_));
268+
// when the base_cgroup is not the OS's root cgroup. This is the case when
269+
// Ray is running inside a container.
270+
RAY_RETURN_NOT_OK(cgroup_driver_->MoveAllProcesses(base_cgroup_, non_ray_cgroup_));
271+
RegisterMoveAllProcesses(non_ray_cgroup_, base_cgroup_);
272+
273+
// NOTE: Since the raylet does not own the lifecycle of all system processes,
274+
// there's no guarantee that there are no pids in the system leaf cgroup.
275+
// Therefore, pids need to be migrated out of the system cgroup to delete it.
259276
RegisterMoveAllProcesses(system_leaf_cgroup_, base_cgroup_);
260277

261-
for (const auto &ctrl : supported_controllers_) {
262-
RAY_RETURN_NOT_OK(cgroup_driver_->EnableController(base_cgroup_, ctrl));
263-
RegisterDisableController(base_cgroup_, ctrl);
264-
RAY_RETURN_NOT_OK(cgroup_driver_->EnableController(node_cgroup_, ctrl));
265-
RegisterDisableController(node_cgroup_, ctrl);
266-
RAY_RETURN_NOT_OK(cgroup_driver_->EnableController(system_cgroup_, ctrl));
267-
RegisterDisableController(system_cgroup_, ctrl);
268-
RAY_RETURN_NOT_OK(cgroup_driver_->EnableController(application_cgroup_, ctrl));
269-
RegisterDisableController(application_cgroup_, ctrl);
278+
std::array<const std::string *, 2> cpu_controlled_cgroups{&base_cgroup_, &node_cgroup_};
279+
std::array<const std::string *, 3> memory_controlled_cgroups{
280+
&base_cgroup_, &node_cgroup_, &system_cgroup_};
281+
282+
for (const std::string *cpu_controlled_cgroup : cpu_controlled_cgroups) {
283+
RAY_RETURN_NOT_OK(cgroup_driver_->EnableController(*cpu_controlled_cgroup, "cpu"));
284+
RegisterDisableController(*cpu_controlled_cgroup, "cpu");
285+
}
286+
287+
for (const std::string *memory_controlled_cgroup : memory_controlled_cgroups) {
288+
RAY_RETURN_NOT_OK(
289+
cgroup_driver_->EnableController(*memory_controlled_cgroup, "memory"));
290+
RegisterDisableController(*memory_controlled_cgroup, "memory");
270291
}
271292

272293
RAY_RETURN_NOT_OK(
@@ -283,12 +304,12 @@ Status CgroupManager::Initialize(int64_t system_reserved_cpu_weight,
283304
std::to_string(system_reserved_memory_bytes)));
284305
RegisterRemoveConstraint(system_cgroup_, memory_min_constraint_);
285306

286-
RAY_RETURN_NOT_OK(
287-
cgroup_driver_->AddConstraint(application_cgroup_,
288-
cpu_weight_constraint_.controller_,
289-
cpu_weight_constraint_.name_,
290-
std::to_string(application_cgroup_cpu_weight)));
291-
RegisterRemoveConstraint(application_cgroup_, cpu_weight_constraint_);
307+
RAY_RETURN_NOT_OK(cgroup_driver_->AddConstraint(user_cgroup_,
308+
cpu_weight_constraint_.controller_,
309+
cpu_weight_constraint_.name_,
310+
std::to_string(user_cpu_weight)));
311+
RegisterRemoveConstraint(user_cgroup_, cpu_weight_constraint_);
312+
292313
return Status::OK();
293314
}
294315

@@ -308,8 +329,8 @@ Status CgroupManager::AddProcessToCgroup(const std::string &cgroup,
308329
return s;
309330
}
310331

311-
Status CgroupManager::AddProcessToApplicationCgroup(const std::string &pid) {
312-
return AddProcessToCgroup(application_leaf_cgroup_, pid);
332+
Status CgroupManager::AddProcessToWorkersCgroup(const std::string &pid) {
333+
return AddProcessToCgroup(workers_cgroup_, pid);
313334
}
314335

315336
Status CgroupManager::AddProcessToSystemCgroup(const std::string &pid) {

0 commit comments

Comments
 (0)