Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
599f968
[wip] [core] (cgroups 14/n) Clean up bazel targets and expose just the
israbbani Sep 30, 2025
77f6b64
[core] Cleaning up Cgroup related bazel targets. CgroupManagerInteface
israbbani Oct 6, 2025
9fd1160
[core] (cgroups 14/n) Clean up bazel targets and enable cross-platform
israbbani Oct 6, 2025
a5f4b5a
Merge branch 'master' into irabbani/cgroups-14
israbbani Oct 6, 2025
762b5cf
Merge branch 'irabbani/cgroups-14' of github.com:ray-project/ray into…
israbbani Oct 6, 2025
8d07f6f
Merge branch 'master' into irabbani/cgroups-14
israbbani Oct 6, 2025
b92677e
Forgot to use clang locally
israbbani Oct 7, 2025
a870d5a
Merge branch 'irabbani/cgroups-14' of github.com:ray-project/ray into…
israbbani Oct 7, 2025
e34f19b
Unused imports
israbbani Oct 7, 2025
55f1ec8
unused includes breaking the build
israbbani Oct 7, 2025
4c7545e
fixing the macos build
israbbani Oct 7, 2025
aef6bd8
Merge branch 'master' into irabbani/cgroups-14
israbbani Oct 7, 2025
bac50d3
[core] (cgroups 15/n) Changing the cgroup heirarchy to have three
israbbani Oct 7, 2025
50b2d14
Merge branch 'irabbani/cgroups-14' into irabbani/cgroups-15
israbbani Oct 7, 2025
59366ce
move operators for NoopCgroupManager
israbbani Oct 7, 2025
44ab09e
Merge branch 'irabbani/cgroups-14' of github.com:ray-project/ray into…
israbbani Oct 7, 2025
0c8d8e3
Update src/ray/common/cgroup2/cgroup_manager_factory.h
israbbani Oct 7, 2025
6dc39ad
feedback
israbbani Oct 7, 2025
60d77bb
up
israbbani Oct 8, 2025
bfd2482
Merge branch 'master' into irabbani/cgroups-14
israbbani Oct 9, 2025
59a0bef
Merge branch 'irabbani/cgroups-14' into irabbani/cgroups-15
israbbani Oct 9, 2025
ee024ea
Different cgroup hierarchy.
israbbani Oct 10, 2025
bf390de
Merge branch 'master' into irabbani/cgroups-15
israbbani Oct 10, 2025
028f3d2
Merge branch 'irabbani/cgroups-15' of github.com:ray-project/ray into…
israbbani Oct 10, 2025
cb34c9b
typo
israbbani Oct 10, 2025
fb7d1ac
one more typo
israbbani Oct 10, 2025
8b443f5
one more
israbbani Oct 10, 2025
eddb0b2
Cleaning up docs and log lines
israbbani Oct 11, 2025
4550bae
Merge branch 'master' into irabbani/cgroups-15
israbbani Oct 11, 2025
c4884ee
Merge branch 'master' into irabbani/cgroups-15
israbbani Oct 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# These tests are intended to run in CI inside a container.
#
# If you want to run this test locally, you will need to create a cgroup that
# the raylet can manage and delegate to the correct user.
# the ray can manage and delegate to the correct user.
#
# Run these commands locally before running the test suite:
#
Expand Down Expand Up @@ -44,13 +44,13 @@
# / \
# TEST_CGROUP LEAF_CGROUP
# |
# ray_node_<node_id>
# / \
# system application
# | |
# leaf leaf
# ray-node_<node_id>
# | |
# system user
# | | |
# leaf workers non-ray
#
# NOTE: The test suite does not assume that ROOT_CGROUP is an actual root cgroup. Therefore,
# NOTE: The test suite does not assume that ROOT_CGROUP is the OS's root cgroup. Therefore,
# 1. setup will migrate all processes from the ROOT_CGROUP -> LEAF_CGROUP
# 2. teardown will migrate all processes from the LEAF_CGROUP -> ROOT_CGROUP
#
Expand All @@ -67,7 +67,7 @@
_TEST_CGROUP = _BASE_CGROUP / "test"
_LEAF_GROUP = _BASE_CGROUP / "leaf"

_MOUNT_FILE_PATH = "/etc/mtab"
_MOUNT_FILE_PATH = "/proc/mounts"

# The list of processes expected to be started in the system cgroup
# with default params for 'ray start' and 'ray.init(...)'
Expand Down Expand Up @@ -270,55 +270,64 @@ def assert_cgroup_hierarchy_exists_for_node(

The cgroup hierarchy looks like:

_TEST_CGROUP
_TEST_CGROUP
|
ray_node_<node_id>
| |
system application
| |
leaf leaf
ray-node_<node_id>
| |
system user
| | |
leaf workers non-ray

Args:
node_id: used to find the path of the cgroup subtree
resource_isolation_config: used to verify constraints enabled on the system
and application cgroups
resource_isolation_config: used to verify constraints enabled on the system, workers, and user cgroups
"""
base_cgroup_for_node = resource_isolation_config.cgroup_path
node_cgroup = Path(base_cgroup_for_node) / f"ray_node_{node_id}"
node_cgroup = Path(base_cgroup_for_node) / f"ray-node_{node_id}"
system_cgroup = node_cgroup / "system"
system_leaf_cgroup = system_cgroup / "leaf"
application_cgroup = node_cgroup / "application"
application_leaf_cgroup = application_cgroup / "leaf"
user_cgroup = node_cgroup / "user"
workers_cgroup = user_cgroup / "workers"
non_ray_cgroup = user_cgroup / "non-ray"

# 1) Check that the cgroup hierarchy is created correctly for the node.
assert node_cgroup.is_dir()
assert system_cgroup.is_dir()
assert system_leaf_cgroup.is_dir()
assert application_cgroup.is_dir()
assert application_leaf_cgroup.is_dir()
assert workers_cgroup.is_dir()
assert user_cgroup.is_dir()
assert non_ray_cgroup.is_dir()

# 2) Verify the constraints are applied correctly.
system_cgroup_memory_min = system_cgroup / "memory.min"
with open(system_cgroup_memory_min, "r") as memory_min_file:
with open(system_cgroup / "memory.min", "r") as memory_min_file:
contents = memory_min_file.read().strip()
assert contents == str(resource_isolation_config.system_reserved_memory)
system_cgroup_cpu_weight = system_cgroup / "cpu.weight"
with open(system_cgroup_cpu_weight, "r") as cpu_weight_file:
with open(system_cgroup / "cpu.weight", "r") as cpu_weight_file:
contents = cpu_weight_file.read().strip()
assert contents == str(resource_isolation_config.system_reserved_cpu_weight)
application_cgroup_cpu_weight = application_cgroup / "cpu.weight"
with open(application_cgroup_cpu_weight, "r") as cpu_weight_file:
with open(user_cgroup / "cpu.weight", "r") as cpu_weight_file:
contents = cpu_weight_file.read().strip()
assert contents == str(
10000 - resource_isolation_config.system_reserved_cpu_weight
)


def assert_system_processes_are_in_system_cgroup(
node_id, resource_isolation_config, expected_count
node_id: str,
resource_isolation_config: ResourceIsolationConfig,
expected_count: int,
):
"""Asserts that the system processes were created in the correct cgroup.

Args:
node_id: used to construct the path of the cgroup subtree
resource_isolation_config: used to construct the path of the cgroup
subtree
expected_count: the number of expected system processes.

"""
base_cgroup_for_node = resource_isolation_config.cgroup_path
node_cgroup = Path(base_cgroup_for_node) / f"ray_node_{node_id}"
node_cgroup = Path(base_cgroup_for_node) / f"ray-node_{node_id}"
system_cgroup = node_cgroup / "system"
system_leaf_cgroup = system_cgroup / "leaf"

Expand All @@ -330,26 +339,24 @@ def assert_system_processes_are_in_system_cgroup(
), f"Expected only system process passed into the raylet. Found {lines}"


def assert_worker_processes_are_in_application_cgroup(
def assert_worker_processes_are_in_workers_cgroup(
node_id: str,
resource_isolation_config: ResourceIsolationConfig,
worker_pids: Set[str],
):
"""Asserts that the cgroup hierarchy was deleted correctly for the node.
"""Asserts that the worker processes were created in the correct cgroup.

Args:
node_id: used to construct the path of the cgroup subtree
resource_isolation_config: used to construct the path of the cgroup
subtree
worker_pids: a set of pids that are expected inside the application
worker_pids: a set of pids that are expected inside the workers
leaf cgroup.
"""
base_cgroup_for_node = resource_isolation_config.cgroup_path
node_cgroup = Path(base_cgroup_for_node) / f"ray_node_{node_id}"
application_leaf_cgroup_procs = (
node_cgroup / "application" / "leaf" / "cgroup.procs"
)
with open(application_leaf_cgroup_procs, "r") as cgroup_procs_file:
node_cgroup = Path(base_cgroup_for_node) / f"ray-node_{node_id}"
workers_cgroup_procs = node_cgroup / "user" / "workers" / "cgroup.procs"
with open(workers_cgroup_procs, "r") as cgroup_procs_file:
pids_in_cgroup = set()
lines = cgroup_procs_file.readlines()
for line in lines:
Expand All @@ -368,7 +375,7 @@ def assert_cgroup_hierarchy_cleaned_up_for_node(
subtree
"""
base_cgroup_for_node = resource_isolation_config.cgroup_path
node_cgroup = Path(base_cgroup_for_node) / f"ray_node_{node_id}"
node_cgroup = Path(base_cgroup_for_node) / f"ray-node_{node_id}"
# If the root cgroup is deleted, there's no need to check anything else.
assert (
not node_cgroup.is_dir()
Expand Down Expand Up @@ -442,7 +449,7 @@ def get_pid(self):
worker_pids = set()
for actor in actor_refs:
worker_pids.add(str(ray.get(actor.get_pid.remote())))
assert_worker_processes_are_in_application_cgroup(
assert_worker_processes_are_in_workers_cgroup(
node_id, resource_isolation_config, worker_pids
)
runner.invoke(scripts.stop)
Expand Down Expand Up @@ -503,7 +510,7 @@ def get_pid(self):
worker_pids = set()
for actor in actor_refs:
worker_pids.add(str(ray.get(actor.get_pid.remote())))
assert_worker_processes_are_in_application_cgroup(
assert_worker_processes_are_in_workers_cgroup(
node_id, resource_isolation_config, worker_pids
)
ray.shutdown()
Expand Down
129 changes: 75 additions & 54 deletions src/ray/common/cgroup2/cgroup_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@

#include <algorithm>
#include <filesystem>
#include <fstream>
#include <memory>
#include <sstream>
#include <string>
#include <type_traits>
#include <unordered_set>
#include <utility>

Expand All @@ -40,10 +43,12 @@ CgroupManager::CgroupManager(std::string base_cgroup,
node_cgroup_ + std::filesystem::path::preferred_separator + kSystemCgroupName;
system_leaf_cgroup_ =
system_cgroup_ + std::filesystem::path::preferred_separator + kLeafCgroupName;
application_cgroup_ =
node_cgroup_ + std::filesystem::path::preferred_separator + kApplicationCgroupName;
application_leaf_cgroup_ =
application_cgroup_ + std::filesystem::path::preferred_separator + kLeafCgroupName;
user_cgroup_ =
node_cgroup_ + std::filesystem::path::preferred_separator + kUserCgroupName;
workers_cgroup_ =
user_cgroup_ + std::filesystem::path::preferred_separator + kWorkersCgroupName;
non_ray_cgroup_ =
user_cgroup_ + std::filesystem::path::preferred_separator + kNonRayCgroupName;
}

CgroupManager::~CgroupManager() {
Expand All @@ -56,17 +61,19 @@ CgroupManager::CgroupManager(CgroupManager &&other)
: node_cgroup_(std::move(other.node_cgroup_)),
system_cgroup_(std::move(other.system_cgroup_)),
system_leaf_cgroup_(std::move(other.system_leaf_cgroup_)),
application_cgroup_(std::move(other.application_cgroup_)),
application_leaf_cgroup_(std::move(other.application_leaf_cgroup_)),
user_cgroup_(std::move(other.user_cgroup_)),
workers_cgroup_(std::move(other.workers_cgroup_)),
non_ray_cgroup_(std::move(other.non_ray_cgroup_)),
cleanup_operations_(std::move(other.cleanup_operations_)),
cgroup_driver_(std::move(other.cgroup_driver_)) {}

CgroupManager &CgroupManager::operator=(CgroupManager &&other) {
node_cgroup_ = std::move(other.node_cgroup_);
system_cgroup_ = std::move(other.system_cgroup_);
system_leaf_cgroup_ = std::move(other.system_leaf_cgroup_);
application_cgroup_ = std::move(other.application_cgroup_);
application_leaf_cgroup_ = std::move(other.application_leaf_cgroup_);
user_cgroup_ = std::move(other.user_cgroup_);
workers_cgroup_ = std::move(other.workers_cgroup_);
non_ray_cgroup_ = std::move(other.non_ray_cgroup_);
cleanup_operations_ = std::move(other.cleanup_operations_);
cgroup_driver_ = std::move(other.cgroup_driver_);
return *this;
Expand Down Expand Up @@ -202,17 +209,16 @@ Status CgroupManager::Initialize(int64_t system_reserved_cpu_weight,
std::string supported_controllers =
absl::StrCat("[", absl::StrJoin(supported_controllers_, ", "), "]");

// The cpu.weight is distributed between the system and application cgroups.
// The application cgroup gets whatever is leftover from the system cgroup.
int64_t application_cgroup_cpu_weight =
cpu_weight_constraint_.Max() - system_reserved_cpu_weight;
int64_t user_cpu_weight = cpu_weight_constraint_.Max() - system_reserved_cpu_weight;

RAY_LOG(INFO) << absl::StrFormat(
"Initializing CgroupManager at base cgroup at '%s'. Ray's cgroup "
"hierarchy will under the node cgroup at '%s'. The %s controllers will be "
"enabled. "
"The system cgroup at '%s' will have constraints [%s=%lld, %s=%lld]. "
"The application cgroup '%s' will have constraints [%s=%lld].",
"hierarchy will under the node cgroup at '%s' with %s controllers enabled. "
"The system cgroup at '%s' will have [memory] controllers enabled with "
"[%s=%lld, %s=%lld] constraints. "
"The user cgroup '%s' will have no controllers enabled with [%s=%lld] "
"constraints. "
"The user cgroup will contain the [%s, %s] cgroups.",
base_cgroup_,
node_cgroup_,
supported_controllers,
Expand All @@ -221,21 +227,23 @@ Status CgroupManager::Initialize(int64_t system_reserved_cpu_weight,
system_reserved_cpu_weight,
memory_min_constraint_.name_,
system_reserved_memory_bytes,
application_cgroup_,
user_cgroup_,
cpu_weight_constraint_.name_,
application_cgroup_cpu_weight);

// Create the cgroup heirarchy:
// base_cgroup_path (e.g. /sys/fs/cgroup)
// |
// ray_node_<node_id>
// | |
// system application
// | |
// leaf leaf
//
// There need to be two cgroups as leaf nodes because of the no
// internal processes constraint.
user_cpu_weight,
workers_cgroup_,
non_ray_cgroup_);

// Create the cgroup hierarchy:
// base_cgroup_path (e.g. /sys/fs/cgroup)
// |
// ray-node_<node_id>
// | |
// system user
// | | |
// leaf workers non-ray

// There need to be leaf cgroups because of the no the internal processes
// constraint.
RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(node_cgroup_));
RegisterDeleteCgroup(node_cgroup_);

Expand All @@ -245,28 +253,41 @@ Status CgroupManager::Initialize(int64_t system_reserved_cpu_weight,
RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(system_leaf_cgroup_));
RegisterDeleteCgroup(system_leaf_cgroup_);

RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(application_cgroup_));
RegisterDeleteCgroup(application_cgroup_);
RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(user_cgroup_));
RegisterDeleteCgroup(user_cgroup_);

RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(application_leaf_cgroup_));
RegisterDeleteCgroup(application_leaf_cgroup_);
RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(workers_cgroup_));
RegisterDeleteCgroup(workers_cgroup_);

// Move all processes from the base_cgroup into the system_leaf_cgroup to make sure
RAY_RETURN_NOT_OK(cgroup_driver_->CreateCgroup(non_ray_cgroup_));
RegisterDeleteCgroup(non_ray_cgroup_);

// Move all processes from the base_cgroup into the non-ray cgroup to make sure
// that the no internal process constraint is not violated. This is relevant
// when the base_cgroup is not a root cgroup for the system. This is likely
// the case if Ray is running inside a container.
RAY_RETURN_NOT_OK(cgroup_driver_->MoveAllProcesses(base_cgroup_, system_leaf_cgroup_));
// when the base_cgroup is not the OS's root cgroup. This is the case when
// Ray is running inside a container.
RAY_RETURN_NOT_OK(cgroup_driver_->MoveAllProcesses(base_cgroup_, non_ray_cgroup_));
RegisterMoveAllProcesses(non_ray_cgroup_, base_cgroup_);

// NOTE: Since the raylet does not own the lifecycle of all system processes,
// there's no guarantee that there are no pids in the system leaf cgroup.
// Therefore, pids need to be migrated out of the system cgroup to delete it.
RegisterMoveAllProcesses(system_leaf_cgroup_, base_cgroup_);

for (const auto &ctrl : supported_controllers_) {
RAY_RETURN_NOT_OK(cgroup_driver_->EnableController(base_cgroup_, ctrl));
RegisterDisableController(base_cgroup_, ctrl);
RAY_RETURN_NOT_OK(cgroup_driver_->EnableController(node_cgroup_, ctrl));
RegisterDisableController(node_cgroup_, ctrl);
RAY_RETURN_NOT_OK(cgroup_driver_->EnableController(system_cgroup_, ctrl));
RegisterDisableController(system_cgroup_, ctrl);
RAY_RETURN_NOT_OK(cgroup_driver_->EnableController(application_cgroup_, ctrl));
RegisterDisableController(application_cgroup_, ctrl);
std::array<const std::string *, 2> cpu_controlled_cgroups{&base_cgroup_, &node_cgroup_};
std::array<const std::string *, 3> memory_controlled_cgroups{
&base_cgroup_, &node_cgroup_, &system_cgroup_};

for (const std::string *cpu_controlled_cgroup : cpu_controlled_cgroups) {
RAY_RETURN_NOT_OK(cgroup_driver_->EnableController(*cpu_controlled_cgroup, "cpu"));
RegisterDisableController(*cpu_controlled_cgroup, "cpu");
}

for (const std::string *memory_controlled_cgroup : memory_controlled_cgroups) {
RAY_RETURN_NOT_OK(
cgroup_driver_->EnableController(*memory_controlled_cgroup, "memory"));
RegisterDisableController(*memory_controlled_cgroup, "memory");
}

RAY_RETURN_NOT_OK(
Expand All @@ -283,12 +304,12 @@ Status CgroupManager::Initialize(int64_t system_reserved_cpu_weight,
std::to_string(system_reserved_memory_bytes)));
RegisterRemoveConstraint(system_cgroup_, memory_min_constraint_);

RAY_RETURN_NOT_OK(
cgroup_driver_->AddConstraint(application_cgroup_,
cpu_weight_constraint_.controller_,
cpu_weight_constraint_.name_,
std::to_string(application_cgroup_cpu_weight)));
RegisterRemoveConstraint(application_cgroup_, cpu_weight_constraint_);
RAY_RETURN_NOT_OK(cgroup_driver_->AddConstraint(user_cgroup_,
cpu_weight_constraint_.controller_,
cpu_weight_constraint_.name_,
std::to_string(user_cpu_weight)));
RegisterRemoveConstraint(user_cgroup_, cpu_weight_constraint_);

return Status::OK();
}

Expand All @@ -308,8 +329,8 @@ Status CgroupManager::AddProcessToCgroup(const std::string &cgroup,
return s;
}

Status CgroupManager::AddProcessToApplicationCgroup(const std::string &pid) {
return AddProcessToCgroup(application_leaf_cgroup_, pid);
Status CgroupManager::AddProcessToWorkersCgroup(const std::string &pid) {
return AddProcessToCgroup(workers_cgroup_, pid);
}

Status CgroupManager::AddProcessToSystemCgroup(const std::string &pid) {
Expand Down
Loading