Skip to content

Commit 80cf67e

Browse files
israbbaniedoakes
authored andcommitted
[core] (cgroups 14/n) Clean up bazel targets and support cross-platform build. (#57244)
For more details about the resource isolation project see #54703. This PR introduces two public bazel targets from the `//src/ray/common/cgroup2` subsystem. * `CgroupManagerFactory` is a cross-platform target that exports a working CgroupManager on Linux if resource isolation is enabled. It exports a Noop implementation if running on a non-Linux platform or if resource isolation is not enabled on Linux. * `CgroupManagerInterface` is the public API of CgroupManager. It also introduces a few other changes 1. All resource isolation related configuration parsing and input validation has been moved into CgroupManagerFactory. 2. NodeManager now controls the lifecycle (and destruction) of CgroupManager. 3. SysFsCgroupDriver uses a linux header file to find the path of the mount file instead of hardcoding because different linux distributions can use different files. --------- Signed-off-by: Ibrahim Rabbani <israbbani@gmail.com> Co-authored-by: Edward Oakes <ed.nmi.oakes@gmail.com>
1 parent 2156393 commit 80cf67e

17 files changed

+327
-327
lines changed

src/ray/common/cgroup2/BUILD.bazel

Lines changed: 51 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -5,104 +5,110 @@ config_setting(
55
constraint_values = ["@platforms//os:linux"],
66
)
77

8-
# Public targets.
8+
# The module exposes only two public targets.
9+
# "cgroup_manager_factory" to create a CgroupManager
10+
# "cgroup_manager_interface" to use the public API of CgroupManager.
911
ray_cc_library(
10-
name = "cgroup_manager",
12+
name = "cgroup_manager_factory",
1113
srcs = select({
12-
":is_linux": ["cgroup_manager.cc"],
13-
"//conditions:default": ["noop_cgroup_manager.cc"],
14+
":is_linux": [
15+
"linux_cgroup_manager_factory.cc",
16+
],
17+
"//conditions:default": [
18+
"noop_cgroup_manager_factory.cc",
19+
],
1420
}),
1521
hdrs = [
16-
"cgroup_manager.h",
17-
"scoped_cgroup_operation.h",
22+
"cgroup_manager_factory.h",
1823
],
1924
visibility = ["//visibility:public"],
2025
deps = [
21-
":cgroup_driver_interface",
2226
":cgroup_manager_interface",
23-
"//src/ray/common:status",
24-
"//src/ray/common:status_or",
27+
":noop_cgroup_manager",
28+
"//src/ray/util:logging",
2529
] + select({
2630
":is_linux": [
27-
"//src/ray/util:logging",
31+
":cgroup_driver_interface",
32+
":cgroup_manager",
33+
":sysfs_cgroup_driver",
34+
"//src/ray/common:status",
35+
"//src/ray/common:status_or",
2836
"@com_google_absl//absl/strings",
2937
],
3038
"//conditions:default": [],
3139
}),
3240
)
3341

3442
ray_cc_library(
35-
name = "cgroup_driver_interface",
43+
name = "cgroup_manager_interface",
3644
hdrs = [
37-
"cgroup_driver_interface.h",
45+
"cgroup_manager_interface.h",
3846
],
3947
visibility = ["//visibility:public"],
4048
deps = [
49+
":cgroup_driver_interface",
4150
"//src/ray/common:status",
4251
"//src/ray/common:status_or",
4352
],
4453
)
4554

55+
# Private targets
4656
ray_cc_library(
47-
name = "cgroup_manager_interface",
57+
name = "cgroup_manager",
58+
srcs = [
59+
"cgroup_manager.cc",
60+
],
4861
hdrs = [
49-
"cgroup_manager_interface.h",
62+
"cgroup_manager.h",
63+
"scoped_cgroup_operation.h",
5064
],
51-
visibility = ["//visibility:public"],
65+
visibility = [":__subpackages__"],
5266
deps = [
5367
":cgroup_driver_interface",
68+
":cgroup_manager_interface",
5469
"//src/ray/common:status",
5570
"//src/ray/common:status_or",
71+
"//src/ray/util:logging",
5672
],
5773
)
5874

5975
ray_cc_library(
60-
name = "sysfs_cgroup_driver",
61-
srcs = select({
62-
":is_linux": ["sysfs_cgroup_driver.cc"],
63-
"//conditions:default": ["noop_sysfs_cgroup_driver.cc"],
64-
}),
76+
name = "noop_cgroup_manager",
6577
hdrs = [
66-
"sysfs_cgroup_driver.h",
78+
"noop_cgroup_manager.h",
6779
],
68-
visibility = ["//visibility:public"],
80+
visibility = [":__subpackages__"],
6981
deps = [
7082
":cgroup_driver_interface",
83+
":cgroup_manager_interface",
7184
"//src/ray/common:status",
7285
"//src/ray/common:status_or",
73-
] + select({
74-
":is_linux": [
75-
"//src/ray/util:logging",
76-
"@com_google_absl//absl/strings",
77-
],
78-
"//conditions:default": [],
79-
}),
86+
],
8087
)
8188

82-
# Private targets
83-
#
84-
# TODO(#54703): This target builds the noop implementations.
85-
# There's a corressponding test that runs on Linux and Non-Linux
86-
# CI so breakages are caught in premerge before these targets are
87-
# cleaned up at the end of the resource isolation milestone 1
88-
# project.
8989
ray_cc_library(
90-
name = "noop_cgroup_targets",
91-
srcs = [
92-
"noop_cgroup_manager.cc",
93-
"noop_sysfs_cgroup_driver.cc",
94-
],
90+
name = "cgroup_driver_interface",
9591
hdrs = [
96-
"cgroup_manager.h",
97-
"scoped_cgroup_operation.h",
98-
"sysfs_cgroup_driver.h",
92+
"cgroup_driver_interface.h",
9993
],
10094
visibility = [":__subpackages__"],
95+
deps = [
96+
"//src/ray/common:status",
97+
"//src/ray/common:status_or",
98+
],
99+
)
100+
101+
ray_cc_library(
102+
name = "sysfs_cgroup_driver",
103+
srcs = ["sysfs_cgroup_driver.cc"],
104+
hdrs = ["sysfs_cgroup_driver.h"],
105+
visibility = [":__subpackages__"],
101106
deps = [
102107
":cgroup_driver_interface",
103-
":cgroup_manager_interface",
104108
"//src/ray/common:status",
105109
"//src/ray/common:status_or",
110+
"//src/ray/util:logging",
111+
"@com_google_absl//absl/strings",
106112
],
107113
)
108114

src/ray/common/cgroup2/cgroup_manager.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,7 @@ class CgroupManager : public CgroupManagerInterface {
7878
2) the system leaf cgroup i.e. the destination cgroup.
7979
3) the lowest common ancestor of the source and destination cgroups.
8080
81-
TODO(#54703): There currently is not a good way to signal to the caller that
82-
the method can cause a FATAL error. Revisit this once we've settled on a pattern.
83-
84-
NOTE: If the process does not have adequate cgroup permissions or the application leaf
81+
@note If the process does not have adequate cgroup permissions or the application leaf
8582
cgroup does not exist, this will fail a RAY_CHECK.
8683
8784
@param pid of the process to move into the application leaf cgroup.
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
// Copyright 2025 The Ray Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
#pragma once
15+
16+
#include <memory>
17+
#include <string>
18+
19+
#include "ray/common/cgroup2/cgroup_manager_interface.h"
20+
21+
namespace ray {
22+
23+
// TODO(54703): Refactor the configs into a struct called CgroupManagerConfig
24+
// and delegate input validation and error messages to it.
25+
class CgroupManagerFactory {
26+
public:
27+
/**
28+
29+
This feature is only enabled in Linux. If using Linux, validates inputs, creates the
30+
ray cgroup heirarchy, enables constraints, and moves all system processes into the
31+
system cgroup.
32+
33+
On non-Linux platforms, this will return a noop implementation.
34+
35+
@param enable_resource_isolation if true, will create process isolation with using
36+
cgroups (@see CgroupManager::Create for more information).
37+
@param cgroup_path the cgroup that the process will take ownership of.
38+
@param node_id used to create a unique cgroup subtree per running ray node.
39+
@param system_reserved_cpu_weight a value between [1,10000] to assign to the cgroup
40+
for system processes. The cgroup for application processes gets 10000 -
41+
system_reserved_cpu_weight.
42+
@param system_reserved_memory_bytes used to reserve memory for the system cgroup.
43+
@param system_pids a comma-separated list of pids of ray system processes to move into
44+
the system cgroup.
45+
46+
For more information about the parameters, see @ref CgroupManager::Create.
47+
48+
@note any of the following is undefined behavior and will cause a RAY_CHECK to fail
49+
1. enable_resource_isolation is true and either
50+
a. cgroup_path is empty
51+
b. system_reserved_cpu_weight or system_reserved_memory_bytes are -1.
52+
2. The CgroupManager's precondition checks fail
53+
a. cgroupv2 is not mounted correctly in unified mode (see @ref
54+
CgroupDriverInterface::CheckCgroupv2Enabled).
55+
b. the current process does not adequate permissions (see @ref
56+
CgroupManager::Create).
57+
c. supported cgroup controllers are not available (see @ref
58+
CgroupManager::supported_controllers_).
59+
3. if a process in system_pids cannot be moved into the system cgroup.
60+
*/
61+
static std::unique_ptr<CgroupManagerInterface> Create(
62+
bool enable_resource_isolation,
63+
std::string cgroup_path,
64+
const std::string &node_id,
65+
const int64_t system_reserved_cpu_weight,
66+
const int64_t system_reserved_memory_bytes,
67+
const std::string &system_pids);
68+
};
69+
} // namespace ray

src/ray/common/cgroup2/cgroup_manager_interface.h

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
// limitations under the License.
1414
#pragma once
1515

16-
#include <sys/types.h>
17-
1816
#include <limits>
1917
#include <memory>
2018
#include <string>
@@ -50,10 +48,7 @@ class CgroupManagerInterface {
5048
2) the system leaf cgroup i.e. the destination cgroup.
5149
3) the lowest common ancestor of the source and destination cgroups.
5250
53-
TODO(#54703): There currently is not a good way to signal to the caller that
54-
the method can cause a FATAL error. Revisit this once we've settled on a pattern.
55-
56-
NOTE: If the process does not have adequate cgroup permissions or the application leaf
51+
@note If the process does not have adequate cgroup permissions or the application leaf
5752
cgroup does not exist, this will fail a RAY_CHECK.
5853
5954
@param pid of the process to move into the system leaf cgroup.
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
// Copyright 2025 The Ray Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
#include <sys/types.h>
15+
#include <unistd.h>
16+
17+
#include <memory>
18+
#include <string>
19+
#include <utility>
20+
#include <vector>
21+
22+
#include "absl/strings/str_format.h"
23+
#include "absl/strings/str_split.h"
24+
#include "ray/common/cgroup2/cgroup_driver_interface.h"
25+
#include "ray/common/cgroup2/cgroup_manager.h"
26+
#include "ray/common/cgroup2/cgroup_manager_factory.h"
27+
#include "ray/common/cgroup2/cgroup_manager_interface.h"
28+
#include "ray/common/cgroup2/noop_cgroup_manager.h"
29+
#include "ray/common/cgroup2/sysfs_cgroup_driver.h"
30+
31+
namespace ray {
32+
33+
std::unique_ptr<CgroupManagerInterface> CgroupManagerFactory::Create(
34+
bool enable_resource_isolation,
35+
std::string cgroup_path,
36+
const std::string &node_id,
37+
const int64_t system_reserved_cpu_weight,
38+
const int64_t system_reserved_memory_bytes,
39+
const std::string &system_pids) {
40+
if (!enable_resource_isolation) {
41+
return std::make_unique<NoopCgroupManager>();
42+
}
43+
44+
RAY_CHECK(!cgroup_path.empty())
45+
<< "Failed to start CgroupManager. If enable_resource_isolation is set to true, "
46+
"cgroup_path cannot be empty.";
47+
48+
RAY_CHECK_NE(system_reserved_cpu_weight, -1)
49+
<< "Failed to start CgroupManager. If enable_resource_isolation is set to true, "
50+
"system_reserved_cpu_weight must be set to a value between [1,10000]";
51+
52+
RAY_CHECK_NE(system_reserved_memory_bytes, -1)
53+
<< "Failed to start CgroupManager. If enable_resource_isolation is set to true, "
54+
"system_reserved_memory_bytes must be set to a value > 0";
55+
56+
StatusOr<std::unique_ptr<CgroupManagerInterface>> cgroup_manager_s =
57+
CgroupManager::Create(cgroup_path,
58+
node_id,
59+
system_reserved_cpu_weight,
60+
system_reserved_memory_bytes,
61+
std::make_unique<SysFsCgroupDriver>());
62+
63+
RAY_CHECK(cgroup_manager_s.ok()) << absl::StrFormat(
64+
"Failed to start CgroupManager due to %s.", cgroup_manager_s.ToString());
65+
66+
std::unique_ptr<CgroupManagerInterface> cgroup_manager =
67+
std::move(cgroup_manager_s.value());
68+
69+
std::vector<std::string> system_pids_to_move;
70+
if (!system_pids.empty()) {
71+
system_pids_to_move = std::move(absl::StrSplit(system_pids, ","));
72+
}
73+
74+
system_pids_to_move.emplace_back(std::to_string(getpid()));
75+
76+
for (const auto &pid : system_pids_to_move) {
77+
RAY_CHECK_OK(cgroup_manager->AddProcessToSystemCgroup(pid))
78+
<< absl::StrFormat("Failed to move process with pid %s into system cgroup.", pid);
79+
}
80+
81+
return cgroup_manager;
82+
}
83+
} // namespace ray

src/ray/common/cgroup2/noop_cgroup_manager.cc

Lines changed: 0 additions & 48 deletions
This file was deleted.

0 commit comments

Comments
 (0)