Skip to content

Commit c4c774e

Browse files
committed
Revert "[core] (cgroups 6/n) CgroupManager cleans up the entire cgroup hierarchy in reverse order when destroyed. (ray-project#56260)"
This reverts commit f900567.
1 parent b535134 commit c4c774e

File tree

18 files changed

+159
-493
lines changed

18 files changed

+159
-493
lines changed

python/ray/_private/worker.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2440,6 +2440,7 @@ def is_initialized() -> bool:
24402440
return ray._private.worker.global_worker.connected
24412441

24422442

2443+
# TODO(hjiang): Add cgroup path along with [enable_resource_isolation].
24432444
@with_connect_or_shutdown_lock
24442445
def connect(
24452446
node,
@@ -2458,6 +2459,7 @@ def connect(
24582459
worker_launch_time_ms: int = -1,
24592460
worker_launched_time_ms: int = -1,
24602461
debug_source: str = "",
2462+
enable_resource_isolation: bool = False,
24612463
):
24622464
"""Connect this worker to the raylet, to Plasma, and to GCS.
24632465
@@ -2486,6 +2488,7 @@ def connect(
24862488
finshes launching. If the worker is not launched by raylet (e.g.,
24872489
driver), this must be -1 (default value).
24882490
debug_source: Source information for `CoreWorker`, used for debugging and informational purpose, rather than functional purpose.
2491+
enable_resource_isolation: If true, core worker enables resource isolation by adding itself into appropriate cgroup.
24892492
"""
24902493
# Do some basic checking to make sure we didn't call ray.init twice.
24912494
error_message = "Perhaps you called ray.init twice by accident?"
@@ -2664,6 +2667,7 @@ def connect(
26642667
worker_launch_time_ms,
26652668
worker_launched_time_ms,
26662669
debug_source,
2670+
enable_resource_isolation,
26672671
)
26682672

26692673
if mode == SCRIPT_MODE:

python/ray/_private/workers/default_worker.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,7 @@
271271
ray_debugger_external=args.ray_debugger_external,
272272
worker_launch_time_ms=args.worker_launch_time_ms,
273273
worker_launched_time_ms=worker_launched_time_ms,
274+
enable_resource_isolation=args.enable_resource_isolation,
274275
)
275276

276277
worker = ray._private.worker.global_worker

python/ray/_raylet.pyx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3002,7 +3002,7 @@ cdef class CoreWorker:
30023002
local_mode, driver_name,
30033003
serialized_job_config, metrics_agent_port, runtime_env_hash,
30043004
startup_token, session_name, cluster_id, entrypoint,
3005-
worker_launch_time_ms, worker_launched_time_ms, debug_source):
3005+
worker_launch_time_ms, worker_launched_time_ms, debug_source, enable_resource_isolation):
30063006
self.is_local_mode = local_mode
30073007

30083008
cdef CCoreWorkerOptions options = CCoreWorkerOptions()
@@ -3058,6 +3058,7 @@ cdef class CoreWorker:
30583058
options.worker_launch_time_ms = worker_launch_time_ms
30593059
options.worker_launched_time_ms = worker_launched_time_ms
30603060
options.debug_source = debug_source
3061+
options.enable_resource_isolation = enable_resource_isolation
30613062
CCoreWorkerProcess.Initialize(options)
30623063

30633064
self.cgname_to_eventloop_dict = None

python/ray/includes/libcoreworker.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
439439
int64_t worker_launch_time_ms
440440
int64_t worker_launched_time_ms
441441
c_string debug_source
442+
c_bool enable_resource_isolation
442443

443444
cdef cppclass CCoreWorkerProcess "ray::core::CoreWorkerProcess":
444445
@staticmethod

src/ray/common/cgroup2/BUILD.bazel

Lines changed: 21 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,13 @@
11
load("//bazel:ray.bzl", "ray_cc_library")
22

3-
config_setting(
4-
name = "is_linux",
5-
constraint_values = ["@platforms//os:linux"],
6-
)
7-
8-
# Public targets.
9-
ray_cc_library(
10-
name = "cgroup_manager",
11-
srcs = select({
12-
":is_linux": ["cgroup_manager.cc"],
13-
"//conditions:default": ["noop_cgroup_manager.cc"],
14-
}),
15-
hdrs = ["cgroup_manager.h"],
16-
visibility = ["//visibility:public"],
17-
deps = [
18-
":cgroup_driver_interface",
19-
":cgroup_manager_interface",
20-
"//src/ray/common:status",
21-
"//src/ray/common:status_or",
22-
] + select({
23-
":is_linux": [
24-
":scoped_cgroup_operation",
25-
"//src/ray/util:logging",
26-
"@com_google_absl//absl/strings",
27-
],
28-
"//conditions:default": [],
29-
}),
30-
)
31-
323
ray_cc_library(
334
name = "cgroup_driver_interface",
345
hdrs = [
356
"cgroup_driver_interface.h",
367
],
37-
visibility = ["//visibility:public"],
8+
target_compatible_with = [
9+
"@platforms//os:linux",
10+
],
3811
deps = [
3912
"//src/ray/common:status",
4013
"//src/ray/common:status_or",
@@ -46,42 +19,51 @@ ray_cc_library(
4619
hdrs = [
4720
"cgroup_manager_interface.h",
4821
],
49-
visibility = ["//visibility:public"],
22+
target_compatible_with = [
23+
"@platforms//os:linux",
24+
],
5025
deps = [
5126
"//src/ray/common:status",
5227
"//src/ray/common:status_or",
5328
],
5429
)
5530

5631
ray_cc_library(
57-
name = "sysfs_cgroup_driver",
58-
srcs = ["sysfs_cgroup_driver.cc"],
32+
name = "cgroup_manager",
33+
srcs = ["cgroup_manager.cc"],
5934
hdrs = [
60-
"sysfs_cgroup_driver.h",
35+
"cgroup_manager.h",
36+
"scoped_cgroup_operation.h",
6137
],
6238
target_compatible_with = [
6339
"@platforms//os:linux",
6440
],
65-
visibility = ["//visibility:public"],
6641
deps = [
6742
":cgroup_driver_interface",
43+
":cgroup_manager_interface",
6844
"//src/ray/common:status",
6945
"//src/ray/common:status_or",
7046
"//src/ray/util:logging",
7147
"@com_google_absl//absl/strings",
7248
],
7349
)
7450

75-
# Private Targets.
7651
ray_cc_library(
77-
name = "scoped_cgroup_operation",
52+
name = "sysfs_cgroup_driver",
53+
srcs = ["sysfs_cgroup_driver.cc"],
7854
hdrs = [
79-
"scoped_cgroup_operation.h",
55+
"sysfs_cgroup_driver.h",
8056
],
8157
target_compatible_with = [
8258
"@platforms//os:linux",
8359
],
84-
visibility = [":__subpackages__"],
60+
deps = [
61+
":cgroup_driver_interface",
62+
"//src/ray/common:status",
63+
"//src/ray/common:status_or",
64+
"//src/ray/util:logging",
65+
"@com_google_absl//absl/strings",
66+
],
8567
)
8668

8769
ray_cc_library(
@@ -92,7 +74,6 @@ ray_cc_library(
9274
target_compatible_with = [
9375
"@platforms//os:linux",
9476
],
95-
visibility = [":__subpackages__"],
9677
deps = [
9778
":cgroup_driver_interface",
9879
"//src/ray/common:status",
@@ -106,7 +87,6 @@ ray_cc_library(
10687
target_compatible_with = [
10788
"@platforms//os:linux",
10889
],
109-
visibility = [":__subpackages__"],
11090
deps = [
11191
"//src/ray/common:id",
11292
"//src/ray/common:status",

src/ray/common/cgroup2/cgroup_manager.cc

Lines changed: 29 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -113,65 +113,48 @@ StatusOr<std::unique_ptr<CgroupManager>> CgroupManager::Create(
113113
return cgroup_manager;
114114
}
115115

116+
// TODO(#54703): This is a placeholder for cleanup. This will call
117+
// CgroupDriver::DeleteCgroup.
116118
void CgroupManager::RegisterDeleteCgroup(const std::string &cgroup_path) {
117-
cleanup_operations_.emplace_back([this, cgroup = cgroup_path]() {
118-
Status s = this->cgroup_driver_->DeleteCgroup(cgroup);
119-
if (!s.ok()) {
120-
RAY_LOG(WARNING) << absl::StrFormat(
121-
"Failed to delete cgroup %s with error %s.", cgroup, s.ToString());
122-
}
119+
cleanup_operations_.emplace_back([cgroup = cgroup_path]() {
120+
RAY_LOG(INFO) << absl::StrFormat("Deleting all cgroup %s.", cgroup);
123121
});
124122
}
125123

124+
// TODO(#54703): This is a placeholder for cleanup. This will call
125+
// CgroupDriver::MoveAllProcesses.
126126
void CgroupManager::RegisterMoveAllProcesses(const std::string &from,
127127
const std::string &to) {
128-
cleanup_operations_.emplace_back([this, from_cgroup = from, to_cgroup = to]() {
129-
Status s = this->cgroup_driver_->MoveAllProcesses(from_cgroup, to_cgroup);
130-
if (!s.ok()) {
131-
RAY_LOG(WARNING) << absl::StrFormat(
132-
"Failed to move all processes from %s to %s with error %s",
133-
from_cgroup,
134-
to_cgroup,
135-
s.ToString());
136-
}
128+
cleanup_operations_.emplace_back([from_cgroup = from, to_cgroup = to]() {
129+
RAY_LOG(INFO) << absl::StrFormat(
130+
"Moved All Processes from %s to %s.", from_cgroup, to_cgroup);
137131
});
138132
}
139133

134+
// TODO(#54703): This is a placeholder for cleanup. This will call
135+
// CgroupDriver::AddConstraint(cgroup, constraint, default_value).
140136
template <typename T>
141137
void CgroupManager::RegisterRemoveConstraint(const std::string &cgroup,
142138
const Constraint<T> &constraint) {
143139
cleanup_operations_.emplace_back(
144-
[this, constrained_cgroup = cgroup, constraint_to_remove = constraint]() {
145-
std::string default_value = std::to_string(constraint_to_remove.default_value_);
146-
Status s = this->cgroup_driver_->AddConstraint(constrained_cgroup,
147-
constraint_to_remove.controller_,
148-
constraint_to_remove.name_,
149-
default_value);
150-
if (!s.ok()) {
151-
RAY_LOG(WARNING) << absl::StrFormat(
152-
"Failed to set constraint %s=%s to default value for cgroup %s with error "
153-
"%s.",
154-
constraint_to_remove.name_,
155-
default_value,
156-
constrained_cgroup,
157-
s.ToString());
158-
}
140+
[constrained_cgroup = cgroup, constraint_to_remove = constraint]() {
141+
RAY_LOG(INFO) << absl::StrFormat(
142+
"Setting constraint %s to default value %lld for cgroup %s",
143+
constraint_to_remove.name_,
144+
constraint_to_remove.default_value_,
145+
constrained_cgroup);
159146
});
160147
}
161148

162-
void CgroupManager::RegisterDisableController(const std::string &cgroup_path,
149+
// TODO(#54703): This is a placeholder for cleanup. This will call
150+
// CgroupDriver::DisableController.
151+
void CgroupManager::RegisterDisableController(const std::string &cgroup,
163152
const std::string &controller) {
164-
cleanup_operations_.emplace_back(
165-
[this, cgroup = cgroup_path, controller_to_disable = controller]() {
166-
Status s = this->cgroup_driver_->DisableController(cgroup, controller_to_disable);
167-
if (!s.ok()) {
168-
RAY_LOG(WARNING) << absl::StrFormat(
169-
"Failed to disable controller %s for cgroup %s with error %s",
170-
controller_to_disable,
171-
cgroup,
172-
s.ToString());
173-
}
174-
});
153+
cleanup_operations_.emplace_back([cgroup_to_clean = cgroup,
154+
controller_to_disable = controller]() {
155+
RAY_LOG(INFO) << absl::StrFormat(
156+
"Disabling controller %s for cgroup %s.", controller_to_disable, cgroup_to_clean);
157+
});
175158
}
176159

177160
Status CgroupManager::Initialize(int64_t system_reserved_cpu_weight,
@@ -185,11 +168,11 @@ Status CgroupManager::Initialize(int64_t system_reserved_cpu_weight,
185168
cpu_weight_constraint_.Max() - system_reserved_cpu_weight;
186169

187170
RAY_LOG(INFO) << absl::StrFormat(
188-
"Initializing CgroupManager at base cgroup at '%s'. Ray's cgroup "
189-
"hierarchy will under the node cgroup at '%s'. The %s controllers will be "
171+
"Initializing CgroupManager at base cgroup path at %s. Ray's cgroup "
172+
"hierarchy will under the node cgroup %s. The %s controllers will be "
190173
"enabled. "
191-
"The system cgroup at '%s' will have constraints [%s=%lld, %s=%lld]. "
192-
"The application cgroup '%s' will have constraints [%s=%lld].",
174+
"System cgroup %s will have constraints [%s=%lld, %s=%lld]. "
175+
"Application cgroup %s will have constraints [%s=%lld].",
193176
base_cgroup_path_,
194177
node_cgroup_path_,
195178
supported_controllers,

src/ray/common/cgroup2/cgroup_manager.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,9 @@ class CgroupManager : public CgroupManagerInterface {
111111
Status Initialize(const int64_t system_reserved_cpu_weight,
112112
const int64_t system_reserved_memory_bytes);
113113

114-
// The Register* methods register a callback that will execute in the destructor
115-
// in FILO order. All callbacks required the cgroup_driver_ to be available to
116-
// remove the cgroup hierarchy.
117-
void RegisterDeleteCgroup(const std::string &cgroup);
114+
// TODO(#54703): This is a placeholder for cleanup. This will be implemented in the a
115+
// future PR.
116+
void RegisterDeleteCgroup(const std::string &cgroup_path);
118117
void RegisterMoveAllProcesses(const std::string &from, const std::string &to);
119118
template <typename T>
120119
void RegisterRemoveConstraint(const std::string &cgroup,

0 commit comments

Comments
 (0)