Skip to content

Commit dcd4336

Browse files
dayshahelliot-barn
authored andcommitted
[core] Kill raylet file and just keep node manager file (#57817)
Signed-off-by: dayshah <dhyey2019@gmail.com> Signed-off-by: elliot-barn <elliot.barnwell@anyscale.com>
1 parent b2cce45 commit dcd4336

File tree

12 files changed

+176
-348
lines changed

12 files changed

+176
-348
lines changed

src/mock/ray/gcs_client/accessor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ class MockNodeInfoAccessor : public NodeInfoAccessor {
118118
public:
119119
MOCK_METHOD(void,
120120
RegisterSelf,
121-
(const rpc::GcsNodeInfo &local_node_info, const StatusCallback &callback),
121+
(rpc::GcsNodeInfo && local_node_info, const StatusCallback &callback),
122122
(override));
123123
MOCK_METHOD(void,
124124
AsyncRegister,

src/ray/gcs_rpc_client/accessor.cc

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -468,18 +468,17 @@ bool ActorInfoAccessor::IsActorUnsubscribed(const ActorID &actor_id) {
468468

469469
NodeInfoAccessor::NodeInfoAccessor(GcsClient *client_impl) : client_impl_(client_impl) {}
470470

471-
void NodeInfoAccessor::RegisterSelf(const rpc::GcsNodeInfo &local_node_info,
471+
void NodeInfoAccessor::RegisterSelf(rpc::GcsNodeInfo &&local_node_info,
472472
const StatusCallback &callback) {
473473
auto node_id = NodeID::FromBinary(local_node_info.node_id());
474474
RAY_LOG(DEBUG).WithField(node_id)
475475
<< "Registering node info, address is = " << local_node_info.node_manager_address();
476476
RAY_CHECK(local_node_info.state() == rpc::GcsNodeInfo::ALIVE);
477477
rpc::RegisterNodeRequest request;
478-
request.mutable_node_info()->CopyFrom(local_node_info);
478+
*request.mutable_node_info() = std::move(local_node_info);
479479
client_impl_->GetGcsRpcClient().RegisterNode(
480480
std::move(request),
481-
[node_id, local_node_info, callback](const Status &status,
482-
rpc::RegisterNodeReply &&) {
481+
[node_id, callback](const Status &status, rpc::RegisterNodeReply &&) {
483482
if (callback) {
484483
callback(status);
485484
}

src/ray/gcs_rpc_client/accessor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ class NodeInfoAccessor {
302302
///
303303
/// \param node_info The information of node to register to GCS.
304304
/// \param callback Callback that will be called when registration is complete.
305-
virtual void RegisterSelf(const rpc::GcsNodeInfo &local_node_info,
305+
virtual void RegisterSelf(rpc::GcsNodeInfo &&local_node_info,
306306
const StatusCallback &callback);
307307

308308
/// Unregister local node to GCS asynchronously.

src/ray/gcs_rpc_client/tests/gcs_client_test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -375,8 +375,8 @@ class GcsClientTest : public ::testing::TestWithParam<bool> {
375375
return WaitReady(promise.get_future(), timeout_ms_);
376376
}
377377

378-
void RegisterSelf(const rpc::GcsNodeInfo &local_node_info) {
379-
gcs_client_->Nodes().RegisterSelf(local_node_info, nullptr);
378+
void RegisterSelf(rpc::GcsNodeInfo local_node_info) {
379+
gcs_client_->Nodes().RegisterSelf(std::move(local_node_info), nullptr);
380380
}
381381

382382
bool RegisterNode(const rpc::GcsNodeInfo &node_info) {

src/ray/raylet/BUILD.bazel

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ ray_cc_library(
208208
)
209209

210210
ray_cc_library(
211-
name = "node_manager",
211+
name = "raylet_lib",
212212
srcs = [
213213
"node_manager.cc",
214214
],
@@ -267,21 +267,6 @@ ray_cc_library(
267267
],
268268
)
269269

270-
ray_cc_library(
271-
name = "raylet_lib",
272-
srcs = ["raylet.cc"],
273-
hdrs = ["raylet.h"],
274-
visibility = ["//visibility:private"],
275-
deps = [
276-
":node_manager",
277-
"//src/ray/common:asio",
278-
"//src/ray/object_manager",
279-
"//src/ray/util:network_util",
280-
"//src/ray/util:time",
281-
"@boost//:asio",
282-
],
283-
)
284-
285270
ray_cc_binary(
286271
name = "raylet",
287272
srcs = ["main.cc"],

src/ray/raylet/main.cc

Lines changed: 44 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@
4242
#include "ray/object_manager_rpc_client/object_manager_client.h"
4343
#include "ray/raylet/local_object_manager.h"
4444
#include "ray/raylet/local_object_manager_interface.h"
45-
#include "ray/raylet/raylet.h"
45+
#include "ray/raylet/node_manager.h"
46+
#include "ray/raylet_ipc_client/client_connection.h"
4647
#include "ray/raylet_rpc_client/raylet_client.h"
4748
#include "ray/stats/stats.h"
4849
#include "ray/stats/tag_defs.h"
@@ -326,7 +327,6 @@ int main(int argc, char *argv[]) {
326327
gcs_client = std::make_unique<ray::gcs::GcsClient>(client_options, node_ip_address);
327328

328329
RAY_CHECK_OK(gcs_client->Connect(main_service));
329-
std::unique_ptr<ray::raylet::Raylet> raylet;
330330

331331
ray::stats::Gauge task_by_state_counter = ray::core::GetTaskByStateGaugeMetric();
332332
std::shared_ptr<plasma::PlasmaClient> plasma_client;
@@ -396,7 +396,7 @@ int main(int argc, char *argv[]) {
396396
auto shutdown_raylet_gracefully =
397397
[raylet_node_id,
398398
&shutting_down,
399-
&raylet,
399+
&node_manager,
400400
&main_service,
401401
&raylet_socket_name,
402402
&gcs_client,
@@ -412,12 +412,12 @@ int main(int argc, char *argv[]) {
412412

413413
auto unregister_done_callback = [&main_service,
414414
&raylet_socket_name,
415-
&raylet,
415+
&node_manager,
416416
&gcs_client,
417417
&object_manager_rpc_threads]() {
418418
// We should stop the service and remove the local socket
419419
// file.
420-
raylet->Stop();
420+
node_manager->Stop();
421421
gcs_client->Disconnect();
422422
ray::stats::Shutdown();
423423
main_service.stop();
@@ -912,6 +912,10 @@ int main(int argc, char *argv[]) {
912912
};
913913

914914
plasma_client = std::make_shared<plasma::PlasmaClient>();
915+
boost::asio::basic_socket_acceptor<ray::local_stream_protocol> acceptor(
916+
main_service, ray::ParseUrlEndpoint(raylet_socket_name));
917+
ray::local_stream_socket socket(main_service);
918+
ray::SetCloseOnExec(acceptor);
915919
node_manager = std::make_unique<ray::raylet::NodeManager>(
916920
main_service,
917921
raylet_node_id,
@@ -939,20 +943,9 @@ int main(int argc, char *argv[]) {
939943
shutdown_raylet_gracefully,
940944
std::move(add_process_to_system_cgroup_hook),
941945
std::move(cgroup_manager),
942-
shutting_down);
943-
944-
// Initialize the node manager.
945-
raylet = std::make_unique<ray::raylet::Raylet>(main_service,
946-
raylet_node_id,
947-
raylet_socket_name,
948-
node_ip_address,
949-
node_name,
950-
node_manager_config,
951-
object_manager_config,
952-
*gcs_client,
953-
metrics_export_port,
954-
is_head_node,
955-
*node_manager);
946+
shutting_down,
947+
std::move(acceptor),
948+
std::move(socket));
956949

957950
// Initializing stats should be done after the node manager is initialized because
958951
// <explain why>. Metrics exported before this call will be buffered until `Init` is
@@ -977,20 +970,48 @@ int main(int argc, char *argv[]) {
977970
const std::vector<ray::SourceTypeVariant> source_types = {
978971
ray::rpc::Event_SourceType::Event_SourceType_RAYLET};
979972
ray::RayEventInit(source_types,
980-
{{"node_id", raylet->GetNodeId().Hex()}},
973+
{{"node_id", raylet_node_id.Hex()}},
981974
log_dir,
982975
RayConfig::instance().event_level(),
983976
RayConfig::instance().emit_event_to_log_file());
984977
};
985978

986-
raylet->Start();
979+
ray::rpc::GcsNodeInfo self_node_info;
980+
self_node_info.set_node_id(raylet_node_id.Binary());
981+
self_node_info.set_state(ray::rpc::GcsNodeInfo::ALIVE);
982+
self_node_info.set_node_manager_address(node_ip_address);
983+
self_node_info.set_node_name(node_name);
984+
self_node_info.set_raylet_socket_name(raylet_socket_name);
985+
self_node_info.set_object_store_socket_name(object_manager_config.store_socket_name);
986+
self_node_info.set_object_manager_port(object_manager->GetServerPort());
987+
self_node_info.set_node_manager_port(node_manager->GetServerPort());
988+
self_node_info.set_node_manager_hostname(boost::asio::ip::host_name());
989+
self_node_info.set_metrics_export_port(metrics_export_port);
990+
self_node_info.set_runtime_env_agent_port(node_manager_config.runtime_env_agent_port);
991+
self_node_info.mutable_state_snapshot()->set_state(ray::rpc::NodeSnapshot::ACTIVE);
992+
auto resource_map = node_manager_config.resource_config.GetResourceMap();
993+
self_node_info.mutable_resources_total()->insert(resource_map.begin(),
994+
resource_map.end());
995+
self_node_info.set_start_time_ms(ray::current_sys_time_ms());
996+
self_node_info.set_is_head_node(is_head_node);
997+
self_node_info.mutable_labels()->insert(node_manager_config.labels.begin(),
998+
node_manager_config.labels.end());
999+
// Setting up autoscaler related fields from ENV
1000+
auto instance_id = std::getenv(kNodeCloudInstanceIdEnv);
1001+
self_node_info.set_instance_id(instance_id ? instance_id : "");
1002+
auto cloud_node_type_name = std::getenv(kNodeTypeNameEnv);
1003+
self_node_info.set_node_type_name(cloud_node_type_name ? cloud_node_type_name : "");
1004+
auto instance_type_name = std::getenv(kNodeCloudInstanceTypeNameEnv);
1005+
self_node_info.set_instance_type_name(instance_type_name ? instance_type_name : "");
1006+
1007+
node_manager->Start(std::move(self_node_info));
9871008
});
9881009

989-
auto signal_handler = [&raylet, shutdown_raylet_gracefully](
1010+
auto signal_handler = [&node_manager, shutdown_raylet_gracefully](
9901011
const boost::system::error_code &error, int signal_number) {
9911012
ray::rpc::NodeDeathInfo node_death_info;
9921013
std::optional<ray::rpc::DrainRayletRequest> drain_request =
993-
raylet->node_manager().GetLocalDrainRequest();
1014+
node_manager->GetLocalDrainRequest();
9941015
RAY_LOG(INFO) << "received SIGTERM. Existing local drain request = "
9951016
<< (drain_request.has_value() ? drain_request->DebugString() : "None");
9961017
if (drain_request.has_value() &&

src/ray/raylet/node_manager.cc

Lines changed: 99 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "ray/raylet/node_manager.h"
1616

1717
#include <algorithm>
18+
#include <boost/bind/bind.hpp>
1819
#include <cctype>
1920
#include <cerrno>
2021
#include <csignal>
@@ -121,6 +122,33 @@ void CleanupProcessGroupSend(pid_t saved_pgid,
121122
}
122123
#endif
123124

125+
std::vector<std::string> GenerateEnumNames(const char *const *enum_names_ptr,
126+
int start_index,
127+
int end_index) {
128+
std::vector<std::string> enum_names;
129+
enum_names.reserve(start_index);
130+
for (int i = 0; i < start_index; ++i) {
131+
enum_names.emplace_back("EmptyMessageType");
132+
}
133+
size_t i = 0;
134+
while (true) {
135+
const char *name = enum_names_ptr[i];
136+
if (name == nullptr) {
137+
break;
138+
}
139+
enum_names.emplace_back(name);
140+
i++;
141+
}
142+
RAY_CHECK(static_cast<size_t>(end_index) == enum_names.size() - 1)
143+
<< "Message Type mismatch!";
144+
return enum_names;
145+
}
146+
147+
const std::vector<std::string> node_manager_message_enum =
148+
GenerateEnumNames(ray::protocol::EnumNamesMessageType(),
149+
static_cast<int>(ray::protocol::MessageType::MIN),
150+
static_cast<int>(ray::protocol::MessageType::MAX));
151+
124152
} // namespace
125153

126154
NodeManager::NodeManager(
@@ -148,7 +176,9 @@ NodeManager::NodeManager(
148176
std::function<void(const rpc::NodeDeathInfo &)> shutdown_raylet_gracefully,
149177
AddProcessToCgroupHook add_process_to_system_cgroup_hook,
150178
std::unique_ptr<CgroupManagerInterface> cgroup_manager,
151-
std::atomic_bool &shutting_down)
179+
std::atomic_bool &shutting_down,
180+
boost::asio::basic_socket_acceptor<local_stream_protocol> acceptor,
181+
local_stream_socket socket)
152182
: self_node_id_(self_node_id),
153183
self_node_name_(std::move(self_node_name)),
154184
io_service_(io_service),
@@ -202,7 +232,9 @@ NodeManager::NodeManager(
202232
CreateMemoryUsageRefreshCallback())),
203233
add_process_to_system_cgroup_hook_(std::move(add_process_to_system_cgroup_hook)),
204234
cgroup_manager_(std::move(cgroup_manager)),
205-
shutting_down_(shutting_down) {
235+
shutting_down_(shutting_down),
236+
acceptor_(std::move(acceptor)),
237+
socket_(std::move(socket)) {
206238
RAY_LOG(INFO).WithField(kLogKeyNodeID, self_node_id_) << "Initializing NodeManager";
207239

208240
placement_group_resource_manager_ =
@@ -251,6 +283,29 @@ NodeManager::NodeManager(
251283
"NodeManager.GCTaskFailureReason");
252284
}
253285

286+
void NodeManager::Start(rpc::GcsNodeInfo &&self_node_info) {
287+
auto register_callback =
288+
[this,
289+
object_manager_port = self_node_info.object_manager_port()](const Status &status) {
290+
RAY_CHECK_OK(status);
291+
RAY_LOG(INFO) << "Raylet of id, " << self_node_id_
292+
<< " started. Raylet consists of node_manager and object_manager."
293+
<< " node_manager address: "
294+
<< BuildAddress(initial_config_.node_manager_address,
295+
initial_config_.node_manager_port)
296+
<< " object_manager address: "
297+
<< BuildAddress(initial_config_.node_manager_address,
298+
object_manager_port)
299+
<< " hostname: " << boost::asio::ip::host_name();
300+
this->RegisterGcs();
301+
};
302+
gcs_client_.Nodes().RegisterSelf(std::move(self_node_info), register_callback);
303+
304+
acceptor_.async_accept(
305+
socket_,
306+
boost::bind(&NodeManager::HandleAccept, this, boost::asio::placeholders::error));
307+
}
308+
254309
void NodeManager::RegisterGcs() {
255310
auto on_node_change = [this](const NodeID &node_id,
256311
const rpc::GcsNodeAddressAndLiveness &data) {
@@ -410,6 +465,45 @@ void NodeManager::RegisterGcs() {
410465
"NodeManager.GcsCheckAlive");
411466
}
412467

468+
void NodeManager::HandleAccept(const boost::system::error_code &error) {
469+
if (!error) {
470+
ConnectionErrorHandler error_handler =
471+
[this](const std::shared_ptr<ClientConnection> &client,
472+
const boost::system::error_code &err) {
473+
this->HandleClientConnectionError(client, err);
474+
};
475+
476+
MessageHandler message_handler = [this](
477+
const std::shared_ptr<ClientConnection> &client,
478+
int64_t message_type,
479+
const std::vector<uint8_t> &message) {
480+
this->ProcessClientMessage(client, message_type, message.data());
481+
};
482+
483+
// Accept a new local client and dispatch it to the node manager.
484+
auto conn = ClientConnection::Create(message_handler,
485+
error_handler,
486+
std::move(socket_),
487+
"worker",
488+
node_manager_message_enum);
489+
490+
// Begin processing messages. The message handler above is expected to call this to
491+
// continue processing messages.
492+
conn->ProcessMessages();
493+
} else {
494+
RAY_LOG(ERROR) << "Raylet failed to accept new connection: " << error.message();
495+
if (error == boost::asio::error::operation_aborted) {
496+
// The server is being destroyed. Don't continue accepting connections.
497+
return;
498+
}
499+
};
500+
501+
// We're ready to accept another client.
502+
acceptor_.async_accept(
503+
socket_,
504+
boost::bind(&NodeManager::HandleAccept, this, boost::asio::placeholders::error));
505+
}
506+
413507
void NodeManager::DestroyWorker(std::shared_ptr<WorkerInterface> worker,
414508
rpc::WorkerExitType disconnect_type,
415509
const std::string &disconnect_detail,
@@ -1298,7 +1392,6 @@ void NodeManager::HandleWorkerAvailable(const std::shared_ptr<WorkerInterface> &
12981392
}
12991393

13001394
namespace {
1301-
13021395
void SendDisconnectClientReply(const WorkerID &worker_id,
13031396
const std::shared_ptr<ClientConnection> &client) {
13041397
flatbuffers::FlatBufferBuilder fbb;
@@ -2821,8 +2914,8 @@ void NodeManager::Stop() {
28212914
#if !defined(_WIN32)
28222915
// Best-effort process-group cleanup for any remaining workers before shutdown.
28232916
if (RayConfig::instance().process_group_cleanup_enabled()) {
2824-
auto workers = worker_pool_.GetAllRegisteredWorkers(/* filter_dead_worker */ true,
2825-
/* filter_io_workers */ false);
2917+
auto workers = worker_pool_.GetAllRegisteredWorkers(/* filter_dead_workers=*/true,
2918+
/* filter_io_workers=*/false);
28262919
for (const auto &w : workers) {
28272920
auto saved = w->GetSavedProcessGroupId();
28282921
if (saved.has_value()) {
@@ -2840,6 +2933,7 @@ void NodeManager::Stop() {
28402933
object_manager_.Stop();
28412934
dashboard_agent_manager_.reset();
28422935
runtime_env_agent_manager_.reset();
2936+
acceptor_.close();
28432937
}
28442938

28452939
void NodeManager::RecordMetrics() {

0 commit comments

Comments
 (0)