Skip to content

Commit aa6bbb8

Browse files
committed
[core][metric] error handling of metric+event exporter agent
Signed-off-by: Cuong Nguyen <can@anyscale.com>
1 parent 2be8a2b commit aa6bbb8

File tree

4 files changed

+23
-17
lines changed

4 files changed

+23
-17
lines changed

src/ray/core_worker/core_worker_process.cc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -819,7 +819,13 @@ CoreWorkerProcessImpl::CoreWorkerProcessImpl(const CoreWorkerOptions &options)
819819
io_service_,
820820
*write_locked.Get()->client_call_manager_);
821821
metrics_agent_client_->WaitForServerReady([this](const Status &server_status) {
822-
stats::InitOpenTelemetryExporter(options_.metrics_agent_port, server_status);
822+
if (server_status.ok()) {
823+
stats::InitOpenTelemetryExporter(options_.metrics_agent_port);
824+
} else {
825+
RAY_LOG(ERROR) << "Failed to establish connection to the metrics exporter agent. "
826+
"Metrics will not be exported. "
827+
<< "Exporter agent status: " << server_status.ToString();
828+
}
823829
});
824830
}
825831
}

src/ray/gcs/gcs_server.cc

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -295,13 +295,14 @@ void GcsServer::DoStart(const GcsInitData &gcs_init_data) {
295295

296296
// Init metrics and event exporter.
297297
metrics_agent_client_->WaitForServerReady([this](const Status &server_status) {
298-
stats::InitOpenTelemetryExporter(config_.metrics_agent_port, server_status);
299298
if (server_status.ok()) {
299+
stats::InitOpenTelemetryExporter(config_.metrics_agent_port);
300300
ray_event_recorder_->StartExportingEvents();
301301
} else {
302-
RAY_LOG(ERROR) << "Failed to establish connection to the event exporter. Events "
303-
"will not be exported. "
304-
<< "Event exporter status: " << server_status.ToString();
302+
RAY_LOG(ERROR)
303+
<< "Failed to establish connection to the event+metrics exporter agent. "
304+
"Events and metrics will not be exported. "
305+
<< "Exporter agent status: " << server_status.ToString();
305306
}
306307
});
307308

src/ray/raylet/main.cc

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -966,10 +966,16 @@ int main(int argc, char *argv[]) {
966966
ray::stats::Init(global_tags, metrics_agent_port, ray::WorkerID::Nil());
967967
metrics_agent_client = std::make_unique<ray::rpc::MetricsAgentClientImpl>(
968968
"127.0.0.1", metrics_agent_port, main_service, *client_call_manager);
969-
metrics_agent_client->WaitForServerReady(
970-
[metrics_agent_port](const ray::Status &server_status) {
971-
ray::stats::InitOpenTelemetryExporter(metrics_agent_port, server_status);
972-
});
969+
metrics_agent_client->WaitForServerReady([metrics_agent_port](
970+
const ray::Status &server_status) {
971+
if (server_status.ok()) {
972+
ray::stats::InitOpenTelemetryExporter(metrics_agent_port);
973+
} else {
974+
RAY_LOG(ERROR) << "Failed to establish connection to the metrics exporter agent. "
975+
"Metrics will not be exported. "
976+
<< "Exporter agent status: " << server_status.ToString();
977+
}
978+
});
973979

974980
// Initialize event framework. This should be done after the node manager is
975981
// initialized.

src/ray/stats/stats.h

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -113,17 +113,10 @@ static inline void Init(
113113
StatsConfig::instance().SetIsInitialized(true);
114114
}
115115

116-
static inline void InitOpenTelemetryExporter(const int metrics_agent_port,
117-
const Status &metrics_agent_server_status) {
116+
static inline void InitOpenTelemetryExporter(const int metrics_agent_port) {
118117
if (!RayConfig::instance().enable_open_telemetry()) {
119118
return;
120119
}
121-
if (!metrics_agent_server_status.ok()) {
122-
RAY_LOG(ERROR) << "Failed to initialize OpenTelemetry exporter. Data will not be "
123-
"exported to the "
124-
<< "metrics agent. Server status: " << metrics_agent_server_status;
125-
return;
126-
}
127120
OpenTelemetryMetricRecorder::GetInstance().RegisterGrpcExporter(
128121
/*endpoint=*/std::string("127.0.0.1:") + std::to_string(metrics_agent_port),
129122
/*interval=*/

0 commit comments

Comments
 (0)