Skip to content

Commit cc02220

Browse files
authored
[core] Clean up NODE_DIED task error message (#58638)
Minor follow ups from: #58539 Example error message: ``` Task failed because the node it was running on is dead or unavailable. Node IP: 127.0.0.1, node ID: e55b8ca03ebf3f7418f51533d8d55abeaab75fa9b29e2e6282b47646. This can happen if the node was preempted, had a hardware failure, or its raylet crashed unexpectedly. To see node death information, use `ray list nodes --filter node_id=e55b8ca03ebf3f7418f51533d8d55abeaab75fa9b29e2e6282b47646`, check the Ray dashboard cluster page, search the node ID in the GCS logs, or use `ray logs raylet.out -ip 127.0.0.1`. ``` --------- Signed-off-by: Edward Oakes <ed.nmi.oakes@gmail.com>
1 parent 5feb457 commit cc02220

File tree

3 files changed

+17
-52
lines changed

3 files changed

+17
-52
lines changed

python/ray/tests/test_ray_event_export_task_events.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -859,14 +859,14 @@ def validate_task_killed(events: json):
859859
expected_task_id_error_info_dict = {
860860
(normal_task_id, 0): {
861861
"error_type": "NODE_DIED",
862-
"error_message": "Task failed due to the node (where this task was running) was dead or unavailable",
862+
"error_message": "Task failed because the node it was running on is dead or unavailable",
863863
}
864864
}
865865
else:
866866
expected_task_id_error_info_dict = {
867867
(normal_task_id, 0): {
868868
"errorType": "NODE_DIED",
869-
"errorMessage": "Task failed due to the node (where this task was running) was dead or unavailable",
869+
"errorMessage": "Task failed because the node it was running on is dead or unavailable",
870870
}
871871
}
872872
check_task_lifecycle_event_states_and_error_info(

python/ray/tests/test_task_events.py

Lines changed: 1 addition & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -191,8 +191,7 @@ def sleep():
191191
verify_failed_task,
192192
name="node-killed",
193193
error_type="NODE_DIED",
194-
error_message="Task failed due to the node (where this task was running) "
195-
" was dead or unavailable",
194+
error_message="Task failed because the node it was running on is dead or unavailable",
196195
)
197196

198197

@@ -226,40 +225,6 @@ def task():
226225
)
227226

228227

229-
# TODO(rickyx): Make this work.
230-
# def test_failed_task_removed_placement_group(shutdown_only, monkeypatch):
231-
# ray.init(num_cpus=2, _system_config=_SYSTEM_CONFIG)
232-
# from ray.util.placement_group import placement_group, remove_placement_group
233-
# from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
234-
#
235-
# pg = placement_group([{"CPU": 2}])
236-
# ray.get(pg.ready())
237-
#
238-
# @ray.remote(num_cpus=2)
239-
# def sleep():
240-
# time.sleep(999)
241-
#
242-
# with monkeypatch.context() as m:
243-
# m.setenv(
244-
# "RAY_testing_asio_delay_us",
245-
# "NodeManagerService.grpc_server.RequestWorkerLease=3000000:3000000",
246-
# )
247-
#
248-
# sleep.options(
249-
# scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg),
250-
# name="task-pg-removed",
251-
# max_retries=0,
252-
# ).remote()
253-
#
254-
# remove_placement_group(pg)
255-
#
256-
# wait_for_condition(
257-
# verify_failed_task,
258-
# name="task-pg-removed",
259-
# error_type="TASK_PLACEMENT_GROUP_REMOVED",
260-
# )
261-
262-
263228
def test_failed_task_runtime_env_setup(shutdown_only):
264229
import conda
265230

src/ray/core_worker/task_submission/normal_task_submitter.cc

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,7 @@ bool NormalTaskSubmitter::HandleGetWorkerFailureCause(
613613
rpc::ErrorType task_error_type = rpc::ErrorType::WORKER_DIED;
614614
std::unique_ptr<rpc::RayErrorInfo> error_info;
615615
bool fail_immediately = false;
616+
NodeID node_id = NodeID::FromBinary(addr.node_id());
616617
if (get_worker_failure_cause_reply_status.ok()) {
617618
RAY_LOG(WARNING) << "Worker failure cause for task " << task_id << ": "
618619
<< ray::gcs::RayErrorInfoToString(
@@ -630,22 +631,21 @@ bool NormalTaskSubmitter::HandleGetWorkerFailureCause(
630631
RAY_LOG(WARNING) << "Failed to fetch worker failure cause with status "
631632
<< get_worker_failure_cause_reply_status.ToString()
632633
<< " worker id: " << WorkerID::FromBinary(addr.worker_id())
633-
<< " node id: " << NodeID::FromBinary(addr.node_id())
634-
<< " ip: " << addr.ip_address();
634+
<< " node id: " << node_id << " ip: " << addr.ip_address();
635635
task_error_type = rpc::ErrorType::NODE_DIED;
636-
std::stringstream buffer;
637-
buffer << "Task failed due to the node (where this task was running) "
638-
<< " was dead or unavailable.\n\nThe node IP: " << addr.ip_address()
639-
<< ", node ID: " << NodeID::FromBinary(addr.node_id()) << "\n\n"
640-
<< "This can happen if the instance where the node was running failed, "
641-
<< "the node was preempted, or raylet crashed unexpectedly "
642-
<< "(e.g., due to OOM) etc.\n\n"
643-
<< "To see node death information, use `ray list nodes --filter \"node_id="
644-
<< NodeID::FromBinary(addr.node_id()) << "\"`, "
645-
<< "or check Ray dashboard cluster page, or search the node ID in GCS log, "
646-
<< "or use `ray logs raylet.out -ip " << addr.ip_address() << "`";
636+
637+
std::string error_message = absl::StrFormat(
638+
"Task failed because the node it was running on is dead or unavailable. Node IP: "
639+
"%s, node ID: %s. This can happen if the node was preempted, had a hardware "
640+
"failure, or its raylet crashed unexpectedly. To see node death information, use "
641+
"`ray list nodes --filter node_id=%s`, check the Ray dashboard cluster page, "
642+
"search the node ID in the GCS logs, or use `ray logs raylet.out -ip %s`.",
643+
addr.ip_address(),
644+
node_id.Hex(),
645+
node_id.Hex(),
646+
addr.ip_address());
647647
error_info = std::make_unique<rpc::RayErrorInfo>();
648-
error_info->set_error_message(buffer.str());
648+
error_info->set_error_message(error_message);
649649
error_info->set_error_type(rpc::ErrorType::NODE_DIED);
650650
}
651651
return task_manager_.FailOrRetryPendingTask(task_id,

0 commit comments

Comments
 (0)