-
Notifications
You must be signed in to change notification settings - Fork 7k
Closed
Labels
bugSomething that is supposed to be working; but isn'tSomething that is supposed to be working; but isn'tcommunity-backlogcoreIssues that should be addressed in Ray CoreIssues that should be addressed in Ray CorestabilitytriageNeeds triage (eg: priority, bug/not-bug, and owning component)Needs triage (eg: priority, bug/not-bug, and owning component)
Description
What happened + What you expected to happen
I am facing a weird issue: ray.get started to hang for me, but only inside actors and only with KubeRay: I can't reproduce this issue locally.
I don't know what started causing this issue. I did not change ray, I suspect it's something to do with system C libraries that got updated during an image rebuilt (via apt-get update), but I am not sure.
I was able to get the following stack trace from my KubeRay cluster:
Process 425: ray::_MapWorker.__init__
Python v3.11.13 (/usr/local/bin/python3.11)
Thread 425 (idle): "MainThread"
recv (libc.so.6)
boost::asio::detail::socket_ops::recv1 (ray/_raylet.so)
boost::asio::detail::socket_ops::sync_recv1 (ray/_raylet.so)
ray::ServerConnection::ReadBuffer (ray/_raylet.so)
ray::ServerConnection::ReadMessage (ray/_raylet.so)
plasma::PlasmaReceive (ray/_raylet.so)
plasma::PlasmaClient::Impl::GetBuffers (ray/_raylet.so)
plasma::PlasmaClient::Impl::Get (ray/_raylet.so)
plasma::PlasmaClient::Get (ray/_raylet.so)
ray::core::CoreWorkerPlasmaStoreProvider::FetchAndGetFromPlasmaStore (ray/_raylet.so)
ray::core::CoreWorkerPlasmaStoreProvider::Get (ray/_raylet.so)
ray::core::CoreWorker::GetObjects (ray/_raylet.so)
ray::core::CoreWorker::Get (ray/_raylet.so)
get_objects (ray/_raylet.so)
get_objects (ray/_private/worker.py:878)
get (ray/_private/worker.py:2745)
wrapper (ray/_private/client_mode_hook.py:103)
auto_init_wrapper (ray/_private/auto_init_hook.py:21)
__init__ (anam_training_data/ray_processing/crop.py:455)
__init__ (ray/data/_internal/execution/util.py:70)
init_fn (ray/data/_internal/planner/plan_udf_map_op.py:153)
init (ray/data/_internal/execution/operators/map_transformer.py:126)
__init__ (ray/data/_internal/execution/operators/actor_pool_map_operator.py:368)
_resume_span (ray/util/tracing/tracing_helper.py:467)
actor_method_executor (ray/_private/function_manager.py:696)
function_executor (ray/_raylet.so)
_raylet_task_execution_handler (ray/_raylet.so)
std::_Function_handler<ray::Status(ray::rpc::Address const&, ray::rpc::TaskType, std::string, ray::core::RayFunction const&, std::unordered_map<std::string, double, std::hash<std::string>, std::equal_to<std::string>, std::allocator<std::pair<std::string const, double> > > const&, std::vector<std::shared_ptr<ray::RayObject>, std::allocator<std::shared_ptr<ray::RayObject> > > const&, std::vector<ray::rpc::ObjectReference, std::allocator<ray::rpc::ObjectReference> > const&, std::string const&, std::string const&, std::vector<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> >, std::allocator<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> > > >*, std::vector<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> >, std::allocator<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> > > >*, std::vector<std::pair<ray::ObjectID, bool>, std::allocator<std::pair<ray::ObjectID, bool> > >*, std::shared_ptr<ray::LocalMemoryBuffer>&, bool*, std::string*, std::vector<ray::ConcurrencyGroup, std::allocator<ray::ConcurrencyGroup> > const&, std::string, bool, bool, bool, long), ray::Status (*)(ray::rpc::Address const&, ray::rpc::TaskType, std::string, ray::core::RayFunction const&, std::unordered_map<std::string, double, std::hash<std::string>, std::equal_to<std::string>, std::allocator<std::pair<std::string const, double> > > const&, std::vector<std::shared_ptr<ray::RayObject>, std::allocator<std::shared_ptr<ray::RayObject> > > const&, std::vector<ray::rpc::ObjectReference, std::allocator<ray::rpc::ObjectReference> > const&, std::string, std::string, std::vector<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> >, std::allocator<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> > > >*, std::vector<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> >, std::allocator<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> > > >*, std::vector<std::pair<ray::ObjectID, bool>, std::allocator<std::pair<ray::ObjectID, bool> > >*, std::shared_ptr<ray::LocalMemoryBuffer>&, bool*, std::string*, std::vector<ray::ConcurrencyGroup, std::allocator<ray::ConcurrencyGroup> > const&, std::string, bool, bool, bool, long)>::_M_invoke (ray/_raylet.so)
ray::core::CoreWorker::ExecuteTask (ray/_raylet.so)
std::_Function_handler<ray::Status(ray::TaskSpecification const&, std::shared_ptr<std::unordered_map<std::string, std::vector<std::pair<long, double>, std::allocator<std::pair<long, double> > >, std::hash<std::string>, std::equal_to<std::string>, std::allocator<std::pair<std::string const, std::vector<std::pair<long, double>, std::allocator<std::pair<long, double> > > > > > >, std::vector<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> >, std::allocator<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> > > >*, std::vector<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> >, std::allocator<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> > > >*, std::vector<std::pair<ray::ObjectID, bool>, std::allocator<std::pair<ray::ObjectID, bool> > >*, google::protobuf::RepeatedPtrField<ray::rpc::ObjectReferenceCount>*, bool*, std::string*), std::_Bind<ray::Status (ray::core::CoreWorker(ray::core::CoreWorker*, std::_Placeholder<1>, std::_Placeholder<2>, std::_Placeholder<3>, std::_Placeholder<4>, std::_Placeholder<5>, std::_Placeholder<6>, std::_Placeholder<7>, std::_Placeholder<8>)::*)(ray::TaskSpecification const&, std::shared_ptr<std::unordered_map<std::string, std::vector<std::pair<long, double>, std::allocator<std::pair<long, double> > >, std::hash<std::string>, std::equal_to<std::string>, std::allocator<std::pair<std::string const, std::vector<std::pair<long, double>, std::allocator<std::pair<long, double> > > > > > > const&, std::vector<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> >, std::allocator<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> > > >*, std::vector<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> >, std::allocator<std::pair<ray::ObjectID, std::shared_ptr<ray::RayObject> > > >*, std::vector<std::pair<ray::ObjectID, bool>, std::allocator<std::pair<ray::ObjectID, bool> > >*, google::protobuf::RepeatedPtrField<ray::rpc::ObjectReferenceCount>*, bool*, std::string*)> >::_M_invoke (ray/_raylet.so)
ray::core::TaskReceiver::HandleTask(ray::rpc::PushTaskRequest const&, ray::rpc::PushTaskReply*, std::function<void (ray::Status, std::function<void ()>, std::function<void ()>)>)::{lambda(std::function<void (ray::Status, std::function<void ()>, std::function<void ()>)>)#1}::operator() const (ray/_raylet.so)
std::_Function_handler<void (std::function<void (ray::Status, std::function<void ()>, std::function<void ()>)>), ray::core::TaskReceiver::HandleTask(ray::rpc::PushTaskRequest const&, ray::rpc::PushTaskReply*, std::function<void (ray::Status, std::function<void ()>, std::function<void ()>)>)::{lambda(std::function<void (ray::Status, std::function<void ()>, std::function<void ()>)>)#1}>::_M_invoke (ray/_raylet.so)
ray::core::InboundRequest::Accept (ray/_raylet.so)
ray::core::NormalSchedulingQueue::ScheduleRequests (ray/_raylet.so)
EventTracker::RecordExecution (ray/_raylet.so)
std::_Function_handler<void (), instrumented_io_context::post(std::function<void ()>, std::string, long)::{lambda()#1}>::_M_invoke (ray/_raylet.so)
boost::asio::detail::completion_handler<std::function<void ()>, boost::asio::io_context::basic_executor_type<std::allocator<void>, (unsigned long)0> >::do_complete (ray/_raylet.so)
boost::asio::detail::scheduler::do_run_one (ray/_raylet.so)
boost::asio::detail::scheduler::run (ray/_raylet.so)
boost::asio::io_context::run (ray/_raylet.so)
ray::core::CoreWorker::RunTaskExecutionLoop (ray/_raylet.so)
ray::core::CoreWorkerProcessImpl::RunWorkerTaskExecutionLoop (ray/_raylet.so)
ray::core::CoreWorkerProcess::RunTaskExecutionLoop (ray/_raylet.so)
run_task_loop (ray/_raylet.so)
main_loop (ray/_private/worker.py:917)
<module> (ray/_private/workers/default_worker.py:289)
0x78d749ffc24a (libc.so.6)
Versions / Dependencies
Tried ray 2.38.0 and ray 2.47.0.
Reproduction script
It's hard to reproduce this issue, it requires a distributed Ray setup.
I may be able to do it later with kind.
Issue Severity
High: It blocks me from completing my task.
Metadata
Metadata
Assignees
Labels
bugSomething that is supposed to be working; but isn'tSomething that is supposed to be working; but isn'tcommunity-backlogcoreIssues that should be addressed in Ray CoreIssues that should be addressed in Ray CorestabilitytriageNeeds triage (eg: priority, bug/not-bug, and owning component)Needs triage (eg: priority, bug/not-bug, and owning component)