From d539c3e312f17167d01d90fd2f3002ed8cc2af68 Mon Sep 17 00:00:00 2001 From: jackhumphries <1645405+jackhumphries@users.noreply.github.com> Date: Thu, 27 Jun 2024 07:28:28 -0700 Subject: [PATCH] [Core] Fix deadlock in accelerated DAG channel read (#46288) Signed-off-by: Jack Humphries <1645405+jackhumphries@users.noreply.github.com> --- .../experimental_mutable_object_manager.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/ray/core_worker/experimental_mutable_object_manager.cc b/src/ray/core_worker/experimental_mutable_object_manager.cc index 4511b6802f95..d90f87327acb 100644 --- a/src/ray/core_worker/experimental_mutable_object_manager.cc +++ b/src/ray/core_worker/experimental_mutable_object_manager.cc @@ -285,14 +285,15 @@ Status MutableObjectManager::ReadAcquire(const ObjectID &object_id, "Channel has not been registered (cannot get semaphores)"); } + std::unique_ptr &object = channel->mutable_object; // Check whether the channel has an error set before checking that we are the only // reader. If the channel is already closed, then it's OK to ReadAcquire and // ReadRelease in any order. - std::unique_ptr &object = channel->mutable_object; - RAY_RETURN_NOT_OK(object->header->CheckHasError()); - // The channel is still open. This lock ensures that there is only one reader - // at a time. The lock is released in `ReadRelease()`. - channel->lock->lock(); + do { + RAY_RETURN_NOT_OK(object->header->CheckHasError()); + // The channel is still open. This lock ensures that there is only one reader + // at a time. The lock is released in `ReadRelease()`. + } while (!channel->lock->try_lock()); channel->reading = true; int64_t version_read = 0;