ray-project
diff --git a/‎python/ray/tests/BUILD.bazel‎
Lines changed: 0 additions & 1 deletion b/‎python/ray/tests/BUILD.bazel‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎python/ray/tests/test_draining.py‎
Lines changed: 8 additions & 15 deletions b/‎python/ray/tests/test_draining.py‎
Lines changed: 8 additions & 15 deletions
diff --git a/‎python/ray/tests/test_ray_get.py‎
Lines changed: 0 additions & 76 deletions b/‎python/ray/tests/test_ray_get.py‎
Lines changed: 0 additions & 76 deletions
diff --git a/‎src/ray/core_worker/core_worker.cc‎
Lines changed: 10 additions & 4 deletions b/‎src/ray/core_worker/core_worker.cc‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎src/ray/core_worker/store_provider/plasma_store_provider.cc‎
Lines changed: 57 additions & 48 deletions b/‎src/ray/core_worker/store_provider/plasma_store_provider.cc‎
Lines changed: 57 additions & 48 deletions
diff --git a/‎src/ray/core_worker/store_provider/plasma_store_provider.h‎
Lines changed: 10 additions & 25 deletions b/‎src/ray/core_worker/store_provider/plasma_store_provider.h‎
Lines changed: 10 additions & 25 deletions
@@ -884,7 +884,6 @@ py_test_module_list(
         "test_dataclient_disconnect.py",
         "test_iter.py",
         "test_placement_group.py",
-        "test_ray_get.py",
         "test_state_api_2.py",
         "test_task_events.py",
         "test_unavailable_actors.py",
 
@@ -427,21 +427,14 @@ def ping(self):
 
     # Simulate autoscaler terminates the worker node after the draining deadline.
     cluster.remove_node(node2, graceful)
-
-    def check_actor_died_error():
-        try:
-            ray.get(actor.ping.remote())
-            return False
-        except ray.exceptions.ActorDiedError as e:
-            assert e.preempted
-            if graceful:
-                assert "The actor died because its node has died." in str(e)
-                assert "the actor's node was preempted: " + drain_reason_message in str(
-                    e
-                )
-        return True
-
-    wait_for_condition(check_actor_died_error)
+    try:
+        ray.get(actor.ping.remote())
+        raise
+    except ray.exceptions.ActorDiedError as e:
+        assert e.preempted
+        if graceful:
+            assert "The actor died because its node has died." in str(e)
+            assert "the actor's node was preempted: " + drain_reason_message in str(e)
 
 
 def test_drain_node_actor_restart(ray_start_cluster):
 
@@ -1391,8 +1391,11 @@ Status CoreWorker::GetObjects(const std::vector<ObjectID> &ids,
                                   timeout_ms - (current_time_ms() - start_time));
     }
     RAY_LOG(DEBUG) << "Plasma GET timeout " << local_timeout_ms;
-    RAY_RETURN_NOT_OK(
-        plasma_store_provider_->Get(plasma_object_ids, local_timeout_ms, &result_map));
+    RAY_RETURN_NOT_OK(plasma_store_provider_->Get(plasma_object_ids,
+                                                  local_timeout_ms,
+                                                  *worker_context_,
+                                                  &result_map,
+                                                  &got_exception));
   }
 
   // Loop through `ids` and fill each entry for the `results` vector,
@@ -3065,13 +3068,15 @@ bool CoreWorker::PinExistingReturnObject(const ObjectID &return_id,
   // might not have the same value as the new copy. It would be better to evict
   // the existing copy here.
   absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> result_map;
+  bool got_exception = false;
 
   // Temporarily set the return object's owner's address. This is needed to retrieve the
   // value from plasma.
   reference_counter_->AddLocalReference(return_id, "<temporary (pin return object)>");
   reference_counter_->AddBorrowedObject(return_id, ObjectID::Nil(), owner_address);
 
-  Status status = plasma_store_provider_->Get({return_id}, 0, &result_map);
+  auto status = plasma_store_provider_->Get(
+      {return_id}, 0, *worker_context_, &result_map, &got_exception);
   // Remove the temporary ref.
   RemoveLocalReference(return_id);
 
@@ -3338,7 +3343,8 @@ Status CoreWorker::GetAndPinArgsForExecutor(const TaskSpecification &task,
     RAY_RETURN_NOT_OK(memory_store_->Get(
         by_ref_ids, -1, *worker_context_, &result_map, &got_exception));
   } else {
-    RAY_RETURN_NOT_OK(plasma_store_provider_->Get(by_ref_ids, -1, &result_map));
+    RAY_RETURN_NOT_OK(plasma_store_provider_->Get(
+        by_ref_ids, -1, *worker_context_, &result_map, &got_exception));
   }
   for (const auto &it : result_map) {
     for (size_t idx : by_ref_indices[it.first]) {
 
@@ -15,7 +15,6 @@
 #include "ray/core_worker/store_provider/plasma_store_provider.h"
 
 #include <algorithm>
-#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
@@ -178,20 +177,23 @@ Status CoreWorkerPlasmaStoreProvider::Release(const ObjectID &object_id) {
   return store_client_->Release(object_id);
 }
 
-Status CoreWorkerPlasmaStoreProvider::GetObjectsFromPlasmaStore(
+Status CoreWorkerPlasmaStoreProvider::PullObjectsAndGetFromPlasmaStore(
     absl::flat_hash_set<ObjectID> &remaining,
-    const std::vector<ObjectID> &ids,
+    const std::vector<ObjectID> &batch_ids,
     int64_t timeout_ms,
     absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
     bool *got_exception) {
+  const auto owner_addresses = reference_counter_.GetOwnerAddresses(batch_ids);
+  RAY_RETURN_NOT_OK(raylet_ipc_client_->AsyncGetObjects(batch_ids, owner_addresses));
+
   std::vector<plasma::ObjectBuffer> plasma_results;
-  RAY_RETURN_NOT_OK(store_client_->Get(ids, timeout_ms, &plasma_results));
+  RAY_RETURN_NOT_OK(store_client_->Get(batch_ids, timeout_ms, &plasma_results));
 
   // Add successfully retrieved objects to the result map and remove them from
   // the set of IDs to get.
   for (size_t i = 0; i < plasma_results.size(); i++) {
     if (plasma_results[i].data != nullptr || plasma_results[i].metadata != nullptr) {
-      const auto &object_id = ids[i];
+      const auto &object_id = batch_ids[i];
       std::shared_ptr<TrackedBuffer> data = nullptr;
       std::shared_ptr<Buffer> metadata = nullptr;
       if (plasma_results[i].data && plasma_results[i].data->Size() > 0) {
@@ -214,6 +216,7 @@ Status CoreWorkerPlasmaStoreProvider::GetObjectsFromPlasmaStore(
       (*results)[object_id] = std::move(result_object);
     }
   }
+
   return Status::OK();
 }
 
@@ -251,52 +254,57 @@ Status CoreWorkerPlasmaStoreProvider::GetExperimentalMutableObject(
   return store_client_->GetExperimentalMutableObject(object_id, mutable_object);
 }
 
+Status UnblockIfNeeded(
+    const std::shared_ptr<ipc::RayletIpcClientInterface> &raylet_client,
+    const WorkerContext &ctx) {
+  if (ctx.CurrentTaskIsDirectCall()) {
+    // NOTE: for direct call actors, we still need to issue an unblock IPC to release
+    // get subscriptions, even if the worker isn't blocked.
+    if (ctx.ShouldReleaseResourcesOnBlockingCalls() || ctx.CurrentActorIsDirectCall()) {
+      return raylet_client->NotifyWorkerUnblocked();
+    } else {
+      return Status::OK();  // We don't need to release resources.
+    }
+  } else {
+    return raylet_client->CancelGetRequest();
+  }
+}
+
 Status CoreWorkerPlasmaStoreProvider::Get(
     const absl::flat_hash_set<ObjectID> &object_ids,
     int64_t timeout_ms,
-    absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results) {
-  std::vector<ipc::ScopedResponse> get_request_cleanup_handlers;
-
-  bool got_exception = false;
-  absl::flat_hash_set<ObjectID> remaining(object_ids.begin(), object_ids.end());
-  std::vector<ObjectID> id_vector(object_ids.begin(), object_ids.end());
+    const WorkerContext &ctx,
+    absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
+    bool *got_exception) {
   std::vector<ObjectID> batch_ids;
+  absl::flat_hash_set<ObjectID> remaining(object_ids.begin(), object_ids.end());
 
-  int64_t num_total_objects = static_cast<int64_t>(object_ids.size());
-
-  // TODO(57923): Need to understand if batching is necessary. If it's necessary,
-  // then the reason needs to be documented.
-  for (int64_t start = 0; start < num_total_objects; start += fetch_batch_size_) {
+  // Send initial requests to pull all objects in parallel.
+  std::vector<ObjectID> id_vector(object_ids.begin(), object_ids.end());
+  int64_t total_size = static_cast<int64_t>(object_ids.size());
+  for (int64_t start = 0; start < total_size; start += fetch_batch_size_) {
     batch_ids.clear();
-    for (int64_t i = start; i < start + fetch_batch_size_ && i < num_total_objects; i++) {
+    for (int64_t i = start; i < start + fetch_batch_size_ && i < total_size; i++) {
       batch_ids.push_back(id_vector[i]);
     }
-
-    // 1. Make the request to pull all objects into local plasma if not local already.
-    std::vector<rpc::Address> owner_addresses =
-        reference_counter_.GetOwnerAddresses(batch_ids);
-    StatusOr<ipc::ScopedResponse> status_or_cleanup =
-        raylet_ipc_client_->AsyncGetObjects(batch_ids, owner_addresses);
-    RAY_RETURN_NOT_OK(status_or_cleanup.status());
-    get_request_cleanup_handlers.emplace_back(std::move(status_or_cleanup.value()));
-
-    // 2. Try to Get all objects that are already local from the plasma store.
     RAY_RETURN_NOT_OK(
-        GetObjectsFromPlasmaStore(remaining,
-                                  batch_ids,
-                                  /*timeout_ms=*/0,
-                                  // Mutable objects must be local before ray.get.
-                                  results,
-                                  &got_exception));
+        PullObjectsAndGetFromPlasmaStore(remaining,
+                                         batch_ids,
+                                         /*timeout_ms=*/0,
+                                         // Mutable objects must be local before ray.get.
+                                         results,
+                                         got_exception));
   }
 
-  if (remaining.empty() || got_exception) {
-    return Status::OK();
+  // If all objects were fetched already, return. Note that we always need to
+  // call UnblockIfNeeded() to cancel the get request.
+  if (remaining.empty() || *got_exception) {
+    return UnblockIfNeeded(raylet_ipc_client_, ctx);
   }
 
-  // 3. If not all objects were successfully fetched, repeatedly call
-  // GetObjectsFromPlasmaStore in batches. This loop will run indefinitely until the
-  // objects are all fetched if timeout is -1.
+  // If not all objects were successfully fetched, repeatedly call FetchOrReconstruct
+  // and Get from the local object store in batches. This loop will run indefinitely
+  // until the objects are all fetched if timeout is -1.
   bool should_break = false;
   bool timed_out = false;
   int64_t remaining_timeout = timeout_ms;
@@ -320,16 +328,18 @@ Status CoreWorkerPlasmaStoreProvider::Get(
     }
 
     size_t previous_size = remaining.size();
-    RAY_RETURN_NOT_OK(GetObjectsFromPlasmaStore(
-        remaining, batch_ids, batch_timeout, results, &got_exception));
-    should_break = timed_out || got_exception;
+    RAY_RETURN_NOT_OK(PullObjectsAndGetFromPlasmaStore(
+        remaining, batch_ids, batch_timeout, results, got_exception));
+    should_break = timed_out || *got_exception;
 
     if ((previous_size - remaining.size()) < batch_ids.size()) {
       WarnIfFetchHanging(fetch_start_time_ms, remaining);
     }
     if (check_signals_) {
       Status status = check_signals_();
       if (!status.ok()) {
+        // TODO(edoakes): in this case which status should we return?
+        RAY_RETURN_NOT_OK(UnblockIfNeeded(raylet_ipc_client_, ctx));
         return status;
       }
     }
@@ -344,14 +354,13 @@ Status CoreWorkerPlasmaStoreProvider::Get(
   }
 
   if (!remaining.empty() && timed_out) {
-    return Status::TimedOut(absl::StrFormat(
-        "Could not fetch %d objects within the timeout of %dms. %d objects were not "
-        "ready.",
-        object_ids.size(),
-        timeout_ms,
-        remaining.size()));
+    RAY_RETURN_NOT_OK(UnblockIfNeeded(raylet_ipc_client_, ctx));
+    return Status::TimedOut("Get timed out: some object(s) not ready.");
   }
-  return Status::OK();
+
+  // Notify unblocked because we blocked when calling FetchOrReconstruct with
+  // fetch_only=false.
+  return UnblockIfNeeded(raylet_ipc_client_, ctx);
 }
 
 Status CoreWorkerPlasmaStoreProvider::Contains(const ObjectID &object_id,
 
@@ -154,24 +154,11 @@ class CoreWorkerPlasmaStoreProvider {
   /// argument to Get to retrieve the object data.
   Status Release(const ObjectID &object_id);
 
-  /// Fetches data from the local plasma store. If an object is not available in the
-  /// local plasma store, then the raylet will trigger a pull request to copy an object
-  /// into the local plasma store from another node.
-  ///
-  /// \param[in] object_ids objects to fetch if they are not already in local plasma.
-  /// \param[in] timeout_ms if the timeout elapses, the request will be canceled.
-  /// \param[out] results objects fetched from plasma. This is only valid if the function
-  ///
-  /// \return Status::IOError if there's an error communicating with the raylet.
-  /// \return Status::TimedOut if timeout_ms was reached before all object_ids could be
-  /// fetched.
-  /// \return Status::Interrupted if a SIGINT signal was received.
-  /// \return Status::IntentionalSystemExit if a SIGTERM signal was was received.
-  /// \return Status::UnexpectedSystemExit if any other signal was received.
-  /// \return Status::OK otherwise.
   Status Get(const absl::flat_hash_set<ObjectID> &object_ids,
              int64_t timeout_ms,
-             absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results);
+             const WorkerContext &ctx,
+             absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
+             bool *got_exception);
 
   /// Get objects directly from the local plasma store, without waiting for the
   /// objects to be fetched from another node. This should only be used
@@ -218,24 +205,22 @@ class CoreWorkerPlasmaStoreProvider {
   std::shared_ptr<plasma::PlasmaClientInterface> &store_client() { return store_client_; }
 
  private:
-  /// Ask the plasma store to return object objects within the timeout.
-  /// Successfully fetched objects will be removed from the input set of remaining IDs and
-  /// added to the results map.
+  /// Ask the raylet to pull a set of objects and then attempt to get them
+  /// from the local plasma store. Successfully fetched objects will be removed
+  /// from the input set of remaining IDs and added to the results map.
   ///
   /// \param[in/out] remaining IDs of the remaining objects to get.
-  /// \param[in] ids IDs of the objects to get.
+  /// \param[in] batch_ids IDs of the objects to get.
   /// \param[in] timeout_ms Timeout in milliseconds.
   /// \param[out] results Map of objects to write results into. This method will only
   /// add to this map, not clear or remove from it, so the caller can pass in a non-empty
   /// map.
   /// \param[out] got_exception Set to true if any of the fetched objects contained an
   /// exception.
-  /// \return Status::IOError if there is an error in communicating with the raylet or the
-  /// plasma store.
-  /// \return Status::OK if successful.
-  Status GetObjectsFromPlasmaStore(
+  /// \return Status.
+  Status PullObjectsAndGetFromPlasmaStore(
       absl::flat_hash_set<ObjectID> &remaining,
-      const std::vector<ObjectID> &ids,
+      const std::vector<ObjectID> &batch_ids,
       int64_t timeout_ms,
       absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
       bool *got_exception);