Skip to content

Commit 3838afc

Browse files
committed
Revert "[core] (ray-get 2/n) Making ray.get thread-safe (#57911)"
This reverts commit 3af4650. Revert "[core] (1/n ray.get) Removing NotifyWorkerUnblocked from PlasmaStoreProvider::Get (#57691)" This reverts commit 6539477. Signed-off-by: irabbani <israbbani@gmail.com>
1 parent b6b1fac commit 3838afc

18 files changed

+333
-445
lines changed

python/ray/tests/BUILD.bazel

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -884,7 +884,6 @@ py_test_module_list(
884884
"test_dataclient_disconnect.py",
885885
"test_iter.py",
886886
"test_placement_group.py",
887-
"test_ray_get.py",
888887
"test_state_api_2.py",
889888
"test_task_events.py",
890889
"test_unavailable_actors.py",

python/ray/tests/test_draining.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -427,21 +427,14 @@ def ping(self):
427427

428428
# Simulate autoscaler terminates the worker node after the draining deadline.
429429
cluster.remove_node(node2, graceful)
430-
431-
def check_actor_died_error():
432-
try:
433-
ray.get(actor.ping.remote())
434-
return False
435-
except ray.exceptions.ActorDiedError as e:
436-
assert e.preempted
437-
if graceful:
438-
assert "The actor died because its node has died." in str(e)
439-
assert "the actor's node was preempted: " + drain_reason_message in str(
440-
e
441-
)
442-
return True
443-
444-
wait_for_condition(check_actor_died_error)
430+
try:
431+
ray.get(actor.ping.remote())
432+
raise
433+
except ray.exceptions.ActorDiedError as e:
434+
assert e.preempted
435+
if graceful:
436+
assert "The actor died because its node has died." in str(e)
437+
assert "the actor's node was preempted: " + drain_reason_message in str(e)
445438

446439

447440
def test_drain_node_actor_restart(ray_start_cluster):

python/ray/tests/test_ray_get.py

Lines changed: 0 additions & 76 deletions
This file was deleted.

src/ray/core_worker/core_worker.cc

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1391,8 +1391,11 @@ Status CoreWorker::GetObjects(const std::vector<ObjectID> &ids,
13911391
timeout_ms - (current_time_ms() - start_time));
13921392
}
13931393
RAY_LOG(DEBUG) << "Plasma GET timeout " << local_timeout_ms;
1394-
RAY_RETURN_NOT_OK(
1395-
plasma_store_provider_->Get(plasma_object_ids, local_timeout_ms, &result_map));
1394+
RAY_RETURN_NOT_OK(plasma_store_provider_->Get(plasma_object_ids,
1395+
local_timeout_ms,
1396+
*worker_context_,
1397+
&result_map,
1398+
&got_exception));
13961399
}
13971400

13981401
// Loop through `ids` and fill each entry for the `results` vector,
@@ -3065,13 +3068,15 @@ bool CoreWorker::PinExistingReturnObject(const ObjectID &return_id,
30653068
// might not have the same value as the new copy. It would be better to evict
30663069
// the existing copy here.
30673070
absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> result_map;
3071+
bool got_exception = false;
30683072

30693073
// Temporarily set the return object's owner's address. This is needed to retrieve the
30703074
// value from plasma.
30713075
reference_counter_->AddLocalReference(return_id, "<temporary (pin return object)>");
30723076
reference_counter_->AddBorrowedObject(return_id, ObjectID::Nil(), owner_address);
30733077

3074-
Status status = plasma_store_provider_->Get({return_id}, 0, &result_map);
3078+
auto status = plasma_store_provider_->Get(
3079+
{return_id}, 0, *worker_context_, &result_map, &got_exception);
30753080
// Remove the temporary ref.
30763081
RemoveLocalReference(return_id);
30773082

@@ -3338,7 +3343,8 @@ Status CoreWorker::GetAndPinArgsForExecutor(const TaskSpecification &task,
33383343
RAY_RETURN_NOT_OK(memory_store_->Get(
33393344
by_ref_ids, -1, *worker_context_, &result_map, &got_exception));
33403345
} else {
3341-
RAY_RETURN_NOT_OK(plasma_store_provider_->Get(by_ref_ids, -1, &result_map));
3346+
RAY_RETURN_NOT_OK(plasma_store_provider_->Get(
3347+
by_ref_ids, -1, *worker_context_, &result_map, &got_exception));
33423348
}
33433349
for (const auto &it : result_map) {
33443350
for (size_t idx : by_ref_indices[it.first]) {

src/ray/core_worker/store_provider/plasma_store_provider.cc

Lines changed: 57 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
#include "ray/core_worker/store_provider/plasma_store_provider.h"
1616

1717
#include <algorithm>
18-
#include <cstdint>
1918
#include <memory>
2019
#include <string>
2120
#include <utility>
@@ -178,20 +177,23 @@ Status CoreWorkerPlasmaStoreProvider::Release(const ObjectID &object_id) {
178177
return store_client_->Release(object_id);
179178
}
180179

181-
Status CoreWorkerPlasmaStoreProvider::GetObjectsFromPlasmaStore(
180+
Status CoreWorkerPlasmaStoreProvider::PullObjectsAndGetFromPlasmaStore(
182181
absl::flat_hash_set<ObjectID> &remaining,
183-
const std::vector<ObjectID> &ids,
182+
const std::vector<ObjectID> &batch_ids,
184183
int64_t timeout_ms,
185184
absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
186185
bool *got_exception) {
186+
const auto owner_addresses = reference_counter_.GetOwnerAddresses(batch_ids);
187+
RAY_RETURN_NOT_OK(raylet_ipc_client_->AsyncGetObjects(batch_ids, owner_addresses));
188+
187189
std::vector<plasma::ObjectBuffer> plasma_results;
188-
RAY_RETURN_NOT_OK(store_client_->Get(ids, timeout_ms, &plasma_results));
190+
RAY_RETURN_NOT_OK(store_client_->Get(batch_ids, timeout_ms, &plasma_results));
189191

190192
// Add successfully retrieved objects to the result map and remove them from
191193
// the set of IDs to get.
192194
for (size_t i = 0; i < plasma_results.size(); i++) {
193195
if (plasma_results[i].data != nullptr || plasma_results[i].metadata != nullptr) {
194-
const auto &object_id = ids[i];
196+
const auto &object_id = batch_ids[i];
195197
std::shared_ptr<TrackedBuffer> data = nullptr;
196198
std::shared_ptr<Buffer> metadata = nullptr;
197199
if (plasma_results[i].data && plasma_results[i].data->Size() > 0) {
@@ -214,6 +216,7 @@ Status CoreWorkerPlasmaStoreProvider::GetObjectsFromPlasmaStore(
214216
(*results)[object_id] = std::move(result_object);
215217
}
216218
}
219+
217220
return Status::OK();
218221
}
219222

@@ -251,52 +254,57 @@ Status CoreWorkerPlasmaStoreProvider::GetExperimentalMutableObject(
251254
return store_client_->GetExperimentalMutableObject(object_id, mutable_object);
252255
}
253256

257+
Status UnblockIfNeeded(
258+
const std::shared_ptr<ipc::RayletIpcClientInterface> &raylet_client,
259+
const WorkerContext &ctx) {
260+
if (ctx.CurrentTaskIsDirectCall()) {
261+
// NOTE: for direct call actors, we still need to issue an unblock IPC to release
262+
// get subscriptions, even if the worker isn't blocked.
263+
if (ctx.ShouldReleaseResourcesOnBlockingCalls() || ctx.CurrentActorIsDirectCall()) {
264+
return raylet_client->NotifyWorkerUnblocked();
265+
} else {
266+
return Status::OK(); // We don't need to release resources.
267+
}
268+
} else {
269+
return raylet_client->CancelGetRequest();
270+
}
271+
}
272+
254273
Status CoreWorkerPlasmaStoreProvider::Get(
255274
const absl::flat_hash_set<ObjectID> &object_ids,
256275
int64_t timeout_ms,
257-
absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results) {
258-
std::vector<ipc::ScopedResponse> get_request_cleanup_handlers;
259-
260-
bool got_exception = false;
261-
absl::flat_hash_set<ObjectID> remaining(object_ids.begin(), object_ids.end());
262-
std::vector<ObjectID> id_vector(object_ids.begin(), object_ids.end());
276+
const WorkerContext &ctx,
277+
absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
278+
bool *got_exception) {
263279
std::vector<ObjectID> batch_ids;
280+
absl::flat_hash_set<ObjectID> remaining(object_ids.begin(), object_ids.end());
264281

265-
int64_t num_total_objects = static_cast<int64_t>(object_ids.size());
266-
267-
// TODO(57923): Need to understand if batching is necessary. If it's necessary,
268-
// then the reason needs to be documented.
269-
for (int64_t start = 0; start < num_total_objects; start += fetch_batch_size_) {
282+
// Send initial requests to pull all objects in parallel.
283+
std::vector<ObjectID> id_vector(object_ids.begin(), object_ids.end());
284+
int64_t total_size = static_cast<int64_t>(object_ids.size());
285+
for (int64_t start = 0; start < total_size; start += fetch_batch_size_) {
270286
batch_ids.clear();
271-
for (int64_t i = start; i < start + fetch_batch_size_ && i < num_total_objects; i++) {
287+
for (int64_t i = start; i < start + fetch_batch_size_ && i < total_size; i++) {
272288
batch_ids.push_back(id_vector[i]);
273289
}
274-
275-
// 1. Make the request to pull all objects into local plasma if not local already.
276-
std::vector<rpc::Address> owner_addresses =
277-
reference_counter_.GetOwnerAddresses(batch_ids);
278-
StatusOr<ipc::ScopedResponse> status_or_cleanup =
279-
raylet_ipc_client_->AsyncGetObjects(batch_ids, owner_addresses);
280-
RAY_RETURN_NOT_OK(status_or_cleanup.status());
281-
get_request_cleanup_handlers.emplace_back(std::move(status_or_cleanup.value()));
282-
283-
// 2. Try to Get all objects that are already local from the plasma store.
284290
RAY_RETURN_NOT_OK(
285-
GetObjectsFromPlasmaStore(remaining,
286-
batch_ids,
287-
/*timeout_ms=*/0,
288-
// Mutable objects must be local before ray.get.
289-
results,
290-
&got_exception));
291+
PullObjectsAndGetFromPlasmaStore(remaining,
292+
batch_ids,
293+
/*timeout_ms=*/0,
294+
// Mutable objects must be local before ray.get.
295+
results,
296+
got_exception));
291297
}
292298

293-
if (remaining.empty() || got_exception) {
294-
return Status::OK();
299+
// If all objects were fetched already, return. Note that we always need to
300+
// call UnblockIfNeeded() to cancel the get request.
301+
if (remaining.empty() || *got_exception) {
302+
return UnblockIfNeeded(raylet_ipc_client_, ctx);
295303
}
296304

297-
// 3. If not all objects were successfully fetched, repeatedly call
298-
// GetObjectsFromPlasmaStore in batches. This loop will run indefinitely until the
299-
// objects are all fetched if timeout is -1.
305+
// If not all objects were successfully fetched, repeatedly call FetchOrReconstruct
306+
// and Get from the local object store in batches. This loop will run indefinitely
307+
// until the objects are all fetched if timeout is -1.
300308
bool should_break = false;
301309
bool timed_out = false;
302310
int64_t remaining_timeout = timeout_ms;
@@ -320,16 +328,18 @@ Status CoreWorkerPlasmaStoreProvider::Get(
320328
}
321329

322330
size_t previous_size = remaining.size();
323-
RAY_RETURN_NOT_OK(GetObjectsFromPlasmaStore(
324-
remaining, batch_ids, batch_timeout, results, &got_exception));
325-
should_break = timed_out || got_exception;
331+
RAY_RETURN_NOT_OK(PullObjectsAndGetFromPlasmaStore(
332+
remaining, batch_ids, batch_timeout, results, got_exception));
333+
should_break = timed_out || *got_exception;
326334

327335
if ((previous_size - remaining.size()) < batch_ids.size()) {
328336
WarnIfFetchHanging(fetch_start_time_ms, remaining);
329337
}
330338
if (check_signals_) {
331339
Status status = check_signals_();
332340
if (!status.ok()) {
341+
// TODO(edoakes): in this case which status should we return?
342+
RAY_RETURN_NOT_OK(UnblockIfNeeded(raylet_ipc_client_, ctx));
333343
return status;
334344
}
335345
}
@@ -344,14 +354,13 @@ Status CoreWorkerPlasmaStoreProvider::Get(
344354
}
345355

346356
if (!remaining.empty() && timed_out) {
347-
return Status::TimedOut(absl::StrFormat(
348-
"Could not fetch %d objects within the timeout of %dms. %d objects were not "
349-
"ready.",
350-
object_ids.size(),
351-
timeout_ms,
352-
remaining.size()));
357+
RAY_RETURN_NOT_OK(UnblockIfNeeded(raylet_ipc_client_, ctx));
358+
return Status::TimedOut("Get timed out: some object(s) not ready.");
353359
}
354-
return Status::OK();
360+
361+
// Notify unblocked because we blocked when calling FetchOrReconstruct with
362+
// fetch_only=false.
363+
return UnblockIfNeeded(raylet_ipc_client_, ctx);
355364
}
356365

357366
Status CoreWorkerPlasmaStoreProvider::Contains(const ObjectID &object_id,

src/ray/core_worker/store_provider/plasma_store_provider.h

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -154,24 +154,11 @@ class CoreWorkerPlasmaStoreProvider {
154154
/// argument to Get to retrieve the object data.
155155
Status Release(const ObjectID &object_id);
156156

157-
/// Fetches data from the local plasma store. If an object is not available in the
158-
/// local plasma store, then the raylet will trigger a pull request to copy an object
159-
/// into the local plasma store from another node.
160-
///
161-
/// \param[in] object_ids objects to fetch if they are not already in local plasma.
162-
/// \param[in] timeout_ms if the timeout elapses, the request will be canceled.
163-
/// \param[out] results objects fetched from plasma. This is only valid if the function
164-
///
165-
/// \return Status::IOError if there's an error communicating with the raylet.
166-
/// \return Status::TimedOut if timeout_ms was reached before all object_ids could be
167-
/// fetched.
168-
/// \return Status::Interrupted if a SIGINT signal was received.
169-
/// \return Status::IntentionalSystemExit if a SIGTERM signal was was received.
170-
/// \return Status::UnexpectedSystemExit if any other signal was received.
171-
/// \return Status::OK otherwise.
172157
Status Get(const absl::flat_hash_set<ObjectID> &object_ids,
173158
int64_t timeout_ms,
174-
absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results);
159+
const WorkerContext &ctx,
160+
absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
161+
bool *got_exception);
175162

176163
/// Get objects directly from the local plasma store, without waiting for the
177164
/// objects to be fetched from another node. This should only be used
@@ -218,24 +205,22 @@ class CoreWorkerPlasmaStoreProvider {
218205
std::shared_ptr<plasma::PlasmaClientInterface> &store_client() { return store_client_; }
219206

220207
private:
221-
/// Ask the plasma store to return object objects within the timeout.
222-
/// Successfully fetched objects will be removed from the input set of remaining IDs and
223-
/// added to the results map.
208+
/// Ask the raylet to pull a set of objects and then attempt to get them
209+
/// from the local plasma store. Successfully fetched objects will be removed
210+
/// from the input set of remaining IDs and added to the results map.
224211
///
225212
/// \param[in/out] remaining IDs of the remaining objects to get.
226-
/// \param[in] ids IDs of the objects to get.
213+
/// \param[in] batch_ids IDs of the objects to get.
227214
/// \param[in] timeout_ms Timeout in milliseconds.
228215
/// \param[out] results Map of objects to write results into. This method will only
229216
/// add to this map, not clear or remove from it, so the caller can pass in a non-empty
230217
/// map.
231218
/// \param[out] got_exception Set to true if any of the fetched objects contained an
232219
/// exception.
233-
/// \return Status::IOError if there is an error in communicating with the raylet or the
234-
/// plasma store.
235-
/// \return Status::OK if successful.
236-
Status GetObjectsFromPlasmaStore(
220+
/// \return Status.
221+
Status PullObjectsAndGetFromPlasmaStore(
237222
absl::flat_hash_set<ObjectID> &remaining,
238-
const std::vector<ObjectID> &ids,
223+
const std::vector<ObjectID> &batch_ids,
239224
int64_t timeout_ms,
240225
absl::flat_hash_map<ObjectID, std::shared_ptr<RayObject>> *results,
241226
bool *got_exception);

0 commit comments

Comments
 (0)