Skip to content

Commit fbf3c32

Browse files
authored
[core] Adding option for in flight rpc failure injection (#58512)
Signed-off-by: dayshah <dhyey2019@gmail.com>
1 parent 79d2a69 commit fbf3c32

14 files changed

+197
-102
lines changed

python/ray/_private/test_utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2096,3 +2096,12 @@ def _execute_command_on_node(command: str, node_ip: str):
20962096
except subprocess.CalledProcessError as e:
20972097
print("Exit code:", e.returncode)
20982098
print("Stderr:", e.stderr)
2099+
2100+
2101+
RPC_FAILURE_MAP = {
2102+
"request": "100:0:0",
2103+
"response": "0:100:0",
2104+
"in_flight": "0:0:100",
2105+
}
2106+
2107+
RPC_FAILURE_TYPES = list(RPC_FAILURE_MAP.keys())

python/ray/dashboard/modules/job/tests/test_job_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ async def test_runtime_env_setup_logged_to_job_driver_logs(
361361
{
362362
"cmd": "ray start --head",
363363
"env": {
364-
"RAY_testing_rpc_failure": "ray::rpc::InternalKVGcsService.grpc_client.InternalKVGet=2:50:50,CoreWorkerService.grpc_client.PushTask=3:50:50"
364+
"RAY_testing_rpc_failure": "ray::rpc::InternalKVGcsService.grpc_client.InternalKVGet=3:33:33:33,CoreWorkerService.grpc_client.PushTask=3:33:33:33"
365365
},
366366
},
367367
],

python/ray/tests/test_actor_lineage_reconstruction.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,15 @@
66
import pytest
77

88
import ray
9-
from ray._common.test_utils import wait_for_condition
9+
from ray._private.test_utils import (
10+
RPC_FAILURE_MAP,
11+
RPC_FAILURE_TYPES,
12+
wait_for_condition,
13+
)
1014
from ray.core.generated import common_pb2, gcs_pb2
1115

1216

13-
@pytest.mark.parametrize("deterministic_failure", ["request", "response"])
17+
@pytest.mark.parametrize("deterministic_failure", RPC_FAILURE_TYPES)
1418
def test_actor_reconstruction_triggered_by_lineage_reconstruction(
1519
monkeypatch, ray_start_cluster, deterministic_failure
1620
):
@@ -21,11 +25,11 @@ def test_actor_reconstruction_triggered_by_lineage_reconstruction(
2125
# -> actor goes out of scope again after lineage reconstruction is done
2226
# -> actor is permanently dead when there is no reference.
2327
# This test also injects network failure to make sure relevant rpcs are retried.
24-
chaos_failure = "100:0" if deterministic_failure == "request" else "0:100"
28+
failure = RPC_FAILURE_MAP[deterministic_failure]
2529
monkeypatch.setenv(
2630
"RAY_testing_rpc_failure",
27-
f"ray::rpc::ActorInfoGcsService.grpc_client.RestartActorForLineageReconstruction=1:{chaos_failure},"
28-
f"ray::rpc::ActorInfoGcsService.grpc_client.ReportActorOutOfScope=1:{chaos_failure}",
31+
f"ray::rpc::ActorInfoGcsService.grpc_client.RestartActorForLineageReconstruction=1:{failure},"
32+
f"ray::rpc::ActorInfoGcsService.grpc_client.ReportActorOutOfScope=1:{failure}",
2933
)
3034
cluster = ray_start_cluster
3135
cluster.add_node(resources={"head": 1})

python/ray/tests/test_core_worker_fault_tolerance.py

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@
44
import pytest
55

66
import ray
7-
from ray._common.test_utils import SignalActor, wait_for_condition
7+
from ray._common.test_utils import SignalActor
8+
from ray._private.test_utils import (
9+
RPC_FAILURE_MAP,
10+
RPC_FAILURE_TYPES,
11+
wait_for_condition,
12+
)
813
from ray.core.generated import common_pb2, gcs_pb2
914
from ray.exceptions import GetTimeoutError, TaskCancelledError
1015
from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
@@ -14,18 +19,18 @@
1419
"allow_out_of_order_execution",
1520
[True, False],
1621
)
17-
@pytest.mark.parametrize("deterministic_failure", ["request", "response"])
22+
@pytest.mark.parametrize("deterministic_failure", RPC_FAILURE_TYPES)
1823
def test_push_actor_task_failure(
1924
monkeypatch,
2025
ray_start_cluster,
2126
allow_out_of_order_execution: bool,
2227
deterministic_failure: str,
2328
):
2429
with monkeypatch.context() as m:
30+
failure = RPC_FAILURE_MAP[deterministic_failure]
2531
m.setenv(
2632
"RAY_testing_rpc_failure",
27-
"CoreWorkerService.grpc_client.PushTask=2:"
28-
+ ("100:0" if deterministic_failure == "request" else "0:100"),
33+
f"CoreWorkerService.grpc_client.PushTask=2:{failure}",
2934
)
3035
m.setenv("RAY_actor_scheduling_queue_max_reorder_wait_seconds", "0")
3136
cluster = ray_start_cluster
@@ -47,15 +52,15 @@ def echo(self, value):
4752
assert ray.get(refs) == list(range(10))
4853

4954

50-
@pytest.mark.parametrize("deterministic_failure", ["request", "response"])
55+
@pytest.mark.parametrize("deterministic_failure", RPC_FAILURE_TYPES)
5156
def test_update_object_location_batch_failure(
5257
monkeypatch, ray_start_cluster, deterministic_failure
5358
):
5459
with monkeypatch.context() as m:
60+
failure = RPC_FAILURE_MAP[deterministic_failure]
5561
m.setenv(
5662
"RAY_testing_rpc_failure",
57-
"CoreWorkerService.grpc_client.UpdateObjectLocationBatch=1:"
58-
+ ("100:0" if deterministic_failure == "request" else "0:100"),
63+
f"CoreWorkerService.grpc_client.UpdateObjectLocationBatch=1:{failure}",
5964
)
6065
cluster = ray_start_cluster
6166
head_node_id = cluster.add_node(
@@ -85,7 +90,7 @@ def consume_large_object(obj):
8590
assert ray.get(consume_ref, timeout=10) > 0
8691

8792

88-
@pytest.mark.parametrize("deterministic_failure", ["request", "response"])
93+
@pytest.mark.parametrize("deterministic_failure", RPC_FAILURE_TYPES)
8994
def test_get_object_status_rpc_retry_and_idempotency(
9095
monkeypatch, shutdown_only, deterministic_failure
9196
):
@@ -94,11 +99,10 @@ def test_get_object_status_rpc_retry_and_idempotency(
9499
Cross_worker_access_task triggers GetObjectStatus because it does
95100
not own objects and needs to request it from the driver.
96101
"""
97-
102+
failure = RPC_FAILURE_MAP[deterministic_failure]
98103
monkeypatch.setenv(
99104
"RAY_testing_rpc_failure",
100-
"CoreWorkerService.grpc_client.GetObjectStatus=1:"
101-
+ ("100:0" if deterministic_failure == "request" else "0:100"),
105+
f"CoreWorkerService.grpc_client.GetObjectStatus=1:{failure}",
102106
)
103107

104108
ray.init()
@@ -118,7 +122,7 @@ def cross_worker_access_task(objects):
118122
assert final_result == [0, 2, 4, 6, 8]
119123

120124

121-
@pytest.mark.parametrize("deterministic_failure", ["request", "response"])
125+
@pytest.mark.parametrize("deterministic_failure", RPC_FAILURE_TYPES)
122126
def test_wait_for_actor_ref_deleted_rpc_retry_and_idempotency(
123127
monkeypatch, shutdown_only, deterministic_failure
124128
):
@@ -127,11 +131,10 @@ def test_wait_for_actor_ref_deleted_rpc_retry_and_idempotency(
127131
The GCS actor manager will trigger this RPC during actor initialization
128132
to monitor when the actor handles have gone out of scope and the actor should be destroyed.
129133
"""
130-
134+
failure = RPC_FAILURE_MAP[deterministic_failure]
131135
monkeypatch.setenv(
132136
"RAY_testing_rpc_failure",
133-
"CoreWorkerService.grpc_client.WaitForActorRefDeleted=1:"
134-
+ ("100:0" if deterministic_failure == "request" else "0:100"),
137+
f"CoreWorkerService.grpc_client.WaitForActorRefDeleted=1:{failure}",
135138
)
136139

137140
ray.init()
@@ -166,15 +169,17 @@ def verify_actor_ref_deleted():
166169
@pytest.fixture
167170
def inject_cancel_remote_task_rpc_failure(monkeypatch, request):
168171
deterministic_failure = request.param
172+
failure = RPC_FAILURE_MAP[deterministic_failure]
169173
monkeypatch.setenv(
170174
"RAY_testing_rpc_failure",
171-
"CoreWorkerService.grpc_client.CancelRemoteTask=1:"
172-
+ ("100:0" if deterministic_failure == "request" else "0:100"),
175+
f"CoreWorkerService.grpc_client.CancelRemoteTask=1:{failure}",
173176
)
174177

175178

176179
@pytest.mark.parametrize(
177-
"inject_cancel_remote_task_rpc_failure", ["request", "response"], indirect=True
180+
"inject_cancel_remote_task_rpc_failure",
181+
RPC_FAILURE_TYPES,
182+
indirect=True,
178183
)
179184
def test_cancel_remote_task_rpc_retry_and_idempotency(
180185
inject_cancel_remote_task_rpc_failure, ray_start_cluster
@@ -208,7 +213,7 @@ def remote_wait(sg):
208213
def test_double_borrowing_with_rpc_failure(monkeypatch, shutdown_only):
209214
"""Regression test for https://github.com/ray-project/ray/issues/57997"""
210215
monkeypatch.setenv(
211-
"RAY_testing_rpc_failure", "CoreWorkerService.grpc_client.PushTask=3:0:100"
216+
"RAY_testing_rpc_failure", "CoreWorkerService.grpc_client.PushTask=3:0:100:0"
212217
)
213218

214219
ray.init()

python/ray/tests/test_gcs_fault_tolerance.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1258,11 +1258,10 @@ def test_mark_job_finished_rpc_retry_and_idempotency(shutdown_only, monkeypatch)
12581258
Uses RPC failure injection to simulate network retry scenarios.
12591259
"""
12601260
# Inject RPC failures for MarkJobFinished - simulate network failures
1261-
# Format: method_name=max_failures:request_failure_prob:response_failure_prob
12621261
# We inject request failures to force retries and test idempotency
12631262
monkeypatch.setenv(
12641263
"RAY_testing_rpc_failure",
1265-
"ray::rpc::JobInfoGcsService.grpc_client.MarkJobFinished=3:50:0",
1264+
"ray::rpc::JobInfoGcsService.grpc_client.MarkJobFinished=3:50:0:0",
12661265
)
12671266

12681267
ray.init(num_cpus=1)

python/ray/tests/test_gcs_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@ def test_kv_timeout(ray_start_regular):
109109
def test_kv_transient_network_error(shutdown_only, monkeypatch):
110110
monkeypatch.setenv(
111111
"RAY_testing_rpc_failure",
112-
"ray::rpc::InternalKVGcsService.grpc_client.InternalKVGet=5:25:25,"
113-
"ray::rpc::InternalKVGcsService.grpc_client.InternalKVPut=5:25:25",
112+
"ray::rpc::InternalKVGcsService.grpc_client.InternalKVGet=5:25:25:25,"
113+
"ray::rpc::InternalKVGcsService.grpc_client.InternalKVPut=5:25:25:25",
114114
)
115115
ray.init()
116116
gcs_address = ray._private.worker.global_worker.gcs_client.address

python/ray/tests/test_object_manager_fault_tolerance.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,22 @@
55

66
import ray
77
from ray._private.internal_api import get_memory_info_reply, get_state_from_address
8-
from ray._private.test_utils import wait_for_condition
8+
from ray._private.test_utils import (
9+
RPC_FAILURE_MAP,
10+
RPC_FAILURE_TYPES,
11+
wait_for_condition,
12+
)
913
from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
1014

1115

12-
@pytest.mark.parametrize("deterministic_failure", ["request", "response"])
16+
@pytest.mark.parametrize("deterministic_failure", RPC_FAILURE_TYPES)
1317
def test_free_objects_idempotent(
1418
monkeypatch, shutdown_only, deterministic_failure, ray_start_cluster
1519
):
20+
failure = RPC_FAILURE_MAP[deterministic_failure]
1621
monkeypatch.setenv(
1722
"RAY_testing_rpc_failure",
18-
"ObjectManagerService.grpc_client.FreeObjects=1:"
19-
+ ("100:0" if deterministic_failure == "request" else "0:100"),
23+
f"ObjectManagerService.grpc_client.FreeObjects=1:{failure}",
2024
)
2125

2226
@ray.remote

python/ray/tests/test_raylet_fault_tolerance.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@
44
import pytest
55

66
import ray
7-
from ray._private.test_utils import wait_for_condition
7+
from ray._private.test_utils import (
8+
RPC_FAILURE_MAP,
9+
RPC_FAILURE_TYPES,
10+
wait_for_condition,
11+
)
812
from ray.core.generated import autoscaler_pb2
913
from ray.util.placement_group import placement_group, remove_placement_group
1014
from ray.util.scheduling_strategies import (
@@ -15,14 +19,14 @@
1519
import psutil
1620

1721

18-
@pytest.mark.parametrize("deterministic_failure", ["request", "response"])
22+
@pytest.mark.parametrize("deterministic_failure", RPC_FAILURE_TYPES)
1923
def test_request_worker_lease_idempotent(
2024
monkeypatch, shutdown_only, deterministic_failure, ray_start_cluster
2125
):
26+
failure = RPC_FAILURE_MAP[deterministic_failure]
2227
monkeypatch.setenv(
2328
"RAY_testing_rpc_failure",
24-
"NodeManagerService.grpc_client.RequestWorkerLease=1:"
25-
+ ("100:0" if deterministic_failure == "request" else "0:100"),
29+
f"NodeManagerService.grpc_client.RequestWorkerLease=1:{failure}",
2630
)
2731

2832
@ray.remote
@@ -57,7 +61,7 @@ def test_drain_node_idempotent(monkeypatch, shutdown_only, ray_start_cluster):
5761
# NOTE: not testing response failure since the node is already marked as draining and shuts down gracefully.
5862
monkeypatch.setenv(
5963
"RAY_testing_rpc_failure",
60-
"NodeManagerService.grpc_client.DrainRaylet=1:100:0",
64+
"NodeManagerService.grpc_client.DrainRaylet=1:100:0:0",
6165
)
6266

6367
cluster = ray_start_cluster
@@ -94,16 +98,18 @@ def node_is_dead():
9498
@pytest.fixture
9599
def inject_release_unused_bundles_rpc_failure(monkeypatch, request):
96100
deterministic_failure = request.param
101+
failure = RPC_FAILURE_MAP[deterministic_failure]
97102
monkeypatch.setenv(
98103
"RAY_testing_rpc_failure",
99-
"NodeManagerService.grpc_client.ReleaseUnusedBundles=1:"
100-
+ ("100:0" if deterministic_failure == "request" else "0:100")
101-
+ ",NodeManagerService.grpc_client.CancelResourceReserve=-1:100:0",
104+
f"NodeManagerService.grpc_client.ReleaseUnusedBundles=1:{failure}"
105+
+ ",NodeManagerService.grpc_client.CancelResourceReserve=-1:100:0:0",
102106
)
103107

104108

105109
@pytest.mark.parametrize(
106-
"inject_release_unused_bundles_rpc_failure", ["request", "response"], indirect=True
110+
"inject_release_unused_bundles_rpc_failure",
111+
RPC_FAILURE_TYPES,
112+
indirect=True,
107113
)
108114
@pytest.mark.parametrize(
109115
"ray_start_cluster_head_with_external_redis",
@@ -146,15 +152,17 @@ def task():
146152
@pytest.fixture
147153
def inject_notify_gcs_restart_rpc_failure(monkeypatch, request):
148154
deterministic_failure = request.param
155+
failure = RPC_FAILURE_MAP[deterministic_failure]
149156
monkeypatch.setenv(
150157
"RAY_testing_rpc_failure",
151-
"NodeManagerService.grpc_client.NotifyGCSRestart=1:"
152-
+ ("100:0" if deterministic_failure == "request" else "0:100"),
158+
f"NodeManagerService.grpc_client.NotifyGCSRestart=1:{failure}",
153159
)
154160

155161

156162
@pytest.mark.parametrize(
157-
"inject_notify_gcs_restart_rpc_failure", ["request", "response"], indirect=True
163+
"inject_notify_gcs_restart_rpc_failure",
164+
RPC_FAILURE_TYPES,
165+
indirect=True,
158166
)
159167
@pytest.mark.parametrize(
160168
"ray_start_cluster_head_with_external_redis",
@@ -207,7 +215,7 @@ def test_kill_local_actor_rpc_retry_and_idempotency(monkeypatch, shutdown_only):
207215

208216
monkeypatch.setenv(
209217
"RAY_testing_rpc_failure",
210-
"NodeManagerService.grpc_client.KillLocalActor=1:100:0",
218+
"NodeManagerService.grpc_client.KillLocalActor=1:100:0:0",
211219
)
212220

213221
ray.init()

python/ray/tests/test_streaming_generator_4.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ def test_many_tasks_lineage_reconstruction_mini_stress_test(
190190
)
191191
m.setenv(
192192
"RAY_testing_rpc_failure",
193-
"CoreWorkerService.grpc_client.ReportGeneratorItemReturns=5:25:25",
193+
"CoreWorkerService.grpc_client.ReportGeneratorItemReturns=5:25:25:25",
194194
)
195195
cluster = ray_start_cluster
196196
cluster.add_node(

src/ray/common/ray_config_def.h

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -851,32 +851,37 @@ RAY_CONFIG(std::string, REDIS_CLIENT_KEY, "")
851851
RAY_CONFIG(std::string, REDIS_SERVER_NAME, "")
852852

853853
/// grpc delay testing flags
854-
/// To use this, simply do
854+
/// To use this,
855855
/// export RAY_testing_asio_delay_us="method1=min_val:max_val,method2=20:100"
856856
// The delay is a random number between the interval. If method equals '*',
857857
// it will apply to all methods.
858858
RAY_CONFIG(std::string, testing_asio_delay_us, "")
859859

860-
/// To use this, simply do
861-
/// export
862-
/// RAY_testing_rpc_failure="method1=max_num_failures:req_failure_prob:resp_failure_prob,method2=max_num_failures:req_failure_prob:resp_failure_prob"
860+
/// To use this,
861+
/// export
862+
/// RAY_testing_rpc_failure="method1=max_num_failures:req_failure_prob:resp_failure_prob:in_flight_failure_prob,method2=max_num_failures:req_failure_prob:resp_failure_prob:in_flight_failure_prob"
863863
/// If you want to test all rpc failures you can use * as the method name and you can set
864864
/// -1 max_num_failures to have unlimited failures.
865-
/// Ex. unlimited failures for all rpc's with 25% request failures and 50% response
866-
/// failures.
867-
/// export RAY_testing_rpc_failure="*=-1:25:50"
865+
/// Ex. unlimited failures for all rpc's with 25% request failures, 50% response
866+
/// failures, and 10% in-flight failures.
867+
/// export RAY_testing_rpc_failure="*=-1:25:50:10"
868868
/// NOTE: Setting the wildcard will override any configuration for other methods.
869869
///
870-
/// You can also provide an optional fourth and/or fifth parameter to specify that there
871-
/// should be at least a certain amount of failures on the request and response. The
872-
/// fourth parameter is for the request and the fifth parameter is for the response. By
873-
/// default these are set to 0, but by setting them to positive values it guarantees
874-
/// that the first X request RPCs will fail, followed by Y response RPCs that will fail.
870+
/// You can also provide an optional fifth, sixth, and/or seventh parameter to specify
871+
/// that there should be at least a certain amount of failures.
872+
// The 5th parameter is for request failures.
873+
// The 6th parameter is for response failures.
874+
// The 7th parameter is for in-flight failures.
875+
/// By default these are set to 0, but by setting them to positive values it guarantees
876+
/// that the first X request RPCs will fail, followed by Y response RPCs that will fail,
877+
/// followed by Z in-flight RPCs that will fail.
875878
/// Afterwards, it will revert to the probabilistic failures. You can combine this with
876879
/// the wildcard so that each RPC method will have the same lower bounds applied.
877-
/// Ex. unlimited failures for all rpc's with 25% request failures and 50% response
878-
/// failures with at least 2 request failures and 3 response failures.
879-
/// export RAY_testing_rpc_failure="*=-1:25:50:2:3"
880+
///
881+
/// Ex. unlimited failures for all rpc's with 25% request failures, 50% response failures,
882+
/// and 10% in-flight failures with at least 2 request failures, 3 response failures, and
883+
/// 1 in-flight failure.
884+
/// export RAY_testing_rpc_failure="*=-1:25:50:10:2:3:1"
880885
RAY_CONFIG(std::string, testing_rpc_failure, "")
881886
/// If this is set, when injecting RPC failures, we'll check if the server and client have
882887
/// the same address. If they do, we won't inject the failure.

0 commit comments

Comments
 (0)