Skip to content

Commit 35451f9

Browse files
VarunBhandaryvb-dbrksdayshah
authored
[core] Fix get actor timeout multiplier (#54525)
Signed-off-by: Varun Bhandary <varun.bhandary@databricks.com> Signed-off-by: DataErrata <varun.bhandary@gmail.com> Co-authored-by: Varun Bhandary <varun.bhandary@databricks.com> Co-authored-by: Dhyey Shah <dhyey2019@gmail.com>
1 parent e5ed7bf commit 35451f9

File tree

2 files changed

+49
-1
lines changed

2 files changed

+49
-1
lines changed

python/ray/tests/test_state_api.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@
111111
WorkerState,
112112
StateSchema,
113113
state_column,
114+
GetApiOptions,
114115
)
115116
from ray.dashboard.utils import ray_address_to_api_server_url
116117
from ray.util.state.exception import DataSourceUnavailable, RayStateApiException
@@ -3800,5 +3801,40 @@ def test_hang_driver_has_no_is_running_task(monkeypatch, ray_start_cluster):
38003801
assert not all_job_info[my_job_id].HasField("is_running_tasks")
38013802

38023803

3804+
def test_get_actor_timeout_multiplier(shutdown_only):
3805+
"""Test that GetApiOptions applies the same timeout multiplier as ListApiOptions.
3806+
3807+
This test reproduces the issue where get_actor with timeout=1 fails even though
3808+
the actual operation takes less than 1 second, because GetApiOptions doesn't
3809+
apply the 0.8 server timeout multiplier that ListApiOptions uses.
3810+
3811+
Related issue: https://github.com/ray-project/ray/issues/54153
3812+
"""
3813+
3814+
@ray.remote
3815+
class TestActor:
3816+
def ready(self):
3817+
pass
3818+
3819+
actor = TestActor.remote()
3820+
ray.get(actor.ready.remote())
3821+
3822+
# Test that both options classes apply the same timeout multiplier
3823+
test_timeout = 1
3824+
get_options = GetApiOptions(timeout=test_timeout)
3825+
list_options = ListApiOptions(timeout=test_timeout)
3826+
3827+
# After __post_init__, both should have the same effective timeout
3828+
assert get_options.timeout == list_options.timeout
3829+
3830+
# Test that get_actor works with a 1-second timeout
3831+
actors = list_actors()
3832+
actor_id = actors[0]["actor_id"]
3833+
3834+
# This should work without timeout issues
3835+
result = get_actor(actor_id, timeout=1)
3836+
assert result["actor_id"] == actor_id
3837+
3838+
38033839
if __name__ == "__main__":
38043840
sys.exit(pytest.main(["-sv", __file__]))

python/ray/util/state/common.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def __post_init__(self):
152152
# To return the data to users, when there's a partial failure
153153
# we need to have a timeout that's smaller than the users' timeout.
154154
# 80% is configured arbitrarily.
155-
self.timeout = int(self.timeout * self.server_timeout_multiplier)
155+
self.timeout = max(1, int(self.timeout * self.server_timeout_multiplier))
156156
assert self.timeout != 0, "0 second timeout is not supported."
157157
if self.filters is None:
158158
self.filters = []
@@ -197,6 +197,18 @@ def has_conflicting_filters(self) -> bool:
197197
class GetApiOptions:
198198
# Timeout for the HTTP request
199199
timeout: int = DEFAULT_RPC_TIMEOUT
200+
# When the request is processed on the server side,
201+
# we should apply multiplier so that server side can finish
202+
# processing a request within timeout. Otherwise,
203+
# timeout will always lead Http timeout.
204+
server_timeout_multiplier: float = 0.8
205+
206+
def __post_init__(self):
207+
# To return the data to users, when there's a partial failure
208+
# we need to have a timeout that's smaller than the users' timeout.
209+
# 80% is configured arbitrarily.
210+
self.timeout = max(1, int(self.timeout * self.server_timeout_multiplier))
211+
assert self.timeout != 0, "0 second timeout is not supported."
200212

201213

202214
@dataclass(init=not IS_PYDANTIC_2)

0 commit comments

Comments
 (0)