Skip to content

Commit 7970875

Browse files
[Utils] Make worker connection timeout parameters configurable (#58372)
When starting a Ray cluster in a Kuberay environment, the startup process may sometimes be slow. In such cases, it is necessary to increase the timeout duration for proper startup, otherwise, the error "ray client connection timeout" will occur. Therefore, we need to make the timeout and retry policies for the Ray worker configurable. --------- Signed-off-by: OneSizeFitsQuorum <txypotato@gmail.com>
1 parent f6bb8b8 commit 7970875

File tree

1 file changed

+12
-12
lines changed

1 file changed

+12
-12
lines changed

python/ray/util/client/worker.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,11 @@
2121
import ray.cloudpickle as cloudpickle
2222
import ray.core.generated.ray_client_pb2 as ray_client_pb2
2323
import ray.core.generated.ray_client_pb2_grpc as ray_client_pb2_grpc
24-
from ray._private.ray_constants import DEFAULT_CLIENT_RECONNECT_GRACE_PERIOD
24+
from ray._private.ray_constants import (
25+
DEFAULT_CLIENT_RECONNECT_GRACE_PERIOD,
26+
env_float,
27+
env_integer,
28+
)
2529
from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
2630
from ray._private.runtime_env.working_dir import upload_working_dir_if_needed
2731

@@ -52,13 +56,14 @@
5256

5357
logger = logging.getLogger(__name__)
5458

55-
INITIAL_TIMEOUT_SEC = 5
56-
MAX_TIMEOUT_SEC = 30
57-
59+
INITIAL_TIMEOUT_SEC = env_integer("RAY_CLIENT_INITIAL_CONNECTION_TIMEOUT_S", 5)
60+
MAX_TIMEOUT_SEC = env_integer("RAY_CLIENT_MAX_CONNECTION_TIMEOUT_S", 30)
5861
# The max amount of time an operation can run blocking in the server. This
5962
# allows for Ctrl-C of the client to work without explicitly cancelling server
6063
# operations.
61-
MAX_BLOCKING_OPERATION_TIME_S: float = 2.0
64+
MAX_BLOCKING_OPERATION_TIME_S: float = env_float(
65+
"RAY_CLIENT_MAX_BLOCKING_OPERATION_TIME_S", 2.0
66+
)
6267

6368
# If the total size (bytes) of all outbound messages to schedule tasks since
6469
# the connection began exceeds this value, a warning should be raised
@@ -416,19 +421,14 @@ def get(self, vals, *, timeout: Optional[float] = None) -> Any:
416421
else:
417422
deadline = time.monotonic() + timeout
418423

419-
max_blocking_operation_time = MAX_BLOCKING_OPERATION_TIME_S
420-
if "RAY_CLIENT_MAX_BLOCKING_OPERATION_TIME_S" in os.environ:
421-
max_blocking_operation_time = float(
422-
os.environ["RAY_CLIENT_MAX_BLOCKING_OPERATION_TIME_S"]
423-
)
424424
while True:
425425
if deadline:
426426
op_timeout = min(
427-
max_blocking_operation_time,
427+
MAX_BLOCKING_OPERATION_TIME_S,
428428
max(deadline - time.monotonic(), 0.001),
429429
)
430430
else:
431-
op_timeout = max_blocking_operation_time
431+
op_timeout = MAX_BLOCKING_OPERATION_TIME_S
432432
try:
433433
res = self._get(to_get, op_timeout)
434434
break

0 commit comments

Comments
 (0)