Skip to content

Commit

Permalink
[core][dashboard] configurable timeouts (#47181)
Browse files Browse the repository at this point in the history
Ray Dashboard reorganizes stats of all nodes and all workers every 2s,
and purges them every 600s. Make these 2 numbers configurable. Also
removed 2 never used numbers.

Signed-off-by: Ruiyang Wang <rywang014@gmail.com>
  • Loading branch information
rynewang authored Aug 16, 2024
1 parent ee3625d commit 9c0e1c4
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 20 deletions.
12 changes: 7 additions & 5 deletions python/ray/dashboard/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,12 @@
# Max allowed number of in-progress requests could be configured.
RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED = 1000

RETRY_REDIS_CONNECTION_TIMES = 10
CONNECT_REDIS_INTERNAL_SECONDS = 2
PURGE_DATA_INTERVAL_SECONDS = 60 * 10
ORGANIZE_DATA_INTERVAL_SECONDS = 2
RAY_DASHBOARD_STATS_PURGING_INTERVAL = env_integer(
"RAY_DASHBOARD_STATS_PURGING_INTERVAL", 60 * 10
)
RAY_DASHBOARD_STATS_UPDATING_INTERVAL = env_integer(
"RAY_DASHBOARD_STATS_UPDATING_INTERVAL", 2
)
DASHBOARD_RPC_ADDRESS = "dashboard_rpc"
DASHBOARD_RPC_PORT = env_integer("RAY_DASHBOARD_RPC_PORT", 0)
GCS_SERVER_ADDRESS = "GcsServerAddress"
Expand All @@ -38,7 +40,7 @@
GCS_RETRY_CONNECT_INTERVAL_SECONDS = env_integer(
"GCS_RETRY_CONNECT_INTERVAL_SECONDS", 2
)
GCS_RPC_TIMEOUT_SECONDS = 3
GCS_RPC_TIMEOUT_SECONDS = env_integer("RAY_DASHBOARD_GCS_RPC_TIMEOUT_SECONDS", 60)
# aiohttp_cache
AIOHTTP_CACHE_TTL_SECONDS = 2
AIOHTTP_CACHE_MAX_SIZE = 128
Expand Down
4 changes: 2 additions & 2 deletions python/ray/dashboard/datacenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class DataOrganizer:
head_node_ip = None

@staticmethod
@async_loop_forever(dashboard_consts.PURGE_DATA_INTERVAL_SECONDS)
@async_loop_forever(dashboard_consts.RAY_DASHBOARD_STATS_PURGING_INTERVAL)
async def purge():
# Purge data that is out of date.
# These data sources are maintained by DashboardHead,
Expand All @@ -66,7 +66,7 @@ async def purge():
DataSource.node_physical_stats.pop(key)

@classmethod
@async_loop_forever(dashboard_consts.ORGANIZE_DATA_INTERVAL_SECONDS)
@async_loop_forever(dashboard_consts.RAY_DASHBOARD_STATS_UPDATING_INTERVAL)
async def organize(cls):
node_workers = {}
core_worker_stats = {}
Expand Down
6 changes: 4 additions & 2 deletions python/ray/dashboard/modules/event/event_consts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ray._private.ray_constants import env_integer
from ray._private.ray_constants import env_float, env_integer
from ray.core.generated import event_pb2

LOG_ERROR_EVENT_STRING_LENGTH_LIMIT = 1000
Expand All @@ -12,7 +12,9 @@
"EVENT_READ_LINE_LENGTH_LIMIT", 2 * 1024 * 1024
) # 2MB
# Report events
EVENT_AGENT_REPORT_INTERVAL_SECONDS = 0.1
EVENT_AGENT_REPORT_INTERVAL_SECONDS = env_float(
"EVENT_AGENT_REPORT_INTERVAL_SECONDS", 0.1
)
EVENT_AGENT_RETRY_TIMES = 10
EVENT_AGENT_CACHE_SIZE = 10240
# Event sources
Expand Down
3 changes: 2 additions & 1 deletion python/ray/dashboard/modules/metrics/metrics_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import ray.dashboard.optional_utils as dashboard_optional_utils
import ray.dashboard.utils as dashboard_utils
from ray._private.async_utils import enable_monitor_loop_lag
from ray._private.ray_constants import env_integer
from ray.dashboard.consts import (
AVAILABLE_COMPONENT_NAMES_FOR_METRICS,
METRICS_INPUT_ROOT,
Expand Down Expand Up @@ -39,7 +40,7 @@
routes = dashboard_optional_utils.DashboardHeadRouteTable

METRICS_OUTPUT_ROOT_ENV_VAR = "RAY_METRICS_OUTPUT_ROOT"
METRICS_RECORD_INTERVAL_S = 5
METRICS_RECORD_INTERVAL_S = env_integer("METRICS_RECORD_INTERVAL_S", 5)

DEFAULT_PROMETHEUS_HOST = "http://localhost:9090"
PROMETHEUS_HOST_ENV_VAR = "RAY_PROMETHEUS_HOST"
Expand Down
17 changes: 9 additions & 8 deletions python/ray/dashboard/modules/node/node_consts.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
from ray._private.ray_constants import env_integer
from ray._private.ray_constants import env_float, env_integer

NODE_STATS_UPDATE_INTERVAL_SECONDS = 5
UPDATE_NODES_INTERVAL_SECONDS = 5
NODE_STATS_UPDATE_INTERVAL_SECONDS = env_integer(
"NODE_STATS_UPDATE_INTERVAL_SECONDS", 5
)
UPDATE_NODES_INTERVAL_SECONDS = env_integer("UPDATE_NODES_INTERVAL_SECONDS", 5)
# Until the head node is registered,
# the API server is doing more frequent update
# with this interval.
FREQUENTY_UPDATE_NODES_INTERVAL_SECONDS = 0.1
FREQUENTY_UPDATE_NODES_INTERVAL_SECONDS = env_float(
"FREQUENTY_UPDATE_NODES_INTERVAL_SECONDS", 0.1
)
# If the head node is not updated within
# this timeout, it will stop frequent update.
FREQUENT_UPDATE_TIMEOUT_SECONDS = 10
FREQUENT_UPDATE_TIMEOUT_SECONDS = env_integer("FREQUENT_UPDATE_TIMEOUT_SECONDS", 10)
MAX_COUNT_OF_GCS_RPC_ERROR = 10

# Timeout for RPCs to GCS.
GCS_RPC_TIMEOUT_SECONDS = env_integer("RAY_DASHBOARD_GCS_RPC_TIMEOUT_SECONDS", 60)
4 changes: 2 additions & 2 deletions python/ray/dashboard/modules/node/node_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ async def _update_nodes(self):
node_id.encode(),
overwrite=True,
namespace=ray_constants.KV_NAMESPACE_JOB,
timeout=node_consts.GCS_RPC_TIMEOUT_SECONDS,
timeout=GCS_RPC_TIMEOUT_SECONDS,
)
node_id_to_ip[node_id] = ip
node_id_to_hostname[node_id] = hostname
Expand All @@ -240,7 +240,7 @@ async def _update_nodes(self):
agent_port = await self._gcs_aio_client.internal_kv_get(
key.encode(),
namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
timeout=node_consts.GCS_RPC_TIMEOUT_SECONDS,
timeout=GCS_RPC_TIMEOUT_SECONDS,
)
if agent_port:
agents[node_id] = json.loads(agent_port)
Expand Down

0 comments on commit 9c0e1c4

Please sign in to comment.