2525import ray ._private .prometheus_exporter as prometheus_exporter
2626import ray .dashboard .modules .reporter .reporter_consts as reporter_consts
2727import ray .dashboard .utils as dashboard_utils
28- from ray ._common .network_utils import build_address , parse_address
28+ from ray ._common .network_utils import parse_address
2929from ray ._common .utils import (
3030 get_or_create_event_loop ,
3131 get_user_temp_dir ,
3434from ray ._private .metrics_agent import Gauge , MetricsAgent , Record
3535from ray ._private .ray_constants import (
3636 DEBUG_AUTOSCALING_STATUS ,
37- GLOBAL_GRPC_OPTIONS ,
3837 RAY_ENABLE_OPEN_TELEMETRY ,
3938 env_integer ,
4039)
4140from ray ._private .telemetry .open_telemetry_metric_recorder import (
4241 OpenTelemetryMetricRecorder ,
4342)
4443from ray ._private .utils import get_system_memory
45- from ray ._raylet import GCS_PID_KEY , WorkerID
44+ from ray ._raylet import GCS_PID_KEY , RayletClient , WorkerID
4645from ray .core .generated import (
47- node_manager_pb2 ,
48- node_manager_pb2_grpc ,
4946 reporter_pb2 ,
5047 reporter_pb2_grpc ,
5148)
5653 COMPONENT_METRICS_TAG_KEYS ,
5754 GCS_RPC_TIMEOUT_SECONDS ,
5855 GPU_TAG_KEYS ,
59- NODE_MANAGER_RPC_TIMEOUT_SECONDS ,
6056 NODE_TAG_KEYS ,
57+ RAYLET_RPC_TIMEOUT_SECONDS ,
6158 TPU_TAG_KEYS ,
6259)
6360from ray .dashboard .modules .reporter .gpu_profile_manager import GpuProfilingManager
@@ -490,10 +487,6 @@ def __init__(self, dashboard_agent):
490487 # Create GPU metric provider instance
491488 self ._gpu_metric_provider = GpuMetricProvider ()
492489
493- self ._node_manager_address = build_address (
494- self ._ip , self ._dashboard_agent .node_manager_port
495- )
496-
497490 async def GetTraceback (self , request , context ):
498491 pid = request .pid
499492 native = request .native
@@ -896,17 +889,14 @@ def _get_disk_io_stats():
896889 stats .write_count ,
897890 )
898891
899- async def _get_worker_pids_from_raylet (self ):
900- channel = ray ._private .utils .init_grpc_channel (
901- self ._node_manager_address , GLOBAL_GRPC_OPTIONS , asynchronous = True
892+ def _get_worker_pids_from_raylet (self ) -> Optional [List [int ]]:
893+ # Get worker pids from raylet via gRPC.
894+ timeout = RAYLET_RPC_TIMEOUT_SECONDS * 1000 # in milliseconds
895+ raylet_client = RayletClient (
896+ ip_address = self ._ip , port = self ._dashboard_agent .node_manager_port
902897 )
903- timeout = NODE_MANAGER_RPC_TIMEOUT_SECONDS
904- stub = node_manager_pb2_grpc .NodeManagerServiceStub (channel )
905898 try :
906- reply = await stub .GetDriverAndWorkerPids (
907- node_manager_pb2 .GetDriverAndWorkerPidsRequest (), timeout = timeout
908- )
909- return reply .pids
899+ return raylet_client .get_worker_pids (timeout = timeout )
910900 except Exception as e :
911901 logger .debug (f"Failed to get worker pids from raylet via gRPC: { e } " )
912902 return None
@@ -920,7 +910,7 @@ def _generate_worker_key(self, proc: psutil.Process) -> Tuple[int, float]:
920910 return (proc .pid , proc .create_time ())
921911
922912 def _get_worker_processes (self ):
923- pids = asyncio . run ( self ._get_worker_pids_from_raylet () )
913+ pids = self ._get_worker_pids_from_raylet ()
924914 if pids is not None :
925915 workers = {}
926916 for pid in pids :
@@ -931,25 +921,6 @@ def _get_worker_processes(self):
931921 continue
932922 return workers
933923
934- logger .debug ("fallback to get worker processes from raylet children" )
935- raylet_proc = self ._get_raylet_proc ()
936- if raylet_proc is None :
937- return []
938- else :
939- workers = {}
940- if sys .platform == "win32" :
941- # windows, get the child process not the runner
942- for child in raylet_proc .children ():
943- if child .children ():
944- child = child .children ()[0 ]
945- workers [self ._generate_worker_key (child )] = child
946- else :
947- workers = {
948- self ._generate_worker_key (proc ): proc
949- for proc in raylet_proc .children ()
950- }
951- return workers
952-
953924 def _get_workers (self , gpus : Optional [List [GpuUtilizationInfo ]] = None ):
954925 workers = self ._get_worker_processes ()
955926 if not workers :
0 commit comments