4141 OpenTelemetryMetricRecorder ,
4242)
4343from ray ._private .utils import get_system_memory
44- from ray ._raylet import GCS_PID_KEY , WorkerID
44+ from ray ._raylet import GCS_PID_KEY , RayletClient , WorkerID
4545from ray .core .generated import reporter_pb2 , reporter_pb2_grpc
4646from ray .dashboard import k8s_utils
4747from ray .dashboard .consts import (
6666from ray .dashboard .modules .reporter .reporter_models import (
6767 StatsPayload ,
6868)
69+ from ray .exceptions import (
70+ GetTimeoutError ,
71+ RpcError ,
72+ )
6973
7074import psutil
7175
@@ -395,9 +399,10 @@ class ReporterAgent(
395399
396400 Attributes:
397401 dashboard_agent: The DashboardAgent object contains global config
402+ raylet_client: The RayletClient object to access raylet server
398403 """
399404
400- def __init__ (self , dashboard_agent ):
405+ def __init__ (self , dashboard_agent , raylet_client = None ):
401406 """Initialize the reporter object."""
402407 super ().__init__ (dashboard_agent )
403408
@@ -486,6 +491,13 @@ def __init__(self, dashboard_agent):
486491 # Create GPU metric provider instance
487492 self ._gpu_metric_provider = GpuMetricProvider ()
488493
494+ if raylet_client :
495+ self ._raylet_client = raylet_client
496+ else :
497+ self ._raylet_client = RayletClient (
498+ ip_address = self ._ip , port = self ._dashboard_agent .node_manager_port
499+ )
500+
489501 async def GetTraceback (self , request , context ):
490502 pid = request .pid
491503 native = request .native
@@ -888,6 +900,14 @@ def _get_disk_io_stats():
888900 stats .write_count ,
889901 )
890902
903+ async def _async_get_worker_pids_from_raylet (self ) -> List [int ]:
904+ try :
905+ # Get worker pids from raylet via gRPC.
906+ return await self ._raylet_client .async_get_worker_pids ()
907+ except (GetTimeoutError , RpcError ):
908+ logger .exception ("Failed to get worker pids from raylet" )
909+ return []
910+
891911 def _get_agent_proc (self ) -> psutil .Process :
892912 # Agent is the current process.
893913 # This method is not necessary, but we have it for mock testing.
@@ -896,27 +916,23 @@ def _get_agent_proc(self) -> psutil.Process:
896916 def _generate_worker_key (self , proc : psutil .Process ) -> Tuple [int , float ]:
897917 return (proc .pid , proc .create_time ())
898918
899- def _get_worker_processes (self ):
900- raylet_proc = self ._get_raylet_proc ()
901- if raylet_proc is None :
919+ async def _async_get_worker_processes (self ):
920+ pids = await self ._async_get_worker_pids_from_raylet ()
921+ logger .debug (f"Worker PIDs from raylet: { pids } " )
922+ if not pids :
902923 return []
903- else :
904- workers = {}
905- if sys .platform == "win32" :
906- # windows, get the child process not the runner
907- for child in raylet_proc .children ():
908- if child .children ():
909- child = child .children ()[0 ]
910- workers [self ._generate_worker_key (child )] = child
911- else :
912- workers = {
913- self ._generate_worker_key (proc ): proc
914- for proc in raylet_proc .children ()
915- }
916- return workers
917-
918- def _get_workers (self , gpus : Optional [List [GpuUtilizationInfo ]] = None ):
919- workers = self ._get_worker_processes ()
924+ workers = {}
925+ for pid in pids :
926+ try :
927+ proc = psutil .Process (pid )
928+ workers [self ._generate_worker_key (proc )] = proc
929+ except (psutil .NoSuchProcess , psutil .AccessDenied ):
930+ logger .error (f"Failed to access worker process { pid } " )
931+ continue
932+ return workers
933+
934+ async def _async_get_workers (self , gpus : Optional [List [GpuUtilizationInfo ]] = None ):
935+ workers = await self ._async_get_worker_processes ()
920936 if not workers :
921937 return []
922938 else :
@@ -936,9 +952,6 @@ def _get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None):
936952 for k in keys_to_pop :
937953 self ._workers .pop (k )
938954
939- # Remove the current process (reporter agent), which is also a child of
940- # the Raylet.
941- self ._workers .pop (self ._generate_worker_key (self ._get_agent_proc ()))
942955 # Build process ID -> GPU info mapping for faster lookups
943956 gpu_pid_mapping = defaultdict (list )
944957 if gpus is not None :
@@ -1058,7 +1071,7 @@ def _get_shm_usage(self):
10581071 return None
10591072 return mem .shared
10601073
1061- def _collect_stats (self ):
1074+ async def _async_collect_stats (self ):
10621075 now = dashboard_utils .to_posix_time (datetime .datetime .utcnow ())
10631076 network_stats = self ._get_network_stats ()
10641077 self ._network_stats_hist .append ((now , network_stats ))
@@ -1079,7 +1092,7 @@ def _collect_stats(self):
10791092 "mem" : self ._get_mem_usage (),
10801093 # Unit is in bytes. None if
10811094 "shm" : self ._get_shm_usage (),
1082- "workers" : self ._get_workers (gpus ),
1095+ "workers" : await self ._async_get_workers (gpus ),
10831096 "raylet" : raylet ,
10841097 "agent" : self ._get_agent (),
10851098 "bootTime" : self ._get_boot_time (),
@@ -1726,7 +1739,7 @@ async def _run_loop(self):
17261739 # executor (TPE) to avoid blocking the Agent's event-loop
17271740 json_payload = await loop .run_in_executor (
17281741 self ._executor ,
1729- self ._compose_stats_payload ,
1742+ self ._run_in_executor ,
17301743 autoscaler_status_json_bytes ,
17311744 )
17321745
@@ -1739,10 +1752,15 @@ async def _run_loop(self):
17391752
17401753 await asyncio .sleep (reporter_consts .REPORTER_UPDATE_INTERVAL_MS / 1000 )
17411754
1742- def _compose_stats_payload (
1755+ def _run_in_executor (self , cluster_autoscaling_stats_json : Optional [bytes ]) -> str :
1756+ return asyncio .run (
1757+ self ._async_compose_stats_payload (cluster_autoscaling_stats_json )
1758+ )
1759+
1760+ async def _async_compose_stats_payload (
17431761 self , cluster_autoscaling_stats_json : Optional [bytes ]
17441762 ) -> str :
1745- stats = self ._collect_stats ()
1763+ stats = await self ._async_collect_stats ()
17461764
17471765 # Report stats only when metrics collection is enabled.
17481766 if not self ._metrics_collection_disabled :
0 commit comments