ray-project
diff --git a/‎BUILD.bazel‎
Lines changed: 1 addition & 0 deletions b/‎BUILD.bazel‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/ray/_private/ray_constants.py‎
Lines changed: 0 additions & 8 deletions b/‎python/ray/_private/ray_constants.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎python/ray/_private/worker.py‎
Lines changed: 1 addition & 1 deletion b/‎python/ray/_private/worker.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/ray/_raylet.pyx‎
Lines changed: 1 addition & 0 deletions b/‎python/ray/_raylet.pyx‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/ray/dashboard/modules/job/common.py‎
Lines changed: 3 additions & 5 deletions b/‎python/ray/dashboard/modules/job/common.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎python/ray/dashboard/modules/job/utils.py‎
Lines changed: 2 additions & 2 deletions b/‎python/ray/dashboard/modules/job/utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/ray/dashboard/modules/node/tests/test_node.py‎
Lines changed: 34 additions & 0 deletions b/‎python/ray/dashboard/modules/node/tests/test_node.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎python/ray/dashboard/modules/reporter/reporter_agent.py‎
Lines changed: 48 additions & 30 deletions b/‎python/ray/dashboard/modules/reporter/reporter_agent.py‎
Lines changed: 48 additions & 30 deletions
@@ -242,6 +242,7 @@ pyx_library(
         "//src/ray/gcs_rpc_client:global_state_accessor_lib",
         "//src/ray/protobuf:serialization_cc_proto",
         "//src/ray/pubsub:python_gcs_subscriber",
+        "//src/ray/raylet_rpc_client:raylet_client_with_io_context_lib",
         "//src/ray/thirdparty/setproctitle",
         "//src/ray/util:memory",
         "//src/ray/util:raii",
 
@@ -459,14 +459,6 @@ def env_set_by_user(key):
 # Default max_concurrency option in @ray.remote for threaded actors.
 DEFAULT_MAX_CONCURRENCY_THREADED = 1
 
-# Prefix for namespaces which are used internally by ray.
-# Jobs within these namespaces should be hidden from users
-# and should not be considered user activity.
-# Please keep this in sync with the definition kRayInternalNamespacePrefix
-# in /src/ray/gcs/gcs_server/gcs_job_manager.h.
-RAY_INTERNAL_NAMESPACE_PREFIX = "_ray_internal_"
-RAY_INTERNAL_DASHBOARD_NAMESPACE = f"{RAY_INTERNAL_NAMESPACE_PREFIX}dashboard"
-
 # Ray internal flags. These flags should not be set by users, and we strip them on job
 # submission.
 # This should be consistent with src/ray/common/ray_internal_flag_def.h
 
@@ -2625,7 +2625,7 @@ def connect(
         # We also want to skip adding script directory when running from dashboard.
         code_paths = []
         if not interactive_mode and not (
-            namespace and namespace == ray_constants.RAY_INTERNAL_DASHBOARD_NAMESPACE
+            namespace and namespace == ray._raylet.RAY_INTERNAL_DASHBOARD_NAMESPACE
         ):
             script_directory = os.path.dirname(os.path.realpath(sys.argv[0]))
             # If driver's sys.path doesn't include the script directory
 
@@ -199,6 +199,7 @@ include "includes/libcoreworker.pxi"
 include "includes/global_state_accessor.pxi"
 include "includes/metric.pxi"
 include "includes/setproctitle.pxi"
+include "includes/raylet_client.pxi"
 include "includes/gcs_subscriber.pxi"
 
 import ray
 
@@ -14,7 +14,7 @@
     get_export_event_logger,
 )
 from ray._private.runtime_env.packaging import parse_uri
-from ray._raylet import GcsClient
+from ray._raylet import RAY_INTERNAL_NAMESPACE_PREFIX, GcsClient
 from ray.core.generated.export_event_pb2 import ExportEvent
 from ray.core.generated.export_submission_job_event_pb2 import (
     ExportSubmissionJobEventData,
@@ -25,9 +25,7 @@
 # they're exposed in the snapshot API.
 JOB_ID_METADATA_KEY = "job_submission_id"
 JOB_NAME_METADATA_KEY = "job_name"
-JOB_ACTOR_NAME_TEMPLATE = (
-    f"{ray_constants.RAY_INTERNAL_NAMESPACE_PREFIX}job_actor_" + "{job_id}"
-)
+JOB_ACTOR_NAME_TEMPLATE = f"{RAY_INTERNAL_NAMESPACE_PREFIX}job_actor_" + "{job_id}"
 # In order to get information about SupervisorActors launched by different jobs,
 # they must be set to the same namespace.
 SUPERVISOR_ACTOR_RAY_NAMESPACE = "SUPERVISOR_ACTOR_RAY_NAMESPACE"
@@ -227,7 +225,7 @@ class JobInfoStorageClient:
 
     # Please keep this format in sync with JobDataKey()
     # in src/ray/gcs/gcs_server/gcs_job_manager.h.
-    JOB_DATA_KEY_PREFIX = f"{ray_constants.RAY_INTERNAL_NAMESPACE_PREFIX}job_info_"
+    JOB_DATA_KEY_PREFIX = f"{RAY_INTERNAL_NAMESPACE_PREFIX}job_info_"
     JOB_DATA_KEY = f"{JOB_DATA_KEY_PREFIX}{{job_id}}"
 
     def __init__(
 
@@ -8,7 +8,7 @@
 from typing import Any, AsyncIterator, Dict, List, Optional, Tuple, Union
 
 from ray._private import ray_constants
-from ray._raylet import GcsClient
+from ray._raylet import RAY_INTERNAL_NAMESPACE_PREFIX, GcsClient
 from ray.dashboard.modules.job.common import (
     JOB_ID_METADATA_KEY,
     JobInfoStorageClient,
@@ -178,7 +178,7 @@ async def get_driver_jobs(
     submission_job_drivers = {}
     for job_table_entry in sorted_job_infos:
         if job_table_entry.config.ray_namespace.startswith(
-            ray_constants.RAY_INTERNAL_NAMESPACE_PREFIX
+            RAY_INTERNAL_NAMESPACE_PREFIX
         ):
             # Skip jobs in any _ray_internal_ namespace
             continue
 
@@ -283,5 +283,39 @@ def _check_workers():
     wait_for_condition(_check_workers, timeout=10)
 
 
+def test_worker_pids_reported(enable_test_module, ray_start_with_dashboard):
+    assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True
+    webui_url = ray_start_with_dashboard["webui_url"]
+    webui_url = format_web_url(webui_url)
+    node_id = ray_start_with_dashboard["node_id"]
+
+    @ray.remote(runtime_env={"uv": {"packages": ["requests==2.3.0"]}})
+    class UvActor:
+        def get_pid(self):
+            return os.getpid()
+
+    uv_actor = UvActor.remote()
+    uv_actor_pid = ray.get(uv_actor.get_pid.remote())
+    driver_pid = os.getpid()
+
+    def _check_worker_pids():
+        try:
+            response = requests.get(webui_url + f"/nodes/{node_id}")
+            response.raise_for_status()
+            dump_info = response.json()
+            assert dump_info["result"] is True
+            detail = dump_info["data"]["detail"]
+            pids = [worker["pid"] for worker in detail["workers"]]
+            assert len(pids) >= 2  # might include idle worker
+            assert uv_actor_pid in pids
+            assert driver_pid in pids
+            return True
+        except Exception as ex:
+            logger.info(ex)
+            return False
+
+    wait_for_condition(_check_worker_pids, timeout=20)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", __file__]))
@@ -41,7 +41,7 @@
     OpenTelemetryMetricRecorder,
 )
 from ray._private.utils import get_system_memory
-from ray._raylet import GCS_PID_KEY, WorkerID
+from ray._raylet import GCS_PID_KEY, RayletClient, WorkerID
 from ray.core.generated import reporter_pb2, reporter_pb2_grpc
 from ray.dashboard import k8s_utils
 from ray.dashboard.consts import (
@@ -66,6 +66,10 @@
 from ray.dashboard.modules.reporter.reporter_models import (
     StatsPayload,
 )
+from ray.exceptions import (
+    GetTimeoutError,
+    RpcError,
+)
 
 import psutil
 
@@ -395,9 +399,10 @@ class ReporterAgent(
 
     Attributes:
         dashboard_agent: The DashboardAgent object contains global config
+        raylet_client: The RayletClient object to access raylet server
     """
 
-    def __init__(self, dashboard_agent):
+    def __init__(self, dashboard_agent, raylet_client=None):
         """Initialize the reporter object."""
         super().__init__(dashboard_agent)
 
@@ -486,6 +491,13 @@ def __init__(self, dashboard_agent):
         # Create GPU metric provider instance
         self._gpu_metric_provider = GpuMetricProvider()
 
+        if raylet_client:
+            self._raylet_client = raylet_client
+        else:
+            self._raylet_client = RayletClient(
+                ip_address=self._ip, port=self._dashboard_agent.node_manager_port
+            )
+
     async def GetTraceback(self, request, context):
         pid = request.pid
         native = request.native
@@ -888,6 +900,14 @@ def _get_disk_io_stats():
                 stats.write_count,
             )
 
+    async def _async_get_worker_pids_from_raylet(self) -> List[int]:
+        try:
+            # Get worker pids from raylet via gRPC.
+            return await self._raylet_client.async_get_worker_pids()
+        except (GetTimeoutError, RpcError):
+            logger.exception("Failed to get worker pids from raylet")
+            return []
+
     def _get_agent_proc(self) -> psutil.Process:
         # Agent is the current process.
         # This method is not necessary, but we have it for mock testing.
@@ -896,27 +916,23 @@ def _get_agent_proc(self) -> psutil.Process:
     def _generate_worker_key(self, proc: psutil.Process) -> Tuple[int, float]:
         return (proc.pid, proc.create_time())
 
-    def _get_worker_processes(self):
-        raylet_proc = self._get_raylet_proc()
-        if raylet_proc is None:
+    async def _async_get_worker_processes(self):
+        pids = await self._async_get_worker_pids_from_raylet()
+        logger.debug(f"Worker PIDs from raylet: {pids}")
+        if not pids:
             return []
-        else:
-            workers = {}
-            if sys.platform == "win32":
-                # windows, get the child process not the runner
-                for child in raylet_proc.children():
-                    if child.children():
-                        child = child.children()[0]
-                    workers[self._generate_worker_key(child)] = child
-            else:
-                workers = {
-                    self._generate_worker_key(proc): proc
-                    for proc in raylet_proc.children()
-                }
-            return workers
-
-    def _get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None):
-        workers = self._get_worker_processes()
+        workers = {}
+        for pid in pids:
+            try:
+                proc = psutil.Process(pid)
+                workers[self._generate_worker_key(proc)] = proc
+            except (psutil.NoSuchProcess, psutil.AccessDenied):
+                logger.error(f"Failed to access worker process {pid}")
+                continue
+        return workers
+
+    async def _async_get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None):
+        workers = await self._async_get_worker_processes()
         if not workers:
             return []
         else:
@@ -936,9 +952,6 @@ def _get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None):
             for k in keys_to_pop:
                 self._workers.pop(k)
 
-            # Remove the current process (reporter agent), which is also a child of
-            # the Raylet.
-            self._workers.pop(self._generate_worker_key(self._get_agent_proc()))
             # Build process ID -> GPU info mapping for faster lookups
             gpu_pid_mapping = defaultdict(list)
             if gpus is not None:
@@ -1058,7 +1071,7 @@ def _get_shm_usage(self):
             return None
         return mem.shared
 
-    def _collect_stats(self):
+    async def _async_collect_stats(self):
         now = dashboard_utils.to_posix_time(datetime.datetime.utcnow())
         network_stats = self._get_network_stats()
         self._network_stats_hist.append((now, network_stats))
@@ -1079,7 +1092,7 @@ def _collect_stats(self):
             "mem": self._get_mem_usage(),
             # Unit is in bytes. None if
             "shm": self._get_shm_usage(),
-            "workers": self._get_workers(gpus),
+            "workers": await self._async_get_workers(gpus),
             "raylet": raylet,
             "agent": self._get_agent(),
             "bootTime": self._get_boot_time(),
@@ -1726,7 +1739,7 @@ async def _run_loop(self):
                 #       executor (TPE) to avoid blocking the Agent's event-loop
                 json_payload = await loop.run_in_executor(
                     self._executor,
-                    self._compose_stats_payload,
+                    self._run_in_executor,
                     autoscaler_status_json_bytes,
                 )
 
@@ -1739,10 +1752,15 @@ async def _run_loop(self):
 
             await asyncio.sleep(reporter_consts.REPORTER_UPDATE_INTERVAL_MS / 1000)
 
-    def _compose_stats_payload(
+    def _run_in_executor(self, cluster_autoscaling_stats_json: Optional[bytes]) -> str:
+        return asyncio.run(
+            self._async_compose_stats_payload(cluster_autoscaling_stats_json)
+        )
+
+    async def _async_compose_stats_payload(
         self, cluster_autoscaling_stats_json: Optional[bytes]
     ) -> str:
-        stats = self._collect_stats()
+        stats = await self._async_collect_stats()
 
         # Report stats only when metrics collection is enabled.
         if not self._metrics_collection_disabled: