ray-project · jjyao · Nov 22, 2023 · Nov 13, 2023 · Nov 14, 2023 · Nov 15, 2023
diff --git a/ci/env/check_minimal_install.py b/ci/env/check_minimal_install.py
@@ -18,7 +18,6 @@
     "aiohttp_cors",
     "colorful",
     "py-spy",
-    "gpustat",
     "opencensus",
     "prometheus_client",
     "smart_open",

diff --git a/dashboard/client/src/pages/node/index.tsx b/dashboard/client/src/pages/node/index.tsx
@@ -85,13 +85,13 @@ const columns = [
     helpInfo: (
       <Typography>
         Usage of each GPU device. If no GPU usage is detected, here are the
-        potential root causes: <br />
-        1. library gpustsat is not installed. Install gpustat and try again.
-        <br /> 2. non-GPU Ray image is used on this node. Switch to a GPU Ray
+        potential root causes:<br /> 
+        1. non-GPU Ray image is used on this node. Switch to a GPU Ray
         image and try again. <br />
-        3. AMD GPUs are being used. AMD GPUs are not currently supported by
-        gpustat module. <br />
-        4. gpustat module raises an exception.
+        2. Non Nvidia GPUs are being used. Non Nvidia GPUs' utilizations are not
+        currently supported.
+        <br />
+        3. pynvml module raises an exception.
       </Typography>
     ),
   },

diff --git a/dashboard/client/src/type/node.d.ts b/dashboard/client/src/type/node.d.ts
@@ -54,28 +54,21 @@ export type NodeListRsp = {
   msg: string;
 };
 
-export type GPUProcessStats = {
-  // Sub stat of GPU stats, this type represents the GPU
-  // utilization of a single process of a single GPU.
-  username: string;
-  command: string;
-  gpuMemoryUsage: number;
+export type ProcessGPUUsage = {
+  // This gpu usage stats from a process
   pid: number;
+  gpuMemoryUsage: number;
 };
 
 export type GPUStats = {
   // This represents stats fetched from a node about a single GPU
   uuid: string;
   index: number;
   name: string;
-  temperatureGpu: number;
-  fanSpeed: number;
   utilizationGpu?: number;
-  powerDraw: number;
-  enforcedPowerLimit: number;
   memoryUsed: number;
   memoryTotal: number;
-  processes?: GPUProcessStats[];
+  processes?: ProcessGPUUsage[];
 };
 
 export type NodeDetailExtend = {

diff --git a/dashboard/modules/reporter/reporter_agent.py b/dashboard/modules/reporter/reporter_agent.py
@@ -6,11 +6,10 @@
 import socket
 import sys
 import traceback
-import warnings
 
 import psutil
 
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, TypedDict
 from collections import defaultdict
 
 import ray
@@ -28,8 +27,8 @@
 from prometheus_client.core import REGISTRY
 from ray._private.metrics_agent import Gauge, MetricsAgent, Record
 from ray._private.ray_constants import DEBUG_AUTOSCALING_STATUS
+import ray._private.thirdparty.pynvml as pynvml
 from ray.core.generated import reporter_pb2, reporter_pb2_grpc
-from ray.util.debug import log_once
 from ray.dashboard import k8s_utils
 from ray._raylet import WorkerID
 
@@ -48,25 +47,6 @@
 # Using existence of /sys/fs/cgroup as the criterion is consistent with
 # Ray's existing resource logic, see e.g. ray._private.utils.get_num_cpus().
 
-try:
-    import gpustat.core as gpustat
-except ModuleNotFoundError:
-    gpustat = None
-    if log_once("gpustat_import_warning"):
-        warnings.warn(
-            "`gpustat` package is not installed. GPU monitoring is "
-            "not available. To have full functionality of the "
-            "dashboard please install `pip install ray["
-            "default]`.)"
-        )
-except ImportError as e:
-    gpustat = None
-    if log_once("gpustat_import_warning"):
-        warnings.warn(
-            "Importing gpustat failed, fix this to have full "
-            "functionality of the dashboard. The original error was:\n\n" + e.msg
-        )
-
 
 def recursive_asdict(o):
     if isinstance(o, tuple) and hasattr(o, "_asdict"):
@@ -268,6 +248,29 @@ def jsonify_asdict(o) -> str:
     ),
 }
 
+MB = 1024 * 1024
+
+# Types
+Percentage = int
+Megabytes = int
+
+
+# gpu utilization for nvidia gpu from a single process
+class ProcessGPUInfo(TypedDict):
+    pid: int
+    gpu_memory_usage: int
+
+
+# gpu utilization for nvidia gpu
+class GpuUtilizationInfo(TypedDict):
+    index: int
+    name: str
+    uuid: str
+    utilization_gpu: Optional[Percentage]
+    memory_used: Megabytes
+    memory_total: Megabytes
+    processes_pids: Optional[List[ProcessGPUInfo]]
+
 
 class ReporterAgent(
     dashboard_utils.DashboardAgentModule, reporter_pb2_grpc.ReporterServiceServicer
@@ -394,30 +397,66 @@ def _get_cpu_percent(in_k8s: bool):
     @staticmethod
     def _get_gpu_usage():
         global enable_gpu_usage_check
-        if gpustat is None or not enable_gpu_usage_check:
+        if not enable_gpu_usage_check:
             return []
         gpu_utilizations = []
-        gpus = []
         try:
-            gpus = gpustat.new_query().gpus
+            pynvml.nvmlInit()
+            num_gpus = pynvml.nvmlDeviceGetCount()
+            for i in range(num_gpus):
+                gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+                utilization = None
+                try:
+                    utilization_info = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle)
+                    utilization = int(utilization_info.gpu)
+                except pynvml.NVMLError as e:
+                    logger.debug(f"pynvml failed to retrieve GPU utilization: {e}")
+
+                # processes pids
+                processes_pids = None
+                try:
+                    nv_comp_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(
+                        gpu_handle
+                    )
+                    nv_graphics_processes = (
+                        pynvml.nvmlDeviceGetGraphicsRunningProcesses(gpu_handle)
+                    )
+                    processes_pids = [
+                        ProcessGPUInfo(
+                            pid=nv_process.pid,
+                            gpu_memory_usage=nv_process.usedGpuMemory // MB
+                            if nv_process.usedGpuMemory
+                            else 0,
+                        )
+                        for nv_process in (nv_comp_processes + nv_graphics_processes)
+                    ]
+                except pynvml.NVMLError as e:
+                    logger.debug(f"pynvml failed to retrieve GPU processes: {e}")
+
+                info = GpuUtilizationInfo(
+                    index=i,
+                    name=pynvml.nvmlDeviceGetName(gpu_handle),
+                    uuid=pynvml.nvmlDeviceGetUUID(gpu_handle),
+                    utilization_gpu=utilization,
+                    memory_used=int(pynvml.nvmlDeviceGetMemoryInfo(gpu_handle).used)
+                    // MB,
+                    memory_total=int(pynvml.nvmlDeviceGetMemoryInfo(gpu_handle).total)
+                    // MB,
+                    processes_pids=processes_pids,
+                )
+                gpu_utilizations.append(info)
+            pynvml.nvmlShutdown()
         except Exception as e:
-            logger.debug(f"gpustat failed to retrieve GPU information: {e}")
+            logger.debug(f"pynvml failed to retrieve GPU information: {e}")
 
-            # gpustat calls pynvml.nvmlInit()
-            # On machines without GPUs, this can run subprocesses that spew to
-            # stderr. Then with log_to_driver=True, we get log spew from every
+            # On machines without GPUs, pynvml.nvmlInit() can run subprocesses that
+            # spew to stderr. Then with log_to_driver=True, we get log spew from every
             # single raylet. To avoid this, disable the GPU usage check on
             # certain errors.
             # https://github.com/ray-project/ray/issues/14305
             # https://github.com/ray-project/ray/pull/21686
             if type(e).__name__ == "NVMLError_DriverNotLoaded":
                 enable_gpu_usage_check = False
-
-        for gpu in gpus:
-            # Note the keys in this dict have periods which throws
-            # off javascript so we change .s to _s
-            gpu_data = {"_".join(key.split(".")): val for key, val in gpu.entry.items()}
-            gpu_utilizations.append(gpu_data)
         return gpu_utilizations
 
     @staticmethod
@@ -911,21 +950,14 @@ def _record_stats(self, stats, cluster_stats):
             )
             records_reported.append(node_mem_shared)
 
-        # The output example of gpustats.
+        # The output example of GpuUtilizationInfo.
         """
         {'index': 0,
         'uuid': 'GPU-36e1567d-37ed-051e-f8ff-df807517b396',
         'name': 'NVIDIA A10G',
-        'temperature_gpu': 20,
-        'fan_speed': 0,
         'utilization_gpu': 1,
-        'utilization_enc': 0,
-        'utilization_dec': 0,
-        'power_draw': 51,
-        'enforced_power_limit': 300,
         'memory_used': 0,
-        'memory_total': 22731,
-        'processes': []}
+        'memory_total': 22731}
         """
         # -- GPU per node --
         gpus = stats["gpus"]

diff --git a/dashboard/modules/reporter/tests/test_reporter.py b/dashboard/modules/reporter/tests/test_reporter.py
@@ -359,13 +359,7 @@ def test_report_stats_gpu():
     {'index': 0,
     'uuid': 'GPU-36e1567d-37ed-051e-f8ff-df807517b396',
     'name': 'NVIDIA A10G',
-    'temperature_gpu': 20,
-    'fan_speed': 0,
     'utilization_gpu': 1,
-    'utilization_enc': 0,
-    'utilization_dec': 0,
-    'power_draw': 51,
-    'enforced_power_limit': 300,
     'memory_used': 0,
     'memory_total': 22731,
     'processes': []}
@@ -376,13 +370,7 @@ def test_report_stats_gpu():
             "index": 0,
             "uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b396",
             "name": "NVIDIA A10G",
-            "temperature_gpu": 20,
-            "fan_speed": 0,
             "utilization_gpu": 0,
-            "utilization_enc": 0,
-            "utilization_dec": 0,
-            "power_draw": 51,
-            "enforced_power_limit": 300,
             "memory_used": 0,
             "memory_total": GPU_MEMORY,
             "processes": [],
@@ -391,13 +379,7 @@ def test_report_stats_gpu():
             "index": 1,
             "uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b397",
             "name": "NVIDIA A10G",
-            "temperature_gpu": 20,
-            "fan_speed": 0,
             "utilization_gpu": 1,
-            "utilization_enc": 0,
-            "utilization_dec": 0,
-            "power_draw": 51,
-            "enforced_power_limit": 300,
             "memory_used": 1,
             "memory_total": GPU_MEMORY,
             "processes": [],
@@ -406,13 +388,7 @@ def test_report_stats_gpu():
             "index": 2,
             "uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b398",
             "name": "NVIDIA A10G",
-            "temperature_gpu": 20,
-            "fan_speed": 0,
             "utilization_gpu": 2,
-            "utilization_enc": 0,
-            "utilization_dec": 0,
-            "power_draw": 51,
-            "enforced_power_limit": 300,
             "memory_used": 2,
             "memory_total": GPU_MEMORY,
             "processes": [],
@@ -421,13 +397,7 @@ def test_report_stats_gpu():
         {
             "index": 3,
             "uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b398",
-            "temperature_gpu": 20,
-            "fan_speed": 0,
             "utilization_gpu": 3,
-            "utilization_enc": 0,
-            "utilization_dec": 0,
-            "power_draw": 51,
-            "enforced_power_limit": 300,
             "memory_used": 3,
             "memory_total": GPU_MEMORY,
             "processes": [],
@@ -436,13 +406,7 @@ def test_report_stats_gpu():
         {
             "uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b398",
             "name": "NVIDIA A10G",
-            "temperature_gpu": 20,
-            "fan_speed": 0,
             "utilization_gpu": 3,
-            "utilization_enc": 0,
-            "utilization_dec": 0,
-            "power_draw": 51,
-            "enforced_power_limit": 300,
             "memory_used": 3,
             "memory_total": 22731,
             "processes": [],

@@ -66,7 +66,7 @@ def host_to_device_auto_pipeline(device):
 
 def test_auto_transfer_correct_device(ray_start_4_cpus_2_gpus):
     """Tests that auto_transfer uses the right device for the cuda stream."""
-    import pynvml
+    import ray._private.thirdparty.pynvml as pynvml
 
     pynvml.nvmlInit()
 

diff --git a/python/requirements.txt b/python/requirements.txt
@@ -38,7 +38,6 @@ scipy
 colorful
 pyyaml
 rich
-gpustat>=1.0.0
 opentelemetry-sdk
 fastapi
 gymnasium==0.28.1

@@ -247,7 +247,6 @@ def get_packages(self):
             "colorful",
             "py-spy >= 0.2.0",
             "requests",
-            "gpustat >= 1.0.0",  # for windows
             "grpcio >= 1.32.0; python_version < '3.10'",  # noqa:E501
             "grpcio >= 1.42.0; python_version >= '3.10'",  # noqa:E501
             "opencensus",

diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler.cc b/src/ray/raylet/scheduling/cluster_resource_scheduler.cc
@@ -45,7 +45,8 @@ ClusterResourceScheduler::ClusterResourceScheduler(
     std::function<bool(scheduling::NodeID)> is_node_available_fn,
     std::function<int64_t(void)> get_used_object_store_memory,
     std::function<bool(void)> get_pull_manager_at_capacity,
-    const absl::flat_hash_map<std::string, std::string> &local_node_labels)
+    const absl::flat_hash_map<std::string, std::string>
+        &local_node_labels)  // pass this to local resource manager
     : local_node_id_(local_node_id), is_node_available_fn_(is_node_available_fn) {
   NodeResources node_resources = ResourceMapToNodeResources(
       local_node_resources, local_node_resources, local_node_labels);