Revert "[Dashboard] Remove gpustats dependencies from Ray[default] (#…

…41044)" (#41375) Reverts #41044 premerge is busted and potentially blocking people from merging into branch cut. Revert to unblock Failing test: linux://python/ray/tests:test_streaming_generator_2 This reverts commit 9b9fb55.
ray-project · Nov 24, 2023 · 1fbfa07 · 1fbfa07
1 parent d3ac31e
commit 1fbfa07
Show file tree

Hide file tree

Showing 11 changed files with 1,380 additions and 188 deletions.
diff --git a/ci/env/check_minimal_install.py b/ci/env/check_minimal_install.py
@@ -18,6 +18,7 @@
     "aiohttp_cors",
     "colorful",
     "py-spy",
+    "gpustat",
     "opencensus",
     "prometheus_client",
     "smart_open",

diff --git a/dashboard/client/src/pages/node/index.tsx b/dashboard/client/src/pages/node/index.tsx
@@ -85,14 +85,13 @@ const columns = [
     helpInfo: (
       <Typography>
         Usage of each GPU device. If no GPU usage is detected, here are the
-        potential root causes:
-        <br />
-        1. non-GPU Ray image is used on this node. Switch to a GPU Ray image and
-        try again. <br />
-        2. Non Nvidia GPUs are being used. Non Nvidia GPUs' utilizations are not
-        currently supported.
-        <br />
-        3. pynvml module raises an exception.
+        potential root causes: <br />
+        1. library gpustsat is not installed. Install gpustat and try again.
+        <br /> 2. non-GPU Ray image is used on this node. Switch to a GPU Ray
+        image and try again. <br />
+        3. AMD GPUs are being used. AMD GPUs are not currently supported by
+        gpustat module. <br />
+        4. gpustat module raises an exception.
       </Typography>
     ),
   },

diff --git a/dashboard/client/src/type/node.d.ts b/dashboard/client/src/type/node.d.ts
@@ -54,21 +54,28 @@ export type NodeListRsp = {
   msg: string;
 };
 
-export type ProcessGPUUsage = {
-  // This gpu usage stats from a process
-  pid: number;
+export type GPUProcessStats = {
+  // Sub stat of GPU stats, this type represents the GPU
+  // utilization of a single process of a single GPU.
+  username: string;
+  command: string;
   gpuMemoryUsage: number;
+  pid: number;
 };
 
 export type GPUStats = {
   // This represents stats fetched from a node about a single GPU
   uuid: string;
   index: number;
   name: string;
+  temperatureGpu: number;
+  fanSpeed: number;
   utilizationGpu?: number;
+  powerDraw: number;
+  enforcedPowerLimit: number;
   memoryUsed: number;
   memoryTotal: number;
-  processes?: ProcessGPUUsage[];
+  processes?: GPUProcessStats[];
 };
 
 export type NodeDetailExtend = {

diff --git a/dashboard/modules/reporter/reporter_agent.py b/dashboard/modules/reporter/reporter_agent.py
@@ -6,10 +6,11 @@
 import socket
 import sys
 import traceback
+import warnings
 
 import psutil
 
-from typing import List, Optional, Tuple, TypedDict, Union
+from typing import List, Optional, Tuple
 from collections import defaultdict
 
 import ray
@@ -27,8 +28,8 @@
 from prometheus_client.core import REGISTRY
 from ray._private.metrics_agent import Gauge, MetricsAgent, Record
 from ray._private.ray_constants import DEBUG_AUTOSCALING_STATUS
-import ray._private.thirdparty.pynvml as pynvml
 from ray.core.generated import reporter_pb2, reporter_pb2_grpc
+from ray.util.debug import log_once
 from ray.dashboard import k8s_utils
 from ray._raylet import WorkerID
 
@@ -47,6 +48,25 @@
 # Using existence of /sys/fs/cgroup as the criterion is consistent with
 # Ray's existing resource logic, see e.g. ray._private.utils.get_num_cpus().
 
+try:
+    import gpustat.core as gpustat
+except ModuleNotFoundError:
+    gpustat = None
+    if log_once("gpustat_import_warning"):
+        warnings.warn(
+            "`gpustat` package is not installed. GPU monitoring is "
+            "not available. To have full functionality of the "
+            "dashboard please install `pip install ray["
+            "default]`.)"
+        )
+except ImportError as e:
+    gpustat = None
+    if log_once("gpustat_import_warning"):
+        warnings.warn(
+            "Importing gpustat failed, fix this to have full "
+            "functionality of the dashboard. The original error was:\n\n" + e.msg
+        )
+
 
 def recursive_asdict(o):
     if isinstance(o, tuple) and hasattr(o, "_asdict"):
@@ -248,29 +268,6 @@ def jsonify_asdict(o) -> str:
     ),
 }
 
-MB = 1024 * 1024
-
-# Types
-Percentage = int
-Megabytes = int
-
-
-# gpu utilization for nvidia gpu from a single process
-class ProcessGPUInfo(TypedDict):
-    pid: int
-    gpu_memory_usage: Megabytes
-
-
-# gpu utilization for nvidia gpu
-class GpuUtilizationInfo(TypedDict):
-    index: int
-    name: str
-    uuid: str
-    utilization_gpu: Optional[Percentage]
-    memory_used: Megabytes
-    memory_total: Megabytes
-    processes_pids: Optional[List[ProcessGPUInfo]]
-
 
 class ReporterAgent(
     dashboard_utils.DashboardAgentModule, reporter_pb2_grpc.ReporterServiceServicer
@@ -397,71 +394,30 @@ def _get_cpu_percent(in_k8s: bool):
     @staticmethod
     def _get_gpu_usage():
         global enable_gpu_usage_check
-        if not enable_gpu_usage_check:
+        if gpustat is None or not enable_gpu_usage_check:
             return []
         gpu_utilizations = []
-
-        def decode(b: Union[str, bytes]) -> str:
-            if isinstance(b, bytes):
-                return b.decode("utf-8")  # for python3, to unicode
-            return b
-
+        gpus = []
         try:
-            pynvml.nvmlInit()
-            num_gpus = pynvml.nvmlDeviceGetCount()
-            for i in range(num_gpus):
-                gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-                memory_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
-                utilization = None
-                try:
-                    utilization_info = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle)
-                    utilization = int(utilization_info.gpu)
-                except pynvml.NVMLError as e:
-                    logger.debug(f"pynvml failed to retrieve GPU utilization: {e}")
-
-                # processes pids
-                processes_pids = None
-                try:
-                    nv_comp_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(
-                        gpu_handle
-                    )
-                    nv_graphics_processes = (
-                        pynvml.nvmlDeviceGetGraphicsRunningProcesses(gpu_handle)
-                    )
-                    processes_pids = [
-                        ProcessGPUInfo(
-                            pid=int(nv_process.pid),
-                            gpu_memory_usage=int(nv_process.usedGpuMemory) // MB
-                            if nv_process.usedGpuMemory
-                            else 0,
-                        )
-                        for nv_process in (nv_comp_processes + nv_graphics_processes)
-                    ]
-                except pynvml.NVMLError as e:
-                    logger.debug(f"pynvml failed to retrieve GPU processes: {e}")
-
-                info = GpuUtilizationInfo(
-                    index=i,
-                    name=decode(pynvml.nvmlDeviceGetName(gpu_handle)),
-                    uuid=decode(pynvml.nvmlDeviceGetUUID(gpu_handle)),
-                    utilization_gpu=utilization,
-                    memory_used=int(memory_info.used) // MB,
-                    memory_total=int(memory_info.total) // MB,
-                    processes_pids=processes_pids,
-                )
-                gpu_utilizations.append(info)
-            pynvml.nvmlShutdown()
+            gpus = gpustat.new_query().gpus
         except Exception as e:
-            logger.debug(f"pynvml failed to retrieve GPU information: {e}")
+            logger.debug(f"gpustat failed to retrieve GPU information: {e}")
 
-            # On machines without GPUs, pynvml.nvmlInit() can run subprocesses that
-            # spew to stderr. Then with log_to_driver=True, we get log spew from every
+            # gpustat calls pynvml.nvmlInit()
+            # On machines without GPUs, this can run subprocesses that spew to
+            # stderr. Then with log_to_driver=True, we get log spew from every
             # single raylet. To avoid this, disable the GPU usage check on
             # certain errors.
             # https://github.com/ray-project/ray/issues/14305
             # https://github.com/ray-project/ray/pull/21686
             if type(e).__name__ == "NVMLError_DriverNotLoaded":
                 enable_gpu_usage_check = False
+
+        for gpu in gpus:
+            # Note the keys in this dict have periods which throws
+            # off javascript so we change .s to _s
+            gpu_data = {"_".join(key.split(".")): val for key, val in gpu.entry.items()}
+            gpu_utilizations.append(gpu_data)
         return gpu_utilizations
 
     @staticmethod
@@ -955,14 +911,21 @@ def _record_stats(self, stats, cluster_stats):
             )
             records_reported.append(node_mem_shared)
 
-        # The output example of GpuUtilizationInfo.
+        # The output example of gpustats.
         """
         {'index': 0,
         'uuid': 'GPU-36e1567d-37ed-051e-f8ff-df807517b396',
         'name': 'NVIDIA A10G',
+        'temperature_gpu': 20,
+        'fan_speed': 0,
         'utilization_gpu': 1,
+        'utilization_enc': 0,
+        'utilization_dec': 0,
+        'power_draw': 51,
+        'enforced_power_limit': 300,
         'memory_used': 0,
-        'memory_total': 22731}
+        'memory_total': 22731,
+        'processes': []}
         """
         # -- GPU per node --
         gpus = stats["gpus"]

diff --git a/dashboard/modules/reporter/tests/test_reporter.py b/dashboard/modules/reporter/tests/test_reporter.py
@@ -359,7 +359,13 @@ def test_report_stats_gpu():
     {'index': 0,
     'uuid': 'GPU-36e1567d-37ed-051e-f8ff-df807517b396',
     'name': 'NVIDIA A10G',
+    'temperature_gpu': 20,
+    'fan_speed': 0,
     'utilization_gpu': 1,
+    'utilization_enc': 0,
+    'utilization_dec': 0,
+    'power_draw': 51,
+    'enforced_power_limit': 300,
     'memory_used': 0,
     'memory_total': 22731,
     'processes': []}
@@ -370,7 +376,13 @@ def test_report_stats_gpu():
             "index": 0,
             "uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b396",
             "name": "NVIDIA A10G",
+            "temperature_gpu": 20,
+            "fan_speed": 0,
             "utilization_gpu": 0,
+            "utilization_enc": 0,
+            "utilization_dec": 0,
+            "power_draw": 51,
+            "enforced_power_limit": 300,
             "memory_used": 0,
             "memory_total": GPU_MEMORY,
             "processes": [],
@@ -379,7 +391,13 @@ def test_report_stats_gpu():
             "index": 1,
             "uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b397",
             "name": "NVIDIA A10G",
+            "temperature_gpu": 20,
+            "fan_speed": 0,
             "utilization_gpu": 1,
+            "utilization_enc": 0,
+            "utilization_dec": 0,
+            "power_draw": 51,
+            "enforced_power_limit": 300,
             "memory_used": 1,
             "memory_total": GPU_MEMORY,
             "processes": [],
@@ -388,7 +406,13 @@ def test_report_stats_gpu():
             "index": 2,
             "uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b398",
             "name": "NVIDIA A10G",
+            "temperature_gpu": 20,
+            "fan_speed": 0,
             "utilization_gpu": 2,
+            "utilization_enc": 0,
+            "utilization_dec": 0,
+            "power_draw": 51,
+            "enforced_power_limit": 300,
             "memory_used": 2,
             "memory_total": GPU_MEMORY,
             "processes": [],
@@ -397,7 +421,13 @@ def test_report_stats_gpu():
         {
             "index": 3,
             "uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b398",
+            "temperature_gpu": 20,
+            "fan_speed": 0,
             "utilization_gpu": 3,
+            "utilization_enc": 0,
+            "utilization_dec": 0,
+            "power_draw": 51,
+            "enforced_power_limit": 300,
             "memory_used": 3,
             "memory_total": GPU_MEMORY,
             "processes": [],
@@ -406,7 +436,13 @@ def test_report_stats_gpu():
         {
             "uuid": "GPU-36e1567d-37ed-051e-f8ff-df807517b398",
             "name": "NVIDIA A10G",
+            "temperature_gpu": 20,
+            "fan_speed": 0,
             "utilization_gpu": 3,
+            "utilization_enc": 0,
+            "utilization_dec": 0,
+            "power_draw": 51,
+            "enforced_power_limit": 300,
             "memory_used": 3,
             "memory_total": 22731,
             "processes": [],

diff --git a/python/ray/_private/accelerators/nvidia_gpu.py b/python/ray/_private/accelerators/nvidia_gpu.py
@@ -63,11 +63,10 @@ def get_current_node_accelerator_type() -> Optional[str]:
         cuda_device_type = None
         if device_count > 0:
             handle = pynvml.nvmlDeviceGetHandleByIndex(0)
-            device_name = pynvml.nvmlDeviceGetName(handle)
-            if isinstance(device_name, bytes):
-                device_name = device_name.decode("utf-8")
             cuda_device_type = (
-                NvidiaGPUAcceleratorManager._gpu_name_to_accelerator_type(device_name)
+                NvidiaGPUAcceleratorManager._gpu_name_to_accelerator_type(
+                    pynvml.nvmlDeviceGetName(handle)
+                )
             )
         pynvml.nvmlShutdown()
         return cuda_device_type

diff --git a/python/ray/_private/thirdparty/pynvml/__init__.py b/python/ray/_private/thirdparty/pynvml/__init__.py
@@ -1,4 +1,3 @@
 from ray._private.thirdparty.pynvml.pynvml import *
-# nvdia-ml-py version
-# Note: we pick this version to use the V2 API which is supported by older drivers
-__version__ = "11.495.46"
+# current version
+__version__ = "12.535.133"