ray-project · mattip · May 21, 2023 · Nov 2, 2023 · jjyao · Nov 3, 2023
diff --git a/python/ray/_private/accelerators/nvidia_gpu.py b/python/ray/_private/accelerators/nvidia_gpu.py
@@ -6,10 +6,7 @@
 import importlib
 from typing import Optional, List, Tuple
 
-try:
-    import GPUtil
-except ImportError:
-    pass
+import gpustat
 
 from ray._private.accelerators.accelerator import AcceleratorManager
 
@@ -53,54 +50,17 @@ def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
 
     @staticmethod
     def get_current_node_num_accelerators() -> int:
-        num_gpus = 0
-        if importlib.util.find_spec("GPUtil"):
-            gpu_list = GPUtil.getGPUs()
-            num_gpus = len(gpu_list)
-        elif sys.platform.startswith("linux"):
-            proc_gpus_path = "/proc/driver/nvidia/gpus"
-            if os.path.isdir(proc_gpus_path):
-                num_gpus = len(os.listdir(proc_gpus_path))
-        elif sys.platform == "win32":
-            props = "AdapterCompatibility"
-            cmdargs = ["WMIC", "PATH", "Win32_VideoController", "GET", props]
-            lines = subprocess.check_output(cmdargs).splitlines()[1:]
-            num_gpus = len([x.rstrip() for x in lines if x.startswith(b"NVIDIA")])
+        num_gpus = gpustat.gpu_count()
         return num_gpus
 
     @staticmethod
     def get_current_node_accelerator_type() -> Optional[str]:
         try:
-            if importlib.util.find_spec("GPUtil"):
-                gpu_list = GPUtil.getGPUs()
-                if len(gpu_list) > 0:
-                    gpu_list_names = [gpu.name for gpu in gpu_list]
-                    return NvidiaGPUAcceleratorManager._gpu_name_to_accelerator_type(
-                        gpu_list_names.pop()
-                    )
-            elif sys.platform.startswith("linux"):
-                proc_gpus_path = "/proc/driver/nvidia/gpus"
-                if not os.path.isdir(proc_gpus_path):
-                    return None
-                gpu_dirs = os.listdir(proc_gpus_path)
-                if len(gpu_dirs) == 0:
-                    return None
-                gpu_info_path = f"{proc_gpus_path}/{gpu_dirs[0]}/information"
-                info_str = open(gpu_info_path).read()
-                if not info_str:
-                    return None
-                lines = info_str.split("\n")
-                full_model_name = None
-                for line in lines:
-                    split = line.split(":")
-                    if len(split) != 2:
-                        continue
-                    k, v = split
-                    if k.strip() == "Model":
-                        full_model_name = v.strip()
-                        break
+            gpu_list = gpustat.new_query()
+            if len(gpu_list) > 0:
+                gpu_list_names = [gpu.name for gpu in gpu_list]
                 return NvidiaGPUAcceleratorManager._gpu_name_to_accelerator_type(
-                    full_model_name
+                    gpu_list_names.pop()
                 )
         except Exception:
             logger.exception("Could not parse gpu information.")

@@ -137,33 +137,37 @@ def test_raises_error_on_key_conflict(self):
             unflatten_dict({"a/b": 2, "a/b/c": 3})
 
 
-class GPUUtilMock:
+class GpustatMock:
     class GPU:
         def __init__(self, id, uuid, util=None):
-            self.id = id
+            self.index = id
             self.uuid = uuid
-            self.util = [0.5, 0.0]
+            self.util = [12000, 0]
 
         @property
-        def memoryUtil(self):
+        def memory_used(self):
             if self.util:
                 return self.util.pop(0)
             return 0
 
+        @property
+        def memory_total(self):
+            return 24000
+
     def __init__(self, gpus, gpu_uuids):
         self.gpus = gpus
         self.uuids = gpu_uuids
         self.gpu_list = [
             self.GPU(gpu, uuid) for gpu, uuid in zip(self.gpus, self.uuids)
         ]
 
-    def getGPUs(self):
+    def new_query(self):
         return self.gpu_list
 
 
 class GPUTest(unittest.TestCase):
     def setUp(self):
-        sys.modules["GPUtil"] = GPUUtilMock([0, 1], ["GPU-aaa", "GPU-bbb"])
+        sys.modules["gpustat"] = GpustatMock([0, 1], ["GPU-aaa", "GPU-bbb"])
 
     def testGPUWait1(self):
         wait_for_gpu(0, delay_s=0)
@@ -188,7 +192,7 @@ def testGPUWaitFail(self):
     def testDefaultGPU(self):
         import sys
 
-        sys.modules["GPUtil"] = GPUUtilMock([0], ["GPU-aaa"])
+        sys.modules["gpustat"] = GpustatMock([0], ["GPU-aaa"])
         wait_for_gpu(delay_s=0)
 
 

@@ -32,12 +32,12 @@
 logger = logging.getLogger(__name__)
 
 
-def _import_gputil():
+def _import_gpustat():
     try:
-        import GPUtil
+        import gpustat
     except ImportError:
-        GPUtil = None
-    return GPUtil
+        gpustat = None
+    return gpustat
 
 
 START_OF_TIME = time.time()
@@ -50,25 +50,25 @@ class UtilMonitor(Thread):
     It keeps track of CPU, RAM, GPU, VRAM usage (each gpu separately) by
     pinging for information every x seconds in a separate thread.
 
-    Requires psutil and GPUtil to be installed. Can be enabled with
+    Requires psutil and gpustat to be installed. Can be enabled with
     Tuner(param_space={"log_sys_usage": True}).
     """
 
     def __init__(self, start=True, delay=0.7):
         self.stopped = True
-        GPUtil = _import_gputil()
-        self.GPUtil = GPUtil
-        if GPUtil is None and start:
-            logger.warning("Install gputil for GPU system monitoring.")
+        gpustat = _import_gpustat()
+        self.gpustat = gpustat
+        if gpustat is None and start:
+            logger.warning("Install gpustat for GPU system monitoring.")
 
         if psutil is None and start:
             logger.warning("Install psutil to monitor system performance.")
 
-        if GPUtil is None and psutil is None:
+        if gpustat is None and psutil is None:
             return
 
         super(UtilMonitor, self).__init__()
-        self.delay = delay  # Time between calls to GPUtil
+        self.delay = delay  # Time between calls to gpustat
         self.values = defaultdict(list)
         self.lock = threading.Lock()
         self.daemon = True
@@ -84,18 +84,18 @@ def _read_utilization(self):
                 self.values["ram_util_percent"].append(
                     float(getattr(psutil.virtual_memory(), "percent"))
                 )
-            if self.GPUtil is not None:
+            if self.gpustat is not None:
                 gpu_list = []
                 try:
-                    gpu_list = self.GPUtil.getGPUs()
+                    gpu_list = self.gpustat.new_query()
                 except Exception:
-                    logger.debug("GPUtil failed to retrieve GPUs.")
+                    logger.debug("gpustat failed to retrieve GPUs.")
                 for gpu in gpu_list:
                     self.values["gpu_util_percent" + str(gpu.id)].append(
-                        float(gpu.load)
+                        float(gpu.utilization)
                     )
                     self.values["vram_util_percent" + str(gpu.id)].append(
-                        float(gpu.memoryUtil)
+                        float(gpu.memory_used / gpu.memory_total)
                     )
 
     def get_data(self):
@@ -451,11 +451,11 @@ def wait_for_gpu(
 ):
     """Checks if a given GPU has freed memory.
 
-    Requires ``gputil`` to be installed: ``pip install gputil``.
+    Requires ``gpustat`` to be installed: ``pip install gpustat``.
 
     Args:
         gpu_id: GPU id or uuid to check.
-            Must be found within GPUtil.getGPUs(). If none, resorts to
+            Must be found within gpustat.new_query(). If None, resorts to
             the first item returned from `ray.get_gpu_ids()`.
         target_util: The utilization threshold to reach to unblock.
             Set this to 0 to block until the GPU is completely free.
@@ -467,7 +467,7 @@ def wait_for_gpu(
         bool: True if free.
 
     Raises:
-        RuntimeError: If GPUtil is not found, if no GPUs are detected
+        RuntimeError: If gpustat is not found, if no GPUs are detected
             or if the check fails.
 
     Example:
@@ -488,10 +488,10 @@ def tune_func(config):
         tuner.fit()
 
     """
-    GPUtil = _import_gputil()
+    gpustat = _import_gpustat()
 
-    if GPUtil is None:
-        raise RuntimeError("GPUtil must be installed if calling `wait_for_gpu`.")
+    if gpustat is None:
+        raise RuntimeError("gpustat must be installed if calling `wait_for_gpu`.")
 
     if gpu_id is None:
         gpu_id_list = ray.get_gpu_ids()
@@ -502,7 +502,7 @@ def tune_func(config):
             )
         gpu_id = gpu_id_list[0]
 
-    gpu_attr = "id"
+    gpu_attr = "index"
     if isinstance(gpu_id, str):
         if gpu_id.isdigit():
             # GPU ID returned from `ray.get_gpu_ids()` is a str representation
@@ -521,7 +521,7 @@ def gpu_id_fn(g):
         # the format of the input `gpu_id`
         return getattr(g, gpu_attr)
 
-    gpu_ids = {gpu_id_fn(g) for g in GPUtil.getGPUs()}
+    gpu_ids = {gpu_id_fn(g) for g in gpustat.new_query()}
     if gpu_id not in gpu_ids:
         raise ValueError(
             f"{gpu_id} not found in set of available GPUs: {gpu_ids}. "
@@ -530,11 +530,11 @@ def gpu_id_fn(g):
         )
 
     for i in range(int(retry)):
-        gpu_object = next(g for g in GPUtil.getGPUs() if gpu_id_fn(g) == gpu_id)
-        if gpu_object.memoryUtil > target_util:
+        gpu_object = next(g for g in gpustat.new_query() if gpu_id_fn(g) == gpu_id)
+        if gpu_object.memory_used > target_util:
             logger.info(
-                f"Waiting for GPU util to reach {target_util}. "
-                f"Util: {gpu_object.memoryUtil:0.3f}"
+                f"Waiting for gpu memory used to be less than {target_util}. "
+                f"Used: {gpu_object.memory_used:0.3f}"
             )
             time.sleep(delay_s)
         else:

diff --git a/python/requirements.txt b/python/requirements.txt
@@ -38,7 +38,7 @@ scipy
 colorful
 pyyaml
 rich
-gpustat>=1.0.0
+gpustat>=1.1.0
 opentelemetry-sdk
 fastapi
 gymnasium==0.28.1

@@ -247,7 +247,7 @@ def get_packages(self):
             "colorful",
             "py-spy >= 0.2.0",
             "requests",
-            "gpustat >= 1.0.0",  # for windows
+            "gpustat >= 1.1.0",  
             "grpcio >= 1.32.0; python_version < '3.10'",  # noqa:E501
             "grpcio >= 1.42.0; python_version >= '3.10'",  # noqa:E501
             "opencensus",