Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

unify gpu checking around gpustat #35581

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 6 additions & 46 deletions python/ray/_private/accelerators/nvidia_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@
import importlib
from typing import Optional, List, Tuple

try:
import GPUtil
except ImportError:
pass
import gpustat

from ray._private.accelerators.accelerator import AcceleratorManager

Expand Down Expand Up @@ -53,54 +50,17 @@ def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:

@staticmethod
def get_current_node_num_accelerators() -> int:
num_gpus = 0
if importlib.util.find_spec("GPUtil"):
gpu_list = GPUtil.getGPUs()
num_gpus = len(gpu_list)
elif sys.platform.startswith("linux"):
proc_gpus_path = "/proc/driver/nvidia/gpus"
if os.path.isdir(proc_gpus_path):
num_gpus = len(os.listdir(proc_gpus_path))
elif sys.platform == "win32":
props = "AdapterCompatibility"
cmdargs = ["WMIC", "PATH", "Win32_VideoController", "GET", props]
lines = subprocess.check_output(cmdargs).splitlines()[1:]
num_gpus = len([x.rstrip() for x in lines if x.startswith(b"NVIDIA")])
num_gpus = gpustat.gpu_count()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current gpustat is only installed for ray[default] so I think we still need the old code that checks "/proc/driver/nvidia/gpus" for minimal installed ray?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be acceptable to make gpustat a unconditional dependency for working with GPUs and ray? That code is very fragile.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be hard since it has external dependencies as well:

install_requires = [
    'nvidia-ml-py>=12.535.108',  # see #107, #143, #161
    'psutil>=5.6.0',    # GH-1447
    'blessed>=1.17.1',  # GH-126
    'typing_extensions',
]

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we copy the auto detect code that pytorch has torch.cuda.device_count(). I think it doesn't depend on GPUtil or gpustat?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That goes to this code, which eventually uses ctypes and libnvidia-ml.so.1. How does this work on windows and macos?

return num_gpus

@staticmethod
def get_current_node_accelerator_type() -> Optional[str]:
try:
if importlib.util.find_spec("GPUtil"):
gpu_list = GPUtil.getGPUs()
if len(gpu_list) > 0:
gpu_list_names = [gpu.name for gpu in gpu_list]
return NvidiaGPUAcceleratorManager._gpu_name_to_accelerator_type(
gpu_list_names.pop()
)
elif sys.platform.startswith("linux"):
proc_gpus_path = "/proc/driver/nvidia/gpus"
if not os.path.isdir(proc_gpus_path):
return None
gpu_dirs = os.listdir(proc_gpus_path)
if len(gpu_dirs) == 0:
return None
gpu_info_path = f"{proc_gpus_path}/{gpu_dirs[0]}/information"
info_str = open(gpu_info_path).read()
if not info_str:
return None
lines = info_str.split("\n")
full_model_name = None
for line in lines:
split = line.split(":")
if len(split) != 2:
continue
k, v = split
if k.strip() == "Model":
full_model_name = v.strip()
break
gpu_list = gpustat.new_query()
if len(gpu_list) > 0:
gpu_list_names = [gpu.name for gpu in gpu_list]
return NvidiaGPUAcceleratorManager._gpu_name_to_accelerator_type(
full_model_name
gpu_list_names.pop()
)
except Exception:
logger.exception("Could not parse gpu information.")
Expand Down
18 changes: 11 additions & 7 deletions python/ray/tune/tests/test_trainable_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,33 +137,37 @@ def test_raises_error_on_key_conflict(self):
unflatten_dict({"a/b": 2, "a/b/c": 3})


class GPUUtilMock:
class GpustatMock:
class GPU:
def __init__(self, id, uuid, util=None):
self.id = id
self.index = id
self.uuid = uuid
self.util = [0.5, 0.0]
self.util = [12000, 0]

@property
def memoryUtil(self):
def memory_used(self):
if self.util:
return self.util.pop(0)
return 0

@property
def memory_total(self):
return 24000

def __init__(self, gpus, gpu_uuids):
self.gpus = gpus
self.uuids = gpu_uuids
self.gpu_list = [
self.GPU(gpu, uuid) for gpu, uuid in zip(self.gpus, self.uuids)
]

def getGPUs(self):
def new_query(self):
return self.gpu_list


class GPUTest(unittest.TestCase):
def setUp(self):
sys.modules["GPUtil"] = GPUUtilMock([0, 1], ["GPU-aaa", "GPU-bbb"])
sys.modules["gpustat"] = GpustatMock([0, 1], ["GPU-aaa", "GPU-bbb"])

def testGPUWait1(self):
wait_for_gpu(0, delay_s=0)
Expand All @@ -188,7 +192,7 @@ def testGPUWaitFail(self):
def testDefaultGPU(self):
import sys

sys.modules["GPUtil"] = GPUUtilMock([0], ["GPU-aaa"])
sys.modules["gpustat"] = GpustatMock([0], ["GPU-aaa"])
wait_for_gpu(delay_s=0)


Expand Down
56 changes: 28 additions & 28 deletions python/ray/tune/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@
logger = logging.getLogger(__name__)


def _import_gputil():
def _import_gpustat():
try:
import GPUtil
import gpustat
except ImportError:
GPUtil = None
return GPUtil
gpustat = None
return gpustat


START_OF_TIME = time.time()
Expand All @@ -50,25 +50,25 @@ class UtilMonitor(Thread):
It keeps track of CPU, RAM, GPU, VRAM usage (each gpu separately) by
pinging for information every x seconds in a separate thread.

Requires psutil and GPUtil to be installed. Can be enabled with
Requires psutil and gpustat to be installed. Can be enabled with
Tuner(param_space={"log_sys_usage": True}).
"""

def __init__(self, start=True, delay=0.7):
self.stopped = True
GPUtil = _import_gputil()
self.GPUtil = GPUtil
if GPUtil is None and start:
logger.warning("Install gputil for GPU system monitoring.")
gpustat = _import_gpustat()
self.gpustat = gpustat
if gpustat is None and start:
logger.warning("Install gpustat for GPU system monitoring.")

if psutil is None and start:
logger.warning("Install psutil to monitor system performance.")

if GPUtil is None and psutil is None:
if gpustat is None and psutil is None:
return

super(UtilMonitor, self).__init__()
self.delay = delay # Time between calls to GPUtil
self.delay = delay # Time between calls to gpustat
self.values = defaultdict(list)
self.lock = threading.Lock()
self.daemon = True
Expand All @@ -84,18 +84,18 @@ def _read_utilization(self):
self.values["ram_util_percent"].append(
float(getattr(psutil.virtual_memory(), "percent"))
)
if self.GPUtil is not None:
if self.gpustat is not None:
gpu_list = []
try:
gpu_list = self.GPUtil.getGPUs()
gpu_list = self.gpustat.new_query()
except Exception:
logger.debug("GPUtil failed to retrieve GPUs.")
logger.debug("gpustat failed to retrieve GPUs.")
for gpu in gpu_list:
self.values["gpu_util_percent" + str(gpu.id)].append(
float(gpu.load)
float(gpu.utilization)
)
self.values["vram_util_percent" + str(gpu.id)].append(
float(gpu.memoryUtil)
float(gpu.memory_used / gpu.memory_total)
)

def get_data(self):
Expand Down Expand Up @@ -451,11 +451,11 @@ def wait_for_gpu(
):
"""Checks if a given GPU has freed memory.

Requires ``gputil`` to be installed: ``pip install gputil``.
Requires ``gpustat`` to be installed: ``pip install gpustat``.

Args:
gpu_id: GPU id or uuid to check.
Must be found within GPUtil.getGPUs(). If none, resorts to
Must be found within gpustat.new_query(). If None, resorts to
the first item returned from `ray.get_gpu_ids()`.
target_util: The utilization threshold to reach to unblock.
Set this to 0 to block until the GPU is completely free.
Expand All @@ -467,7 +467,7 @@ def wait_for_gpu(
bool: True if free.

Raises:
RuntimeError: If GPUtil is not found, if no GPUs are detected
RuntimeError: If gpustat is not found, if no GPUs are detected
or if the check fails.

Example:
Expand All @@ -488,10 +488,10 @@ def tune_func(config):
tuner.fit()

"""
GPUtil = _import_gputil()
gpustat = _import_gpustat()

if GPUtil is None:
raise RuntimeError("GPUtil must be installed if calling `wait_for_gpu`.")
if gpustat is None:
raise RuntimeError("gpustat must be installed if calling `wait_for_gpu`.")

if gpu_id is None:
gpu_id_list = ray.get_gpu_ids()
Expand All @@ -502,7 +502,7 @@ def tune_func(config):
)
gpu_id = gpu_id_list[0]

gpu_attr = "id"
gpu_attr = "index"
if isinstance(gpu_id, str):
if gpu_id.isdigit():
# GPU ID returned from `ray.get_gpu_ids()` is a str representation
Expand All @@ -521,7 +521,7 @@ def gpu_id_fn(g):
# the format of the input `gpu_id`
return getattr(g, gpu_attr)

gpu_ids = {gpu_id_fn(g) for g in GPUtil.getGPUs()}
gpu_ids = {gpu_id_fn(g) for g in gpustat.new_query()}
if gpu_id not in gpu_ids:
raise ValueError(
f"{gpu_id} not found in set of available GPUs: {gpu_ids}. "
Expand All @@ -530,11 +530,11 @@ def gpu_id_fn(g):
)

for i in range(int(retry)):
gpu_object = next(g for g in GPUtil.getGPUs() if gpu_id_fn(g) == gpu_id)
if gpu_object.memoryUtil > target_util:
gpu_object = next(g for g in gpustat.new_query() if gpu_id_fn(g) == gpu_id)
if gpu_object.memory_used > target_util:
logger.info(
f"Waiting for GPU util to reach {target_util}. "
f"Util: {gpu_object.memoryUtil:0.3f}"
f"Waiting for gpu memory used to be less than {target_util}. "
f"Used: {gpu_object.memory_used:0.3f}"
)
time.sleep(delay_s)
else:
Expand Down
2 changes: 1 addition & 1 deletion python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ scipy
colorful
pyyaml
rich
gpustat>=1.0.0
gpustat>=1.1.0
opentelemetry-sdk
fastapi
gymnasium==0.28.1
Expand Down
2 changes: 1 addition & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def get_packages(self):
"colorful",
"py-spy >= 0.2.0",
"requests",
"gpustat >= 1.0.0", # for windows
"gpustat >= 1.1.0",
"grpcio >= 1.32.0; python_version < '3.10'", # noqa:E501
"grpcio >= 1.42.0; python_version >= '3.10'", # noqa:E501
"opencensus",
Expand Down