diff --git a/distributed/dashboard/components/nvml.py b/distributed/dashboard/components/nvml.py index 534c396cbc0..34cce3c4bc7 100644 --- a/distributed/dashboard/components/nvml.py +++ b/distributed/dashboard/components/nvml.py @@ -131,29 +131,23 @@ def update(self): memory_total = 0 memory_max = 0 worker = [] - i = 0 - for ws in workers: + for idx, ws in enumerate(workers): try: info = ws.extra["gpu"] except KeyError: continue metrics = ws.metrics["gpu"] - for j, (u, mem_used, mem_total) in enumerate( - zip( - metrics["utilization"], - metrics["memory-used"], - info["memory-total"], - ) - ): - memory_max = max(memory_max, mem_total) - memory_total += mem_total - utilization.append(int(u)) - memory.append(mem_used) - worker.append(ws.address) - gpu_index.append(j) - y.append(i) - i += 1 + u = metrics["utilization"] + mem_used = metrics["memory-used"] + mem_total = info["memory-total"] + memory_max = max(memory_max, mem_total) + memory_total += mem_total + utilization.append(int(u)) + memory.append(mem_used) + worker.append(ws.address) + gpu_index.append(idx) + y.append(idx) memory_text = [format_bytes(m) for m in memory] diff --git a/distributed/diagnostics/nvml.py b/distributed/diagnostics/nvml.py index a96a5547598..c1bbb4161a8 100644 --- a/distributed/diagnostics/nvml.py +++ b/distributed/diagnostics/nvml.py @@ -1,28 +1,48 @@ +import os import pynvml -handles = None +nvmlInit = None + + +def init_once(): + global nvmlInit + if nvmlInit is not None: + return + + from pynvml import nvmlInit as _nvmlInit + + nvmlInit = _nvmlInit + nvmlInit() def _pynvml_handles(): - global handles - if handles is None: - pynvml.nvmlInit() - count = pynvml.nvmlDeviceGetCount() - handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(count)] - return handles + count = pynvml.nvmlDeviceGetCount() + try: + cuda_visible_devices = [ + int(idx) for idx in os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",") + ] + except ValueError: + # CUDA_VISIBLE_DEVICES is not set + cuda_visible_devices = False + if not cuda_visible_devices: + cuda_visible_devices = list(range(count)) + gpu_idx = cuda_visible_devices[0] + return pynvml.nvmlDeviceGetHandleByIndex(gpu_idx) def real_time(): - handles = _pynvml_handles() + init_once() + h = _pynvml_handles() return { - "utilization": [pynvml.nvmlDeviceGetUtilizationRates(h).gpu for h in handles], - "memory-used": [pynvml.nvmlDeviceGetMemoryInfo(h).used for h in handles], + "utilization": pynvml.nvmlDeviceGetUtilizationRates(h).gpu, + "memory-used": pynvml.nvmlDeviceGetMemoryInfo(h).used, } def one_time(): - handles = _pynvml_handles() + init_once() + h = _pynvml_handles() return { - "memory-total": [pynvml.nvmlDeviceGetMemoryInfo(h).total for h in handles], - "name": [pynvml.nvmlDeviceGetName(h).decode() for h in handles], + "memory-total": pynvml.nvmlDeviceGetMemoryInfo(h).total, + "name": pynvml.nvmlDeviceGetName(h).decode(), } diff --git a/distributed/diagnostics/tests/test_nvml.py b/distributed/diagnostics/tests/test_nvml.py new file mode 100644 index 00000000000..820ba57a8a5 --- /dev/null +++ b/distributed/diagnostics/tests/test_nvml.py @@ -0,0 +1,34 @@ +import pytest +import os + +pynvml = pytest.importorskip("pynvml") + +from distributed.diagnostics import nvml + + +def test_one_time(): + output = nvml.one_time() + assert "memory-total" in output + assert "name" in output + + assert len(output["name"]) > 0 + + +def test_1_visible_devices(): + os.environ["CUDA_VISIBLE_DEVICES"] = "0" + output = nvml.one_time() + assert len(output["memory-total"]) == 1 + + +@pytest.mark.parametrize("CVD", ["1,0", "0,1"]) +def test_2_visible_devices(CVD): + os.environ["CUDA_VISIBLE_DEVICES"] = CVD + idx = int(CVD.split(",")[0]) + + h = nvml._pynvml_handles() + h2 = pynvml.nvmlDeviceGetHandleByIndex(idx) + + s = pynvml.nvmlDeviceGetSerial(h) + s2 = pynvml.nvmlDeviceGetSerial(h2) + + assert s == s2