dask · quasiben · Oct 6, 2020 · May 18, 2020 · May 18, 2020 · Sep 29, 2020
diff --git a/distributed/dashboard/components/nvml.py b/distributed/dashboard/components/nvml.py
@@ -139,13 +139,17 @@ def update(self):
                 except KeyError:
                     continue
                 metrics = ws.metrics["gpu"]
-                for j, (u, mem_used, mem_total) in enumerate(
+                for j, (u, mem_used, procs, mem_total) in enumerate(
                     zip(
                         metrics["utilization"],
                         metrics["memory-used"],
+                        metrics["procs"],
                         info["memory-total"],
                     )
                 ):
+                    # find which GPU maps to which process
+                    if ws.pid not in procs:
+                        continue
                     memory_max = max(memory_max, mem_total)
                     memory_total += mem_total
                     utilization.append(int(u))

diff --git a/distributed/diagnostics/nvml.py b/distributed/diagnostics/nvml.py
@@ -1,28 +1,60 @@
+import os
 import pynvml
 
-handles = None
+nvmlInit = None
+
+
+def init_once():
+    global nvmlInit
+    if nvmlInit is not None:
+        return
+
+    from pynvml import nvmlInit as _nvmlInit
+
+    nvmlInit = _nvmlInit
+    nvmlInit()
 
 
 def _pynvml_handles():
-    global handles
-    if handles is None:
-        pynvml.nvmlInit()
-        count = pynvml.nvmlDeviceGetCount()
-        handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(count)]
+    count = pynvml.nvmlDeviceGetCount()
+    try:
+        cuda_visible_devices = [
+            int(idx) for idx in os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")
+        ]
+    except ValueError:
+        # CUDA_VISIBLE_DEVICES is not set
+        cuda_visible_devices = False
+    if not cuda_visible_devices:
+        cuda_visible_devices = list(range(count))
+    handles = [
+        pynvml.nvmlDeviceGetHandleByIndex(i)
+        for i in range(count)
+        if i in cuda_visible_devices
+    ]
     return handles
 
 
 def real_time():
+    init_once()
     handles = _pynvml_handles()
     return {
         "utilization": [pynvml.nvmlDeviceGetUtilizationRates(h).gpu for h in handles],
         "memory-used": [pynvml.nvmlDeviceGetMemoryInfo(h).used for h in handles],
+        "procs": [
+            [p.pid for p in pynvml.nvmlDeviceGetComputeRunningProcesses(h)]
+            for h in handles
+        ],
     }
 
 
 def one_time():
+    init_once()
     handles = _pynvml_handles()
     return {
         "memory-total": [pynvml.nvmlDeviceGetMemoryInfo(h).total for h in handles],
         "name": [pynvml.nvmlDeviceGetName(h).decode() for h in handles],
+        "procs": [
+            [p.pid for p in pynvml.nvmlDeviceGetComputeRunningProcesses(h)]
+            for h in handles
+        ],
     }
diff --git a/distributed/diagnostics/tests/test_nvml.py b/distributed/diagnostics/tests/test_nvml.py
@@ -0,0 +1,26 @@
+import pytest
+import os
+
+pytest.importorskip("pynvml")
+
+from distributed.diagnostics import nvml
+
+
+def test_one_time():
+    output = nvml.one_time()
+    assert "memory-total" in output
+    assert "name" in output
+
+    assert len(output["name"]) > 0
+
+
+def test_1_visible_devices():
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+    output = nvml.one_time()
+    assert len(output["memory-total"]) == 1
+
+
+def test_2_visible_devices():
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+    output = nvml.one_time()
+    assert len(output["memory-total"]) == 2