-
Notifications
You must be signed in to change notification settings - Fork 94
/
Copy pathutils.py
679 lines (572 loc) · 21.8 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
import math
import os
import time
import warnings
from contextlib import suppress
from multiprocessing import cpu_count
import numpy as np
import pynvml
import toolz
from dask.utils import parse_bytes
from distributed import Worker, wait
try:
from nvtx import annotate as nvtx_annotate
except ImportError:
# If nvtx module is not installed, `annotate` yields only.
from contextlib import contextmanager
@contextmanager
def nvtx_annotate(message=None, color="blue", domain=None):
yield
try:
import ucp
_ucx_110 = ucp.get_ucx_version() >= (1, 10, 0)
_ucx_111 = ucp.get_ucx_version() >= (1, 11, 0)
except ImportError:
_ucx_110 = False
_ucx_111 = False
class CPUAffinity:
def __init__(self, cores):
self.cores = cores
def setup(self, worker=None):
os.sched_setaffinity(0, self.cores)
class RMMSetup:
def __init__(self, nbytes, managed_memory, async_alloc, log_directory):
self.nbytes = nbytes
self.managed_memory = managed_memory
self.async_alloc = async_alloc
self.logging = log_directory is not None
self.log_directory = log_directory
def setup(self, worker=None):
if self.async_alloc:
import rmm
rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource())
if self.logging:
rmm.enable_logging(
log_file_name=get_rmm_log_file_name(
worker, self.logging, self.log_directory
)
)
elif self.nbytes is not None or self.managed_memory:
import rmm
pool_allocator = False if self.nbytes is None else True
rmm.reinitialize(
pool_allocator=pool_allocator,
managed_memory=self.managed_memory,
initial_pool_size=self.nbytes,
logging=self.logging,
log_file_name=get_rmm_log_file_name(
worker, self.logging, self.log_directory
),
)
def unpack_bitmask(x, mask_bits=64):
"""Unpack a list of integers containing bitmasks.
Parameters
----------
x: list of int
A list of integers
mask_bits: int
An integer determining the bitwidth of `x`
Examples
--------
>>> from dask_cuda.utils import unpack_bitmaps
>>> unpack_bitmask([1 + 2 + 8])
[0, 1, 3]
>>> unpack_bitmask([1 + 2 + 16])
[0, 1, 4]
>>> unpack_bitmask([1 + 2 + 16, 2 + 4])
[0, 1, 4, 65, 66]
>>> unpack_bitmask([1 + 2 + 16, 2 + 4], mask_bits=32)
[0, 1, 4, 33, 34]
"""
res = []
for i, mask in enumerate(x):
if not isinstance(mask, int):
raise TypeError("All elements of the list `x` must be integers")
cpu_offset = i * mask_bits
bytestr = np.frombuffer(
bytes(np.binary_repr(mask, width=mask_bits), "utf-8"), "u1"
)
mask = np.flip(bytestr - ord("0")).astype(bool)
unpacked_mask = np.where(
mask, np.arange(mask_bits) + cpu_offset, np.full(mask_bits, -1)
)
res += unpacked_mask[(unpacked_mask >= 0)].tolist()
return res
@toolz.memoize
def get_cpu_count():
return cpu_count()
@toolz.memoize
def get_gpu_count():
pynvml.nvmlInit()
return pynvml.nvmlDeviceGetCount()
@toolz.memoize
def get_gpu_count_mig(return_uuids=False):
"""Return the number of MIG instances available
Parameters
----------
return_uuids: bool
Returns the uuids of the MIG instances available optionally
"""
pynvml.nvmlInit()
uuids = []
for index in range(get_gpu_count()):
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
try:
is_mig_mode = pynvml.nvmlDeviceGetMigMode(handle)[0]
except pynvml.NVMLError:
# if not a MIG device, i.e. a normal GPU, skip
continue
if is_mig_mode:
count = pynvml.nvmlDeviceGetMaxMigDeviceCount(handle)
miguuids = []
for i in range(count):
try:
mighandle = pynvml.nvmlDeviceGetMigDeviceHandleByIndex(
device=handle, index=i
)
miguuids.append(mighandle)
uuids.append(pynvml.nvmlDeviceGetUUID(mighandle))
except pynvml.NVMLError:
pass
if return_uuids:
return len(uuids), uuids
return len(uuids)
def get_cpu_affinity(device_index=None):
"""Get a list containing the CPU indices to which a GPU is directly connected.
Use either the device index or the specified device identifier UUID.
Parameters
----------
device_index: int or str
Index or UUID of the GPU device
Examples
--------
>>> from dask_cuda.utils import get_cpu_affinity
>>> get_cpu_affinity(0) # DGX-1 has GPUs 0-3 connected to CPUs [0-19, 20-39]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59]
>>> get_cpu_affinity(5) # DGX-1 has GPUs 5-7 connected to CPUs [20-39, 60-79]
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
>>> get_cpu_affinity(1000) # DGX-1 has no device on index 1000
dask_cuda/utils.py:96: UserWarning: Cannot get CPU affinity for device with index
1000, setting default affinity
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
"""
pynvml.nvmlInit()
try:
if device_index and not str(device_index).isnumeric():
# This means device_index is UUID.
# This works for both MIG and non-MIG device UUIDs.
handle = pynvml.nvmlDeviceGetHandleByUUID(str.encode(device_index))
if pynvml.nvmlDeviceIsMigDeviceHandle(handle):
# Additionally get parent device handle
# if the device itself is a MIG instance
handle = pynvml.nvmlDeviceGetDeviceHandleFromMigDeviceHandle(handle)
else:
handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
# Result is a list of 64-bit integers, thus ceil(get_cpu_count() / 64)
affinity = pynvml.nvmlDeviceGetCpuAffinity(
handle, math.ceil(get_cpu_count() / 64),
)
return unpack_bitmask(affinity)
except pynvml.NVMLError:
warnings.warn(
"Cannot get CPU affinity for device with index %d, setting default affinity"
% device_index
)
return list(range(get_cpu_count()))
def get_n_gpus():
try:
return len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
except KeyError:
return get_gpu_count()
def get_device_total_memory(index=0):
"""
Return total memory of CUDA device with index or with device identifier UUID
"""
pynvml.nvmlInit()
if index and not str(index).isnumeric():
# This means index is UUID. This works for both MIG and non-MIG device UUIDs.
handle = pynvml.nvmlDeviceGetHandleByUUID(str.encode(str(index)))
else:
# This is a device index
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
return pynvml.nvmlDeviceGetMemoryInfo(handle).total
def get_ucx_net_devices(
cuda_device_index, ucx_net_devices, get_openfabrics=True, get_network=False
):
if _ucx_111 and ucx_net_devices == "auto":
return None
if cuda_device_index is None and (
callable(ucx_net_devices) or ucx_net_devices == "auto"
):
raise ValueError(
"A CUDA device index must be specified if the "
"ucx_net_devices variable is either callable or 'auto'"
)
elif cuda_device_index is not None:
dev = int(cuda_device_index)
net_dev = None
if callable(ucx_net_devices):
net_dev = ucx_net_devices(int(cuda_device_index))
elif isinstance(ucx_net_devices, str):
if ucx_net_devices == "auto":
# If TopologicalDistance from ucp is available, we set the UCX
# net device to the closest network device explicitly.
from ucp._libs.topological_distance import TopologicalDistance
net_dev = ""
td = TopologicalDistance()
if get_openfabrics:
ibs = td.get_cuda_distances_from_device_index(dev, "openfabrics")
if len(ibs) > 0:
net_dev += ibs[0]["name"] + ":1"
if get_network:
ifnames = td.get_cuda_distances_from_device_index(dev, "network")
if len(ifnames) > 0:
if len(net_dev) > 0:
net_dev += ","
net_dev += ifnames[0]["name"]
else:
net_dev = ucx_net_devices
return net_dev
def get_ucx_config(
enable_tcp_over_ucx=False,
enable_infiniband=False,
enable_nvlink=False,
enable_rdmacm=False,
net_devices="",
cuda_device_index=None,
):
if net_devices == "auto" and enable_infiniband is False:
raise ValueError(
"Using ucx_net_devices='auto' is currently only "
"supported when enable_infiniband=True."
)
ucx_config = {
"tcp": None,
"infiniband": None,
"nvlink": None,
"rdmacm": None,
"net-devices": None,
"cuda_copy": None,
"create_cuda_context": None,
"reuse-endpoints": not _ucx_111,
}
if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
ucx_config["cuda_copy"] = True
if enable_tcp_over_ucx:
ucx_config["tcp"] = True
if enable_infiniband:
ucx_config["infiniband"] = True
if enable_nvlink:
ucx_config["nvlink"] = True
if enable_rdmacm:
ucx_config["rdmacm"] = True
if net_devices is not None and net_devices != "":
ucx_config["net-devices"] = get_ucx_net_devices(cuda_device_index, net_devices)
return ucx_config
def get_preload_options(
protocol=None,
create_cuda_context=False,
enable_tcp_over_ucx=False,
enable_infiniband=False,
enable_nvlink=False,
enable_rdmacm=False,
ucx_net_devices="",
cuda_device_index=0,
):
"""
Return a dictionary with the preload and preload_argv options required to
create CUDA context and enabling UCX communication.
Parameters
----------
protocol: None or str
If "ucx", options related to UCX (enable_tcp_over_ucx, enable_infiniband,
enable_nvlink and ucx_net_devices) are added to preload_argv.
create_cuda_context: bool
Ensure the CUDA context gets created at initialization, generally
needed by Dask workers.
enable_tcp: bool
Set environment variables to enable TCP over UCX, even when InfiniBand or
NVLink support are disabled.
enable_infiniband: bool
Set environment variables to enable UCX InfiniBand support. Implies
enable_tcp=True.
enable_rdmacm: bool
Set environment variables to enable UCX RDMA connection manager support.
Currently requires enable_infiniband=True.
enable_nvlink: bool
Set environment variables to enable UCX NVLink support. Implies
enable_tcp=True.
ucx_net_devices: str or callable
A string with the interface name to be used for all devices (empty
string means use default), or a callable function taking an integer
identifying the GPU index.
cuda_device_index: int
The index identifying the CUDA device used by this worker, only used
when ucx_net_devices is callable.
Example
-------
>>> from dask_cuda.utils import get_preload_options
>>> get_preload_options()
{'preload': ['dask_cuda.initialize'], 'preload_argv': []}
>>> get_preload_options(protocol="ucx", create_cuda_context=True,
... enable_infiniband=True, cuda_device_index=5,
... ucx_net_devices=lambda i: "mlx5_%d:1" % (i // 2))
{'preload': ['dask_cuda.initialize'],
'preload_argv': ['--create-cuda-context',
'--enable-infiniband',
'--net-devices=mlx5_2:1']}
"""
preload_options = {"preload": ["dask_cuda.initialize"], "preload_argv": []}
if create_cuda_context:
preload_options["preload_argv"].append("--create-cuda-context")
if protocol == "ucx":
initialize_ucx_argv = []
if enable_tcp_over_ucx:
initialize_ucx_argv.append("--enable-tcp-over-ucx")
if enable_infiniband:
initialize_ucx_argv.append("--enable-infiniband")
if enable_rdmacm:
initialize_ucx_argv.append("--enable-rdmacm")
if enable_nvlink:
initialize_ucx_argv.append("--enable-nvlink")
if ucx_net_devices is not None and ucx_net_devices != "":
net_dev = get_ucx_net_devices(cuda_device_index, ucx_net_devices)
initialize_ucx_argv.append("--net-devices=%s" % net_dev)
preload_options["preload_argv"].extend(initialize_ucx_argv)
return preload_options
def get_rmm_log_file_name(dask_worker, logging=False, log_directory=None):
return (
os.path.join(
log_directory,
"rmm_log_%s.txt"
% (
(
dask_worker.name.split("/")[-1]
if isinstance(dask_worker.name, str)
else dask_worker.name
)
if hasattr(dask_worker, "name")
else "scheduler"
),
)
if logging
else None
)
def wait_workers(
client, min_timeout=10, seconds_per_gpu=2, n_gpus=None, timeout_callback=None
):
"""
Wait for workers to be available. When a timeout occurs, a callback
is executed if specified. Generally used for tests.
Parameters
----------
client: distributed.Client
Instance of client, used to query for number of workers connected.
min_timeout: float
Minimum number of seconds to wait before timeout.
seconds_per_gpu: float
Seconds to wait for each GPU on the system. For example, if its
value is 2 and there is a total of 8 GPUs (workers) being started,
a timeout will occur after 16 seconds. Note that this value is only
used as timeout when larger than min_timeout.
n_gpus: None or int
If specified, will wait for a that amount of GPUs (i.e., Dask workers)
to come online, else waits for a total of `get_n_gpus` workers.
timeout_callback: None or callable
A callback function to be executed if a timeout occurs, ignored if
None.
Returns
-------
True if all workers were started, False if a timeout occurs.
"""
n_gpus = n_gpus or get_n_gpus()
timeout = max(min_timeout, seconds_per_gpu * n_gpus)
start = time.time()
while True:
if len(client.scheduler_info()["workers"]) == n_gpus:
return True
elif time.time() - start > timeout:
if callable(timeout_callback):
timeout_callback()
return False
else:
time.sleep(0.1)
async def _all_to_all(client):
"""
Trigger all to all communication between workers and scheduler
"""
workers = list(client.scheduler_info()["workers"])
futs = []
for w in workers:
bit_of_data = b"0" * 1
data = client.map(lambda x: bit_of_data, range(1), pure=False, workers=[w])
futs.append(data[0])
await wait(futs)
def f(x):
pass
new_futs = []
for w in workers:
for future in futs:
data = client.submit(f, future, workers=[w], pure=False)
new_futs.append(data)
await wait(new_futs)
def all_to_all(client):
return client.sync(_all_to_all, client=client, asynchronous=client.asynchronous)
def parse_cuda_visible_device(dev):
"""Parses a single CUDA device identifier
A device identifier must either be an integer, a string containing an
integer or a string containing the device's UUID, beginning with prefix
'GPU-' or 'MIG-GPU'.
>>> parse_cuda_visible_device(2)
2
>>> parse_cuda_visible_device('2')
2
>>> parse_cuda_visible_device('GPU-9baca7f5-0f2f-01ac-6b05-8da14d6e9005')
'GPU-9baca7f5-0f2f-01ac-6b05-8da14d6e9005'
>>> parse_cuda_visible_device('Foo')
Traceback (most recent call last):
...
ValueError: Devices in CUDA_VISIBLE_DEVICES must be comma-separated integers or
strings beginning with 'GPU-' or 'MIG-GPU-' prefixes.
"""
try:
return int(dev)
except ValueError:
if any(dev.startswith(prefix) for prefix in ["GPU-", "MIG-GPU-", "MIG-"]):
return dev
else:
raise ValueError(
"Devices in CUDA_VISIBLE_DEVICES must be comma-separated integers "
"or strings beginning with 'GPU-' or 'MIG-GPU-' prefixes"
" or 'MIG-<UUID>'."
)
def cuda_visible_devices(i, visible=None):
"""Cycling values for CUDA_VISIBLE_DEVICES environment variable
Examples
--------
>>> cuda_visible_devices(0, range(4))
'0,1,2,3'
>>> cuda_visible_devices(3, range(8))
'3,4,5,6,7,0,1,2'
"""
if visible is None:
try:
visible = map(
parse_cuda_visible_device, os.environ["CUDA_VISIBLE_DEVICES"].split(",")
)
except KeyError:
visible = range(get_n_gpus())
visible = list(visible)
L = visible[i:] + visible[:i]
return ",".join(map(str, L))
def nvml_device_index(i, CUDA_VISIBLE_DEVICES):
"""Get the device index for NVML addressing
NVML expects the index of the physical device, unlike CUDA runtime which
expects the address relative to `CUDA_VISIBLE_DEVICES`. This function
returns the i-th device index from the `CUDA_VISIBLE_DEVICES`
comma-separated string of devices or list.
Examples
--------
>>> nvml_device_index(1, "0,1,2,3")
1
>>> nvml_device_index(1, "1,2,3,0")
2
>>> nvml_device_index(1, [0,1,2,3])
1
>>> nvml_device_index(1, [1,2,3,0])
2
>>> nvml_device_index(1, ["GPU-84fd49f2-48ad-50e8-9f2e-3bf0dfd47ccb",
"GPU-d6ac2d46-159b-5895-a854-cb745962ef0f",
"GPU-158153b7-51d0-5908-a67c-f406bc86be17"])
"MIG-d6ac2d46-159b-5895-a854-cb745962ef0f"
>>> nvml_device_index(2, ["MIG-41b3359c-e721-56e5-8009-12e5797ed514",
"MIG-65b79fff-6d3c-5490-a288-b31ec705f310",
"MIG-c6e2bae8-46d4-5a7e-9a68-c6cf1f680ba0"])
"MIG-c6e2bae8-46d4-5a7e-9a68-c6cf1f680ba0"
>>> nvml_device_index(1, 2)
Traceback (most recent call last):
...
ValueError: CUDA_VISIBLE_DEVICES must be `str` or `list`
"""
if isinstance(CUDA_VISIBLE_DEVICES, str):
ith_elem = CUDA_VISIBLE_DEVICES.split(",")[i]
if ith_elem.isnumeric():
return int(ith_elem)
else:
return ith_elem
elif isinstance(CUDA_VISIBLE_DEVICES, list):
return CUDA_VISIBLE_DEVICES[i]
else:
raise ValueError("`CUDA_VISIBLE_DEVICES` must be `str` or `list`")
def parse_device_memory_limit(device_memory_limit, device_index=0):
"""Parse memory limit to be used by a CUDA device.
Parameters
----------
device_memory_limit: float, int, str or None
This can be a float (fraction of total device memory), an integer (bytes),
a string (like 5GB or 5000M), and "auto", 0 or None for the total device
size.
device_index: int or str
The index or UUID of the device from which to obtain the total memory amount.
Default: 0.
Examples
--------
>>> # On a 32GB CUDA device
>>> parse_device_memory_limit(None)
34089730048
>>> parse_device_memory_limit(0.8)
27271784038
>>> parse_device_memory_limit(1000000000)
1000000000
>>> parse_device_memory_limit("1GB")
1000000000
"""
if any(device_memory_limit == v for v in [0, "0", None, "auto"]):
return get_device_total_memory(device_index)
with suppress(ValueError, TypeError):
device_memory_limit = float(device_memory_limit)
if isinstance(device_memory_limit, float) and device_memory_limit <= 1:
return int(get_device_total_memory(device_index) * device_memory_limit)
if isinstance(device_memory_limit, str):
return parse_bytes(device_memory_limit)
else:
return int(device_memory_limit)
class MockWorker(Worker):
"""Mock Worker class preventing NVML from getting used by SystemMonitor.
By preventing the Worker from initializing NVML in the SystemMonitor, we can
mock test multiple devices in `CUDA_VISIBLE_DEVICES` behavior with single-GPU
machines.
"""
def __init__(self, *args, **kwargs):
import distributed
distributed.diagnostics.nvml.device_get_count = MockWorker.device_get_count
self._device_get_count = distributed.diagnostics.nvml.device_get_count
super().__init__(*args, **kwargs)
def __del__(self):
import distributed
distributed.diagnostics.nvml.device_get_count = self._device_get_count
@staticmethod
def device_get_count():
return 0
def get_gpu_uuid_from_index(device_index=0):
"""Get GPU UUID from CUDA device index.
Parameters
----------
device_index: int or str
The index of the device from which to obtain the UUID. Default: 0.
Examples
--------
>>> get_gpu_uuid_from_index()
'GPU-9baca7f5-0f2f-01ac-6b05-8da14d6e9005'
>>> get_gpu_uuid_from_index(3)
'GPU-9fb42d6f-7d6b-368f-f79c-3c3e784c93f6'
"""
import pynvml
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
return pynvml.nvmlDeviceGetUUID(handle).decode("utf-8")