Skip to content

Commit c6f0327

Browse files
sdesrozisDesroziersvfdev-5
authored
Issue 1165 : nccl + torch.cuda not available (#1166)
* fix issue 1165 * Update ignite/distributed/comp_models/native.py Co-authored-by: vfdev <vfdev.5@gmail.com> * add test for nccl /wo gpu Co-authored-by: Desroziers <sylvain.desroziers@ifpen.fr> Co-authored-by: vfdev <vfdev.5@gmail.com>
1 parent ab546ab commit c6f0327

File tree

3 files changed

+26
-0
lines changed

3 files changed

+26
-0
lines changed

ignite/distributed/comp_models/native.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ def _create_from_backend(self, backend, timeout=None, **kwargs):
7474
if timeout is not None:
7575
init_pg_kwargs["timeout"] = timeout
7676

77+
if backend == dist.Backend.NCCL and not torch.cuda.is_available():
78+
raise RuntimeError("Nccl backend is required but no cuda capable devices")
79+
7780
dist.init_process_group(backend, init_method="env://", **init_pg_kwargs)
7881
# https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
7982
dist.barrier()

tests/ignite/conftest.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,3 +216,12 @@ def _xla_execute(fn, args, nprocs):
216216
@pytest.fixture()
217217
def xmp_executor():
218218
yield _xla_execute
219+
220+
221+
@pytest.fixture()
222+
def mock_gpu_is_not_available():
223+
from unittest.mock import patch
224+
225+
with patch("torch.cuda") as mock_cuda:
226+
mock_cuda.is_available.return_value = False
227+
yield mock_cuda

tests/ignite/distributed/comp_models/test_native.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,20 @@ def test__native_dist_model():
3232
assert "mpi" not in available_backends
3333

3434

35+
@pytest.mark.distributed
36+
@pytest.mark.skipif(not dist.is_nccl_available(), reason="Skip if nccl not available")
37+
def test__native_nccl_but_no_gpu(mock_gpu_is_not_available):
38+
39+
env_backup = os.environ
40+
41+
with pytest.raises(RuntimeError, match=r"Nccl backend is required but no cuda capable devices"):
42+
_NativeDistModel(backend="nccl")
43+
44+
# environ could be corrupted by _NativeDistModel
45+
os.environ.clear()
46+
os.environ.update(env_backup)
47+
48+
3549
@pytest.mark.distributed
3650
@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
3751
def test__native_dist_model_create_from_backend_bad_config():

0 commit comments

Comments
 (0)