Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ jobs:
# and https://github.com/horovod/horovod/issues/1944#issuecomment-628192778
docker exec -it pthd /bin/bash -c "apt-get update && apt-get install -y git"
docker exec -it pthd /bin/bash -c "git clone --recursive https://github.com/horovod/horovod.git /horovod && cd /horovod && python setup.py sdist"
docker exec -it pthd /bin/bash -c "conda install -y cmake=3.16 nccl=2.5 -c conda-forge"
docker exec -it pthd /bin/bash -c "conda install -y cmake=3.16 nccl=2.7 -c conda-forge"
docker exec -it pthd /bin/bash -c 'cd /horovod && HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_NCCL_LINK=SHARED HOROVOD_WITHOUT_MPI=1 HOROVOD_WITH_PYTORCH=1 pip install -v $(ls /horovod/dist/horovod-*.tar.gz) && ldconfig'
docker exec -it pthd horovodrun --check-build

Expand Down
9 changes: 3 additions & 6 deletions tests/ignite/distributed/comp_models/test_horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def _test__hvd_dist_model_create_from_backend_no_dist(backend, true_device):
model = _HorovodDistModel.create_from_backend(backend=backend)

assert hvd.rank() > -1

print("true_device", true_device)
_assert_model(
model,
{
Expand All @@ -62,9 +62,6 @@ def _test__hvd_dist_model_create_from_backend_dist(backend, true_device):
with pytest.raises(RuntimeError, match=r"Can not re-initialize Horovod if it is already initialized"):
_HorovodDistModel.create_from_backend(backend=backend)

if "cuda" in true_device:
true_device += ":{}".format(hvd.local_rank())

_assert_model(
model,
{
Expand Down Expand Up @@ -139,8 +136,8 @@ def test__hvd_dist_model_create_no_dist(gloo_hvd_executor):
@pytest.mark.distributed
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
def test__hvd_dist_model_create_no_dist_cuda(gloo_hvd_executor):
gloo_hvd_executor(_test__hvd_dist_model_create_from_backend_no_dist, ("horovod", "cuda:0"), np=1)
gloo_hvd_executor(_test__hvd_dist_model_create_from_context_no_dist, ("horovod", "cuda:0"), np=1)
gloo_hvd_executor(_test__hvd_dist_model_create_from_backend_no_dist, ("horovod", "cuda"), np=1)
gloo_hvd_executor(_test__hvd_dist_model_create_from_context_no_dist, ("horovod", "cuda"), np=1)


@pytest.mark.distributed
Expand Down
13 changes: 9 additions & 4 deletions tests/ignite/engine/test_create_supervised.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from distutils.version import LooseVersion
from typing import Optional

import pytest
Expand Down Expand Up @@ -45,8 +46,10 @@ def _test_create_supervised_trainer(
assert model.weight.data[0, 0].item() == approx(1.3)
assert model.bias.item() == approx(0.8)
else:
with pytest.raises(RuntimeError, match=r"device type"):
trainer.run(data)
if LooseVersion(torch.__version__) >= LooseVersion("1.7.0"):
# This is broken in 1.6.0 but will be probably fixed with 1.7.0
with pytest.raises(RuntimeError, match=r"is on CPU, but expected them to be on GPU"):
trainer.run(data)


def _test_create_supervised_evaluator(
Expand Down Expand Up @@ -84,8 +87,10 @@ def _test_create_supervised_evaluator(
assert model.bias.item() == approx(0.0)

else:
with pytest.raises(RuntimeError, match=r"device type"):
evaluator.run(data)
if LooseVersion(torch.__version__) >= LooseVersion("1.7.0"):
# This is broken in 1.6.0 but will be probably fixed with 1.7.0
with pytest.raises(RuntimeError, match=r"is on CPU, but expected them to be on GPU"):
evaluator.run(data)


def test_create_supervised_trainer():
Expand Down
2 changes: 1 addition & 1 deletion tests/run_gpu_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ py.test --cov ignite --cov-report term-missing --cov-report xml -vvv tests/ -k '
py.test --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed


if [ "${ngpus}" != "1" ]; then
if [ ${ngpus} -gt 1 ]; then

export WORLD_SIZE=${ngpus}
py.test --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python3.7 tests -m distributed -vvv
Expand Down