Skip to content

Commit f44b79a

Browse files
committed
Fixes failing tests
1 parent 199888e commit f44b79a

File tree

2 files changed

+12
-12
lines changed

2 files changed

+12
-12
lines changed

.circleci/config.yml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ run_pytorch_container: &run_pytorch_container
4747
environment:
4848
wd: << pipeline.parameters.workingdir >>
4949
command: |
50-
docker run --gpus=all --rm -itd -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image >>
50+
docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image >>
5151
docker exec -it pthd nvidia-smi
5252
docker exec -it pthd ls
5353
@@ -154,37 +154,37 @@ jobs:
154154
- run:
155155
name: "Run without backend"
156156
command: |
157-
export example_path="examples/contrib/new-cifar10"
157+
export example_path="examples/contrib/cifar10"
158158
# initial run
159-
export stop_cmd="--stop_iteration=1000"
159+
export stop_cmd="--stop_iteration=500"
160160
export test_cmd="CI=1 python ${example_path}/main.py run"
161161
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
162162
# resume
163-
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-1000/training_checkpoint_1000.pt"
163+
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt"
164164
docker exec -it pthd /bin/bash -c "${test_cmd} ${resume_opt}"
165165
166166
- run:
167167
name: "Run with NCCL backend using torch dist launch"
168168
command: |
169-
export example_path="examples/contrib/new-cifar10"
169+
export example_path="examples/contrib/cifar10"
170170
# initial run
171-
export stop_cmd="--stop_iteration=1000"
171+
export stop_cmd="--stop_iteration=500"
172172
export test_cmd="CI=1 python -u -m torch.distributed.launch --nproc_per_node=2 --use_env ${example_path}/main.py run --backend=nccl"
173173
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
174174
# resume
175-
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-1000/training_checkpoint_1000.pt"
175+
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
176176
docker exec -it pthd /bin/bash -c "${test_cmd} ${resume_opt}"
177177
178178
- run:
179179
name: "Run with NCCL backend using spawn"
180180
command: |
181-
export example_path="examples/contrib/new-cifar10"
181+
export example_path="examples/contrib/cifar10"
182182
# initial run
183-
export stop_cmd="--stop_iteration=1000"
183+
export stop_cmd="--stop_iteration=500"
184184
export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --num_procs_per_node=2"
185185
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
186186
# resume
187-
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-1000/training_checkpoint_1000.pt"
187+
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
188188
docker exec -it pthd /bin/bash -c "${test_cmd} ${resume_opt}"
189189
190190

tests/ignite/distributed/test_auto.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def _test_auto_model_optimizer(ws, device):
4444
model = auto_model(model)
4545
if ws > 1:
4646
assert isinstance(model, nn.parallel.DistributedDataParallel)
47-
elif torch.cuda.is_available() and torch.cuda.device_count() > 1:
47+
elif device != "cpu" and torch.cuda.is_available() and torch.cuda.device_count() > 1:
4848
assert isinstance(model, nn.parallel.DataParallel)
4949
else:
5050
assert isinstance(model, nn.Module)
@@ -91,7 +91,7 @@ def test_auto_methods_nccl(distributed_context_single_node_nccl):
9191
_test_auto_dataloader(ws=ws, nproc=ws)
9292
_test_auto_dataloader(ws=ws, nproc=ws, sampler_name="WeightedRandomSampler")
9393

94-
device = "cuda:{}".format(lrank) if ws > 1 else "cuda"
94+
device = "cuda"
9595
_test_auto_model_optimizer(ws, device)
9696

9797

0 commit comments

Comments
 (0)