@@ -47,7 +47,7 @@ run_pytorch_container: &run_pytorch_container
4747 environment :
4848 wd : << pipeline.parameters.workingdir >>
4949 command : |
50- docker run --gpus=all --rm -itd -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image >>
50+ docker run --gpus=all --rm -itd --shm-size 16G - v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image >>
5151 docker exec -it pthd nvidia-smi
5252 docker exec -it pthd ls
5353
@@ -154,37 +154,37 @@ jobs:
154154 - run :
155155 name : " Run without backend"
156156 command : |
157- export example_path="examples/contrib/new- cifar10"
157+ export example_path="examples/contrib/cifar10"
158158 # initial run
159- export stop_cmd="--stop_iteration=1000 "
159+ export stop_cmd="--stop_iteration=500 "
160160 export test_cmd="CI=1 python ${example_path}/main.py run"
161161 docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
162162 # resume
163- export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-1000/training_checkpoint_1000 .pt"
163+ export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400 .pt"
164164 docker exec -it pthd /bin/bash -c "${test_cmd} ${resume_opt}"
165165
166166 - run :
167167 name : " Run with NCCL backend using torch dist launch"
168168 command : |
169- export example_path="examples/contrib/new- cifar10"
169+ export example_path="examples/contrib/cifar10"
170170 # initial run
171- export stop_cmd="--stop_iteration=1000 "
171+ export stop_cmd="--stop_iteration=500 "
172172 export test_cmd="CI=1 python -u -m torch.distributed.launch --nproc_per_node=2 --use_env ${example_path}/main.py run --backend=nccl"
173173 docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
174174 # resume
175- export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-1000/training_checkpoint_1000 .pt"
175+ export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400 .pt"
176176 docker exec -it pthd /bin/bash -c "${test_cmd} ${resume_opt}"
177177
178178 - run :
179179 name : " Run with NCCL backend using spawn"
180180 command : |
181- export example_path="examples/contrib/new- cifar10"
181+ export example_path="examples/contrib/cifar10"
182182 # initial run
183- export stop_cmd="--stop_iteration=1000 "
183+ export stop_cmd="--stop_iteration=500 "
184184 export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --num_procs_per_node=2"
185185 docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
186186 # resume
187- export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-1000/training_checkpoint_1000 .pt"
187+ export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400 .pt"
188188 docker exec -it pthd /bin/bash -c "${test_cmd} ${resume_opt}"
189189
190190
0 commit comments