Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 62 additions & 4 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ run_pytorch_container: &run_pytorch_container
environment:
wd: << pipeline.parameters.workingdir >>
command: |
docker run --gpus=all --rm -itd -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image >>
docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image >>
docker exec -it pthd nvidia-smi
docker exec -it pthd ls

Expand Down Expand Up @@ -80,7 +80,7 @@ jobs:

# pytest on cuda
export test_cmd='sh tests/run_gpu_tests.sh'
docker exec -it pthd /bin/bash -c "$test_cmd"
docker exec -it pthd /bin/bash -c "${test_cmd}"

# MNIST tests

Expand Down Expand Up @@ -118,7 +118,7 @@ jobs:
- run:
name: Codecov upload
command: |
codecov -F gpu || echo 'Codecov upload failed'
bash <(curl -s https://codecov.io/bash) -Z -F gpu


two_gpus_tests:
Expand All @@ -135,7 +135,64 @@ jobs:
name: Run 1 Node 2 GPUs Unit Tests
command: |
export test_cmd='sh tests/run_gpu_tests.sh 2'
docker exec -it pthd /bin/bash -c "$test_cmd"
docker exec -it pthd /bin/bash -c "${test_cmd}"

- run:
name: Codecov upload
command: |
bash <(curl -s https://codecov.io/bash) -Z -F gpu-2


two_gpus_check_dist_cifar10_example:
<<: *two_gpus

working_directory: << pipeline.parameters.workingdir >>

steps:
- checkout
- <<: *pull_pytorch_stable_image
- <<: *run_pytorch_container
- <<: *install_dependencies
- run:
name: "Install additional example dependencies"
command: |
docker exec -it pthd pip install fire
- run:
name: "Run without backend"
command: |
export example_path="examples/contrib/cifar10"
# initial run
export stop_cmd="--stop_iteration=500"
export test_cmd="CI=1 python ${example_path}/main.py run"
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
# resume
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt"
docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}"

- run:
name: "Run with NCCL backend using torch dist launch"
command: |
export example_path="examples/contrib/cifar10"
# initial run
export stop_cmd="--stop_iteration=500"
export test_cmd="CI=1 python -u -m torch.distributed.launch --nproc_per_node=2 --use_env ${example_path}/main.py run --backend=nccl"
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
# resume
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}"

- run:
name: "Run with NCCL backend using spawn"
command: |
export example_path="examples/contrib/cifar10"
# initial run
export stop_cmd="--stop_iteration=500"
export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --num_procs_per_node=2"
docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
# resume
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}"


# -------------------------------------------------------------------------------------
# Workflows
Expand All @@ -146,3 +203,4 @@ workflows:
jobs:
- one_gpu_tests
- two_gpus_tests
- two_gpus_check_dist_cifar10_example
81 changes: 74 additions & 7 deletions docs/source/distributed.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,87 @@ Helper module to use distributed settings for multiple backends:

- XLA on TPUs via `pytorch/xla <https://github.com/pytorch/xla>`_

This module wraps common methods to fetch information about distributed configuration, initialize/finalize process
group or spawn multiple processes.
Distributed launcher and `auto` helpers
---------------------------------------

We provide a context manager to simplify the code of distributed configuration setup for all above supported backends.
In addition, methods like :meth:`~ignite.distributed.auto.auto_model`, :meth:`~ignite.distributed.auto.auto_optim` and
:meth:`~ignite.distributed.auto.auto_dataloader` helps to adapt in a transparent way provided model, optimizer and data
loaders to existing configuration:

.. code-block:: python

# main.py

import ignite.distributed as idist

def training(local_rank, config, **kwargs):

print(idist.get_rank(), ": run with config:", config, "- backend=", idist.backend())

train_loader = idist.auto_dataloader(dataset, batch_size=32, num_workers=12, shuffle=True, **kwargs)
# batch size, num_workers and sampler are automatically adapted to existing configuration
# ...
model = resnet50()
model = idist.auto_model(model)
# model is DDP or DP or just itself according to existing configuration
# ...
optimizer = optim.SGD(model.parameters(), lr=0.01)
optimizer = idist.auto_optim(optimizer)
# optimizer is itself, except XLA configuration and overrides `step()` method.
# User can safely call `optimizer.step()` (behind `xm.optimizer_step(optimizier)` is performed)


backend = "nccl" # torch native distributed configuration on multiple GPUs
# backend = "xla-tpu" # XLA TPUs distributed configuration
# backend = None # no distributed configuration
with idist.Parallel(backend=backend, **dist_configs) as parallel:
parallel.run(training, config, a=1, b=2)

Above code may be executed with `torch.distributed.launch`_ tool or by python and specifying distributed configuration
in the code. For more details, please, see :class:`~ignite.distributed.launcher.Parallel`,
:meth:`~ignite.distributed.auto.auto_model`, :meth:`~ignite.distributed.auto.auto_optim` and
:meth:`~ignite.distributed.auto.auto_dataloader`.

Complete example of CIFAR10 training can be found
`here <https://github.com/pytorch/ignite/tree/master/examples/contrib/cifar10>`_.


.. _torch.distributed.launch: https://pytorch.org/docs/stable/distributed.html#launch-utility

Examples:

- Example to spawn `nprocs` processes that run `fn` with `args`: :meth:`~ignite.distributed.spawn`
ignite.distributed.auto
-----------------------

.. currentmodule:: ignite.distributed.auto

.. automodule:: ignite.distributed.auto
:members:


ignite.distributed.launcher
---------------------------

.. currentmodule:: ignite.distributed.launcher

.. automodule:: ignite.distributed.launcher
:members:

.. currentmodule:: ignite.distributed

.. automodule:: ignite.distributed
ignite.distributed.utils
------------------------

This module wraps common methods to fetch information about distributed configuration, initialize/finalize process
group or spawn multiple processes.

.. currentmodule:: ignite.distributed.utils

.. automodule:: ignite.distributed.utils
:members:
:imported-members:

.. attribute:: has_native_dist_support

True if `torch.distributed` is available

.. attribute:: has_xla_support

Expand Down
4 changes: 1 addition & 3 deletions examples/contrib/cifar10/.gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
output
cifar10
.polyaxonignore
.polyaxon
plx_configs/*.yaml
raw_pytorch
Loading