From 0228c0bf81ef4e50af8020105a5f8187637c2315 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 8 Sep 2020 14:32:12 -0700 Subject: [PATCH 1/3] Added check for apex AMP and unit tests for Horovod + AMP --- .../accelerators/horovod_backend.py | 2 +- tests/models/test_horovod.py | 25 +++++++++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerators/horovod_backend.py b/pytorch_lightning/accelerators/horovod_backend.py index f2c74107d9ba4..5e02f947ebf94 100644 --- a/pytorch_lightning/accelerators/horovod_backend.py +++ b/pytorch_lightning/accelerators/horovod_backend.py @@ -72,7 +72,7 @@ def setup(self, model): if isinstance(scheduler, _LRScheduler): scheduler.base_lrs = [lr * hvd.size() for lr in scheduler.base_lrs] - if self.trainer.amp_backend: + if self.trainer.amp_backend == AMPType.APEX: model, optimizers = model.configure_apex(amp, model, self.trainer.optimizers, self.trainer.amp_level) self.trainer.optimizers = optimizers self.trainer.reinit_scheduler_properties(self.trainer.optimizers, self.trainer.lr_schedulers) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 7c6dc3b7417c5..42cb9a28d890d 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -42,8 +42,8 @@ def _nccl_available(): def _run_horovod(trainer_options, on_gpu=False): """Execute the training script across multiple workers in parallel.""" num_processes = trainer_options.get('gpus', 2) - # gpus trainer argument does not apply for horovod - trainer_options.update(gpus=None) + # for Horovod, we interpret `gpus` to be set per worker + trainer_options.update(gpus=1 if on_gpu else None) tutils.reset_seed() cmdline = [ 'horovodrun', @@ -110,6 +110,27 @@ def test_horovod_multi_gpu(tmpdir): _run_horovod(trainer_options, on_gpu=True) +@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") +@pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_horovod_amp(tmpdir): + """Test Horovod with multi-GPU support.""" + trainer_options = dict( + default_root_dir=str(tmpdir), + weights_save_path=str(tmpdir), + gradient_clip_val=1.0, + progress_bar_refresh_rate=0, + max_epochs=1, + limit_train_batches=0.4, + limit_val_batches=0.2, + gpus=2, + deterministic=True, + distributed_backend='horovod', + precision=16, + ) + _run_horovod(trainer_options, on_gpu=True) + + @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") @pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support") @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") From d61bc9b88e7b8b456d83dd0e237a363c9c80d1d6 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 8 Sep 2020 14:35:27 -0700 Subject: [PATCH 2/3] Changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 13bce4c14d6f4..697ee6d835b5e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed setting batch size in `LightningModule.datamodule` when using `auto_scale_batch_size` ([#3266](https://github.com/PyTorchLightning/pytorch-lightning/pull/3266)) +- Fixed Horovod distributed backend compatibility with native AMP ([#3404](https://github.com/PyTorchLightning/pytorch-lightning/pull/3404)) + ## [0.9.0] - YYYY-MM-DD ### Added From 62a9810a13801d7501657191fa506c86a2a7faf5 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 8 Sep 2020 16:46:23 -0700 Subject: [PATCH 3/3] Fixed order of Horovod and Apex optimizer wrapping --- pytorch_lightning/accelerators/horovod_backend.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/accelerators/horovod_backend.py b/pytorch_lightning/accelerators/horovod_backend.py index 5e02f947ebf94..794eb2a2521df 100644 --- a/pytorch_lightning/accelerators/horovod_backend.py +++ b/pytorch_lightning/accelerators/horovod_backend.py @@ -72,11 +72,6 @@ def setup(self, model): if isinstance(scheduler, _LRScheduler): scheduler.base_lrs = [lr * hvd.size() for lr in scheduler.base_lrs] - if self.trainer.amp_backend == AMPType.APEX: - model, optimizers = model.configure_apex(amp, model, self.trainer.optimizers, self.trainer.amp_level) - self.trainer.optimizers = optimizers - self.trainer.reinit_scheduler_properties(self.trainer.optimizers, self.trainer.lr_schedulers) - # Horovod: broadcast parameters & optimizer state to ensure consistent initialization hvd.broadcast_parameters(model.state_dict(), root_rank=0) for optimizer in self.trainer.optimizers: @@ -92,6 +87,11 @@ def filter_named_parameters(model, optimizer): for optimizer in self.trainer.optimizers ] + if self.trainer.amp_backend == AMPType.APEX: + model, optimizers = model.configure_apex(amp, model, self.trainer.optimizers, self.trainer.amp_level) + self.trainer.optimizers = optimizers + self.trainer.reinit_scheduler_properties(self.trainer.optimizers, self.trainer.lr_schedulers) + # Update logger rank info from Horovod to avoid race conditions from different ranks # creating directories / writing files in the same locations. self.trainer.global_rank = hvd.rank()