diff --git a/.drone.yml b/.drone.yml index bb4d8a74b28f5..84e97150752e7 100644 --- a/.drone.yml +++ b/.drone.yml @@ -20,7 +20,7 @@ name: torch-GPU steps: - name: testing - image: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.5 + image: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6 environment: CODECOV_TOKEN: diff --git a/README.md b/README.md index 33ee544802914..30079df931759 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ Lightning can automatically export to ONNX or TorchScript for those cases. | System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 | 1.7 (latest) | | :---: | :---: | :---: | :---: | :---: | :---: | | Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | -| Linux py3.7 [GPUs**] | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - | +| Linux py3.7 [GPUs**] | - | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | | Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | | Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | | OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index b016b6c5d24fb..98bc8dfc87d25 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -29,8 +29,10 @@ def connect(self, model, optimizers): def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): closure_loss = self.trainer.scaler.scale(closure_loss) + automatic_optimization = self.trainer.train_loop.automatic_optimization + # do backward pass - if self.trainer.train_loop.automatic_optimization: + if automatic_optimization: model = self.trainer.get_model() model.backward(closure_loss, optimizer, opt_idx) else: @@ -40,7 +42,7 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): closure_loss = closure_loss.detach() # unscale gradient to allow analyze within `on_after_backward` - if not self.trainer.train_loop.should_accumulate(): + if not self.trainer.train_loop.should_accumulate() and automatic_optimization: self.trainer.scaler.unscale_(optimizer) return closure_loss diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 449bc8c712ddb..d09d9387ea485 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -25,6 +25,7 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer +from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVALAIBLE from tests.base import EvalModelTemplate from tests.base.models import BasicGAN @@ -126,8 +127,33 @@ def test_horovod_multi_gpu(tmpdir): @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") @pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@pytest.mark.skipif(not APEX_AVAILABLE, reason="test requires apex") +def test_horovod_apex(tmpdir): + """Test Horovod with multi-GPU support using apex amp.""" + trainer_options = dict( + default_root_dir=str(tmpdir), + weights_save_path=str(tmpdir), + gradient_clip_val=1.0, + progress_bar_refresh_rate=0, + max_epochs=1, + limit_train_batches=0.4, + limit_val_batches=0.2, + gpus=2, + deterministic=True, + distributed_backend='horovod', + amp_backend='apex', + precision=16, + ) + _run_horovod(trainer_options, on_gpu=True) + + +@pytest.mark.skip(reason="Skip till Horovod fixes integration with Native torch.cuda.amp") +@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") +@pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@pytest.mark.skipif(not NATIVE_AMP_AVALAIBLE, reason="test requires torch.cuda.amp") def test_horovod_amp(tmpdir): - """Test Horovod with multi-GPU support.""" + """Test Horovod with multi-GPU support using native amp.""" trainer_options = dict( default_root_dir=str(tmpdir), weights_save_path=str(tmpdir), @@ -139,6 +165,7 @@ def test_horovod_amp(tmpdir): gpus=2, deterministic=True, distributed_backend='horovod', + amp_backend='native', precision=16, ) _run_horovod(trainer_options, on_gpu=True) diff --git a/tests/plugins/test_amp_plugin.py b/tests/plugins/test_amp_plugin.py index 6fd000b61d97f..6d7f0252d94c3 100644 --- a/tests/plugins/test_amp_plugin.py +++ b/tests/plugins/test_amp_plugin.py @@ -86,20 +86,19 @@ def on_fit_start(self, trainer, pl_module): trainer.fit(model) +class GradientUnscaleBoringModel(BoringModel): + def on_after_backward(self): + norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2) + if not (torch.isinf(norm) or torch.isnan(norm)): + assert norm.item() < 15. + + @pytest.mark.skipif( LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Minimal PT version is set to 1.6") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_amp_gradient_unscale(tmpdir): - - class ExtendedBoringModel(BoringModel): - - def on_after_backward(self): - norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2) - if not (torch.isinf(norm) or torch.isnan(norm)): - assert norm.item() < 15. - - model = ExtendedBoringModel() + model = GradientUnscaleBoringModel() trainer = Trainer( max_epochs=2, @@ -117,19 +116,19 @@ def on_after_backward(self): trainer.fit(model) +class UnscaleAccumulateGradBatchesBoringModel(BoringModel): + + def on_after_backward(self): + norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2) + if not (torch.isinf(norm) or torch.isnan(norm)): + assert norm.item() < 15. + + @pytest.mark.skipif( LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Minimal PT version is set to 1.6") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_amp_gradient_unscale_accumulate_grad_batches(tmpdir): - - class ExtendedBoringModel(BoringModel): - - def on_after_backward(self): - norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2) - if not (torch.isinf(norm) or torch.isnan(norm)): - assert norm.item() < 15. - - model = ExtendedBoringModel() + model = UnscaleAccumulateGradBatchesBoringModel() trainer = Trainer( max_epochs=2,