Support fused Adam with mixed precision (#15555)

awaelchli · carmocca · web-flow · commit e4611ef98eba · 2022-11-08T09:53:30.000-05:00
Co-authored-by: Carlos Mocholí &lt;carlossmocholi@gmail.com&gt;
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -61,6 +61,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed an issue with `WandbLogger(log_model=True|'all)` raising an error and not being able to serialize tensors in the metadata ([#15544](https://github.com/Lightning-AI/lightning/pull/15544))
 
+- Fixed the gradient unscaling logic when using `Trainer(precision=16)` and fused optimizers such as `Adam(..., fused=True)` ([#15544](https://github.com/Lightning-AI/lightning/pull/15544))
+
 - Fixed model state transfer in multiprocessing launcher when running multi-node ([#15567](https://github.com/Lightning-AI/lightning/pull/15567))
 
 
diff --git a/src/pytorch_lightning/plugins/precision/native_amp.py b/src/pytorch_lightning/plugins/precision/native_amp.py
@@ -16,13 +16,13 @@
 
 import torch
 from torch import Tensor
-from torch.optim import LBFGS
+from torch.optim import LBFGS, Optimizer
 
 import pytorch_lightning as pl
 from lightning_lite.accelerators.cuda import _patch_cuda_is_available
 from lightning_lite.utilities.types import Optimizable
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
-from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_10, AMPType
+from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_10, AMPType, GradClipAlgorithmType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 if _TORCH_GREATER_EQUAL_1_10:
@@ -83,8 +83,13 @@ def optimizer_step(  # type: ignore[override]
                 f"Native AMP and the LBFGS optimizer are not compatible (optimizer {optimizer_idx})."
             )
         closure_result = closure()
-        # `unscale` after the closure is executed but before the `on_before_optimizer_step` hook.
-        self.scaler.unscale_(optimizer)
+
+        if not _optimizer_handles_unscaling(optimizer):
+            # Unscaling needs to be performed here in case we are going to apply gradient clipping.
+            # Optimizers that perform unscaling in their `.step()` method are not supported (e.g., fused Adam).
+            # Note: `unscale` happens after the closure is executed, but before the `on_before_optimizer_step` hook.
+            self.scaler.unscale_(optimizer)
+
         self._after_closure(model, optimizer, optimizer_idx)
         skipped_backward = closure_result is None
         # in manual optimization, the closure does not return a value
@@ -95,6 +100,19 @@ def optimizer_step(  # type: ignore[override]
             return step_output
         return closure_result
 
+    def clip_gradients(
+        self,
+        optimizer: Optimizer,
+        clip_val: Union[int, float] = 0.0,
+        gradient_clip_algorithm: GradClipAlgorithmType = GradClipAlgorithmType.NORM,
+    ) -> None:
+        if clip_val > 0 and _optimizer_handles_unscaling(optimizer):
+            raise RuntimeError(
+                f"The current optimizer, {type(optimizer).__qualname__}, does not allow for gradient clipping"
+                " because it performs unscaling of gradients internally. HINT: Are you using a 'fused' optimizer?"
+            )
+        super().clip_gradients(optimizer=optimizer, clip_val=clip_val, gradient_clip_algorithm=gradient_clip_algorithm)
+
     def autocast_context_manager(self) -> Union["old_autocast", "new_autocast"]:
         if _TORCH_GREATER_EQUAL_1_10:
             # the dtype could be automatically inferred but we need to manually set it due to a bug upstream
@@ -116,3 +134,13 @@ def state_dict(self) -> Dict[str, Any]:
     def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
         if self.scaler is not None:
             self.scaler.load_state_dict(state_dict)
+
+
+def _optimizer_handles_unscaling(optimizer: Any) -> bool:
+    """Determines whether a PyTorch optimizer handles unscaling gradients in the step method rather than through the
+    :class:`torch.cuda.amp.GradScaler`.
+
+    Since, the current implementation of this function checks a PyTorch internal variable on the optimizer, the return
+    value will only be reliable for built-in PyTorch optimizers.
+    """
+    return getattr(optimizer, "_step_supports_amp_scaling", False)
diff --git a/tests/tests_lite/plugins/precision/test_native_amp_integration.py b/tests/tests_lite/plugins/precision/test_native_amp_integration.py
@@ -18,6 +18,8 @@
 from tests_lite.helpers.models import BoringLite
 from tests_lite.helpers.runif import RunIf
 
+from lightning_lite import LightningLite, seed_everything
+
 
 class NativeMixedPrecisionModule(nn.Module):
     def __init__(self, expected_dtype):
@@ -70,3 +72,37 @@ def test_native_mixed_precision(accelerator, precision, expected_dtype):
     lite = NativeMixedPrecisionBoringLite(accelerator=accelerator, precision=precision)
     lite.expected_dtype = expected_dtype
     lite.run()
+
+
+@RunIf(min_torch="1.13", min_cuda_gpus=1)
+def test_native_mixed_precision_fused_optimizer_parity():
+    def run(fused=False):
+        seed_everything(1234)
+        lite = LightningLite(accelerator="cuda", precision=16, devices=1)
+
+        model = nn.Linear(10, 10).to(lite.device)  # TODO: replace with individual setup_model call
+        optimizer = torch.optim.Adam(model.parameters(), lr=1.0, fused=fused)
+
+        model, optimizer = lite.setup(model, optimizer)
+        assert isinstance(lite._precision.scaler, torch.cuda.amp.GradScaler)
+
+        data = torch.randn(10, 10, device="cuda")
+        target = torch.randn(10, 10, device="cuda")
+
+        losses = []
+        for _ in range(5):
+            optimizer.zero_grad()
+            output = model(data)
+            loss = (output - target).abs().sum()
+            lite.backward(loss)
+            optimizer.step()
+            losses.append(loss.detach())
+        return torch.stack(losses), model.parameters()
+
+    losses, params = run(fused=False)
+    losses_fused, params_fused = run(fused=True)
+
+    # Both the regular and the fused version of Adam produce the same losses and model weights
+    torch.testing.assert_close(losses, losses_fused)
+    for p, q in zip(params, params_fused):
+        torch.testing.assert_close(p, q)
diff --git a/tests/tests_pytorch/plugins/precision/test_native_amp.py b/tests/tests_pytorch/plugins/precision/test_native_amp.py
@@ -0,0 +1,53 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from unittest.mock import Mock
+
+import pytest
+from torch.optim import Optimizer
+
+from pytorch_lightning.plugins import NativeMixedPrecisionPlugin
+from pytorch_lightning.utilities import GradClipAlgorithmType
+
+
+def test_clip_gradients():
+    """Test that `.clip_gradients()` is a no-op when clipping is disabled."""
+    optimizer = Mock(spec=Optimizer)
+    precision = NativeMixedPrecisionPlugin(precision=16, device="cuda:0", scaler=Mock())
+    precision.clip_grad_by_value = Mock()
+    precision.clip_grad_by_norm = Mock()
+    precision.clip_gradients(optimizer)
+    precision.clip_grad_by_value.assert_not_called()
+    precision.clip_grad_by_norm.assert_not_called()
+
+    precision.clip_gradients(optimizer, clip_val=1.0, gradient_clip_algorithm=GradClipAlgorithmType.VALUE)
+    precision.clip_grad_by_value.assert_called_once()
+    precision.clip_grad_by_norm.assert_not_called()
+
+    precision.clip_grad_by_value.reset_mock()
+    precision.clip_grad_by_norm.reset_mock()
+
+    precision.clip_gradients(optimizer, clip_val=1.0, gradient_clip_algorithm=GradClipAlgorithmType.NORM)
+    precision.clip_grad_by_value.assert_not_called()
+    precision.clip_grad_by_norm.assert_called_once()
+
+
+def test_optimizer_amp_scaling_support_in_step_method():
+    """Test that the plugin checks if the optimizer takes over unscaling in its step, making it incompatible with
+    gradient clipping (example: fused Adam)."""
+
+    optimizer = Mock(_step_supports_amp_scaling=True)
+    precision = NativeMixedPrecisionPlugin(precision=16, device="cuda:0", scaler=Mock())
+
+    with pytest.raises(RuntimeError, match="The current optimizer.*does not allow for gradient clipping"):
+        precision.clip_gradients(optimizer, clip_val=1.0)
diff --git a/tests/tests_pytorch/plugins/precision/test_native_amp_integration.py b/tests/tests_pytorch/plugins/precision/test_native_amp_integration.py
@@ -0,0 +1,56 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from lightning_lite import seed_everything
+from pytorch_lightning import Trainer
+from pytorch_lightning.demos.boring_classes import BoringModel
+from tests_pytorch.helpers.runif import RunIf
+
+
+class FusedOptimizerParityModel(BoringModel):
+    def __init__(self, fused=False):
+        super().__init__()
+        self.fused = fused
+
+    def configure_optimizers(self):
+        assert isinstance(self.trainer.precision_plugin.scaler, torch.cuda.amp.GradScaler)
+        return torch.optim.Adam(self.parameters(), lr=1.0, fused=self.fused)
+
+
+@RunIf(min_torch="1.13", min_cuda_gpus=1)
+def test_native_mixed_precision_fused_optimizer_parity(tmpdir):
+    def run(fused=False):
+        seed_everything(1234)
+        model = FusedOptimizerParityModel(fused)
+        trainer = Trainer(
+            default_root_dir=tmpdir,
+            accelerator="cuda",
+            devices=1,
+            precision=16,
+            max_steps=5,
+            logger=False,
+            enable_checkpointing=False,
+            enable_progress_bar=False,
+            enable_model_summary=False,
+        )
+        trainer.fit(model)
+        return model.parameters()
+
+    params = run(fused=False)
+    params_fused = run(fused=True)
+
+    # Both the regular and the fused version of Adam produce the same losses and model weights
+    for p, q in zip(params, params_fused):
+        torch.testing.assert_close(p, q)