refactor optimizer loop logic for manual and automatic optimization (#7526)

awaelchli · carmocca · ananthsub · web-flow · commit 502adbced37c · 2021-05-17T14:42:01.000+02:00
Co-authored-by: Carlos Mocholí &lt;carlossmocholi@gmail.com&gt;
Co-authored-by: ananthsub &lt;ananth.subramaniam@gmail.com&gt;
Co-authored-by: Justus Schock &lt;12886177+justusschock@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,6 +38,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Moved attributes `global_step`, `current_epoch`, `max/min_steps`, `max/min_epochs`, `batch_idx`, and `total_batch_idx` to TrainLoop ([#7437](https://github.com/PyTorchLightning/pytorch-lightning/pull/7025))
     * Refactored result handling in training loop ([#7506](https://github.com/PyTorchLightning/pytorch-lightning/pull/7506))
     * Moved attributes `hiddens` and `split_idx` to TrainLoop ([#7507](https://github.com/PyTorchLightning/pytorch-lightning/pull/7507))
+    * Refactored the logic around manual and automatic optimization inside the optimizer loop ([#7526](https://github.com/PyTorchLightning/pytorch-lightning/pull/7526))
 
 - `DataModule`s now avoid duplicate `{setup,teardown,prepare_data}` calls for the same stage ([#7238](https://github.com/PyTorchLightning/pytorch-lightning/pull/7238))
 
diff --git a/pytorch_lightning/callbacks/finetuning.py b/pytorch_lightning/callbacks/finetuning.py
@@ -282,7 +282,7 @@ def _store(
 
     def on_train_epoch_start(self, trainer, pl_module):
         """Called when the epoch begins."""
-        for opt_idx, optimizer in trainer.train_loop.prepare_optimizers():
+        for opt_idx, optimizer in trainer.train_loop.get_active_optimizers():
             num_param_groups = len(optimizer.param_groups)
             self.finetune_function(pl_module, trainer.current_epoch, optimizer, opt_idx)
             current_param_groups = optimizer.param_groups
diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import types
 from contextlib import contextmanager
 from typing import Callable, Optional
 from weakref import proxy
@@ -207,7 +206,7 @@ def closure_dis():
             profiler_name = "closure_{self._optimizer_idx}"
             closure = do_nothing_closure
         else:
-            if not isinstance(closure, types.FunctionType):
+            if not callable(closure):
                 raise MisconfigurationException("When closure is provided, it should be a function")
             profiler_name = f"optimizer_step_and_closure_{self._optimizer_idx}"
 
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -998,7 +998,7 @@ def _run_evaluation(self, on_epoch: bool = False) -> _EVALUATE_OUTPUT:
             self.optimizer_connector.update_learning_rates(
                 interval='epoch',
                 opt_indices=[
-                    opt_idx for opt_idx, _ in self.train_loop.get_optimizers_iterable(
+                    opt_idx for opt_idx, _ in self.train_loop.get_active_optimizers(
                         batch_idx=(self.train_loop.total_batch_idx - 1)
                     )  # Select the optimizers which were used in the last batch of the epoch
                 ],
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
@@ -15,10 +15,12 @@
 from collections import OrderedDict
 from contextlib import contextmanager, suppress
 from copy import copy, deepcopy
-from typing import Any, Dict, List, Optional, Union
+from functools import partial, update_wrapper
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
+from torch.optim import Optimizer
 
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.core.step_result import Result
@@ -82,9 +84,8 @@ def __init__(
             self.trainer.num_sanity_val_steps = num_sanity_val_steps
 
     @property
-    def num_optimizers(self):
-        num_optimizers = len(self.get_optimizers_iterable())
-        return num_optimizers
+    def num_active_optimizers(self) -> int:
+        return len(self.get_active_optimizers())
 
     @property
     def optimizer_freq_cumsum(self):
@@ -234,23 +235,25 @@ def _should_add_batch_output_to_epoch_output(self) -> bool:
 
         return False
 
-    def get_optimizers_iterable(self, batch_idx=None):
+    def get_active_optimizers(self, batch_idx: Optional[int] = None) -> List[Tuple[int, Optimizer]]:
         """
-        Generates an iterable with (idx, optimizer) for each optimizer.
+        Returns the currently active optimizers. When multiple optimizers are used with different frequencies,
+        only one of the optimizers is active at a time.
+
+        Returns:
+            A list of tuples (opt_idx, optimizer) of currently active optimizers.
         """
         if not self.trainer.optimizer_frequencies:
             # call training_step once per optimizer
             return list(enumerate(self.trainer.optimizers))
 
-        if batch_idx is None:
-            batch_idx = self.total_batch_idx
-
+        batch_idx = self.total_batch_idx if batch_idx is None else batch_idx
         optimizers_loop_length = self.optimizer_freq_cumsum[-1]
         current_place_in_loop = batch_idx % optimizers_loop_length
 
         # find optimzier index by looking for the first {item > current_place} in the cumsum list
-        opt_idx = np.argmax(self.optimizer_freq_cumsum > current_place_in_loop)
-        return [[opt_idx, self.trainer.optimizers[opt_idx]]]
+        opt_idx = int(np.argmax(self.optimizer_freq_cumsum > current_place_in_loop))
+        return [(opt_idx, self.trainer.optimizers[opt_idx])]
 
     def on_after_backward(self, training_step_output, batch_idx, untouched_loss):
         training_step_output.detach()
@@ -471,7 +474,7 @@ def run_training_epoch(self):
         train_dataloader = self.trainer.accelerator.process_dataloader(self.trainer.train_dataloader)
 
         # track epoch output
-        epoch_output = [[] for _ in range(self.num_optimizers)]
+        epoch_output = [[] for _ in range(self.num_active_optimizers)]
 
         train_dataloader = self.trainer.data_connector.get_profiled_train_dataloader(train_dataloader)
         dataloader_idx = 0
@@ -660,7 +663,7 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx):
         # bookkeeping
         self._hiddens = None
 
-        optimizers = self.prepare_optimizers()
+        optimizers = list(enumerate(self.trainer.optimizers))
 
         # track all outputs across time and num of optimizers
         batch_outputs = [[] for _ in range(len(optimizers))]
@@ -689,69 +692,88 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx):
         for split_idx, split_batch in enumerate(splits):
             self.split_idx = split_idx
 
-            # create an iterable for optimizers and loop over them
-            for opt_idx, optimizer in optimizers:
-
-                # toggle model params + set info to logger_connector
-                self.run_train_split_start(split_idx, split_batch, opt_idx, optimizer)
-
-                result = AttributeDict()
-                if self.should_accumulate():
-                    # For gradient accumulation
-
-                    # -------------------
-                    # calculate loss (train step + train step end)
-                    # -------------------
+            if self.trainer.lightning_module.automatic_optimization:
+                for opt_idx, optimizer in self.get_active_optimizers(batch_idx):
+                    result = self._run_optimization(batch_idx, split_idx, split_batch, opt_idx, optimizer)
+                    if result:
+                        batch_outputs[opt_idx].append(result.training_step_output_for_epoch_end)
+                        grad_norm_dict = result.get("grad_norm_dict", {})
+            else:
+                # in manual optimization, there is no looping over optimizers
+                result = self._run_optimization(batch_idx, split_idx, split_batch)
+                if result:
+                    batch_outputs[0].append(result.training_step_output_for_epoch_end)
+
+        output = AttributeDict(
+            signal=0,
+            # todo: Properly aggregate grad_norm accros opt_idx and split_idx
+            grad_norm_dict=grad_norm_dict,
+            training_step_output_for_epoch_end=batch_outputs,
+        )
+        return output
 
-                    # automatic_optimization=True: perform dpp sync only when performing optimizer_step
-                    # automatic_optimization=False: don't block synchronization here
-                    with self.block_ddp_sync_behaviour():
-                        self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, self._hiddens)
+    def _run_optimization(self, batch_idx, split_idx, split_batch, opt_idx=0, optimizer=None):
+        # TODO: In v1.5, when optimizer_idx gets removed from training_step in manual_optimization, change
+        #   opt_idx=0 to opt_idx=None in the signature here
 
-                # ------------------------------
-                # BACKWARD PASS
-                # ------------------------------
-                # gradient update with accumulated gradients
-                else:
-                    if self.trainer.lightning_module.automatic_optimization:
+        # toggle model params + set info to logger_connector
+        self.run_train_split_start(split_idx, split_batch, opt_idx, optimizer)
 
-                        def train_step_and_backward_closure():
-                            nonlocal result
-                            result = self.training_step_and_backward(
-                                split_batch, batch_idx, opt_idx, optimizer, self._hiddens
-                            )
-                            return None if result is None else result.loss
+        result = AttributeDict()
+        closure = self.make_closure(split_batch, batch_idx, opt_idx, optimizer, self._hiddens, result)
 
-                        # optimizer step
-                        self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
+        if self.should_accumulate():
+            # For gradient accumulation
 
-                    else:
-                        result = self.training_step(split_batch, batch_idx, opt_idx, self._hiddens)
+            # -------------------
+            # calculate loss (train step + train step end)
+            # -------------------
 
-                    if not result:
-                        # user decided to skip optimization
-                        # make sure to zero grad.
-                        continue
+            # automatic_optimization=True: perform ddp sync only when performing optimizer_step
+            # automatic_optimization=False: don't block synchronization here
+            with self.block_ddp_sync_behaviour():
+                closure()
 
-                    # todo: Properly aggregate grad_norm accros opt_idx and split_idx
-                    grad_norm_dict = result.get("grad_norm_dict", {})
+        # ------------------------------
+        # BACKWARD PASS
+        # ------------------------------
+        # gradient update with accumulated gradients
+        else:
+            if self.trainer.lightning_module.automatic_optimization:
+                self.optimizer_step(optimizer, opt_idx, batch_idx, closure)
+            else:
+                result = self.training_step(split_batch, batch_idx, opt_idx, self._hiddens)
 
-                    # update running loss + reset accumulated loss
-                    self.update_running_loss(result.loss)
+            if not result:
+                # user decided to skip optimization
+                return result
 
-                batch_outputs = self._process_closure_result(
-                    opt_closure_result=result,
-                    batch_outputs=batch_outputs,
-                    opt_idx=opt_idx,
-                )
+            # update running loss + reset accumulated loss
+            self.update_running_loss(result.loss)
 
-        result = AttributeDict(
-            signal=0,
-            grad_norm_dict=grad_norm_dict,
-            training_step_output_for_epoch_end=batch_outputs,
-        )
+        self._process_closure_result(result)
         return result
 
+    def training_step_and_backward_closure(
+        self,
+        split_batch: Any,
+        batch_idx: int,
+        opt_idx: int,
+        optimizer: Optimizer,
+        hiddens,
+        return_result: AttributeDict,
+    ) -> Optional[torch.Tensor]:
+
+        step_result = self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
+        if step_result is not None:
+            return_result.update(step_result)
+            return return_result.loss
+
+    def make_closure(self, *closure_args, **closure_kwargs: Any) -> Callable:
+        """ Wraps the training step closure into a partial object which will be called within ``optimizer.step``. """
+        partial_func = partial(self.training_step_and_backward_closure, *closure_args, **closure_kwargs)
+        return update_wrapper(partial_func, self.training_step_and_backward_closure)
+
     @contextmanager
     def block_ddp_sync_behaviour(self, should_block_sync: bool = False):
         """
@@ -776,22 +798,16 @@ def block_ddp_sync_behaviour(self, should_block_sync: bool = False):
         else:
             yield None
 
-    def _process_closure_result(
-        self, opt_closure_result: Optional[AttributeDict], batch_outputs: list, opt_idx: int
-    ) -> list:
-        if opt_closure_result:
-            # cache metrics
-            self.trainer.logger_connector.cache_training_step_metrics(opt_closure_result)
-
-            # check if loss or model weights are nan
-            if self.trainer.terminate_on_nan:
-                self._check_finite(opt_closure_result.loss)
+    def _process_closure_result(self, opt_closure_result: Optional[AttributeDict]) -> None:
+        if not opt_closure_result:
+            return
 
-            # track all the outputs across all steps
-            batch_opt_idx = opt_idx if len(batch_outputs) > 1 else 0
-            batch_outputs[batch_opt_idx].append(opt_closure_result.training_step_output_for_epoch_end)
+        # cache metrics
+        self.trainer.logger_connector.cache_training_step_metrics(opt_closure_result)
 
-        return batch_outputs
+        # check if loss or model weights are nan
+        if self.trainer.terminate_on_nan:
+            self._check_finite(opt_closure_result.loss)
 
     def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, hiddens):
         """Wrap forward, zero_grad and backward in a closure so second order methods work"""
@@ -863,7 +879,7 @@ def update_train_loop_lr_schedulers(self, monitor_metrics=None):
             self.trainer.optimizer_connector.update_learning_rates(
                 interval="step",
                 monitor_metrics=monitor_metrics,
-                opt_indices=[opt_idx for opt_idx, _ in self.get_optimizers_iterable()],
+                opt_indices=[opt_idx for opt_idx, _ in self.get_active_optimizers()],
             )
 
     def increment_accumulated_grad_global_step(self):
@@ -961,13 +977,6 @@ def save_loggers_on_train_batch_end(self):
         if should_flush_logs and self.trainer.is_global_zero and self.trainer.logger is not None:
             self.trainer.logger.save()
 
-    def prepare_optimizers(self):
-        # in manual optimization we loop over all optimizers at once
-        optimizers = self.get_optimizers_iterable()
-        if not self.trainer.lightning_module.automatic_optimization:
-            optimizers = [optimizers[0]]
-        return optimizers
-
     def run_train_split_start(self, split_idx, split_batch, opt_idx, optimizer):
         # make sure only the gradients of the current optimizer's parameters are calculated
         # in the training step to prevent dangling gradients in multiple-optimizer setup.
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
@@ -295,7 +295,6 @@ def test_accelerator_choice_ddp_kubeflow(device_count_mock, setup_distributed_mo
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer.accelerator_connector.use_ddp
             assert isinstance(trainer.accelerator, GPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment)
@@ -331,7 +330,6 @@ def test_accelerator_choice_ddp_cpu_kubeflow(device_count_mock, setup_distribute
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer.accelerator_connector.use_ddp
             assert isinstance(trainer.accelerator, CPUAccelerator)
             assert isinstance(trainer.training_type_plugin, DDPPlugin)
             assert isinstance(trainer.training_type_plugin.cluster_environment, KubeflowEnvironment)
diff --git a/tests/core/test_lightning_optimizer.py b/tests/core/test_lightning_optimizer.py
@@ -243,7 +243,7 @@ def training_epoch_end(self, outputs):
             ...
 
         def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, **_):
-            assert optimizer_closure.__name__ == "train_step_and_backward_closure"
+            assert optimizer_closure.__name__ == "training_step_and_backward_closure"
             # not passing the closure to the optimizer because step is mocked
             # zero_grad is called inside the closure
             if isinstance(optimizer, SGD) and batch_idx % 2 == 0:
diff --git a/tests/deprecated_api/test_remove_1-5.py b/tests/deprecated_api/test_remove_1-5.py
@@ -196,7 +196,7 @@ def __init__(self):
             self.automatic_optimization = False
 
         def training_step(self, batch, batch_idx, optimizer_idx):
-            assert optimizer_idx is not None
+            assert optimizer_idx == 0
             return super().training_step(batch, batch_idx)
 
         def configure_optimizers(self):