Lightning-AI · tchaton · Mar 6, 2021 · Feb 24, 2021 · Feb 24, 2021 · Feb 25, 2021
@@ -15,12 +15,30 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `checkpoint` parameter to callback's `on_save_checkpoint` hook ([#6072](https://github.com/PyTorchLightning/pytorch-lightning/pull/6072))
 
 
+- Added `RunningStage.SANITY_CHECKING` ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945))
+
+
+- Added `TrainerState.{FITTING,VALIDATING,TESTING,PREDICTING,TUNING}` ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945))
+
+
 ### Changed
 
 
+- Refactor `RunningStage` and `TrainerState` usage ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945))
+
+
+- Changed `trainer.evaluating` to return `True` if validating or testing ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945))
+
+
 ### Deprecated
 
 
+- Deprecated `trainer.running_sanity_check` in favor of `trainer.sanity_checking` ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945))
+
+
+- Deprecated `trainer.tested_ckpt_path` in favor of `trainer.evaluated_ckpt_path` ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945))
+
+
 ### Removed
 
 - Removed support for passing a bool value to `profiler` argument of Trainer ([#6164](https://github.com/PyTorchLightning/pytorch-lightning/pull/6164))

@@ -20,6 +20,7 @@
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.plugins.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
 from pytorch_lightning.plugins.training_type import TrainingTypePlugin
+from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available
 from pytorch_lightning.utilities.enums import AMPType, LightningEnum
@@ -81,8 +82,8 @@ def setup(self, trainer: 'Trainer', model: LightningModule) -> None:
     def start_training(self, trainer: 'Trainer') -> None:
         self.training_type_plugin.start_training(trainer)
 
-    def start_testing(self, trainer: 'Trainer') -> None:
-        self.training_type_plugin.start_testing(trainer)
+    def start_evaluating(self, trainer: 'Trainer') -> None:
+        self.training_type_plugin.start_evaluating(trainer)
 
     def start_predicting(self, trainer: 'Trainer') -> None:
         self.training_type_plugin.start_predicting(trainer)
@@ -330,7 +331,7 @@ def setup_optimizers(self, trainer: 'Trainer') -> None:
             trainer: the Trainer, these optimizers should be connected to
             model: the model to be optimized by the created optimizers
         """
-        if trainer.testing:
+        if trainer.state not in (TrainerState.FITTING, TrainerState.TUNING):
             return
         optimizers, lr_schedulers, optimizer_frequencies = self.training_type_plugin.init_optimizers(
             trainer=trainer, model=self.lightning_module
@@ -426,7 +427,7 @@ def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[I
     @property
     def results(self) -> Any:
         """
-        The results of the last training/testing run will be cached within the training type plugin.
+        The results of the last run will be cached within the training type plugin.
         In distributed training, we make sure to transfer the results to the appropriate master process.
         """
         return self.training_type_plugin.results
@@ -133,12 +133,13 @@ def on_load_checkpoint(self, callback_state: Dict[str, Any]):
         self.patience = callback_state['patience']
 
     def on_validation_end(self, trainer, pl_module):
-        if trainer.running_sanity_check:
+        from pytorch_lightning.trainer.states import TrainerState
+        if trainer.state is not TrainerState.FITTING or trainer.sanity_checking:
             return
 
-        self._run_early_stopping_check(trainer, pl_module)
+        self._run_early_stopping_check(trainer)
 
-    def _run_early_stopping_check(self, trainer, pl_module):
+    def _run_early_stopping_check(self, trainer):
         """
         Checks whether the early stopping condition is met
         and if so tells the trainer to stop the training.

@@ -213,12 +213,14 @@ def save_checkpoint(self, trainer, pl_module):
         epoch = trainer.current_epoch
         global_step = trainer.global_step
 
+        from pytorch_lightning.trainer.states import TrainerState
         if (
             trainer.fast_dev_run  # disable checkpointing with fast_dev_run
+            or trainer.state is not TrainerState.FITTING  # don't save anything during non-fit
+            or trainer.sanity_checking  # don't save anything during sanity check
             or self.save_top_k == 0  # no models are saved
             or self.period < 1  # no models are saved
             or (epoch + 1) % self.period  # skip epoch
-            or trainer.running_sanity_check  # don't save anything during sanity check
             or self._last_global_step_saved == global_step  # already saved at the last step
         ):
             return

@@ -380,7 +380,6 @@ def init_test_tqdm(self) -> tqdm:
     def on_sanity_check_start(self, trainer, pl_module):
         super().on_sanity_check_start(trainer, pl_module)
         self.val_progress_bar = self.init_sanity_tqdm()
-        reset(self.val_progress_bar, sum(trainer.num_sanity_val_batches))
         self.main_progress_bar = tqdm(disable=True)  # dummy progress bar
 
     def on_sanity_check_end(self, trainer, pl_module):
@@ -412,7 +411,9 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, data
 
     def on_validation_start(self, trainer, pl_module):
         super().on_validation_start(trainer, pl_module)
-        if not trainer.running_sanity_check:
+        if trainer.sanity_checking:
+            reset(self.val_progress_bar, sum(trainer.num_sanity_val_batches))
+        else:
             self._update_bar(self.main_progress_bar)  # fill up remaining
             self.val_progress_bar = self.init_validation_tqdm()
             reset(self.val_progress_bar, self.total_val_batches)

@@ -24,7 +24,7 @@
 from argparse import Namespace
 from functools import partial
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, TYPE_CHECKING, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import ScriptModule, Tensor
@@ -44,9 +44,6 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.parsing import AttributeDict, collect_init_args, get_init_args
 
-if TYPE_CHECKING:
-    from pytorch_lightning.trainer.states import RunningStage
-
 
 class LightningModule(
     ABC,
@@ -171,10 +168,6 @@ def automatic_optimization(self) -> bool:
         """
         return self._automatic_optimization
 
-    @property
-    def running_stage(self) -> Optional["RunningStage"]:
-        return self.trainer._running_stage if self.trainer else None
-
     @automatic_optimization.setter
     def automatic_optimization(self, automatic_optimization: bool) -> None:
         self._automatic_optimization = automatic_optimization

@@ -18,7 +18,6 @@
 from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin
 from pytorch_lightning.utilities.warnings import WarningCache
 
@@ -43,28 +42,28 @@ def __init__(self, pl_module: LightningModule):
         self.module = pl_module
 
     def forward(self, *inputs, **kwargs):
-        running_stage = self.module.running_stage
+        trainer = self.module.trainer
 
-        if running_stage == RunningStage.TRAINING:
+        if trainer and trainer.training:
             output = self.module.training_step(*inputs, **kwargs)
 
             # In manual_optimization, we need to prevent DDP reducer as
             # it is done manually in ``LightningModule.manual_backward``
             # `require_backward_grad_sync` will be reset in the
             # ddp_plugin ``post_training_step`` hook
             if not self.module.automatic_optimization:
-                self.module.trainer.model.require_backward_grad_sync = False
+                trainer.model.require_backward_grad_sync = False
             warn_if_output_is_none(output, "training_step")
 
-        elif running_stage == RunningStage.TESTING:
+        elif trainer and trainer.testing:
             output = self.module.test_step(*inputs, **kwargs)
             warn_if_output_is_none(output, "test_step")
 
-        elif running_stage == RunningStage.EVALUATING:
+        elif trainer and (trainer.sanity_checking or trainer.validating):
             output = self.module.validation_step(*inputs, **kwargs)
             warn_if_output_is_none(output, "validation_step")
 
-        elif running_stage == RunningStage.PREDICTING:
+        elif trainer and trainer.predicting:
             output = self.module.predict(*inputs, **kwargs)
             warn_if_output_is_none(output, "predict")
 

@@ -27,6 +27,7 @@
 from pytorch_lightning.overrides.distributed import prepare_for_backward
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7
 from pytorch_lightning.utilities.cloud_io import atomic_save
 from pytorch_lightning.utilities.cloud_io import load as pl_load
@@ -156,7 +157,7 @@ def new_process(self, process_idx, trainer, mp_queue):
 
         self.barrier()
 
-        results = trainer.train_or_test_or_predict()
+        results = trainer.run_stage()
 
         # persist info in ddp_spawn
         self.transfer_distrib_spawn_state_on_fit_end(results)
@@ -218,7 +219,11 @@ def transfer_distrib_spawn_state_on_fit_end(self, results):
             # save the last weights
             last_path = None
             # TODO: is there a better way than accessing trainer through model -> trainer?
-            if not self.lightning_module.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
+            if (
+                self.lightning_module.trainer.state is TrainerState.FITTING
+                and best_model_path is not None
+                and len(best_model_path) > 0
+            ):
                 last_path = re.sub(".ckpt", ".tmp_end.ckpt", best_model_path)
                 atomic_save(self.on_save(self.lightning_module.state_dict()), last_path)
 
@@ -235,7 +240,7 @@ def __recover_child_process_weights(self, best_path, last_path):
         # todo, pass also best score
 
         # load last weights
-        if last_path is not None and not self.lightning_module.trainer.testing:
+        if last_path is not None and self.lightning_module.trainer.state is TrainerState.FITTING:
             ckpt = pl_load(last_path, map_location=lambda storage, loc: storage)
             self.lightning_module.load_state_dict(ckpt)
 

@@ -213,7 +213,7 @@ def init_deepspeed(self):
         precision = self.lightning_module.trainer.accelerator.precision
         model = LightningDeepSpeedModule(pl_module=self.model, precision=precision)
 
-        if self.lightning_module.trainer.training:
+        if self.lightning_module.trainer and self.lightning_module.trainer.training:
             self._initialize_deepspeed_train(model)
         else:
             self._initialize_deepspeed_inference(model)
@@ -249,8 +249,7 @@ def _initialize_deepspeed_train(self, model):
         )
 
         # set optimizer for save/load, but deepspeed manages the specific optimizer logic
-        trainer = self.lightning_module.trainer
-        trainer.optimizers = [optimizer]
+        self.lightning_module.trainer.optimizers = [optimizer]
         self.model = model
 
     def _initialize_deepspeed_inference(self, model):

@@ -24,7 +24,7 @@
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.distributed import LightningDistributedModule
 from pytorch_lightning.plugins.training_type.rpc import DEFAULT_RPC_TIMEOUT_SEC, RPCPlugin
-from pytorch_lightning.trainer.states import RunningStage
+from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _FAIRSCALE_PIPE_AVAILABLE, rank_zero_only
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
@@ -208,7 +208,7 @@ def _skip_init_connections(self):
         Returns: Whether to skip initialization
 
         """
-        return torch_distrib.is_initialized() and self.lightning_module.running_stage == RunningStage.TESTING
+        return torch_distrib.is_initialized() and self.lightning_module.trainer.state is not TrainerState.FITTING
 
     def init_model_parallel_groups(self):
         num_model_parallel = 1  # TODO currently no support for vertical model parallel
@@ -231,7 +231,7 @@ def _infer_check_num_gpus(self):
         return self.world_size
 
     def handle_transferred_pipe_module(self) -> None:
-        if not self.lightning_module.running_stage == RunningStage.TESTING:
+        if self.lightning_module.trainer.state is TrainerState.FITTING:
             torch_distrib.barrier()  # Ensure we await main process initialization
             # Add trainer/configure_optimizers to the pipe model for access in all worker processes
             rpc_pipe.PipeModel.trainer = self.lightning_module.trainer
@@ -243,7 +243,7 @@ def init_pipe_module(self) -> None:
         # Create pipe_module
         model = self.lightning_module
         self._find_and_init_pipe_module(model)
-        if not self.lightning_module.running_stage == RunningStage.TESTING:
+        if self.lightning_module.trainer.state is TrainerState.FITTING:
             torch_distrib.barrier()  # Ensure we join main process initialization
             model.sequential_module.foreach_worker(register_optimizers, include_self=True)
 

@@ -3,6 +3,7 @@
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.core.optimizer import is_lightning_optimizer
 from pytorch_lightning.plugins.training_type.ddp import DDPPlugin
+from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, rank_zero_only
 
 if _FAIRSCALE_AVAILABLE:
@@ -35,8 +36,7 @@ def _reinit_optimizers_with_oss(self):
         trainer.convert_to_lightning_optimizers()
 
     def _wrap_optimizers(self):
-        trainer = self.model.trainer
-        if trainer.testing is True:
+        if self.model.trainer.state is not TrainerState.FITTING:
             return
         self._reinit_optimizers_with_oss()
 

@@ -2,6 +2,7 @@
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin
+from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, rank_zero_only
 
 if _FAIRSCALE_AVAILABLE:
@@ -31,8 +32,7 @@ def _reinit_optimizers_with_oss(self):
         trainer.optimizers = optimizers
 
     def _wrap_optimizers(self):
-        trainer = self.model.trainer
-        if trainer.testing:
+        if self.model.trainer.state is not TrainerState.FITTING:
             return
         self._reinit_optimizers_with_oss()
 

@@ -9,6 +9,7 @@
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin
 from pytorch_lightning.plugins.training_type.utils import on_colab_kaggle
+from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_warn
 from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -97,7 +98,7 @@ def new_process(self, process_idx: int, trainer, mp_queue) -> None:
         trainer.save_checkpoint = self.save_checkpoint
         self.barrier()
 
-        results = trainer.train_or_test_or_predict()
+        results = trainer.run_stage()
 
         self.__save_end_of_training_weights(self.lightning_module)
         self.transfer_distrib_spawn_state_on_fit_end(results)
@@ -124,7 +125,11 @@ def transfer_distrib_spawn_state_on_fit_end(self, results):
             # save the last weights
             last_path = None
             # TODO: is there a better way than accessing trainer through model -> trainer?
-            if not self.lightning_module.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
+            if (
+                self.lightning_module.trainer.state is TrainerState.FITTING
+                and best_model_path is not None
+                and len(best_model_path) > 0
+            ):
                 last_path = re.sub(".ckpt", ".tmp_end.ckpt", best_model_path)
                 xm.save(self.lightning_module.state_dict(), last_path)
 
@@ -214,7 +219,7 @@ def post_dispatch(self) -> None:
         # todo, pass also bets score
 
         # load last weights
-        if last_path and not self.lightning_module.trainer.testing:
+        if last_path and model.trainer.state is not TrainerState.FITTING:
             ckpt = torch.load(last_path, map_location=lambda storage, loc: storage)
             model.load_state_dict(ckpt)
 
@@ -227,8 +232,7 @@ def __load_weights_on_main_process(self) -> None:
         model = self.lightning_module
 
         # load weights if not interrupted
-        # TODO: check for trainer reference
-        if on_colab_kaggle() and not model.trainer.testing:
+        if on_colab_kaggle() and model.trainer.state is TrainerState.FITTING:
             self.load_spawn_weights(model)
 
         self._model = model

@@ -117,9 +117,9 @@ def start_training(self, trainer: 'Trainer') -> None:
         # double dispatch to initiate the training loop
         self._results = trainer.run_train()
 
-    def start_testing(self, trainer: 'Trainer') -> None:
+    def start_evaluating(self, trainer: 'Trainer') -> None:
         # double dispatch to initiate the test loop
-        self._results = trainer.run_test()
+        self._results = trainer.run_evaluate()
 
     def start_predicting(self, trainer: 'Trainer') -> None:
         # double dispatch to initiate the predicting loop