Lightning-AI · awaelchli · Jun 21, 2022 · Jun 1, 2022 · Jun 5, 2022 · Jun 5, 2022
@@ -9,6 +9,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added a flag named `log_rank_zero_only` to `EarlyStopping` to disable logging to non-zero rank processes ([#13233](https://github.com/PyTorchLightning/pytorch-lightning/pull/13233))
+
+
 - Added support for reloading the last checkpoint saved by passing `ckpt_path="last"` ([#12816](https://github.com/PyTorchLightning/pytorch-lightning/pull/12816))
 
 

@@ -63,6 +63,7 @@ class EarlyStopping(Callback):
         divergence_threshold: Stop training as soon as the monitored quantity becomes worse than this threshold.
         check_on_train_epoch_end: whether to run early stopping at the end of the training epoch.
             If this is ``False``, then the check runs at the end of the validation.
+        log_rank_zero_only: When set ``True``, logs the status of the early stopping callback only for rank 0 process.
 
     Raises:
         MisconfigurationException:
@@ -100,6 +101,7 @@ def __init__(
         stopping_threshold: Optional[float] = None,
         divergence_threshold: Optional[float] = None,
         check_on_train_epoch_end: Optional[bool] = None,
+        log_rank_zero_only: bool = False,
     ):
         super().__init__()
         self.monitor = monitor
@@ -114,6 +116,7 @@ def __init__(
         self.wait_count = 0
         self.stopped_epoch = 0
         self._check_on_train_epoch_end = check_on_train_epoch_end
+        self.log_rank_zero_only = log_rank_zero_only
 
         if self.mode not in self.mode_dict:
             raise MisconfigurationException(f"`mode` can be {', '.join(self.mode_dict.keys())}, got {self.mode}")
@@ -202,7 +205,7 @@ def _run_early_stopping_check(self, trainer: "pl.Trainer") -> None:
         if should_stop:
             self.stopped_epoch = trainer.current_epoch
         if reason and self.verbose:
-            self._log_info(trainer, reason)
+            self._log_info(trainer, reason, self.log_rank_zero_only)
 
     def _evaluate_stopping_criteria(self, current: Tensor) -> Tuple[bool, Optional[str]]:
         should_stop = False
@@ -255,8 +258,15 @@ def _improvement_message(self, current: Tensor) -> str:
         return msg
 
     @staticmethod
-    def _log_info(trainer: Optional["pl.Trainer"], message: str) -> None:
-        if trainer is not None and trainer.world_size > 1:
-            log.info(f"[rank: {trainer.global_rank}] {message}")
-        else:
-            log.info(message)
+    def _log_info(trainer: Optional["pl.Trainer"], message: str, log_rank_zero_only: bool) -> None:
+        if trainer:
+            # ignore logging in non-zero ranks if log_rank_zero_only flag is enabled
+            if log_rank_zero_only and trainer.global_rank != 0:
+                return
+            # if world size is more than one then specify the rank of the process being logged
+            if trainer.world_size > 1:
+                log.info(f"[rank: {trainer.global_rank}] {message}")
+                return
+
+        # if above conditions don't meet and we have to log
+        log.info(message)
@@ -456,3 +456,36 @@ def test_early_stopping_squeezes():
         early_stopping._run_early_stopping_check(trainer)
 
     es_mock.assert_called_once_with(torch.tensor(0))
+
+
+@pytest.mark.parametrize("trainer", [Trainer(), None])
+@pytest.mark.parametrize(
+    "log_rank_zero_only, world_size, global_rank, expected_log",
+    [
+        (False, 1, 0, "bar"),
+        (False, 2, 0, "[rank: 0] bar"),
+        (False, 2, 1, "[rank: 1] bar"),
+        (True, 1, 0, "bar"),
+        (True, 2, 0, "[rank: 0] bar"),
+        (True, 2, 1, None),
+    ],
+)
+def test_early_stopping_log_info(tmpdir, trainer, log_rank_zero_only, world_size, global_rank, expected_log):
+    """checks if log.info() gets called with expected message when used within EarlyStopping."""
+
+    # set the global_rank and world_size if trainer is not None
+    # or else always expect the simple logging message
+    if trainer:
+        trainer.strategy.global_rank = global_rank
+        trainer.strategy.world_size = world_size
+    else:
+        expected_log = "bar"
+
+    with mock.patch("pytorch_lightning.callbacks.early_stopping.log.info") as log_mock:
+        EarlyStopping._log_info(trainer, "bar", log_rank_zero_only)
+
+    # check log.info() was called or not with expected arg
+    if expected_log:
+        log_mock.assert_called_once_with(expected_log)
+    else:
+        log_mock.assert_not_called()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,6 +9,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

		### Added

		- Added a flag named `log_rank_zero_only` to `EarlyStopping` to disable logging to non-zero rank processes ([#13233](https://github.com/PyTorchLightning/pytorch-lightning/pull/13233))


		- Added support for reloading the last checkpoint saved by passing `ckpt_path="last"` ([#12816](https://github.com/PyTorchLightning/pytorch-lightning/pull/12816))


Expand Down