Lightning-AI · carmocca · Apr 7, 2021 · Apr 2, 2021 · Apr 2, 2021 · Apr 5, 2021
@@ -170,6 +170,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/PyTorchLightning/pytorch-lightning/pull/6802/))
+
+
 - Made the `Plugin.reduce` method more consistent across all Plugins to reflect a mean-reduction by default ([#6011](https://github.com/PyTorchLightning/pytorch-lightning/pull/6011))
 
 
@@ -196,7 +199,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed torch distributed not available in setup hook for DDP ([#6506](https://github.com/PyTorchLightning/pytorch-lightning/pull/6506))
 
-
 ## [1.2.6] - 2021-03-30
 
 ### Changed

@@ -44,8 +44,18 @@ def wrapped_fn(*args, **kwargs):
     return wrapped_fn
 
 
+# TODO: this should be part of the cluster environment
+def _get_rank() -> int:
+    rank_keys = ('RANK', 'SLURM_PROCID', 'LOCAL_RANK')
+    for key in rank_keys:
+        rank = os.environ.get(key)
+        if rank is not None:
+            return int(rank)
+    return 0
+
+
 # add the attribute to the function but don't overwrite in case Trainer has already set it
-rank_zero_only.rank = getattr(rank_zero_only, 'rank', int(os.environ.get('LOCAL_RANK', 0)))
+rank_zero_only.rank = getattr(rank_zero_only, 'rank', _get_rank())
 
 
 def _warn(*args, **kwargs):

diff --git a/tests/utilities/distributed.py b/tests/utilities/distributed.py
@@ -16,6 +16,7 @@
 import sys
 from pathlib import Path
 from subprocess import TimeoutExpired
+from unittest import mock
 
 import pytorch_lightning
 
@@ -42,3 +43,48 @@ def call_training_script(module_file, cli_args, method, tmpdir, timeout=60):
         p.kill()
         std, err = p.communicate()
     return std, err
+
+
+@mock.patch.dict(os.environ, {"SLURM_PROCID": "0"})
+def test_rank_zero_slurm():
+    """ Test that SLURM environment variables are properly checked for rank_zero_only. """
+    from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only
+    rank_zero_only.rank = _get_rank()
+
+    @rank_zero_only
+    def foo():
+        # The return type is optional because on non-zero ranks it will not be called
+        return 1
+
+    x = foo()
+    assert x == 1
+
+
+@mock.patch.dict(os.environ, {"RANK": "0"})
+def test_rank_zero_torchelastic():
+    """ Test that torchelastic environment variables are properly checked for rank_zero_only. """
+    from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only
+    rank_zero_only.rank = _get_rank()
+
+    @rank_zero_only
+    def foo():
+        # The return type is optional because on non-zero ranks it will not be called
+        return 1
+
+    x = foo()
+    assert x == 1
+
+
+@mock.patch.dict(os.environ, {"RANK": "1", "SLURM_PROCID": "2", "LOCAL_RANK": "3"})
+def test_rank_zero_none_set():
+    """ Test that function is not called when rank environment variables are not global zero. """
+
+    from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only
+    rank_zero_only.rank = _get_rank()
+
+    @rank_zero_only
+    def foo():
+        return 1
+
+    x = foo()
+    assert x is None