Skip to content

Commit f3bb85c

Browse files
Atharva-PhatakBorda
authored andcommitted
Fix ddp_spawn -> ddp fallback logic when on LSF cluster (#15657)
Co-authored-by: awaelchli <aedu.waelchli@gmail.com> (cherry picked from commit cdb7006)
1 parent 2d2c902 commit f3bb85c

File tree

6 files changed

+80
-35
lines changed

6 files changed

+80
-35
lines changed

src/lightning_lite/CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
1717

1818
### Fixed
1919

20-
-
20+
- Fixed the automatic fallback from `LightningLite(strategy="ddp_spawn", ...)` to `LightningLite(strategy="ddp", ...)` when on an LSF cluster ([#15103](https://github.com/PyTorchLightning/pytorch-lightning/issues/15103))
21+
2122

2223

2324
## [1.8.1] - 2022-11-10

src/lightning_lite/connector.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,10 @@ def _check_strategy_and_fallback(self) -> None:
395395
strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag
396396

397397
if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
398-
TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect()
398+
TorchElasticEnvironment.detect()
399+
or KubeflowEnvironment.detect()
400+
or SLURMEnvironment.detect()
401+
or LSFEnvironment.detect()
399402
):
400403
strategy_flag = "ddp"
401404
if strategy_flag == "dp" and self._accelerator_flag == "cpu":

src/pytorch_lightning/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
3434
- Fixed manual optimization raising `AttributeError` with Bagua Strategy ([#12534](https://github.com/PyTorchLightning/pytorch-lightning/issues/12534))
3535
- Fixed the import of `pytorch_lightning` causing a warning 'Redirects are currently not supported in Windows or MacOs' ([#15610](https://github.com/PyTorchLightning/pytorch-lightning/issues/15610))
3636

37+
- Fixed the automatic fallback from `Trainer(strategy="ddp_spawn", ...)` to `Trainer(strategy="ddp", ...)` when on an LSF cluster ([#15103](https://github.com/PyTorchLightning/pytorch-lightning/issues/15103))
38+
3739

3840
## [1.8.0] - 2022-11-01
3941

src/pytorch_lightning/trainer/connectors/accelerator_connector.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -614,7 +614,10 @@ def _check_strategy_and_fallback(self) -> None:
614614
strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag
615615

616616
if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and (
617-
TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect()
617+
TorchElasticEnvironment.detect()
618+
or KubeflowEnvironment.detect()
619+
or SLURMEnvironment.detect()
620+
or LSFEnvironment.detect()
618621
):
619622
strategy_flag = "ddp"
620623
if strategy_flag == "dp" and self._accelerator_flag == "cpu":

tests/tests_lite/test_connector.py

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from lightning_lite.plugins.environments import (
3333
KubeflowEnvironment,
3434
LightningEnvironment,
35+
LSFEnvironment,
3536
SLURMEnvironment,
3637
TorchElasticEnvironment,
3738
)
@@ -200,24 +201,41 @@ class Strat(DDPStrategy):
200201
assert connector.strategy is strategy
201202

202203

203-
@mock.patch.dict(
204-
os.environ,
205-
{
206-
"SLURM_NTASKS": "2",
207-
"SLURM_NTASKS_PER_NODE": "1",
208-
"SLURM_JOB_NAME": "SOME_NAME",
209-
"SLURM_NODEID": "0",
210-
"LOCAL_RANK": "0",
211-
"SLURM_PROCID": "0",
212-
"SLURM_LOCALID": "0",
213-
},
204+
@pytest.mark.parametrize(
205+
"env_vars,expected_environment",
206+
[
207+
(
208+
{
209+
"SLURM_NTASKS": "2",
210+
"SLURM_NTASKS_PER_NODE": "1",
211+
"SLURM_JOB_NAME": "SOME_NAME",
212+
"SLURM_NODEID": "0",
213+
"LOCAL_RANK": "0",
214+
"SLURM_PROCID": "0",
215+
"SLURM_LOCALID": "0",
216+
},
217+
SLURMEnvironment,
218+
),
219+
(
220+
{
221+
"LSB_JOBID": "1",
222+
"LSB_DJOB_RANKFILE": "SOME_RANK_FILE",
223+
"JSM_NAMESPACE_LOCAL_RANK": "1",
224+
"JSM_NAMESPACE_SIZE": "20",
225+
"JSM_NAMESPACE_RANK": "1",
226+
},
227+
LSFEnvironment,
228+
),
229+
],
214230
)
215-
@mock.patch("lightning_lite.accelerators.cuda.num_cuda_devices", return_value=0)
216-
def test_dist_backend_accelerator_mapping(*_):
217-
connector = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2)
218-
assert isinstance(connector.accelerator, CPUAccelerator)
219-
assert isinstance(connector.strategy, DDPStrategy)
220-
assert connector.strategy.local_rank == 0
231+
@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._read_hosts", return_value=["node0", "node1"])
232+
@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._get_node_rank", return_value=0)
233+
def test_fallback_from_ddp_spawn_to_ddp_on_cluster(_, __, env_vars, expected_environment):
234+
with mock.patch.dict(os.environ, env_vars, clear=True):
235+
trainer = _Connector(strategy="ddp_spawn", accelerator="cpu", devices=2)
236+
assert isinstance(trainer.accelerator, CPUAccelerator)
237+
assert isinstance(trainer.strategy, DDPStrategy)
238+
assert isinstance(trainer.strategy.cluster_environment, expected_environment)
221239

222240

223241
@RunIf(mps=False)

tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from lightning_lite.plugins.environments import (
2626
KubeflowEnvironment,
2727
LightningEnvironment,
28+
LSFEnvironment,
2829
SLURMEnvironment,
2930
TorchElasticEnvironment,
3031
)
@@ -193,24 +194,41 @@ class Strat(DDPStrategy):
193194
assert trainer._accelerator_connector.strategy is strategy
194195

195196

196-
@mock.patch.dict(
197-
os.environ,
198-
{
199-
"SLURM_NTASKS": "2",
200-
"SLURM_NTASKS_PER_NODE": "1",
201-
"SLURM_JOB_NAME": "SOME_NAME",
202-
"SLURM_NODEID": "0",
203-
"LOCAL_RANK": "0",
204-
"SLURM_PROCID": "0",
205-
"SLURM_LOCALID": "0",
206-
},
197+
@pytest.mark.parametrize(
198+
"env_vars,expected_environment",
199+
[
200+
(
201+
{
202+
"SLURM_NTASKS": "2",
203+
"SLURM_NTASKS_PER_NODE": "1",
204+
"SLURM_JOB_NAME": "SOME_NAME",
205+
"SLURM_NODEID": "0",
206+
"LOCAL_RANK": "0",
207+
"SLURM_PROCID": "0",
208+
"SLURM_LOCALID": "0",
209+
},
210+
SLURMEnvironment,
211+
),
212+
(
213+
{
214+
"LSB_JOBID": "1",
215+
"LSB_DJOB_RANKFILE": "SOME_RANK_FILE",
216+
"JSM_NAMESPACE_LOCAL_RANK": "1",
217+
"JSM_NAMESPACE_SIZE": "20",
218+
"JSM_NAMESPACE_RANK": "1",
219+
},
220+
LSFEnvironment,
221+
),
222+
],
207223
)
208-
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
209-
def test_dist_backend_accelerator_mapping(cuda_count_0):
210-
trainer = Trainer(fast_dev_run=True, strategy="ddp_spawn", accelerator="cpu", devices=2)
224+
@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._read_hosts", return_value=["node0", "node1"])
225+
@mock.patch("lightning_lite.plugins.environments.lsf.LSFEnvironment._get_node_rank", return_value=0)
226+
def test_fallback_from_ddp_spawn_to_ddp_on_cluster(_, __, env_vars, expected_environment):
227+
with mock.patch.dict(os.environ, env_vars, clear=True):
228+
trainer = Trainer(strategy="ddp_spawn", accelerator="cpu", devices=2)
211229
assert isinstance(trainer.accelerator, CPUAccelerator)
212230
assert isinstance(trainer.strategy, DDPStrategy)
213-
assert trainer.strategy.local_rank == 0
231+
assert isinstance(trainer.strategy.cluster_environment, expected_environment)
214232

215233

216234
def test_interactive_incompatible_backend_error(mps_count_2, cuda_count_2, monkeypatch):

0 commit comments

Comments
 (0)