Skip to content

Commit d24978b

Browse files
awaelchlirohitgr7
andauthored
Add ddp_notebook alias for ddp_fork (#13744)
Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
1 parent ae98031 commit d24978b

File tree

5 files changed

+42
-21
lines changed

5 files changed

+42
-21
lines changed

docs/source-pytorch/accelerators/gpu_intermediate.rst

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ Lightning supports multiple ways of doing distributed training.
2424
- DistributedDataParallel (multiple-gpus across many machines)
2525
- Regular (``strategy='ddp'``)
2626
- Spawn (``strategy='ddp_spawn'``)
27-
- Fork (``strategy='ddp_fork'``)
27+
- Notebook/Fork (``strategy='ddp_notebook'``)
2828
- Horovod (``strategy='horovod'``) (multi-machine, multi-gpu, configured at runtime)
2929
- Bagua (``strategy='bagua'``) (multiple-gpus across many machines with advanced training algorithms)
3030

@@ -101,7 +101,7 @@ There are cases in which it is NOT possible to use DDP. Examples are:
101101
- Jupyter Notebook, Google COLAB, Kaggle, etc.
102102
- You have a nested script without a root package
103103

104-
In these situations you should use `dp` or `ddp_spawn` instead.
104+
In these situations you should use `ddp_notebook` or `dp` instead.
105105

106106
Distributed Data Parallel 2
107107
^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -201,18 +201,25 @@ You can then call your scripts anywhere
201201
python some_file.py --accelerator 'gpu' --devices 8 --strategy 'ddp'
202202
203203
204-
Distributed Data Parallel Fork
205-
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
204+
Distributed Data Parallel in Notebooks
205+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
206206

207-
DDP Fork is an alternative to Spawn that can be used in interactive Python and Jupyter notebooks, Google Colab, Kaggle notebooks, and so on:
207+
DDP Notebook/Fork is an alternative to Spawn that can be used in interactive Python and Jupyter notebooks, Google Colab, Kaggle notebooks, and so on:
208+
The Trainer enables it by default when such environments are detected.
208209

209210
.. code-block:: python
210211
211212
# train on 8 GPUs in a Jupyter notebook
213+
trainer = Trainer(accelerator="gpu", devices=8)
214+
215+
# can be set explicitly
216+
trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp_notebook")
217+
218+
# can also be used in non-interactive environments
212219
trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp_fork")
213220
214221
Data Parallel (``strategy="dp"``) is the only other strategy supported in interactive environments but is slower, is discouraged by PyTorch and has other limitations.
215-
Among the native distributed strategies, regular DDP (``strategy="ddp"``) is still recommended as the go-to strategy over Spawn and Fork for its speed and stability but it can only be used with scripts.
222+
Among the native distributed strategies, regular DDP (``strategy="ddp"``) is still recommended as the go-to strategy over Spawn and Fork/Notebook for its speed and stability but it can only be used with scripts.
216223

217224

218225
Comparison of DDP variants and tradeoffs
@@ -225,7 +232,7 @@ Comparison of DDP variants and tradeoffs
225232
* -
226233
- DDP
227234
- DDP Spawn
228-
- DDP Fork
235+
- DDP Notebook/Fork
229236
* - Works in Jupyter notebooks / IPython environments
230237
- No
231238
- No

src/pytorch_lightning/strategies/ddp_spawn.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,13 @@
5353

5454
log = logging.getLogger(__name__)
5555

56+
_DDP_FORK_ALIASES = (
57+
"ddp_fork",
58+
"ddp_fork_find_unused_parameters_false",
59+
"ddp_notebook",
60+
"ddp_notebook_find_unused_parameters_false",
61+
)
62+
5663

5764
class DDPSpawnStrategy(ParallelStrategy):
5865
"""Spawns processes using the :func:`torch.multiprocessing.spawn` method and joins processes after training
@@ -283,20 +290,22 @@ def post_training_step(self):
283290

284291
@classmethod
285292
def register_strategies(cls, strategy_registry: Dict) -> None:
286-
for start_method in ("spawn", "fork"):
293+
entries = (
294+
("ddp_spawn", "spawn"),
295+
("ddp_spawn_find_unused_parameters_false", "spawn"),
296+
("ddp_fork", "fork"),
297+
("ddp_fork_find_unused_parameters_false", "fork"),
298+
("ddp_notebook", "fork"),
299+
("ddp_notebook_find_unused_parameters_false", "fork"),
300+
)
301+
for name, start_method in entries:
287302
strategy_registry.register(
288-
f"ddp_{start_method}_find_unused_parameters_false",
303+
name,
289304
cls,
290-
description=f"DDP {start_method.title()} strategy with `find_unused_parameters` as False",
305+
description=f"DDP strategy with `find_unused_parameters` as False and `start_method` '{start_method}'",
291306
find_unused_parameters=False,
292307
start_method=start_method,
293308
)
294-
strategy_registry.register(
295-
f"ddp_{start_method}",
296-
cls,
297-
description=f"DDP {start_method.title()} strategy",
298-
start_method=start_method,
299-
)
300309

301310
def teardown(self) -> None:
302311
log.detail(f"{self.__class__.__name__}: tearing down strategy")

src/pytorch_lightning/trainer/connectors/accelerator_connector.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
StrategyRegistry,
7373
TPUSpawnStrategy,
7474
)
75+
from pytorch_lightning.strategies.ddp_spawn import _DDP_FORK_ALIASES
7576
from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
7677
from pytorch_lightning.utilities import (
7778
_StrategyType,
@@ -617,10 +618,7 @@ def _check_strategy_and_fallback(self) -> None:
617618
f"You selected strategy to be `{DDPFullyShardedNativeStrategy.strategy_name}`, "
618619
"but GPU accelerator is not used."
619620
)
620-
if (
621-
strategy_flag in ("ddp_fork", "ddp_fork_find_unused_parameters_false")
622-
and "fork" not in torch.multiprocessing.get_all_start_methods()
623-
):
621+
if strategy_flag in _DDP_FORK_ALIASES and "fork" not in torch.multiprocessing.get_all_start_methods():
624622
raise ValueError(
625623
f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this"
626624
f" platform. We recommed `Trainer(strategy='ddp_spawn')` instead."

tests/tests_pytorch/accelerators/test_accelerator_connector.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
DeepSpeedStrategy,
4545
SingleDeviceStrategy,
4646
)
47+
from pytorch_lightning.strategies.ddp_spawn import _DDP_FORK_ALIASES
4748
from pytorch_lightning.strategies.hpu_parallel import HPUParallelStrategy
4849
from pytorch_lightning.utilities.exceptions import MisconfigurationException
4950
from tests_pytorch.helpers.runif import RunIf
@@ -761,7 +762,7 @@ def test_accelerator_specific_checkpoint_io(*_):
761762
assert trainer.strategy.checkpoint_io is ckpt_plugin
762763

763764

764-
@pytest.mark.parametrize("strategy", ["ddp_fork", "ddp_fork_find_unused_parameters_false"])
765+
@pytest.mark.parametrize("strategy", _DDP_FORK_ALIASES)
765766
@mock.patch(
766767
"pytorch_lightning.trainer.connectors.accelerator_connector.torch.multiprocessing.get_all_start_methods",
767768
return_value=[],

tests/tests_pytorch/strategies/test_strategy_registry.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,12 @@ def test_fsdp_strategy_registry(tmpdir):
124124
{"find_unused_parameters": False, "start_method": "fork"},
125125
marks=RunIf(skip_windows=True),
126126
),
127+
pytest.param(
128+
"ddp_notebook_find_unused_parameters_false",
129+
DDPSpawnStrategy,
130+
{"find_unused_parameters": False, "start_method": "fork"},
131+
marks=RunIf(skip_windows=True),
132+
),
127133
(
128134
"ddp_sharded_spawn_find_unused_parameters_false",
129135
DDPSpawnShardedStrategy,

0 commit comments

Comments
 (0)