Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support launching Lightning ddp with traditional command #7480

Merged
merged 24 commits into from
Jul 14, 2021
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Added `FastForwardSampler` and `CaptureIterableDataset` ([#8307](https://github.com/PyTorchLightning/pytorch-lightning/pull/8307))


- Enabled traditional/manual launching of DDP processes through `LOCAL_RANK` and `NODE_RANK` environment variable assignments ([#7480](https://github.com/PyTorchLightning/pytorch-lightning/pull/7480))


### Changed


Expand Down
17 changes: 13 additions & 4 deletions pytorch_lightning/plugins/environments/lightning_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,16 @@ class LightningEnvironment(ClusterEnvironment):
"""
The default environment used by Lightning for a single node or free cluster (not managed).

The master process must be launched by the user and Lightning will spawn new
worker processes for distributed training, either in a single node or across multiple nodes.
There are two modes the Lightning environment can operate with:

1. The user only launches the main process by :code:`python train.py ...` with no additional environment variables
set. Lightning will spawn new worker processes for distributed training in the current node.
2. The user launches all processes manually or with utilities like :code:`torch.distributed.launch`.
The appropriate environment variables need to be set, and at minimum :code:`LOCAL_RANK`.

If the master address and port are not provided, the default environment will choose them
automatically. It is recommended to use this default environment for single-node distributed
training as it provides the most convenient way to launch the training script.
training as it provides a convenient way to launch the training script.
"""

def __init__(self):
Expand All @@ -38,7 +42,12 @@ def __init__(self):
self._world_size: int = 1

def creates_children(self) -> bool:
return False
"""
Returns whether the cluster creates the processes or not.
If at least :code:`LOCAL_RANK` is available as environment variable, Lightning assumes the user acts as the
process launcher/job scheduler and Lightning will not launch new processes.
"""
return "LOCAL_RANK" in os.environ

def master_address(self) -> str:
return os.environ.get("MASTER_ADDR", "127.0.0.1")
Expand Down
4 changes: 1 addition & 3 deletions pytorch_lightning/plugins/training_type/ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def _is_single_process_single_device(self) -> bool:

def setup_environment(self) -> None:
# start the other scripts
if not self.cluster_environment.creates_children() and os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1":
if not self.cluster_environment.creates_children():
carmocca marked this conversation as resolved.
Show resolved Hide resolved
self._call_children_scripts()

# set the task idx
Expand Down Expand Up @@ -208,8 +208,6 @@ def _call_children_scripts(self):
if self.parallel_devices is None:
raise MisconfigurationException("you selected (distribute_backend = ddp) but did not set Trainer(gpus=?)")

os.environ["PL_IN_DDP_SUBPROCESS"] = "1"

os.environ["WORLD_SIZE"] = f"{self.num_processes * self.num_nodes}"

self.interactive_ddp_procs = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -452,11 +452,6 @@ def select_training_type_plugin(self) -> TrainingTypePlugin:
use_ddp_sharded_spawn = self._distrib_type == DistributedType.DDP_SHARDED_SPAWN
use_ddp_fully_sharded = self._distrib_type == DistributedType.DDP_FULLY_SHARDED

# TODO: decouple from TE
# ddp script mode uses the same flags as TE
if os.environ.get("PL_IN_DDP_SUBPROCESS", False):
use_torchelastic_ddp = False

if use_tpu_spawn:
ddp_plugin_cls = TPUSpawnPlugin
elif use_ddp_sharded:
Expand Down
16 changes: 16 additions & 0 deletions tests/plugins/environments/test_lightning_environment.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
from unittest import mock

import pytest

from pytorch_lightning.plugins.environments import LightningEnvironment


Expand Down Expand Up @@ -37,6 +39,20 @@ def test_attributes_from_environment_variables():
assert env.world_size() == 100


@pytest.mark.parametrize(
"environ, creates_children", [
({}, False),
(dict(LOCAL_RANK="2"), True),
(dict(NODE_RANK="1"), False),
]
)
def test_manual_user_launch(environ, creates_children):
""" Test that the environment switches to manual user mode when LOCAL_RANK env variable detected. """
with mock.patch.dict(os.environ, environ):
env = LightningEnvironment()
assert env.creates_children() == creates_children


@mock.patch.dict(os.environ, {
"GROUP_RANK": "1",
})
Expand Down
6 changes: 6 additions & 0 deletions tests/special_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,12 @@ if [ $? -eq 0 ]; then
report+="Ran\ttests/utilities/test_warnings.py\n"
fi

# test that a user can manually launch individual processes
Borda marked this conversation as resolved.
Show resolved Hide resolved
args="--trainer.gpus 2 --trainer.accelerator ddp --trainer.fast_dev_run=True
MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python pl_examples/basic_examples/simple_image_classifier.py ${args} &
MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python pl_examples/basic_examples/simple_image_classifier.py ${args}
report+="Ran\tmanual ddp launch test\n"

# echo test report
printf '=%.s' {1..80}
printf "\n$report"
Expand Down