Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] ref: decoupled ddp, ddp spawn #3733

Closed
wants to merge 119 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
119 commits
Select commit Hold shift + click to select a range
767b8ab
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
f746018
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
c3529ee
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
3497f0d
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
c4a9dc0
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
09bf2a6
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
81d7a0d
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
54a7402
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
2960aa2
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
417242c
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
b751f3a
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
e40a7c2
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
78bf07b
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
07efc8e
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
1276a51
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
d4b9f37
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
3041561
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
61ab801
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
7eeaa64
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
416a96d
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
b4454ee
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
f151c21
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
4278731
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
2e9c537
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
6f6f4fa
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
dab971d
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
95aaca6
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
b46874c
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
424a6db
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
35d01e4
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
f6e0bbe
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
a0542ae
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
64a486c
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
d124a94
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
3fa5ad2
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
2e49563
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
8acddd7
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
50a9c8b
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
5fc4912
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
2070075
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
f0c06bd
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
08b0cad
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
8a8a0bf
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
ed675ef
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
336bb47
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
c3f299a
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
e4cb76d
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
94ef3b9
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
357d640
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
e49c8a1
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
91736e2
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
15e5be0
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
b37d948
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
51370ce
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
23032ea
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
9f8705a
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
0f13e61
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
7ccabd8
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
9171464
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
1d4aeaa
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
b96d7c1
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
85050a3
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
506b037
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
63f5d50
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
01dd4c5
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
a0f52d7
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
650903a
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
cbd89f7
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
8ebd4ed
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
1f19c2f
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
ea448bb
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
fbeec9e
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
7663c6b
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
9421dbb
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
cf08480
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
f0c3cc5
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
459a0fa
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
64484a1
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
10bae5b
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
667c434
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
5b412e0
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
d9fc538
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
b2e941c
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
5ac3e59
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
3650f86
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
da582ab
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
471b576
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
545bf01
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
7b72cd6
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
1fbc1ca
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
c5c9faf
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
701f233
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
4a7368a
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
7169107
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
27e5870
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
455a488
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
6c3732c
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
73f0ef3
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
e36e20f
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
2f93660
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
1fb466c
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
202e82e
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
c8bd6ee
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
d4d8551
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
5acef3e
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
288fd23
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
0dcdd81
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
581e929
ref: decoupled ddp spawn
williamFalcon Sep 30, 2020
fe53c9a
h
williamFalcon Oct 1, 2020
c644f66
h
williamFalcon Oct 1, 2020
2a10f59
h
williamFalcon Oct 1, 2020
beacd6a
rebased
williamFalcon Oct 1, 2020
7e98763
merged
williamFalcon Oct 1, 2020
661cfb0
merged
williamFalcon Oct 1, 2020
69235e9
merged
williamFalcon Oct 1, 2020
c958ec7
ref: part 4 of #3733
williamFalcon Oct 1, 2020
6088c48
ref: part 4 of #3733
williamFalcon Oct 1, 2020
f86ab63
ref: part 4 of #3733
williamFalcon Oct 1, 2020
2c2755c
ref: clean up ddp before final fix
williamFalcon Oct 3, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 41 additions & 42 deletions pytorch_lightning/accelerators/ddp_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,22 @@
# See the License for the specific language governing permissions and
# limitations under the License
import os
import torch.distributed as dist
import torch
import torch.distributed as torch_distrib
import subprocess
import sys
from os.path import abspath
from time import sleep
from typing import Optional

import numpy as np
import torch
import torch.distributed as torch_distrib
import torch.distributed as dist


from pytorch_lightning import _logger as log
from pytorch_lightning.utilities.distributed import find_free_network_port
from pytorch_lightning.accelerators.base_backend import Accelerator
from pytorch_lightning import _logger as log
from pytorch_lightning.utilities import AMPType
from pytorch_lightning.utilities.distributed import rank_zero_only
from pytorch_lightning.utilities import AMPType
from pytorch_lightning.utilities.seed import seed_everything
from pytorch_lightning.distributed.dist import LightningDistributed

Expand All @@ -47,6 +47,7 @@ def __init__(self, trainer):
super().__init__(trainer)
self.task_idx = None
self._has_spawned_children = False
self.interactive_ddp_procs = []
self.dist = LightningDistributed()

def setup(self, model):
Expand All @@ -57,7 +58,6 @@ def setup(self, model):
self._call_children_scripts()

def _call_children_scripts(self):

assert self.trainer.global_rank == 0
self._check_can_spawn_children()
self._has_spawned_children = True
Expand Down Expand Up @@ -104,11 +104,12 @@ def _call_children_scripts(self):

os.environ['WORLD_SIZE'] = f'{num_gpus * self.trainer.num_nodes}'

self.trainer.interactive_ddp_procs = []
self.interactive_ddp_procs = []
for local_rank in range(1, self.trainer.num_processes):
env_copy = os.environ.copy()
env_copy['LOCAL_RANK'] = f'{local_rank}'
env_copy['PL_DDP_PID'] = str(self.trainer.data_parallel_device_ids[local_rank])
env_copy['PL_GLOBAL_SEED'] = os.environ.get('PL_GLOBAL_SEED', None)

# start process
# if hydra is available and initialized, make sure to set the cwd correctly
Expand All @@ -117,7 +118,7 @@ def _call_children_scripts(self):
if HydraConfig.initialized():
cwd = get_original_cwd()
proc = subprocess.Popen(command, env=env_copy, cwd=cwd)
self.trainer.interactive_ddp_procs.append(proc)
self.interactive_ddp_procs.append(proc)

# starting all processes at once can cause issues
# with dataloaders delay between 1-10 seconds
Expand All @@ -126,12 +127,36 @@ def _call_children_scripts(self):

self.task_idx = 0

# wait for all the procs to start
sleep(2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do the processes communicate on startup? I feel like a hardcoded sleep is not the optimal solution here


def train(self):
model = self.trainer.model
results = self.ddp_train(process_idx=self.task_idx, model=model, is_master=True)
del os.environ['WORLD_SIZE']
results = self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model, is_master=True)
if 'WORLD_SIZE' in os.environ:
del os.environ['WORLD_SIZE']
return results

def training_step(self, args):
if self.trainer.amp_backend == AMPType.NATIVE:
with torch.cuda.amp.autocast():
output = self.trainer.model(*args)
else:
output = self.trainer.model(*args)
return output

def validation_step(self, args):
output = self.training_step(args)
return output

def test_step(self, args):
output = self.training_step(args)
return output

def barrier(self, name: str = None):
if torch_distrib.is_initialized():
torch_distrib.barrier()

def _check_can_spawn_children(self):
if self._has_spawned_children:
raise RuntimeError(
Expand All @@ -145,17 +170,7 @@ def set_world_ranks(self, process_idx):
self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes

def model_to_device(self, model, process_idx, is_master):
gpu_idx = process_idx

# when using ddp, the master process (proc 0) continues running as the main one
# this means that the local rank will always be 0
# (even if cuda visible devices has other visible gpus)
# this means that the master process needs to pull the 0th visible index as the device number
if is_master:
available_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',')
gpu_idx = int(available_gpus[self.trainer.local_rank])

gpu_idx = int(os.environ.get('PL_DDP_PID', gpu_idx))
gpu_idx = int(os.environ.get('PL_DDP_PID', process_idx))

self.trainer.root_gpu = gpu_idx
torch.cuda.set_device(self.trainer.root_gpu)
Expand All @@ -165,25 +180,8 @@ def get_device_ids(self):
device_ids = [self.trainer.root_gpu]
return device_ids

def training_step(self, args):
if self.trainer.amp_backend == AMPType.NATIVE:
with torch.cuda.amp.autocast():
output = self.trainer.model(*args)
else:
output = self.trainer.model(*args)
return output

def validation_step(self, args):
output = self.training_step(args)
return output

def test_step(self, args):
output = self.training_step(args)
return output

def barrier(self, name: str = None):
if torch_distrib.is_initialized():
torch_distrib.barrier()
def on_train_end(self):
pass

def early_stopping_should_stop(self, pl_module):
stop = torch.tensor(int(self.trainer.should_stop), device=pl_module.device)
Expand All @@ -207,7 +205,7 @@ def ddp_train(self, process_idx, model, is_master=False, proc_offset=0):
Returns:

"""
seed = os.environ.get("PL_GLOBAL_SEED")
seed = os.environ.get("PL_GLOBAL_SEED", None)
if seed is not None:
seed_everything(int(seed))

Expand Down Expand Up @@ -268,6 +266,7 @@ def ddp_train(self, process_idx, model, is_master=False, proc_offset=0):
model = model.configure_ddp(model, device_ids)

# set up training routine
self.barrier('ddp_setup')
self.trainer.train_loop.setup_training(model)

# train or test
Expand Down
103 changes: 100 additions & 3 deletions pytorch_lightning/accelerators/ddp_spawn_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@
import re

import torch
import torch.multiprocessing as mp
import torch.distributed as torch_distrib
import torch.multiprocessing as mp
import torch.distributed as dist

from pytorch_lightning import _logger as log
from pytorch_lightning.accelerators.base_backend import Accelerator
from pytorch_lightning.utilities import AMPType
from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load
from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn
from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load
from pytorch_lightning.utilities.seed import seed_everything
from pytorch_lightning.distributed.dist import LightningDistributed
from pytorch_lightning.utilities.distributed import find_free_network_port
Expand Down Expand Up @@ -157,12 +157,109 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0
# clean up memory
torch.cuda.empty_cache()

def ddp_train(self, process_idx, mp_queue, model):
"""
Entry point for ddp

Args:
process_idx:
mp_queue: multiprocessing queue
model:

Returns:

"""
# show progressbar only on progress_rank 0
if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None:
self.trainer.progress_bar_callback.disable()

# determine which process we are and world size
self.set_world_ranks(process_idx)

# set warning rank
rank_zero_only.rank = self.trainer.global_rank

# set up server using proc 0's ip address
# try to init for 20 times at max in case ports are taken
# where to store ip_table
model.trainer = self.trainer
model.init_ddp_connection(
self.trainer.global_rank,
self.trainer.world_size,
self.trainer.is_slurm_managing_tasks
)

# call setup after the ddp process has connected
self.trainer.call_setup_hook(model)

# on world_size=0 let everyone know training is starting
if self.trainer.is_global_zero and not torch.distributed.is_initialized():
log.info('-' * 100)
log.info(f'distributed_backend={self.trainer.distributed_backend}')
log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes')
log.info('-' * 100)

# call sync_bn before .cuda(), configure_apex and configure_ddp
if self.trainer.sync_batchnorm:
model = model.configure_sync_batchnorm(model)

# move the model to the correct device
self.model_to_device(model, process_idx)

# CHOOSE OPTIMIZER
# allow for lr schedulers as well
self.setup_optimizers(model)

# set model properties before going into wrapper
self.trainer.model_connector.copy_trainer_model_properties(model)

# 16-bit
model = self.trainer.precision_connector.connect(model)

# device ids change depending on the DDP setup
device_ids = self.get_device_ids()

# allow user to configure ddp
model = model.configure_ddp(model, device_ids)

# set up training routine
self.trainer.train_loop.setup_training(model)

# train or test
results = self.train_or_test()

# get original model
model = self.trainer.get_model()

# persist info in ddp_spawn
self.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results)

def training_step(self, args):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def training_step(self, args):
def training_step(self, *args):

if self.trainer.amp_backend == AMPType.NATIVE:
with torch.cuda.amp.autocast():
output = self.trainer.model(*args)
else:
output = self.trainer.model(*args)
return output

def validation_step(self, args):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def validation_step(self, args):
def validation_step(self, *args):

output = self.training_step(args)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
output = self.training_step(args)
output = self.training_step(*args)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, can we not call training_step here but use some other common function. Overwriting the training_step can yield unexpected behaviour here since I'd not expect training_step to influence validation_step and test_step at all!

return output

def test_step(self, args):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def test_step(self, args):
def test_step(self, *args):

output = self.training_step(args)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
output = self.training_step(args)
output = self.training_step(*args)

return output

def barrier(self, name: str = None):
if torch_distrib.is_initialized():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unused name argument

torch_distrib.barrier()

def set_world_ranks(self, process_idx):
self.trainer.local_rank = process_idx
self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx
self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes

def model_to_device(self, model, process_idx, is_master):
def model_to_device(self, model, process_idx):
gpu_idx = process_idx
self.trainer.root_gpu = gpu_idx
torch.cuda.set_device(self.trainer.root_gpu)
Expand Down
36 changes: 18 additions & 18 deletions tests/backends/test_ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,21 +37,21 @@ def test_multi_gpu_model_ddp_test_only(tmpdir, cli_args):
assert result['status'] == 'complete'


# @pytest.mark.parametrize('cli_args', [
# pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'),
# ])
# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
# def test_multi_gpu_model_ddp_fit_test(tmpdir, cli_args):
# # call the script
# call_training_script(ddp_model, cli_args, 'fit_test', tmpdir, timeout=20)
#
# # load the results of the script
# result_path = os.path.join(tmpdir, 'ddp.result')
# result = torch.load(result_path)
#
# # verify the file wrote the expected outputs
# assert result['status'] == 'complete'
#
# model_outs = result['result']
# for out in model_outs:
# assert out['test_acc'] > 0.90
@pytest.mark.parametrize('cli_args', [
pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'),
])
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
def test_multi_gpu_model_ddp_fit_test(tmpdir, cli_args):
# call the script
call_training_script(ddp_model, cli_args, 'fit_test', tmpdir, timeout=20)

# load the results of the script
result_path = os.path.join(tmpdir, 'ddp.result')
result = torch.load(result_path)

# verify the file wrote the expected outputs
assert result['status'] == 'complete'

model_outs = result['result']
for out in model_outs:
assert out['test_acc'] > 0.90
Loading