From 5990e6241eeba8805bf4476e20aa06d71a2e8582 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Tue, 8 Sep 2020 10:32:11 +0000 Subject: [PATCH] Fixes #1258 - Replaced mp.spawn by mp.start_processes for native comp model --- ignite/distributed/comp_models/native.py | 2 +- ignite/distributed/utils.py | 6 +++--- tests/ignite/distributed/comp_models/test_native.py | 4 +++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/ignite/distributed/comp_models/native.py b/ignite/distributed/comp_models/native.py index 74c1b2abae1c..75d6a946d9a1 100644 --- a/ignite/distributed/comp_models/native.py +++ b/ignite/distributed/comp_models/native.py @@ -279,7 +279,7 @@ def spawn( if LooseVersion(torch.__version__) >= LooseVersion("1.5.0"): spawn_kwargs["start_method"] = kwargs.get("start_method", "spawn") - mp.spawn( + mp.start_processes( _NativeDistModel._dist_worker_task_fn, nprocs=nproc_per_node, args=( diff --git a/ignite/distributed/utils.py b/ignite/distributed/utils.py index 56d07afa036a..bde3fdbc658e 100644 --- a/ignite/distributed/utils.py +++ b/ignite/distributed/utils.py @@ -285,7 +285,7 @@ def train_fn(local_rank, a, b, c, d=12): - | "nccl" or "gloo" : `nnodes` (default, 1), `node_rank` (default, 0), `master_addr` | (default, "127.0.0.1"), `master_port` (default, 2222), `timeout` to `dist.init_process_group`_ function - | and kwargs for `mp.spawn`_ function. + | and kwargs for `mp.start_processes`_ function. - | "xla-tpu" : `nnodes` (default, 1), `node_rank` (default, 0) and kwargs to `xmp.spawn`_ function. @@ -293,8 +293,8 @@ def train_fn(local_rank, a, b, c, d=12): | and `node_rank=0` are tolerated and ignored, otherwise an exception is raised. .. _dist.init_process_group: https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group - .. _mp.spawn: https://pytorch.org/docs/stable/multiprocessing.html#torch.multiprocessing.spawn - .. _xmp.spawn: http://pytorch.org/xla/release/1.5/index.html#torch_xla.distributed.xla_multiprocessing.spawn + .. _mp.start_processes: https://pytorch.org/docs/stable/_modules/torch/multiprocessing/spawn.html#spawn + .. _xmp.spawn: http://pytorch.org/xla/release/1.6/index.html#torch_xla.distributed.xla_multiprocessing.spawn .. _hvd_run: https://horovod.readthedocs.io/en/latest/api.html#module-horovod.run """ diff --git a/tests/ignite/distributed/comp_models/test_native.py b/tests/ignite/distributed/comp_models/test_native.py index 550969a47972..b509df82b182 100644 --- a/tests/ignite/distributed/comp_models/test_native.py +++ b/tests/ignite/distributed/comp_models/test_native.py @@ -276,13 +276,14 @@ def _test_dist_spawn_fn(local_rank, backend, world_size, device): assert _model.device() == torch.device(device) -def _test__native_dist_model_spawn(backend, num_workers_per_machine, device): +def _test__native_dist_model_spawn(backend, num_workers_per_machine, device, **spawn_kwargs): _NativeDistModel.spawn( _test_dist_spawn_fn, args=(backend, num_workers_per_machine, device), kwargs_dict={}, backend=backend, nproc_per_node=num_workers_per_machine, + **spawn_kwargs, ) @@ -290,6 +291,7 @@ def _test__native_dist_model_spawn(backend, num_workers_per_machine, device): @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test__native_dist_model_spawn_gloo(): _test__native_dist_model_spawn("gloo", num_workers_per_machine=4, device="cpu") + _test__native_dist_model_spawn("gloo", num_workers_per_machine=4, device="cpu", start_method="fork") @pytest.mark.distributed