From aeb2f27dc340140e80eb1aee333f331b71fed967 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Tue, 14 Jul 2020 16:08:36 +0000 Subject: [PATCH 1/2] Fixes #1199 - Updated code to propagate spawn kwargs - start_method is fork by default --- ignite/distributed/comp_models/xla.py | 4 +--- ignite/distributed/launcher.py | 8 ++++++-- tests/ignite/distributed/test_launcher.py | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/ignite/distributed/comp_models/xla.py b/ignite/distributed/comp_models/xla.py index fdd4be80d181..e058831e8d84 100644 --- a/ignite/distributed/comp_models/xla.py +++ b/ignite/distributed/comp_models/xla.py @@ -113,9 +113,7 @@ def spawn( backend: str = XLA_TPU, **kwargs ): - import os - - if "COLAB_TPU_ADDR" in os.environ: + if "start_method" not in kwargs: kwargs["start_method"] = "fork" xmp.spawn( diff --git a/ignite/distributed/launcher.py b/ignite/distributed/launcher.py index 414bb27bb85f..a5713988d490 100644 --- a/ignite/distributed/launcher.py +++ b/ignite/distributed/launcher.py @@ -150,6 +150,7 @@ def training(local_rank, config, **kwargs): (`nccl`, `gloo`). Mandatory argument if ``nnodes`` is specified and larger than one. master_port (int, optional): optional argument, master node port for torch native backends (`nccl`, `gloo`). Mandatory argument if ``master_addr`` is specified. + **spawn_kwargs: kwargs to ``idist.spawn`` function. """ def __init__( @@ -160,6 +161,7 @@ def __init__( node_rank: Optional[int] = None, master_addr: Optional[str] = None, master_port: Optional[str] = None, + **spawn_kwargs, ): if backend is not None: if backend not in idist.available_backends(): @@ -183,7 +185,7 @@ def __init__( if self.backend is not None: if nproc_per_node is not None: self._spawn_params = self._setup_spawn_params( - nproc_per_node, nnodes, node_rank, master_addr, master_port + nproc_per_node, nnodes, node_rank, master_addr, master_port, **spawn_kwargs ) if self._spawn_params is not None: @@ -191,7 +193,8 @@ def __init__( msg = "\n\t".join(["{}: {}".format(k, v) for k, v in self._spawn_params.items() if v is not None]) self.logger.info("- Parameters to spawn processes: \n\t{}".format(msg)) - def _setup_spawn_params(self, nproc_per_node, nnodes, node_rank, master_addr, master_port): + @staticmethod + def _setup_spawn_params(nproc_per_node, nnodes, node_rank, master_addr, master_port, **spawn_kwargs): if nproc_per_node < 1: raise ValueError("Argument nproc_per_node should positive, but given {}".format(nproc_per_node)) if nnodes is None: @@ -218,6 +221,7 @@ def _setup_spawn_params(self, nproc_per_node, nnodes, node_rank, master_addr, ma "master_addr": master_addr, "master_port": master_port, } + params.update(spawn_kwargs) return {k: v for k, v in params.items() if v is not None} def run(self, func: Callable, *args, **kwargs): diff --git a/tests/ignite/distributed/test_launcher.py b/tests/ignite/distributed/test_launcher.py index b8ccadfbca3c..6b64b2639d9c 100644 --- a/tests/ignite/distributed/test_launcher.py +++ b/tests/ignite/distributed/test_launcher.py @@ -158,3 +158,21 @@ def test_idist_parallel_no_dist(): device = "cuda" if torch.cuda.is_available() else "cpu" with idist.Parallel(backend=None) as parallel: parallel.run(_test_func, ws=1, device=device) + + +@pytest.mark.tpu +@pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if no NUM_TPU_WORKERS in env vars") +@pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package") +def test_idist_parallel_spawn_params(): + + res = idist.Parallel._setup_spawn_params( + nproc_per_node=8, nnodes=None, node_rank=None, master_addr=None, master_port=None, start_method="fork" + ) + assert "nproc_per_node" in res and res["nproc_per_node"] == 8 + assert "start_method" in res and res["start_method"] == "fork" + + with idist.Parallel(backend="xla-tpu", nproc_per_node=8, start_method="fork") as parallel: + assert parallel.backend == "xla-tpu" + res = parallel._spawn_params + assert "nproc_per_node" in res and res["nproc_per_node"] == 8 + assert "start_method" in res and res["start_method"] == "fork" From 813642f23e5f10fe0a20bf4a6e108b827198414f Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Tue, 14 Jul 2020 16:44:25 +0000 Subject: [PATCH 2/2] Fixed bad syntax --- ignite/distributed/launcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ignite/distributed/launcher.py b/ignite/distributed/launcher.py index a5713988d490..2823263ae1c6 100644 --- a/ignite/distributed/launcher.py +++ b/ignite/distributed/launcher.py @@ -161,7 +161,7 @@ def __init__( node_rank: Optional[int] = None, master_addr: Optional[str] = None, master_port: Optional[str] = None, - **spawn_kwargs, + **spawn_kwargs ): if backend is not None: if backend not in idist.available_backends():