diff --git a/.circleci/config.yml b/.circleci/config.yml index 450d1e69ef34..89710d759800 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -187,7 +187,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --num_procs_per_node=2" + export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt" diff --git a/examples/contrib/cifar10/README.md b/examples/contrib/cifar10/README.md index 394575e693ce..dcb0b7769b53 100644 --- a/examples/contrib/cifar10/README.md +++ b/examples/contrib/cifar10/README.md @@ -50,7 +50,7 @@ python -u -m torch.distributed.launch --nproc_per_node=2 --use_env main.py run - or ```bash # using function spawn inside the code -python -u main.py run --backend="nccl" --num_procs_per_node=2 +python -u main.py run --backend="nccl" --nproc_per_node=2 ``` If user would like to provide already downloaded dataset, the path can be setup in parameters as diff --git a/examples/contrib/cifar10/main.py b/examples/contrib/cifar10/main.py index ae2d140bca2e..86e994c14af7 100644 --- a/examples/contrib/cifar10/main.py +++ b/examples/contrib/cifar10/main.py @@ -133,7 +133,7 @@ def run( backend=None, resume_from=None, log_every_iters=15, - num_procs_per_node=None, + nproc_per_node=None, stop_iteration=None, with_trains=False, **spawn_kwargs @@ -156,14 +156,14 @@ def run( checkpoint_every (int): store training checkpoint every ``checkpoint_every`` iterations. Default, 200. backend (str, optional): backend to use for distributed configuration. Possible values: None, "nccl", "xla-tpu", "gloo" etc. Default, None. - num_procs_per_node (int, optional): optional argument to setup number of processes per node. It is useful, + nproc_per_node (int, optional): optional argument to setup number of processes per node. It is useful, when main python process is spawning training as child processes. resume_from (str, optional): path to checkpoint to use to resume the training from. Default, None. log_every_iters (int): argument to log progress every ``log_every_iters`` iterations. It can be 0 to disable it. Default, 15. stop_iteration (int, optional): iteration to stop the training. Can be used to check resume from checkpoint. with_trains (bool): if True, experiment Trains logger is setup. Default, False. - **spawn_kwargs: Other kwargs to spawn run in child processes: master_addr, master_port, node_rank, num_nodes + **spawn_kwargs: Other kwargs to spawn run in child processes: master_addr, master_port, node_rank, nnodes """ # catch all local parameters @@ -171,7 +171,7 @@ def run( config.update(config["spawn_kwargs"]) del config["spawn_kwargs"] - spawn_kwargs["num_procs_per_node"] = num_procs_per_node + spawn_kwargs["nproc_per_node"] = nproc_per_node with idist.Parallel(backend=backend, **spawn_kwargs) as parallel: diff --git a/ignite/distributed/auto.py b/ignite/distributed/auto.py index f4fd5f608555..83597ba185b8 100644 --- a/ignite/distributed/auto.py +++ b/ignite/distributed/auto.py @@ -68,7 +68,7 @@ def auto_dataloader(dataset, **kwargs): kwargs["batch_size"] //= world_size if "num_workers" in kwargs: - nproc = idist.get_ntasks_per_node() + nproc = idist.get_nproc_per_node() kwargs["num_workers"] = (kwargs["num_workers"] + nproc - 1) // nproc if "batch_sampler" not in kwargs: diff --git a/ignite/distributed/comp_models/base.py b/ignite/distributed/comp_models/base.py index 7d73e9600f9e..87943dd4c9cc 100644 --- a/ignite/distributed/comp_models/base.py +++ b/ignite/distributed/comp_models/base.py @@ -17,20 +17,20 @@ class ComputationModel(metaclass=ABCMeta): def __init__(self): self._backend = None - self._ntasks_per_node = None + self._nproc_per_node = None self._nnodes = None self._node = None def _setup_attrs(self): - if self._ntasks_per_node is None: - self._ntasks_per_node = self._compute_ntasks_per_node() if self.get_world_size() > 1 else 1 + if self._nproc_per_node is None: + self._nproc_per_node = self._compute_nproc_per_node() if self.get_world_size() > 1 else 1 if self._nnodes is None: - self._nnodes = self.get_world_size() // self._ntasks_per_node + self._nnodes = self.get_world_size() // self._nproc_per_node if self._node is None: - self._node = self.get_rank() // self._ntasks_per_node + self._node = self.get_rank() // self._nproc_per_node @abstractmethod - def _compute_ntasks_per_node(self) -> int: + def _compute_nproc_per_node(self) -> int: pass @abstractmethod @@ -46,11 +46,11 @@ def get_world_size(self) -> int: pass @abstractmethod - def get_ntasks_per_node(self) -> int: + def get_nproc_per_node(self) -> int: pass @abstractmethod - def get_num_nodes(self) -> int: + def get_nnodes(self) -> int: pass @abstractmethod @@ -181,10 +181,10 @@ def get_rank(self) -> int: def get_world_size(self) -> int: return 1 - def get_ntasks_per_node(self) -> int: + def get_nproc_per_node(self) -> int: return 1 - def get_num_nodes(self) -> int: + def get_nnodes(self) -> int: return 1 def get_node_rank(self) -> int: @@ -201,7 +201,7 @@ def backend(self) -> None: def finalize(self): pass - def _compute_ntasks_per_node(self) -> int: + def _compute_nproc_per_node(self) -> int: return 1 @staticmethod diff --git a/ignite/distributed/comp_models/native.py b/ignite/distributed/comp_models/native.py index 725946334ca0..41c70f778847 100644 --- a/ignite/distributed/comp_models/native.py +++ b/ignite/distributed/comp_models/native.py @@ -92,7 +92,7 @@ def _init_from_context(self): self._master_addr = None self._setup_attrs() - def _compute_ntasks_per_node(self): + def _compute_nproc_per_node(self): tensor = torch.tensor([self.get_local_rank() + 1]).to(self.device()) dist.all_reduce(tensor, op=dist.ReduceOp.MAX) return tensor.item() @@ -202,10 +202,10 @@ def get_rank(self) -> int: def get_world_size(self) -> int: return dist.get_world_size() - def get_ntasks_per_node(self) -> int: - return self._ntasks_per_node + def get_nproc_per_node(self) -> int: + return self._nproc_per_node - def get_num_nodes(self) -> int: + def get_nnodes(self) -> int: return self._nnodes def get_node_rank(self) -> int: @@ -249,15 +249,15 @@ def spawn( fn: Callable, args: Tuple, kwargs_dict: Optional[Mapping] = None, - num_procs_per_node: int = 1, - num_nodes: int = 1, + nproc_per_node: int = 1, + nnodes: int = 1, node_rank: int = 0, master_addr: str = "127.0.0.1", master_port: int = 2222, backend: str = "nccl", **kwargs ): - world_size = num_nodes * num_procs_per_node + world_size = nnodes * nproc_per_node spawn_kwargs = { "join": kwargs.get("join", True), @@ -269,14 +269,14 @@ def spawn( mp.spawn( _NativeDistModel._dist_worker_task_fn, - nprocs=num_procs_per_node, + nprocs=nproc_per_node, args=( backend, fn, args, kwargs_dict, world_size, - num_procs_per_node, + nproc_per_node, node_rank, master_addr, master_port, diff --git a/ignite/distributed/comp_models/xla.py b/ignite/distributed/comp_models/xla.py index 88de595bcbf6..ac8922adb417 100644 --- a/ignite/distributed/comp_models/xla.py +++ b/ignite/distributed/comp_models/xla.py @@ -60,7 +60,7 @@ def _init_from_context(self): self._backend = "xla-tpu" self._setup_attrs() - def _compute_ntasks_per_node(self): + def _compute_nproc_per_node(self): tensor = torch.tensor([self.get_local_rank() + 1.0], dtype=torch.float).to(self.device()) xm.all_reduce("max", [tensor,]) return int(tensor.item()) @@ -74,10 +74,10 @@ def get_rank(self) -> int: def get_world_size(self) -> int: return xm.xrt_world_size() - def get_ntasks_per_node(self) -> int: - return self._ntasks_per_node + def get_nproc_per_node(self) -> int: + return self._nproc_per_node - def get_num_nodes(self) -> int: + def get_nnodes(self) -> int: return self._nnodes def get_node_rank(self) -> int: @@ -107,8 +107,8 @@ def spawn( fn: Callable, args: Tuple, kwargs_dict: Optional[Mapping] = None, - num_procs_per_node: int = 1, - num_nodes: int = 1, + nproc_per_node: int = 1, + nnodes: int = 1, node_rank: int = 0, backend: str = "xla-tpu", **kwargs @@ -121,7 +121,7 @@ def spawn( xmp.spawn( _XlaDistModel._dist_worker_task_fn, args=(backend, fn, args, kwargs_dict), - nprocs=num_procs_per_node, + nprocs=nproc_per_node, **kwargs, ) diff --git a/ignite/distributed/launcher.py b/ignite/distributed/launcher.py index 36ac377325d8..414bb27bb85f 100644 --- a/ignite/distributed/launcher.py +++ b/ignite/distributed/launcher.py @@ -15,7 +15,7 @@ class Parallel: - XLA on TPUs via `pytorch/xla `_ - Namely, it can 1) spawn ``num_procs_per_node`` child processes and initialize a processing group according to + Namely, it can 1) spawn ``nproc_per_node`` child processes and initialize a processing group according to provided ``backend`` (useful for standalone scripts) or 2) only initialize a processing group given the ``backend`` (useful with tools like `torch.distributed.launch`_). @@ -76,7 +76,7 @@ def training(local_rank, config, **kwargs): print(idist.get_rank(), ": run with config:", config, "- backend=", idist.backend()) # ... - with idist.Parallel(backend="nccl", num_procs_per_node=4) as parallel: + with idist.Parallel(backend="nccl", nproc_per_node=4) as parallel: parallel.run(training, config, a=1, b=2) @@ -97,7 +97,7 @@ def training(local_rank, config, **kwargs): print(idist.get_rank(), ": run with config:", config, "- backend=", idist.backend()) # ... - with idist.Parallel(backend="xla-tpu", num_procs_per_node=8) as parallel: + with idist.Parallel(backend="xla-tpu", nproc_per_node=8) as parallel: parallel.run(training, config, a=1, b=2) @@ -124,8 +124,8 @@ def training(local_rank, config, **kwargs): # ... dist_config = { - "num_procs_per_node": 8, - "num_nodes": 2, + "nproc_per_node": 8, + "nnodes": 2, "node_rank": args.node_rank, "master_addr": "master", "master_port": 15000 @@ -138,16 +138,16 @@ def training(local_rank, config, **kwargs): Args: backend (str, optional): backend to use: `nccl`, `gloo`, `xla-tpu`. If None, no distributed configuration. - num_procs_per_node (int, optional): optional argument, number of processes per - node to specify. If not None, :meth:`~ignite.distributed.Parallel.run` will spawn ``num_procs_per_node`` + nproc_per_node (int, optional): optional argument, number of processes per + node to specify. If not None, :meth:`~ignite.distributed.Parallel.run` will spawn ``nproc_per_node`` processes that run input function with its arguments. - num_nodes (int, optional): optional argument, number of nodes participating in distributed configuration. - If not None, :meth:`~ignite.distributed.Parallel.run` will spawn ``num_procs_per_node`` - processes that run input function with its arguments. Total world size is `num_procs_per_node * num_nodes`. - node_rank (int, optional): optional argument, current machine index. Mandatory argument if ``num_nodes`` is + nnodes (int, optional): optional argument, number of nodes participating in distributed configuration. + If not None, :meth:`~ignite.distributed.Parallel.run` will spawn ``nproc_per_node`` + processes that run input function with its arguments. Total world size is `nproc_per_node * nnodes`. + node_rank (int, optional): optional argument, current machine index. Mandatory argument if ``nnodes`` is specified and larger than one. master_addr (str, optional): optional argument, master node TCP/IP address for torch native backends - (`nccl`, `gloo`). Mandatory argument if ``num_nodes`` is specified and larger than one. + (`nccl`, `gloo`). Mandatory argument if ``nnodes`` is specified and larger than one. master_port (int, optional): optional argument, master node port for torch native backends (`nccl`, `gloo`). Mandatory argument if ``master_addr`` is specified. """ @@ -155,8 +155,8 @@ def training(local_rank, config, **kwargs): def __init__( self, backend: str = None, - num_procs_per_node: Optional[int] = None, - num_nodes: Optional[int] = None, + nproc_per_node: Optional[int] = None, + nnodes: Optional[int] = None, node_rank: Optional[int] = None, master_addr: Optional[str] = None, master_port: Optional[str] = None, @@ -167,8 +167,8 @@ def __init__( "Unknown backend '{}'. Available backends: {}".format(backend, idist.available_backends()) ) else: - arg_names = ["num_procs_per_node", "num_nodes", "node_rank", "master_addr", "master_port"] - arg_values = [num_procs_per_node, num_nodes, node_rank, master_addr, master_port] + arg_names = ["nproc_per_node", "nnodes", "node_rank", "master_addr", "master_port"] + arg_values = [nproc_per_node, nnodes, node_rank, master_addr, master_port] for name, value in zip(arg_names, arg_values): if value is not None: raise ValueError( @@ -181,9 +181,9 @@ def __init__( # distributed_rank=0 <=> explicit rank 0, avoid call idist. Critical for TPU on Colab, avoid context is setup if self.backend is not None: - if num_procs_per_node is not None: + if nproc_per_node is not None: self._spawn_params = self._setup_spawn_params( - num_procs_per_node, num_nodes, node_rank, master_addr, master_port + nproc_per_node, nnodes, node_rank, master_addr, master_port ) if self._spawn_params is not None: @@ -191,29 +191,29 @@ def __init__( msg = "\n\t".join(["{}: {}".format(k, v) for k, v in self._spawn_params.items() if v is not None]) self.logger.info("- Parameters to spawn processes: \n\t{}".format(msg)) - def _setup_spawn_params(self, num_procs_per_node, num_nodes, node_rank, master_addr, master_port): - if num_procs_per_node < 1: - raise ValueError("Argument num_procs_per_node should positive, but given {}".format(num_procs_per_node)) - if num_nodes is None: - num_nodes = 1 - if num_nodes < 1: - raise ValueError("Argument num_nodes should positive, but given {}".format(num_nodes)) + def _setup_spawn_params(self, nproc_per_node, nnodes, node_rank, master_addr, master_port): + if nproc_per_node < 1: + raise ValueError("Argument nproc_per_node should positive, but given {}".format(nproc_per_node)) + if nnodes is None: + nnodes = 1 + if nnodes < 1: + raise ValueError("Argument nnodes should positive, but given {}".format(nnodes)) if node_rank is None: - if num_nodes > 1: + if nnodes > 1: raise ValueError("If number of nodes larger than one, arguments node_rank should be given") node_rank = 0 - if node_rank >= num_nodes or node_rank < 0: + if node_rank >= nnodes or node_rank < 0: raise ValueError( - "Argument node_rank should be between 0 and {}, but given {}".format(num_nodes - 1, node_rank) + "Argument node_rank should be between 0 and {}, but given {}".format(nnodes - 1, node_rank) ) - if num_nodes > 1 and (master_addr is None or master_port is None): + if nnodes > 1 and (master_addr is None or master_port is None): raise ValueError( "If number of nodes larger than one, arguments master_addr and master_port " "should be specified, but given master_addr={} and master_port={}".format(master_addr, master_port) ) params = { - "num_procs_per_node": num_procs_per_node, - "num_nodes": num_nodes, + "nproc_per_node": nproc_per_node, + "nnodes": nnodes, "node_rank": node_rank, "master_addr": master_addr, "master_port": master_port, @@ -240,9 +240,7 @@ def training(local_rank, config, **kwargs): """ if self._spawn_params is not None: - self.logger.info( - "Spawn function '{}' in {} processes".format(func, self._spawn_params["num_procs_per_node"]) - ) + self.logger.info("Spawn function '{}' in {} processes".format(func, self._spawn_params["nproc_per_node"])) idist.spawn(self.backend, func, args=args, kwargs_dict=kwargs, **self._spawn_params) else: self.logger.info("- Run '{}' in {} processes".format(func, idist.get_world_size())) diff --git a/ignite/distributed/utils.py b/ignite/distributed/utils.py index eecd6eba877d..bf8dba0185e1 100644 --- a/ignite/distributed/utils.py +++ b/ignite/distributed/utils.py @@ -22,9 +22,9 @@ "get_world_size", "get_rank", "get_local_rank", - "get_ntasks_per_node", + "get_nproc_per_node", "get_node_rank", - "get_num_nodes", + "get_nnodes", "spawn", "initialize", "finalize", @@ -144,19 +144,19 @@ def get_local_rank() -> int: @_sync_model_wrapper -def get_ntasks_per_node() -> int: +def get_nproc_per_node() -> int: """Returns number of processes (or tasks) per node within current distributed configuration. Returns 1 if no distributed configuration. """ - return _model.get_ntasks_per_node() + return _model.get_nproc_per_node() @_sync_model_wrapper -def get_num_nodes() -> int: +def get_nnodes() -> int: """Returns number of nodes within current distributed configuration. Returns 1 if no distributed configuration. """ - return _model.get_num_nodes() + return _model.get_nnodes() @_sync_model_wrapper @@ -174,14 +174,9 @@ def hostname() -> str: def spawn( - backend: str, - fn: Callable, - args: Tuple, - kwargs_dict: Optional[Mapping] = None, - num_procs_per_node: int = 1, - **kwargs + backend: str, fn: Callable, args: Tuple, kwargs_dict: Optional[Mapping] = None, nproc_per_node: int = 1, **kwargs ): - """Spawns ``num_procs_per_node`` processes that run ``fn`` with ``args``/``kwargs_dict`` and initialize + """Spawns ``nproc_per_node`` processes that run ``fn`` with ``args``/``kwargs_dict`` and initialize distributed configuration defined by ``backend``. Examples: @@ -205,27 +200,27 @@ def train_fn(local_rank, a, b, c, d=12): assert device == torch.device("cuda:{}".format(local_rank)) - idist.spawn("nccl", train_fn, args=(a, b, c), kwargs_dict={"d": 23}, num_procs_per_node=4) + idist.spawn("nccl", train_fn, args=(a, b, c), kwargs_dict={"d": 23}, nproc_per_node=4) 2) Launch multi-node multi-GPU training .. code-block:: python - # >>> (node 0): python main.py --node_rank=0 --num_nodes=8 --master_addr=master --master_port=2222 - # >>> (node 1): python main.py --node_rank=1 --num_nodes=8 --master_addr=master --master_port=2222 + # >>> (node 0): python main.py --node_rank=0 --nnodes=8 --master_addr=master --master_port=2222 + # >>> (node 1): python main.py --node_rank=1 --nnodes=8 --master_addr=master --master_port=2222 # >>> ... - # >>> (node 7): python main.py --node_rank=7 --num_nodes=8 --master_addr=master --master_port=2222 + # >>> (node 7): python main.py --node_rank=7 --nnodes=8 --master_addr=master --master_port=2222 # main.py import torch import ignite.distributed as idist - def train_fn(local_rank, num_nodes, num_procs_per_node): + def train_fn(local_rank, nnodes, nproc_per_node): import torch.distributed as dist assert dist.is_available() and dist.is_initialized() - assert dist.get_world_size() == num_nodes * num_procs_per_node + assert dist.get_world_size() == nnodes * nproc_per_node device = idist.device() assert device == torch.device("cuda:{}".format(local_rank)) @@ -233,9 +228,9 @@ def train_fn(local_rank, num_nodes, num_procs_per_node): idist.spawn( "nccl", train_fn, - args=(num_nodes, num_procs_per_node), - num_procs_per_node=num_procs_per_node, - num_nodes=num_nodes, + args=(nnodes, nproc_per_node), + nproc_per_node=nproc_per_node, + nnodes=nnodes, node_rank=node_rank, master_addr=master_addr, master_port=master_port @@ -259,7 +254,7 @@ def train_fn(local_rank, a, b, c, d=12): assert "xla" in device.type - idist.spawn("xla-tpu", train_fn, args=(a, b, c), kwargs_dict={"d": 23}, num_procs_per_node=8) + idist.spawn("xla-tpu", train_fn, args=(a, b, c), kwargs_dict={"d": 23}, nproc_per_node=8) Args: backend (str): backend to use: `nccl`, `gloo`, `xla-tpu` @@ -269,14 +264,14 @@ def train_fn(local_rank, a, b, c, d=12): where `i` is the process index and args is the passed through tuple of arguments. args (tuple): arguments passed to `fn`. kwargs_dict (Mapping): kwargs passed to `fn`. - num_procs_per_node (int): number of processes to spawn on a single node. Default, 1. + nproc_per_node (int): number of processes to spawn on a single node. Default, 1. **kwargs: acceptable kwargs according to provided backend: - - | "nccl" or "gloo" : `num_nodes` (default, 1), `node_rank` (default, 0), `master_addr` + - | "nccl" or "gloo" : `nnodes` (default, 1), `node_rank` (default, 0), `master_addr` | (default, "127.0.0.1"), `master_port` (default, 2222), `timeout` to `dist.init_process_group`_ function | and kwargs for `mp.spawn`_ function. - - "xla-tpu" : `num_nodes` (default, 1), `node_rank` (default, 0) and kwargs to `xmp.spawn`_ function. + - "xla-tpu" : `nnodes` (default, 1), `node_rank` (default, 0) and kwargs to `xmp.spawn`_ function. .. _dist.init_process_group: https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group .. _mp.spawn: https://pytorch.org/docs/stable/multiprocessing.html#torch.multiprocessing.spawn @@ -292,7 +287,7 @@ def train_fn(local_rank, a, b, c, d=12): if backend not in comp_model_cls.available_backends: continue comp_model_cls.spawn( - fn, args=args, kwargs_dict=kwargs_dict, num_procs_per_node=num_procs_per_node, backend=backend, **kwargs + fn, args=args, kwargs_dict=kwargs_dict, nproc_per_node=nproc_per_node, backend=backend, **kwargs ) @@ -448,8 +443,8 @@ def show_config(): logger.info("world size: {}".format(get_world_size())) logger.info("rank: {}".format(get_rank())) logger.info("local rank: {}".format(get_local_rank())) - logger.info("num tasks per_node: {}".format(get_ntasks_per_node())) - logger.info("num nodes: {}".format(get_num_nodes())) + logger.info("num processes per_node: {}".format(get_nproc_per_node())) + logger.info("num nodes: {}".format(get_nnodes())) logger.info("node rank: {}".format(get_node_rank())) diff --git a/tests/ignite/distributed/check_idist_parallel.py b/tests/ignite/distributed/check_idist_parallel.py index e12fc33a1f68..830d0ad09481 100644 --- a/tests/ignite/distributed/check_idist_parallel.py +++ b/tests/ignite/distributed/check_idist_parallel.py @@ -48,20 +48,20 @@ def training(local_rank, config, **kwargs): - Spawn 4 procs in single node using gloo backend: ``` - python tests/ignite/distributed/check_idist_parallel.py --backend=gloo --num_procs_per_node=4 + python tests/ignite/distributed/check_idist_parallel.py --backend=gloo --nproc_per_node=4 ``` - Spawn 2 procs in 2 nodes using gloo backend: ``` bash -c "python tests/ignite/distributed/check_idist_parallel.py --backend=gloo \ - --num_procs_per_node=2 --num_nodes=2 --node_rank=0 --master_addr=localhost --master_port=3344 &" \ + --nproc_per_node=2 --nnodes=2 --node_rank=0 --master_addr=localhost --master_port=3344 &" \ && bash -c "python tests/ignite/distributed/check_idist_parallel.py --backend=gloo \ - --num_procs_per_node=2 --num_nodes=2 --node_rank=1 --master_addr=localhost --master_port=3344 &" + --nproc_per_node=2 --nnodes=2 --node_rank=1 --master_addr=localhost --master_port=3344 &" ``` - Spawn 8 procs in single node using xla-tpu backend: ``` - python tests/ignite/distributed/check_idist_parallel.py --backend=xla-tpu --num_procs_per_node=8 + python tests/ignite/distributed/check_idist_parallel.py --backend=xla-tpu --nproc_per_node=8 ``` @@ -69,8 +69,8 @@ def training(local_rank, config, **kwargs): parser = argparse.ArgumentParser("Check idist.Parallel") parser.add_argument("--backend", type=str, default=None) - parser.add_argument("--num_procs_per_node", type=int, default=None) - parser.add_argument("--num_nodes", type=int, default=None) + parser.add_argument("--nproc_per_node", type=int, default=None) + parser.add_argument("--nnodes", type=int, default=None) parser.add_argument("--node_rank", type=int, default=None) parser.add_argument("--master_addr", type=str, default=None) parser.add_argument("--master_port", type=str, default=None) @@ -80,8 +80,8 @@ def training(local_rank, config, **kwargs): config = {"model": "resnet18", "lr": 0.01} dist_config = dict( - num_procs_per_node=args.num_procs_per_node, - num_nodes=args.num_nodes, + nproc_per_node=args.nproc_per_node, + nnodes=args.nnodes, node_rank=args.node_rank, master_addr=args.master_addr, master_port=args.master_port, diff --git a/tests/ignite/distributed/comp_models/test_base.py b/tests/ignite/distributed/comp_models/test_base.py index 3a50bad6ffcc..cf15c7972ff9 100644 --- a/tests/ignite/distributed/comp_models/test_base.py +++ b/tests/ignite/distributed/comp_models/test_base.py @@ -11,8 +11,8 @@ def test_serial_model(): assert model.get_local_rank() == 0 assert model.get_rank() == 0 assert model.get_world_size() == 1 - assert model.get_ntasks_per_node() == 1 - assert model.get_num_nodes() == 1 + assert model.get_nproc_per_node() == 1 + assert model.get_nnodes() == 1 assert model.get_node_rank() == 0 if torch.cuda.is_available(): assert model.device().type == "cuda" diff --git a/tests/ignite/distributed/comp_models/test_native.py b/tests/ignite/distributed/comp_models/test_native.py index f99df1abb034..650c5fb7bcb2 100644 --- a/tests/ignite/distributed/comp_models/test_native.py +++ b/tests/ignite/distributed/comp_models/test_native.py @@ -56,8 +56,8 @@ def _assert_model(model, true_conf): assert model.get_world_size() == true_conf["world_size"] assert model.get_node_rank() == true_conf["node_index"] - assert model.get_num_nodes() == true_conf["num_nodes"] - assert model.get_ntasks_per_node() == true_conf["ntasks_per_node"] + assert model.get_nnodes() == true_conf["nnodes"] + assert model.get_nproc_per_node() == true_conf["nproc_per_node"] def _test__native_dist_model_create_from_backend_no_dist(backend, true_device): @@ -76,8 +76,8 @@ def _test__native_dist_model_create_from_backend_no_dist(backend, true_device): "rank": 0, "world_size": 1, "node_index": 0, - "num_nodes": 1, - "ntasks_per_node": 1, + "nnodes": 1, + "nproc_per_node": 1, }, ) @@ -107,8 +107,8 @@ def _test__native_dist_model_create_from_backend_dist(local_rank, rank, world_si "rank": rank, "world_size": world_size, "node_index": 0, - "num_nodes": 1, - "ntasks_per_node": world_size, + "nnodes": 1, + "nproc_per_node": world_size, }, ) @@ -181,8 +181,8 @@ def _test__native_dist_model_create_from_context_no_dist(true_backend, true_devi "rank": 0, "world_size": 1, "node_index": 0, - "num_nodes": 1, - "ntasks_per_node": 1, + "nnodes": 1, + "nproc_per_node": 1, } _test__native_dist_model_create_from_context_env_local_rank(true_conf) @@ -204,8 +204,8 @@ def _test__native_dist_model_create_from_context_dist(local_rank, rank, world_si "rank": rank, "world_size": world_size, "node_index": 0, - "num_nodes": 1, - "ntasks_per_node": world_size, + "nnodes": 1, + "nproc_per_node": world_size, } _test__native_dist_model_create_from_context_env_local_rank(true_conf) @@ -268,7 +268,7 @@ def _test__native_dist_model_spawn(backend, num_workers_per_machine, device): args=(backend, num_workers_per_machine, device), kwargs_dict={}, backend=backend, - num_procs_per_node=num_workers_per_machine, + nproc_per_node=num_workers_per_machine, ) diff --git a/tests/ignite/distributed/comp_models/test_xla.py b/tests/ignite/distributed/comp_models/test_xla.py index fe29a6714eff..99c0d514e01f 100644 --- a/tests/ignite/distributed/comp_models/test_xla.py +++ b/tests/ignite/distributed/comp_models/test_xla.py @@ -29,9 +29,9 @@ def _test_xla_spawn_fn(local_rank, world_size, device): assert isinstance(d, torch.device) and d.type == device assert _model.get_rank() == local_rank - assert _model.get_ntasks_per_node() == world_size + assert _model.get_nproc_per_node() == world_size assert _model.get_node_rank() == 0 - assert _model.get_num_nodes() == 1 + assert _model.get_nnodes() == 1 @pytest.mark.tpu @@ -40,7 +40,7 @@ def _test_xla_spawn_fn(local_rank, world_size, device): def test__xla_dist_model_spawn_one_proc(): try: _XlaDistModel.spawn( - _test_xla_spawn_fn, args=(1, "xla"), kwargs_dict={}, num_procs_per_node=1, + _test_xla_spawn_fn, args=(1, "xla"), kwargs_dict={}, nproc_per_node=1, ) except SystemExit: pass @@ -53,7 +53,7 @@ def test__xla_dist_model_spawn_n_procs(): n = int(os.environ["NUM_TPU_WORKERS"]) try: _XlaDistModel.spawn( - _test_xla_spawn_fn, args=(n, "xla"), kwargs_dict={}, num_procs_per_node=n, + _test_xla_spawn_fn, args=(n, "xla"), kwargs_dict={}, nproc_per_node=n, ) except SystemExit: pass @@ -67,8 +67,8 @@ def _assert_model(model, true_conf): assert model.get_world_size() == true_conf["world_size"] assert model.get_node_rank() == true_conf["node_index"] - assert model.get_num_nodes() == true_conf["num_nodes"] - assert model.get_ntasks_per_node() == true_conf["ntasks_per_node"] + assert model.get_nnodes() == true_conf["nnodes"] + assert model.get_nproc_per_node() == true_conf["nproc_per_node"] @pytest.mark.tpu @@ -88,8 +88,8 @@ def test__xla_dist_model_create_from_backend(): "rank": 0, "world_size": 1, "node_index": 0, - "num_nodes": 1, - "ntasks_per_node": 1, + "nnodes": 1, + "nproc_per_node": 1, }, ) @@ -115,8 +115,8 @@ def test__xla_dist_model_create_from_context(): "rank": 0, "world_size": 1, "node_index": 0, - "num_nodes": 1, - "ntasks_per_node": 1, + "nnodes": 1, + "nproc_per_node": 1, }, ) @@ -136,8 +136,8 @@ def _test__xla_dist_model_create_from_context_in_child_proc(index): "rank": xm.get_ordinal(), "world_size": xm.xrt_world_size(), "node_index": 0, - "num_nodes": 1, - "ntasks_per_node": xm.xrt_world_size(), + "nnodes": 1, + "nproc_per_node": xm.xrt_world_size(), }, ) diff --git a/tests/ignite/distributed/test_launcher.py b/tests/ignite/distributed/test_launcher.py index 31ef13a67cdf..b8ccadfbca3c 100644 --- a/tests/ignite/distributed/test_launcher.py +++ b/tests/ignite/distributed/test_launcher.py @@ -75,9 +75,9 @@ def test_check_idist_parallel_torch_launch_n_procs_nccl(exec_filepath): def _test_check_idist_parallel_spawn(fp, backend, nprocs): - # python tests/ignite/distributed/check_idist_parallel.py --backend=backend --num_procs_per_node=nprocs + # python tests/ignite/distributed/check_idist_parallel.py --backend=backend --nproc_per_node=nprocs - cmd = [sys.executable, fp, "--backend={}".format(backend), "--num_procs_per_node={}".format(nprocs)] + cmd = [sys.executable, fp, "--backend={}".format(backend), "--nproc_per_node={}".format(nprocs)] out = execute(cmd) assert "backend={}".format(backend) in out @@ -121,7 +121,7 @@ def _test_func(index, ws, device): @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_parallel_gloo(): - with idist.Parallel(backend="gloo", num_procs_per_node=4) as parallel: + with idist.Parallel(backend="gloo", nproc_per_node=4) as parallel: parallel.run(_test_func, ws=4, device="cpu") @@ -139,7 +139,7 @@ def test_idist_parallel_gloo_nprocs(local_rank, world_size): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_parallel_nccl(): - with idist.Parallel(backend="nccl", num_procs_per_node=torch.cuda.device_count()) as parallel: + with idist.Parallel(backend="nccl", nproc_per_node=torch.cuda.device_count()) as parallel: parallel.run(_test_func, ws=torch.cuda.device_count(), device="cuda") diff --git a/tests/ignite/distributed/test_utils.py b/tests/ignite/distributed/test_utils.py index f27f16066916..c9944676cb30 100644 --- a/tests/ignite/distributed/test_utils.py +++ b/tests/ignite/distributed/test_utils.py @@ -12,10 +12,10 @@ def _sanity_check(): from ignite.distributed.utils import _model - assert _model.get_world_size() == _model.get_num_nodes() * _model.get_ntasks_per_node() - assert _model.get_local_rank() < _model.get_ntasks_per_node() + assert _model.get_world_size() == _model.get_nnodes() * _model.get_nproc_per_node() + assert _model.get_local_rank() < _model.get_nproc_per_node() assert _model.get_rank() < _model.get_world_size() - assert _model.get_node_rank() < _model.get_num_nodes() + assert _model.get_node_rank() < _model.get_nnodes() def test_no_distrib(capsys): @@ -123,7 +123,7 @@ def test_native_distrib_single_node_spawn_gloo(): world_size = 4 idist.spawn( - "gloo", _test_distrib_config, args=("gloo", world_size, "cpu"), num_procs_per_node=world_size, timeout=timeout + "gloo", _test_distrib_config, args=("gloo", world_size, "cpu"), nproc_per_node=world_size, timeout=timeout ) @@ -134,13 +134,13 @@ def test_native_distrib_single_node_spawn_gloo(): def test_native_distrib_single_node_spawn_nccl(): world_size = torch.cuda.device_count() - idist.spawn("nccl", _test_distrib_config, args=("nccl", world_size, "cuda"), num_procs_per_node=world_size) + idist.spawn("nccl", _test_distrib_config, args=("nccl", world_size, "cuda"), nproc_per_node=world_size) @pytest.mark.skipif(has_xla_support, reason="Skip if has PyTorch XLA package") def test_xla_distrib_spawn_no_xla_support(): with pytest.raises(ValueError, match=r"Backend should be one of"): - idist.spawn("xla-tpu", _test_distrib_config, args=("xla-tpu", 1, "xla"), num_procs_per_node=1) + idist.spawn("xla-tpu", _test_distrib_config, args=("xla-tpu", 1, "xla"), nproc_per_node=1) @pytest.mark.tpu @@ -157,7 +157,7 @@ def test_xla_distrib_single_node_no_spawn(): @pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package") def test_xla_distrib_single_node_spawn_one_proc(): try: - idist.spawn("xla-tpu", _test_distrib_config, args=("xla-tpu", 1, "xla"), num_procs_per_node=1) + idist.spawn("xla-tpu", _test_distrib_config, args=("xla-tpu", 1, "xla"), nproc_per_node=1) except SystemExit: pass @@ -168,7 +168,7 @@ def test_xla_distrib_single_node_spawn_one_proc(): def test_xla_distrib_single_node_spawn_n_procs(): n = int(os.environ["NUM_TPU_WORKERS"]) try: - idist.spawn("xla-tpu", _test_distrib_config, args=("xla-tpu", n, "xla"), num_procs_per_node=n) + idist.spawn("xla-tpu", _test_distrib_config, args=("xla-tpu", n, "xla"), nproc_per_node=n) except SystemExit: pass