Removed size limitation for str on collective ops (#1702)

vfdev-5 · vfdev-5 · commit 8ed04344ff8c · 2021-03-01T09:10:15.000Z
* - Removed 1024 limitation for str on collective ops - Added MIN, MAX, PRODUCT options for horovod all reduce Fixes #1697 * Fixed failing test and added more tests + minor consistency fixes
diff --git a/.github/workflows/hvd-tests.yml b/.github/workflows/hvd-tests.yml
@@ -63,7 +63,7 @@ jobs:
       - name: Run Tests
         shell: bash -l {0}
         run: |
-          SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
+          bash tests/run_cpu_tests.sh
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v1
diff --git a/ignite/distributed/comp_models/base.py b/ignite/distributed/comp_models/base.py
@@ -87,23 +87,22 @@ def spawn(*args: Any, **kwargs: Any) -> None:
     _collective_op_dtype = None  # type: Any
 
     @staticmethod
-    def _encode_str(x: str, device: torch.device) -> torch.Tensor:
-        # use fix padded size
-        size = 1024
-        if len(x) > size:
-            warnings.warn(f"Input string size {len(x)} is larger than {size} and thus will be truncated")
-            x = x[:size]
-
+    def _encode_str(x: str, device: torch.device, size: int) -> torch.Tensor:
         name = torch.tensor(bytearray(x, "utf-8")).to(device)
         padded_x = torch.zeros(size + 1, device=device, dtype=torch.long)
         padded_x[: len(name)] = name
         padded_x[-1] = len(name)
-        # output is tensor of shape (1, 1025)
+        # output is tensor of shape (1, size + 1)
         return padded_x.unsqueeze(0)
 
+    def _get_max_length(self, x: str, device: torch.device) -> int:
+        size = torch.tensor([len(x),], device=device)
+        size = self._do_all_reduce(size, "MAX")
+        return cast(int, size.item())
+
     @staticmethod
     def _decode_str(xs: torch.Tensor) -> List[str]:
-        # xs.shape = (n, 1025), e.g. (world_size, 1025)
+        # xs.shape = (n, size + 1), e.g. (world_size, size + 1)
         out = [bytearray(x[: x[-1]].tolist()).decode("utf-8") for x in xs]
         return out
 
@@ -144,7 +143,8 @@ def _collective_op(
             tensor = torch.tensor(tensor, device=device, dtype=self._collective_op_dtype)
         elif isinstance(tensor, str):
             tensor_to_str = True
-            tensor = self._encode_str(tensor, device)
+            max_length = self._get_max_length(tensor, device)
+            tensor = self._encode_str(tensor, device, size=max_length)
 
         tensor = self._apply_op(tensor, device, fn, *args, **kwargs)
 
@@ -176,20 +176,20 @@ def broadcast(self, tensor: Union[torch.Tensor, float, str], src: int = 0) -> Un
         rank = self.get_rank()
         device = self.device()
         tensor_to_number = tensor_to_str = False
-        if rank != src:
-            if isinstance(tensor, Number):
-                tensor_to_number = True
-                tensor = torch.empty(1, device=self.device(), dtype=torch.float)
-            elif isinstance(tensor, str):
-                tensor_to_str = True
-                tensor = torch.empty(1, 1025, device=self.device(), dtype=torch.long)
-        else:
-            if isinstance(tensor, Number):
-                tensor_to_number = True
+
+        if isinstance(tensor, Number):
+            tensor_to_number = True
+            if rank != src:
+                tensor = torch.empty(1, device=device, dtype=torch.float)
+            else:
                 tensor = torch.tensor([tensor,], device=device, dtype=torch.float)
-            elif isinstance(tensor, str):
-                tensor_to_str = True
-                tensor = self._encode_str(tensor, device)
+        elif isinstance(tensor, str):
+            tensor_to_str = True
+            max_length = self._get_max_length(tensor, device)
+            if rank != src:
+                tensor = torch.empty(1, max_length + 1, device=device, dtype=torch.long)
+            else:
+                tensor = self._encode_str(tensor, device, size=max_length)
 
         tensor = self._apply_op(tensor, device, self._do_broadcast, src)
 
@@ -201,7 +201,7 @@ def broadcast(self, tensor: Union[torch.Tensor, float, str], src: int = 0) -> Un
         return tensor
 
     @abstractmethod
-    def _do_all_reduce(self, tensor: torch.Tensor, op: str = "sum") -> torch.Tensor:
+    def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM") -> torch.Tensor:
         pass
 
     @abstractmethod
@@ -271,7 +271,7 @@ def create_from_backend(backend: Optional[str] = None, **kwargs: Any) -> "_Seria
     def spawn(*args: Any, **kwargs: Any) -> None:
         raise NotImplementedError("Serial computation model does not implement spawn method")
 
-    def all_reduce(self, tensor: Union[torch.Tensor, float], op: str = "sum") -> Union[torch.Tensor, float]:
+    def all_reduce(self, tensor: Union[torch.Tensor, float], op: str = "SUM") -> Union[torch.Tensor, float]:
         return tensor
 
     def all_gather(self, tensor: Union[torch.Tensor, float, str]) -> Union[torch.Tensor, float, List[float], List[str]]:
@@ -282,14 +282,14 @@ def all_gather(self, tensor: Union[torch.Tensor, float, str]) -> Union[torch.Ten
     def broadcast(self, tensor: Union[torch.Tensor, float, str], src: int = 0) -> Union[torch.Tensor, float, str]:
         return tensor
 
-    def _do_all_reduce(self, tensor: torch.Tensor, op: str = "sum") -> torch.Tensor:
-        pass
+    def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM") -> torch.Tensor:
+        return tensor
 
     def _do_all_gather(self, tensor: torch.Tensor) -> torch.Tensor:
-        pass
+        return tensor
 
     def _do_broadcast(self, tensor: torch.Tensor, src: int) -> torch.Tensor:
-        pass
+        return tensor
 
     def barrier(self) -> None:
         pass
diff --git a/ignite/distributed/comp_models/horovod.py b/ignite/distributed/comp_models/horovod.py
@@ -165,12 +165,24 @@ def spawn(  # type: ignore[override]
             "ADASUM": hvd.mpi_ops.Adasum,
         }
 
+        _manual_reduce_op_map = {"MIN": torch.min, "MAX": torch.max, "PRODUCT": torch.prod}
+
         def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM") -> torch.Tensor:
+            if op in self._manual_reduce_op_map:
+                op_fn = self._manual_reduce_op_map[op]
+                return self._do_manual_all_reduce(tensor, op_fn)
             if op not in self._reduce_op_map:
                 raise ValueError(f"Unsupported reduction operation: '{op}'")
             op = self._reduce_op_map[op]
             return hvd.allreduce(tensor, op=op)
 
+        def _do_manual_all_reduce(self, tensor: torch.Tensor, op: Any) -> torch.Tensor:
+            res = self._do_all_gather(tensor)
+            reduced_res = op(res, dim=0)
+            if isinstance(reduced_res, torch.Tensor):
+                return reduced_res
+            return reduced_res[0]
+
         def _do_all_gather(self, tensor: torch.Tensor) -> torch.Tensor:
             if tensor.ndimension() == 0:
                 tensor = tensor.unsqueeze(0)
diff --git a/ignite/distributed/utils.py b/ignite/distributed/utils.py
@@ -331,7 +331,7 @@ def all_reduce(tensor: Union[torch.Tensor, float], op: str = "SUM") -> Union[tor
     Args:
         tensor: tensor or number to collect across participating processes.
         op: reduction operation, "SUM" by default. Possible values: "SUM", "PRODUCT", "MIN", "MAX", "AND", "OR".
-            Please, several values are not supported for the backend like "horovod".
+            Horovod backend supports only "SUM", "AVERAGE", "ADASUM", "MIN", "MAX", "PRODUCT".
 
     Returns:
         torch.Tensor or number
diff --git a/tests/ignite/distributed/comp_models/test_base.py b/tests/ignite/distributed/comp_models/test_base.py
@@ -27,17 +27,17 @@ def test_serial_model():
     model.all_reduce(1)
     model.all_gather(1)
     model.broadcast(1)
-    model._do_all_reduce(torch.tensor(1))
-    model._do_all_gather(torch.tensor(1))
-    model._do_broadcast(torch.tensor(1), src=0)
+    assert model._do_all_reduce(torch.tensor(1)) == torch.tensor(1)
+    assert model._do_all_gather(torch.tensor(1)) == torch.tensor(1)
+    assert model._do_broadcast(torch.tensor(1), src=0) == torch.tensor(1)
     model.barrier()
 
 
 def test__encode_str__decode_str():
     device = torch.device("cpu")
     s = "test-abcedfg"
 
-    encoded_s = ComputationModel._encode_str(s, device)
+    encoded_s = ComputationModel._encode_str(s, device, 1024)
     assert isinstance(encoded_s, torch.Tensor) and encoded_s.shape == (1, 1025)
 
     decoded_s = ComputationModel._decode_str(encoded_s)
diff --git a/tests/ignite/distributed/test_auto.py b/tests/ignite/distributed/test_auto.py
@@ -102,7 +102,7 @@ def test_auto_methods_no_dist():
     _test_auto_dataloader(1, 1, batch_size=10, num_workers=2)
     _test_auto_dataloader(1, 1, batch_size=10, sampler_name="WeightedRandomSampler")
 
-    _test_auto_model_optimizer(1, "cpu")
+    _test_auto_model_optimizer(1, "cuda" if torch.cuda.is_available() else "cpu")
 
 
 @pytest.mark.distributed
diff --git a/tests/ignite/distributed/utils/__init__.py b/tests/ignite/distributed/utils/__init__.py
@@ -56,6 +56,16 @@ def _test_sync(cls):
     assert isinstance(_model, cls), f"{type(_model)} vs {cls}"
 
 
+def _test_distrib__get_max_length(device):
+    ws = idist.get_world_size()
+    x = "_test_distrib__get_max_length" * (idist.get_rank() + 2)
+
+    from ignite.distributed.utils import _model
+
+    res = _model._get_max_length(x, device)
+    assert res == len("_test_distrib__get_max_length" * (ws + 1))
+
+
 def _test_distrib_all_reduce(device):
 
     res = idist.all_reduce(10)
@@ -65,9 +75,27 @@ def _test_distrib_all_reduce(device):
     res = idist.all_reduce(t)
     assert res.item() == 10 * idist.get_world_size()
 
-    t = torch.tensor(idist.get_rank(), device=device)
+    rank = idist.get_rank()
+    t = torch.tensor(rank * 2.0 + 1.0, device=device)
     res = idist.all_reduce(t)
-    assert res.item() == sum([i for i in range(idist.get_world_size())])
+    assert res.item() == sum([i * 2.0 + 1.0 for i in range(idist.get_world_size())])
+
+    t = torch.tensor(rank * 2.0 + 1.0, device=device)
+    res = idist.all_reduce(t, "MIN").item()
+    true_val = min([i * 2 + 1 for i in range(idist.get_world_size())])
+    assert res == true_val, f"{res} vs {true_val}"
+
+    t = torch.tensor(rank * 2.0 + 1.0, device=device)
+    res = idist.all_reduce(t, "MAX").item()
+    true_val = max([i * 2.0 + 1.0 for i in range(idist.get_world_size())])
+    assert res == true_val, f"{res} vs {true_val}"
+
+    t = torch.tensor(rank * 2.0 + 1.0, device=device)
+    res = idist.all_reduce(t, "PRODUCT").item()
+    true_val = 1
+    for v in [i * 2.0 + 1.0 for i in range(idist.get_world_size())]:
+        true_val *= v
+    assert res == true_val, f"{res} vs {true_val}"
 
     if idist.get_world_size() > 1:
         with pytest.raises(TypeError, match=r"Unhandled input type"):
@@ -99,17 +127,13 @@ def _test_distrib_all_gather(device):
     true_res = ["abc",] + ["test-test"] * (idist.get_world_size() - 1)
     assert res == true_res
 
-    base_x = "x" * 1026
+    base_x = "tests/ignite/distributed/utils/test_native.py" * 2000
     x = base_x
     if idist.get_rank() == 0:
         x = "abc"
 
-    if idist.get_rank() > 0:
-        with pytest.warns(UserWarning, match=r"is larger than 1024 and thus will be truncated"):
-            res = idist.all_gather(x)
-    else:
-        res = idist.all_gather(x)
-    true_res = ["abc",] + [base_x[:1024]] * (idist.get_world_size() - 1)
+    res = idist.all_gather(x)
+    true_res = ["abc",] + [base_x] * (idist.get_world_size() - 1)
     assert res == true_res
 
     t = torch.arange(100, device=device).reshape(4, 25) * (idist.get_rank() + 1)
@@ -147,14 +171,19 @@ def _test_distrib_broadcast(device):
         true_res = torch.tensor([1.2345, 2.3456], dtype=torch.float, device=device)
         assert (res == true_res).all(), f"{res} vs {true_res}"
 
-        if rank == src:
-            t = "test-abcdefg"
-        else:
-            t = ""
+        def _test(text):
 
-        res = idist.broadcast(t, src=src)
-        true_res = "test-abcdefg"
-        assert res == true_res
+            if rank == src:
+                t = text
+            else:
+                t = ""
+
+            res = idist.broadcast(t, src=src)
+            true_res = text
+            assert res == true_res
+
+        _test("test-abcdefg")
+        _test("tests/ignite/distributed/utils/test_horovod.py::test_idist_broadcast_hvd" * 200)
 
         if rank == src:
             t = torch.arange(100, device=device).reshape(4, 25) * (src + 1)
diff --git a/tests/ignite/distributed/utils/test_horovod.py b/tests/ignite/distributed/utils/test_horovod.py
@@ -6,6 +6,7 @@
 import ignite.distributed as idist
 from ignite.distributed.utils import has_hvd_support
 from tests.ignite.distributed.utils import (
+    _test_distrib__get_max_length,
     _test_distrib_all_gather,
     _test_distrib_all_reduce,
     _test_distrib_barrier,
@@ -145,6 +146,16 @@ def test_idist_all_reduce_hvd(gloo_hvd_executor):
     gloo_hvd_executor(_test_distrib_all_reduce, (device,), np=np, do_init=True)
 
 
+@pytest.mark.distributed
+@pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support")
+@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
+def test_idist__model_methods_hvd(gloo_hvd_executor):
+
+    device = "cpu" if not torch.cuda.is_available() else "cuda"
+    np = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
+    gloo_hvd_executor(_test_distrib__get_max_length, (device,), np=np, do_init=True)
+
+
 @pytest.mark.distributed
 @pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
diff --git a/tests/ignite/distributed/utils/test_native.py b/tests/ignite/distributed/utils/test_native.py
@@ -7,6 +7,7 @@
 import ignite.distributed as idist
 from ignite.distributed.utils import has_native_dist_support
 from tests.ignite.distributed.utils import (
+    _test_distrib__get_max_length,
     _test_distrib_all_gather,
     _test_distrib_all_reduce,
     _test_distrib_barrier,
@@ -152,6 +153,23 @@ def test_idist_methods_in_native_nccl_context_set_local_rank(distributed_context
     _test_idist_methods_in_native_context_set_local_rank("nccl", "cuda", local_rank)
 
 
+@pytest.mark.distributed
+@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
+def test_idist__model_methods_nccl(distributed_context_single_node_nccl):
+
+    device = f"cuda:{distributed_context_single_node_nccl['local_rank']}"
+    _test_distrib__get_max_length(device)
+
+
+@pytest.mark.distributed
+@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
+def test_idist__model_methods_gloo(distributed_context_single_node_gloo):
+
+    device = "cpu"
+    _test_distrib__get_max_length(device)
+
+
 @pytest.mark.distributed
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
diff --git a/tests/ignite/distributed/utils/test_serial.py b/tests/ignite/distributed/utils/test_serial.py
@@ -1,7 +1,15 @@
 import torch
 
 import ignite.distributed as idist
-from tests.ignite.distributed.utils import _sanity_check, _test_sync
+from tests.ignite.distributed.utils import (
+    _sanity_check,
+    _test_distrib__get_max_length,
+    _test_distrib_all_gather,
+    _test_distrib_all_reduce,
+    _test_distrib_barrier,
+    _test_distrib_broadcast,
+    _test_sync,
+)
 
 
 def test_no_distrib(capsys):
@@ -48,10 +56,20 @@ def test_idist_methods_no_dist():
     assert idist.backend() is None, f"{idist.backend()}"
 
 
-def test_idist_all_reduce_no_dist():
-    assert idist.all_reduce(10) == 10
+def test_idist__model_methods_no_dist():
+    _test_distrib__get_max_length("cpu")
+    if torch.cuda.device_count() > 1:
+        _test_distrib__get_max_length("cuda")
 
 
-def test_idist_all_gather_no_dist():
-    assert idist.all_gather(10) == [10]
-    assert (idist.all_gather(torch.tensor(10)) == torch.tensor(10)).all()
+def test_idist_collective_ops_no_dist():
+    _test_distrib_all_reduce("cpu")
+    _test_distrib_all_gather("cpu")
+    _test_distrib_barrier("cpu")
+    _test_distrib_broadcast("cpu")
+
+    if torch.cuda.device_count() > 1:
+        _test_distrib_all_reduce("cuda")
+        _test_distrib_all_gather("cuda")
+        _test_distrib_barrier("cuda")
+        _test_distrib_broadcast("cuda")