[torchrec] [composable] update ShardedEmbeddingBagCollection to be use registered EBCs with shardedTensors as registered modules (#758) (pytorch#88026)

colin2328 · pytorchmergebot · commit 24b9890f0343 · 2022-11-17T04:26:13.000Z
Summary: X-link: meta-pytorch/torchrec#758 This PR fixes a bug in FSDP/DDP, where ShardedTensors are not supported even if passed in as params to ignore. this is important for composability because TorchRec named_parameters() will return FQN of shardedTensors (as defined in goals) It defines device of ShardedTensor to be None when local_tensor() does not exist on rank update ShardedEmbeddingBagCollection to be composable according to https://docs.google.com/document/d/1TBJSd5zgEg6cRcXv3Okuj7bBkqQwGS2IPh4TLWNNzFI/edit Differential Revision: D40458625 Pull Request resolved: pytorch#88026 Approved by: https://github.com/wanchaol, https://github.com/rohan-varma
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
@@ -23,28 +23,35 @@
 import torch.nn.functional as F
 import torch.testing._internal.common_utils as common
 from test_c10d_common import (
-    LOOPBACK,
     gpus_for_rank,
-    Task,
+    LOOPBACK,
     ModuleForDdpCommHook,
     SparseGradientModule,
+    Task,
 )
 from torch import nn
+from torch.distributed._shard.sharded_tensor import (
+    init_from_local_shards,
+    Shard,
+    ShardedTensor,
+    ShardMetadata,
+)
 from torch.nn.parallel import DistributedDataParallel
+from torch.nn.parallel._replicated_tensor_ddp_utils import _ddp_replicated_tensor
 from torch.testing._internal.common_distributed import (
+    create_device,
     MultiProcessTestCase,
     requires_gloo,
-    skip_if_lt_x_gpu,
     simple_sparse_reduce_tests,
+    skip_if_lt_x_gpu,
     skip_if_win32,
-    create_device,
     verify_ddp_error_logged,
 )
 from torch.testing._internal.common_utils import (
-    TestCase,
-    run_tests,
     retry_on_connect_failures,
+    run_tests,
     sandcastle_skip,
+    TestCase,
 )
 
 
@@ -1754,6 +1761,49 @@ def forward(self, x):
             loss = criterion(output, target)
             loss.backward()
 
+    @requires_gloo()
+    @skip_if_lt_x_gpu(2)
+    def test_ignored_sharded_tensor(self):
+        class MyModule(nn.Module):
+            def __init__(self, shard_tensor: ShardedTensor) -> None:
+                super().__init__()
+                self.fc1 = nn.Linear(2, 10, bias=False)
+                self.st = nn.Parameter(shard_tensor)
+                self.relu = nn.ReLU()
+
+            def forward(self, x):
+                x = self.relu(self.fc1(x))
+                return F.softmax(x, dim=1)
+        pg = dist.init_process_group(
+            "gloo",
+            init_method=f"file://{self.file_name}",
+            world_size=self.world_size,
+            rank=self.rank,
+        )
+        device = torch.device(f"cuda:{self.rank}")
+        local_shard_metadata = ShardMetadata(
+            shard_offsets=[(self.rank % 2) * 5, 0],
+            shard_sizes=[5, 10],
+            placement=f"rank:{self.rank}/cuda:{self.rank}"
+        )
+        local_shards = [Shard(torch.randn(5, 10, device=device), local_shard_metadata)]
+        st = init_from_local_shards(local_shards, [10, 10])
+        m = MyModule(st)
+        with _ddp_replicated_tensor(False):
+            DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
+                module=m,
+                params_and_buffers_to_ignore={'st'}
+            )
+            # test to make DDP constructor will not fail when module includes a ShardedTensor when ignored
+            DistributedDataParallel(
+                m,
+                device_ids=[device] if device.type == "gpu" else None,
+                process_group=pg,
+                gradient_as_bucket_view=True,
+                broadcast_buffers=False,
+                static_graph=True,
+            )
+
     def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model):
         mult = 2
         batch_size = mult * self.world_size
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py b/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
@@ -42,9 +42,14 @@ def tensor_device(types, args=(), kwargs=None, pg=None):
     # Validate types
     if not isinstance(self_st, ShardedTensor):
         raise TypeError("input needs to be a ShardedTensor")
-
-    return self_st.local_shards()[0].tensor.device
-
+    dev: torch.device
+    if self_st._local_shards:
+        dev = self_st._local_shards[0].tensor.device
+    elif pg and pg._get_backend_name() == "gloo":
+        dev = torch.device("cpu")
+    else:
+        dev = torch.device(torch.cuda.current_device())
+    return dev
 
 @_sharded_op_impl(torch.Tensor.is_meta.__get__)  # type: ignore[attr-defined]
 def st_is_meta(types, args=(), kwargs=None, pg=None):
diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py
@@ -630,7 +630,13 @@ def cuda(
         return st_cuda
 
     def to(self, *args, **kwargs) -> ShardedTensor:
-        current_device = self._local_shards[0].tensor.device
+        current_device: torch.device
+        if self._local_shards:
+            current_device = self._local_shards[0].tensor.device
+        elif self._process_group._get_backend_name() == "gloo":
+            current_device = torch.device("cpu")
+        else:
+            current_device = torch.device(torch.cuda.current_device())
         current_dtype = self.dtype
         device_to = current_device
         dtype_to = current_dtype
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
@@ -553,11 +553,15 @@ def __init__(
         gradient_as_bucket_view=False,
         static_graph=False,
     ):
-
         super(DistributedDataParallel, self).__init__()
         Joinable.__init__(self)
         self.logger = None
-        if not any((p.requires_grad for p in module.parameters())):
+        if hasattr(module, "_ddp_params_and_buffers_to_ignore"):
+            self.parameters_to_ignore = set(module._ddp_params_and_buffers_to_ignore)
+        else:
+            self.parameters_to_ignore = set()
+        self._module_parameters = [p for n, p in module.named_parameters() if n not in self.parameters_to_ignore]
+        if not any((p.requires_grad for p in self._module_parameters)):
             self._log_and_throw(
                 RuntimeError,
                 "DistributedDataParallel is not needed when a module "
@@ -570,10 +574,8 @@ def __init__(
                 "device_ids can only be None or contain a single element.",
             )
 
-        self.is_multi_device_module = (
-            len({p.device for p in module.parameters()}) > 1
-        )
-        distinct_device_types = {p.device.type for p in module.parameters()}
+        self.is_multi_device_module = len({p.device for p in self._module_parameters}) > 1
+        distinct_device_types = {p.device.type for p in self._module_parameters if p.device is not None}
         if len(distinct_device_types) != 1:
             self._log_and_throw(
                 ValueError,
@@ -599,7 +601,7 @@ def __init__(
                     "but got device_ids {}, output_device {}, and module parameters {}.".format(
                         device_ids,
                         output_device,
-                        {p.device for p in module.parameters()},
+                        {p.device for p in self._module_parameters},
                     ),
                 )
 
@@ -621,16 +623,12 @@ def __init__(
         self.static_graph = False
         self.dim = dim
         self.module = module
-        self.device = list(self.module.parameters())[0].device
+        self.device = list(self._module_parameters)[0].device
         self.broadcast_buffers = broadcast_buffers
         self.find_unused_parameters = find_unused_parameters
         self.require_backward_grad_sync = True
         self.require_forward_param_sync = True
         self.gradient_as_bucket_view = gradient_as_bucket_view
-        if hasattr(module, "_ddp_params_and_buffers_to_ignore"):
-            self.parameters_to_ignore = module._ddp_params_and_buffers_to_ignore
-        else:
-            self.parameters_to_ignore = []
 
         self._use_replicated_tensor_module = (
             _ddp_with_replicated_tensor_enabled()
@@ -647,7 +645,7 @@ def __init__(
             )
 
         # Check that a module does not have Uninitialized parameters
-        for param in module.parameters():
+        for param in self._module_parameters:
             if isinstance(param, torch.nn.parameter.UninitializedParameter):
                 self._log_and_throw(
                     RuntimeError,