TroyGarden
diff --git a/‎torchrec/distributed/embedding.py‎
Lines changed: 63 additions & 44 deletions b/‎torchrec/distributed/embedding.py‎
Lines changed: 63 additions & 44 deletions
diff --git a/‎torchrec/distributed/embedding_lookup.py‎
Lines changed: 72 additions & 71 deletions b/‎torchrec/distributed/embedding_lookup.py‎
Lines changed: 72 additions & 71 deletions
diff --git a/‎torchrec/distributed/embedding_types.py‎
Lines changed: 1 addition & 0 deletions b/‎torchrec/distributed/embedding_types.py‎
Lines changed: 1 addition & 0 deletions
@@ -147,46 +147,6 @@ def get_ec_index_dedup() -> bool:
     return EC_INDEX_DEDUP
 
 
-def create_embedding_sharding(
-    sharding_type: str,
-    sharding_infos: List[EmbeddingShardingInfo],
-    env: ShardingEnv,
-    device: Optional[torch.device] = None,
-    qcomm_codecs_registry: Optional[Dict[str, QuantizedCommCodecs]] = None,
-) -> EmbeddingSharding[
-    SequenceShardingContext, KeyedJaggedTensor, torch.Tensor, torch.Tensor
-]:
-    if sharding_type == ShardingType.TABLE_WISE.value:
-        return TwSequenceEmbeddingSharding(
-            sharding_infos=sharding_infos,
-            env=env,
-            device=device,
-            qcomm_codecs_registry=qcomm_codecs_registry,
-        )
-    elif sharding_type == ShardingType.ROW_WISE.value:
-        return RwSequenceEmbeddingSharding(
-            sharding_infos=sharding_infos,
-            env=env,
-            device=device,
-            qcomm_codecs_registry=qcomm_codecs_registry,
-        )
-    elif sharding_type == ShardingType.DATA_PARALLEL.value:
-        return DpSequenceEmbeddingSharding(
-            sharding_infos=sharding_infos,
-            env=env,
-            device=device,
-        )
-    elif sharding_type == ShardingType.COLUMN_WISE.value:
-        return CwSequenceEmbeddingSharding(
-            sharding_infos=sharding_infos,
-            env=env,
-            device=device,
-            qcomm_codecs_registry=qcomm_codecs_registry,
-        )
-    else:
-        raise ValueError(f"Sharding not supported {sharding_type}")
-
-
 def create_sharding_infos_by_sharding(
     module: EmbeddingCollectionInterface,
     table_name_to_parameter_sharding: Dict[str, ParameterSharding],
@@ -557,7 +517,7 @@ def __init__(
                 SequenceShardingContext, KeyedJaggedTensor, torch.Tensor, torch.Tensor
             ],
         ] = {
-            sharding_type: create_embedding_sharding(
+            sharding_type: self.create_embedding_sharding(
                 sharding_type=sharding_type,
                 sharding_infos=embedding_confings,
                 env=env,
@@ -637,6 +597,51 @@ def __init__(
         if module.device != torch.device("meta"):
             self.load_state_dict(module.state_dict())
 
+    @classmethod
+    def create_embedding_sharding(
+        cls,
+        sharding_type: str,
+        sharding_infos: List[EmbeddingShardingInfo],
+        env: ShardingEnv,
+        device: Optional[torch.device] = None,
+        qcomm_codecs_registry: Optional[Dict[str, QuantizedCommCodecs]] = None,
+    ) -> EmbeddingSharding[
+        SequenceShardingContext, KeyedJaggedTensor, torch.Tensor, torch.Tensor
+    ]:
+        """
+        This is the main function to generate `EmbeddingSharding` instances based on sharding_type
+        so that the same sharding_type in one EC would be fused.
+        """
+        if sharding_type == ShardingType.TABLE_WISE.value:
+            return TwSequenceEmbeddingSharding(
+                sharding_infos=sharding_infos,
+                env=env,
+                device=device,
+                qcomm_codecs_registry=qcomm_codecs_registry,
+            )
+        elif sharding_type == ShardingType.ROW_WISE.value:
+            return RwSequenceEmbeddingSharding(
+                sharding_infos=sharding_infos,
+                env=env,
+                device=device,
+                qcomm_codecs_registry=qcomm_codecs_registry,
+            )
+        elif sharding_type == ShardingType.DATA_PARALLEL.value:
+            return DpSequenceEmbeddingSharding(
+                sharding_infos=sharding_infos,
+                env=env,
+                device=device,
+            )
+        elif sharding_type == ShardingType.COLUMN_WISE.value:
+            return CwSequenceEmbeddingSharding(
+                sharding_infos=sharding_infos,
+                env=env,
+                device=device,
+                qcomm_codecs_registry=qcomm_codecs_registry,
+            )
+        else:
+            raise ValueError(f"Sharding not supported {sharding_type}")
+
     @staticmethod
     def _pre_state_dict_hook(
         self: "ShardedEmbeddingCollection",
@@ -757,14 +762,23 @@ def _initialize_torch_state(self) -> None:  # noqa
             parameter_sharding,
         ) in self.module_sharding_plan.items():
             if parameter_sharding.sharding_type == ShardingType.DATA_PARALLEL.value:
+                # Don't need to use sharded/distributed state tensor for DATA_PARALLEL
+                # because each rank has a full copy of the table in DATA_PARALLEL
+                continue
+            _model_parallel_name_to_compute_kernel[table_name] = (
+                parameter_sharding.compute_kernel
+            )
+            if (
+                parameter_sharding.compute_kernel
+                == EmbeddingComputeKernel.CUSTOMIZED_KERNEL.value
+            ):
+                # Skip state_dict handling for CUSTOMIZED_KERNEL, this should be implemented
+                # in child class for the CUSTOMIZED_KERNEL
                 continue
             self._model_parallel_name_to_local_shards[table_name] = []
             self._model_parallel_name_to_shards_wrapper[table_name] = OrderedDict(
                 [("local_tensors", []), ("local_offsets", [])]
             )
-            _model_parallel_name_to_compute_kernel[table_name] = (
-                parameter_sharding.compute_kernel
-            )
 
         self._name_to_table_size = {}
         for table in self._embedding_configs:
@@ -783,6 +797,11 @@ def _initialize_torch_state(self) -> None:  # noqa
                 # save local_shards for transforming MP params to shardedTensor
                 for key, v in lookup.state_dict().items():
                     table_name = key[: -len(".weight")]
+                    if (
+                        _model_parallel_name_to_compute_kernel[table_name]
+                        == EmbeddingComputeKernel.CUSTOMIZED_KERNEL.value
+                    ):
+                        continue
                     if isinstance(v, DTensor):
                         shards_wrapper = self._model_parallel_name_to_shards_wrapper[
                             table_name
 
@@ -181,46 +181,11 @@ def __init__(
         pg: Optional[dist.ProcessGroup] = None,
         device: Optional[torch.device] = None,
     ) -> None:
-        # TODO rename to _create_embedding_kernel
-        def _create_lookup(
-            config: GroupedEmbeddingConfig,
-        ) -> BaseEmbedding:
-            for table in config.embedding_tables:
-                if (
-                    table.compute_kernel == EmbeddingComputeKernel.FUSED_UVM_CACHING
-                    or table.compute_kernel == EmbeddingComputeKernel.KEY_VALUE
-                ):
-                    self._need_prefetch = True
-            if config.compute_kernel == EmbeddingComputeKernel.DENSE:
-                return BatchedDenseEmbedding(
-                    config=config,
-                    pg=pg,
-                    device=device,
-                )
-            elif config.compute_kernel == EmbeddingComputeKernel.FUSED:
-                return BatchedFusedEmbedding(
-                    config=config,
-                    pg=pg,
-                    device=device,
-                )
-            elif config.compute_kernel in {
-                EmbeddingComputeKernel.KEY_VALUE,
-            }:
-                return KeyValueEmbedding(
-                    config=config,
-                    pg=pg,
-                    device=device,
-                )
-            else:
-                raise ValueError(
-                    f"Compute kernel not supported {config.compute_kernel}"
-                )
-
         super().__init__()
         self._emb_modules: nn.ModuleList = nn.ModuleList()
         self._need_prefetch: bool = False
         for config in grouped_configs:
-            self._emb_modules.append(_create_lookup(config))
+            self._emb_modules.append(self._create_embedding_kernel(config, pg, device))
 
         self._feature_splits: List[int] = []
         for config in grouped_configs:
@@ -239,6 +204,41 @@ def _create_lookup(
 
         self.grouped_configs = grouped_configs
 
+    def _create_embedding_kernel(
+        self,
+        config: GroupedEmbeddingConfig,
+        pg: Optional[dist.ProcessGroup],
+        device: Optional[torch.device],
+    ) -> BaseEmbedding:
+        for table in config.embedding_tables:
+            if (
+                table.compute_kernel == EmbeddingComputeKernel.FUSED_UVM_CACHING
+                or table.compute_kernel == EmbeddingComputeKernel.KEY_VALUE
+            ):
+                self._need_prefetch = True
+        if config.compute_kernel == EmbeddingComputeKernel.DENSE:
+            return BatchedDenseEmbedding(
+                config=config,
+                pg=pg,
+                device=device,
+            )
+        elif config.compute_kernel == EmbeddingComputeKernel.FUSED:
+            return BatchedFusedEmbedding(
+                config=config,
+                pg=pg,
+                device=device,
+            )
+        elif config.compute_kernel in {
+            EmbeddingComputeKernel.KEY_VALUE,
+        }:
+            return KeyValueEmbedding(
+                config=config,
+                pg=pg,
+                device=device,
+            )
+        else:
+            raise ValueError(f"Compute kernel not supported {config.compute_kernel}")
+
     def prefetch(
         self,
         sparse_features: KeyedJaggedTensor,
@@ -409,44 +409,12 @@ def __init__(
         scale_weight_gradients: bool = True,
         sharding_type: Optional[ShardingType] = None,
     ) -> None:
-        # TODO rename to _create_embedding_kernel
-        def _create_lookup(
-            config: GroupedEmbeddingConfig,
-            device: Optional[torch.device] = None,
-            sharding_type: Optional[ShardingType] = None,
-        ) -> BaseEmbedding:
-            if config.compute_kernel == EmbeddingComputeKernel.DENSE:
-                return BatchedDenseEmbeddingBag(
-                    config=config,
-                    pg=pg,
-                    device=device,
-                    sharding_type=sharding_type,
-                )
-            elif config.compute_kernel == EmbeddingComputeKernel.FUSED:
-                return BatchedFusedEmbeddingBag(
-                    config=config,
-                    pg=pg,
-                    device=device,
-                    sharding_type=sharding_type,
-                )
-            elif config.compute_kernel in {
-                EmbeddingComputeKernel.KEY_VALUE,
-            }:
-                return KeyValueEmbeddingBag(
-                    config=config,
-                    pg=pg,
-                    device=device,
-                    sharding_type=sharding_type,
-                )
-            else:
-                raise ValueError(
-                    f"Compute kernel not supported {config.compute_kernel}"
-                )
-
         super().__init__()
         self._emb_modules: nn.ModuleList = nn.ModuleList()
         for config in grouped_configs:
-            self._emb_modules.append(_create_lookup(config, device, sharding_type))
+            self._emb_modules.append(
+                self._create_embedding_kernel(config, device, pg, sharding_type)
+            )
 
         self._feature_splits: List[int] = []
         for config in grouped_configs:
@@ -473,6 +441,39 @@ def _create_lookup(
             else 1
         )
 
+    def _create_embedding_kernel(
+        self,
+        config: GroupedEmbeddingConfig,
+        device: Optional[torch.device],
+        pg: Optional[dist.ProcessGroup],
+        sharding_type: Optional[ShardingType],
+    ) -> BaseEmbedding:
+        if config.compute_kernel == EmbeddingComputeKernel.DENSE:
+            return BatchedDenseEmbeddingBag(
+                config=config,
+                pg=pg,
+                device=device,
+                sharding_type=sharding_type,
+            )
+        elif config.compute_kernel == EmbeddingComputeKernel.FUSED:
+            return BatchedFusedEmbeddingBag(
+                config=config,
+                pg=pg,
+                device=device,
+                sharding_type=sharding_type,
+            )
+        elif config.compute_kernel in {
+            EmbeddingComputeKernel.KEY_VALUE,
+        }:
+            return KeyValueEmbeddingBag(
+                config=config,
+                pg=pg,
+                device=device,
+                sharding_type=sharding_type,
+            )
+        else:
+            raise ValueError(f"Compute kernel not supported {config.compute_kernel}")
+
     def prefetch(
         self,
         sparse_features: KeyedJaggedTensor,
 
@@ -70,6 +70,7 @@ class EmbeddingComputeKernel(Enum):
     QUANT_UVM = "quant_uvm"
     QUANT_UVM_CACHING = "quant_uvm_caching"
     KEY_VALUE = "key_value"
+    CUSTOMIZED_KERNEL = "customized_kernel"
 
 
 def compute_kernel_to_embedding_location(