Add option to use gather to select embeddings in EC (#3479)

Jingchang Zhang · meta-codesync[bot] · commit 5458b038d9ce · 2025-10-23T20:34:10.000-07:00
Summary: Pull Request resolved: #3479 Due to atomic add in torch.index_select, the backward performance sometimes is bad comparing with gather. In this diff, it provides users with control over the indexing process and select the suitable operator based on specific cases. Perf comparison on pure operators(forward+backward) 2D Embedding, No Repetition Config: shape=(1000000, 256), dim=0, indices=100000, unique=95300 (95.3%) Method Time (s) Speedup Status torch.gather 0.9439 1.00 x 🏆 torch.index_select 1.0509 0.90 x 2D Embedding, Low Repetition Config: shape=(1000000, 256), dim=0, indices=100000, unique=48732 (48.7%) Method Time (s) Speedup Status torch.gather 0.9076 1.00 x 🏆 torch.index_select 1.0415 0.87 x 2D Embedding, High Repetition Config: shape=(1000000, 256), dim=0, indices=250000, unique=9957 (4.0%) Method Time (s) Speedup Status torch.gather 1.2385 1.00 x 🏆 torch.index_select 1.6225 0.76 x Small Vocab, Low Repetition Config: shape=(1000, 256), dim=0, indices=2000, unique=635 (31.8%) Method Time (s) Speedup Status torch.gather 0.1502 1.00 x 🏆 torch.index_select 0.1763 0.85 x Small Vocab, Very High Repetition Config: shape=(1000, 256), dim=0, indices=100000, unique=625 (0.6%) Method Time (s) Speedup Status torch.gather 0.2626 1.00 x 🏆 torch.index_select 0.4126 0.64 x Large Vocab, No Repetition Config: shape=(10000000, 256), dim=0, indices=10000, unique=9996 (100.0%) Method Time (s) Speedup Status torch.gather 5.8014 1.00 x 🏆 torch.index_select 5.8184 1.00 x Large Vocab, Low Repetition Config: shape=(10000000, 256), dim=0, indices=10000, unique=5000 (50.0%) Method Time (s) Speedup Status torch.gather 5.7912 1.00 x 🏆 torch.index_select 5.8137 1.00 x Large Vocab, High Repetition Config: shape=(10000000, 256), dim=0, indices=10000, unique=400 (4.0%) Method Time (s) Speedup Status torch.gather 5.7784 1.00 x 🏆 torch.index_select 5.8100 0.99 x Mast Job Test: baseline: fire-jingchang-f816557933 torch.index_select backward takes ~37ms {F1982939713} exp: fire-jingchang-f816355728 torch.gather backward takes ~10ms {F1982939742} Reviewed By: TroyGarden Differential Revision: D85309309
diff --git a/torchrec/distributed/embedding.py b/torchrec/distributed/embedding.py
@@ -338,6 +338,7 @@ def __init__(
         features_to_permute_indices: Optional[Dict[str, List[int]]] = None,
         module_fqn: Optional[str] = None,
         sharding_types: Optional[List[str]] = None,
+        use_gather_select: bool = False,
     ) -> None:
         super().__init__()
         self._awaitables_per_sharding = awaitables_per_sharding
@@ -348,6 +349,7 @@ def __init__(
         self._ctx = ctx
         self._module_fqn = module_fqn
         self._sharding_types = sharding_types
+        self._use_gather_select = use_gather_select
 
     def _wait_impl(self) -> Dict[str, JaggedTensor]:
         jt_dict: Dict[str, JaggedTensor] = {}
@@ -389,6 +391,7 @@ def _wait_impl(self) -> Dict[str, JaggedTensor]:
                     original_features=original_features,
                     reverse_indices=reverse_indices,
                     seq_vbe_ctx=seq_vbe_ctx,
+                    use_gather_select=self._use_gather_select,
                 )
             )
         return jt_dict
@@ -529,6 +532,7 @@ def __init__(
                 module.embedding_configs(), table_name_to_parameter_sharding
             )
         self._need_indices: bool = module.need_indices()
+        self._use_gather_select: bool = module.use_gather_select()
         self._inverse_indices_permute_per_sharding: Optional[List[torch.Tensor]] = None
         self._skip_missing_weight_key: List[str] = []
 
@@ -1563,6 +1567,7 @@ def output_dist(
             need_indices=self._need_indices,
             features_to_permute_indices=self._features_to_permute_indices,
             ctx=ctx,
+            use_gather_select=self._use_gather_select,
         )
 
     def compute_and_output_dist(
@@ -1612,6 +1617,7 @@ def compute_and_output_dist(
             ctx=ctx,
             module_fqn=self._module_fqn,
             sharding_types=list(self._sharding_type_to_sharding.keys()),
+            use_gather_select=self._use_gather_select,
         )
 
     def _embedding_dim_for_sharding_type(self, sharding_type: str) -> int:
diff --git a/torchrec/modules/embedding_modules.py b/torchrec/modules/embedding_modules.py
@@ -408,13 +408,15 @@ def __init__(  # noqa C901
         tables: List[EmbeddingConfig],
         device: Optional[torch.device] = None,
         need_indices: bool = False,
+        use_gather_select: bool = False,
     ) -> None:
         super().__init__()
         torch._C._log_api_usage_once(f"torchrec.modules.{self.__class__.__name__}")
         self.embeddings: nn.ModuleDict = nn.ModuleDict()
         self._embedding_configs = tables
         self._embedding_dim: int = -1
         self._need_indices: bool = need_indices
+        self._use_gather_select: bool = use_gather_select
         self._device: torch.device = (
             device if device is not None else torch.device("cpu")
         )
@@ -541,3 +543,10 @@ def reset_parameters(self) -> None:
             param = self.embeddings[f"{table_config.name}"].weight
             # pyre-ignore
             table_config.init_fn(param)
+
+    def use_gather_select(self) -> bool:
+        """
+        Returns:
+            bool: Whether the EmbeddingCollection uses torch.gather to select embeddings.
+        """
+        return self._use_gather_select
diff --git a/torchrec/modules/utils.py b/torchrec/modules/utils.py
@@ -245,14 +245,22 @@ def construct_jagged_tensors(
     original_features: Optional[KeyedJaggedTensor] = None,
     reverse_indices: Optional[torch.Tensor] = None,
     seq_vbe_ctx: Optional[SequenceVBEContext] = None,
+    use_gather_select: bool = False,
 ) -> Dict[str, JaggedTensor]:
     with record_function("## construct_jagged_tensors ##"):
         if original_features is not None:
             features = original_features
         if reverse_indices is not None:
-            embeddings = torch.index_select(
-                embeddings, 0, reverse_indices.to(torch.int32)
-            )
+            if use_gather_select:
+                # gather has better backward performance than index_select in many cases
+                expanded_indices = reverse_indices.unsqueeze(1).expand(
+                    -1, embeddings.size(-1)
+                )
+                embeddings = torch.gather(embeddings, 0, expanded_indices)
+            else:
+                embeddings = torch.index_select(
+                    embeddings, 0, reverse_indices.to(torch.int32)
+                )
         ret: Dict[str, JaggedTensor] = {}
 
         if seq_vbe_ctx is not None: