fix sink

wwwjn · wwwjn · commit 48b2a11462ea · 2025-09-29T20:50:00.000-07:00
diff --git a/torchtitan/experiments/gpt_oss/infra/parallelize.py b/torchtitan/experiments/gpt_oss/infra/parallelize.py
@@ -22,7 +22,6 @@
 from torchtitan.models.llama3.infra.parallelize import apply_ac, apply_ddp
 from torchtitan.experiments.llama4.infra.parallelize import (
     apply_fsdp,
-    apply_moe_ep_tp,
 )
 
 from torchtitan.tools.logging import logger
@@ -231,30 +230,30 @@ def apply_non_moe_tp(
         layer_plan = {
             "attention_norm": SequenceParallel(),
             "attention": prepare_module_input(
-                input_layouts=(Shard(1), Replicate()),
-                desired_input_layouts=(Replicate(), Replicate()),
+                input_layouts=(Shard(1), None),
+                desired_input_layouts=(Replicate(), None),
             ),
-            "attention.wq": colwise_parallel(use_local_output=False),
-            "attention.wk": colwise_parallel(use_local_output=False),
-            "attention.wv": colwise_parallel(use_local_output=False),
+            "attention.wq": colwise_parallel(),
+            "attention.wk": colwise_parallel(),
+            "attention.wv": colwise_parallel(),
             "attention.wo": rowwise_parallel(output_layouts=Shard(1)),
             "ffn_norm": SequenceParallel(),
         }
 
+        # shard attention.sinks across heads
+        # TODO(jianiw): Fix the sink implementation for nn.Parameter
+        attn = transformer_block.attention
+        attn.register_parameter(
+            "sinks",
+            nn.Parameter(distribute_tensor(attn.sinks, tp_mesh, [Shard(0)])),
+        )
+
         parallelize_module(
             module=transformer_block,
             device_mesh=tp_mesh,
             parallelize_plan=layer_plan,
         )
 
-        # shard attention.sinks across heads
-        # TODO(jianiw): Fix the sink implementation
-        # attn = transformer_block.attention
-        # attn.register_parameter(
-        #     "sinks",
-        #     nn.Parameter(distribute_tensor(attn.sinks, tp_mesh, [Replicate()])),
-        # )
-
     if enable_async_tp:
         from torch.distributed._symmetric_memory import enable_symm_mem_for_group
 
diff --git a/torchtitan/experiments/gpt_oss/model/model.py b/torchtitan/experiments/gpt_oss/model/model.py
@@ -245,14 +245,18 @@ def forward(
                 k,
                 v,
                 scale=None,
-                return_lse=False,
+                return_lse=True,
             )
 
             # Apply attention sink rescaling: rescale by σ(lse - w[h])
             # This is mathematically equivalent to concatenating learnable sink weights
+            # TODO: If attention part is, but self.sinks are registered as a DTensor, while lse is a plain tensor
+            # q, k, v are already sharded by TP: [batch, local_heads, seq_len, head_dim] (plain tensor)
+            # sinks shape needs to match: [local_heads],
+            # [rank0]:lse.shape torch.Size([8, 32, 2048]), <class 'torch.Tensor'>
             sink_scale = torch.sigmoid(lse - self.sinks.view(1, -1, 1)).unsqueeze(
                 -1
-            )  # [B,H,S,1]
+            )
             output = output * sink_scale.to(output.dtype)
 
         else: