refactor FlexAttention

wwwjn · wwwjn · commit da672b28df16 · 2025-10-16T19:42:38.000-07:00
diff --git a/torchtitan/experiments/gpt_oss/infra/expert_parallel.py b/torchtitan/experiments/gpt_oss/infra/expert_parallel.py
@@ -4,19 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable
 
-import torch
 import torch.nn as nn
-from torch.distributed.tensor import (
-    DeviceMesh,
-    distribute_module,
-    distribute_tensor,
-    DTensor,
-    Replicate,
-    Shard,
-)
-from torch.distributed.tensor.parallel import ParallelStyle
+from torch.distributed.tensor import distribute_tensor, Replicate, Shard
 from torchtitan.distributed.expert_parallel import ExpertParallel, TensorParallel
 
 # implementation of Tensor Parallel for the GroupedExperts in MoE
diff --git a/torchtitan/experiments/gpt_oss/infra/parallelize.py b/torchtitan/experiments/gpt_oss/infra/parallelize.py
@@ -25,11 +25,11 @@
     ExpertParallel,
     ReordererSequenceParallel,
 )
-from torchtitan.models.llama4.infra.parallelize import apply_fsdp
 from torchtitan.models.llama3.infra.parallelize import apply_ac, apply_ddp
+from torchtitan.models.llama4.infra.parallelize import apply_fsdp
 from torchtitan.tools.logging import logger
 
-from .expert_parallel import GptossExpertTensorParallel, GptossTensorParallel
+from .expert_parallel import GptossExpertTensorParallel
 
 
 # for selective op activation checkpointing
@@ -308,10 +308,10 @@ def apply_moe_ep_tp(
         elif tp_mesh is None:
             experts_mesh = ep_mesh
             # input / output sharding on the batch / tokens dim
-            experts_plan = ExpertParallel()
+            experts_plan = GptossExpertParallel()
         elif etp_enabled:
             experts_mesh = ep_tp_mesh
-            experts_plan = ExpertTensorParallel(tp_mesh=tp_mesh, ep_mesh=ep_mesh)
+            experts_plan = GptossExpertTensorParallel(tp_mesh=tp_mesh, ep_mesh=ep_mesh)
         else:
             experts_mesh = ep_mesh
             experts_plan = ExpertParallel()
diff --git a/torchtitan/experiments/gpt_oss/model/model.py b/torchtitan/experiments/gpt_oss/model/model.py
@@ -7,19 +7,19 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from torchtitan.protocols.model import AttentionMasksType
 import torch
 from torch import nn
+from torch.nn.attention.flex_attention import and_masks, BlockMask
 from torchtitan.components.tokenizer import BaseTokenizer
-from torchtitan.protocols.train_spec import ModelProtocol
 from torchtitan.models.attention import (
     create_attention_mask,
     FlexAttentionWrapper,
     get_causal_mask_mod,
     get_document_mask_mod,
-    ScaledDotProductAttentionWrapper,
+    get_sliding_window_mask_mod,
 )
-from torch.nn.attention.flex_attention import and_masks, BlockMask
+from torchtitan.protocols.model import AttentionMasksType
+from torchtitan.protocols.train_spec import ModelProtocol
 
 from .args import GptOssModelArgs
 from .moe import GptOssMoE
@@ -115,14 +115,8 @@ class Attention(nn.Module):
     Multi-head attention (MLA) module.
     """
 
-    def __init__(
-        self, model_args: GptOssModelArgs, use_sliding_attention: bool = False
-    ):
+    def __init__(self, model_args: GptOssModelArgs):
         super().__init__()
-
-        self.sliding_window_size = (
-            model_args.sliding_window_size if use_sliding_attention else None
-        )
         self.head_dim = model_args.head_dim
         self.n_heads = model_args.n_heads
         self.n_kv_heads = model_args.n_kv_heads
@@ -157,7 +151,7 @@ def __init__(
             self.inner_attention = FlexAttentionWrapper()
         else:
             raise ValueError("Gpt-oss model only supports FlexAttention!")
-    
+
     def init_weights(self, init_std: float):
         linear_list = [
             self.wq,
@@ -172,7 +166,6 @@ def init_weights(self, init_std: float):
         nn.init.trunc_normal_(self.wo.weight, mean=0.0, std=init_std)
         nn.init.trunc_normal_(self.wo.bias, mean=0.0, std=init_std)
 
-    
     def forward(
         self,
         x: torch.Tensor,
@@ -208,22 +201,15 @@ def forward(
 
         if self.use_flex_attn:
             assert isinstance(attention_masks, BlockMask), attention_masks
-            output = self.inner_attention(xq, xk, xv, block_mask=attention_masks)
-        
-        # # FlexAttention
-        # output, aux_output = self.attn(
-        #     q,
-        #     k,
-        #     v,
-        #     scale=None,
-        #     return_lse=True,
-        # )
-
-        # Apply attention sink rescaling: rescale by σ(lse - w[h])
-        # This is mathematically equivalent to concatenating learnable sink weights
-        lse = aux_output.lse
-        sink_scale = torch.sigmoid(lse - self.sinks.view(1, -1, 1)).unsqueeze(-1)
-        output = output * sink_scale.to(output.dtype)
+            output, aux_output = self.inner_attention(
+                xq, xk, xv, block_mask=attention_masks, scale=None, return_aux=True
+            )
+
+            # Apply attention sink rescaling: rescale by σ(lse - w[h])
+            # This is mathematically equivalent to concatenating learnable sink weights
+            lse = aux_output.lse
+            sink_scale = torch.sigmoid(lse - self.sinks.view(1, -1, 1)).unsqueeze(-1)
+            output = output * sink_scale.to(output.dtype)
 
         output = output.transpose(1, 2).contiguous()  # (B, H, T, D) -> (B, T, H, D)
 
@@ -234,18 +220,6 @@ def forward(
         output = self.wo(output)  # (bsz, seqlen, dim)
         return output
 
-    # TODO: statically init the mask using train.seq_len
-    def sliding_window_causal(self, seqlen, device):
-        i = torch.arange(seqlen, device=device)
-        q_idx = i[:, None]
-        kv_idx = i[None, :]
-
-        causal_mask = q_idx >= kv_idx
-        if self.sliding_window is None:
-            return causal_mask
-        window_mask = q_idx - kv_idx <= self.sliding_window
-        return causal_mask & window_mask
-
 
 class TransformerBlock(nn.Module):
     """
@@ -255,10 +229,8 @@ class TransformerBlock(nn.Module):
     def __init__(self, layer_id: int, model_args: GptOssModelArgs):
 
         super().__init__()
-        use_sliding_attention = layer_id % 2 == 0
-        self.attention = Attention(
-            model_args, use_sliding_attention=use_sliding_attention
-        )
+        self.use_sliding_attention = layer_id % 2 == 0
+        self.attention = Attention(model_args)
         self.attention_norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps)
         self.ffn_norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps)
 
@@ -270,18 +242,31 @@ def __init__(self, layer_id: int, model_args: GptOssModelArgs):
         self.weight_init_std = 0.02 / (2 * (layer_id + 1)) ** 0.5
         self.layer_id = layer_id
 
-    def forward(self, x: torch.Tensor, rope_cache: torch.Tensor, attention_masks: AttentionMasksType | None):
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope_cache: torch.Tensor,
+        attention_masks: AttentionMasksType | None,
+    ):
         """
         Forward pass for the Transformer block.
 
         Args:
             x (torch.Tensor): Input tensor of shape (batch_size, seq_len, dim).
             rope_cache (torch.Tensor): Precomputed cosine and sine frequencies.
+            attention_masks (AttentionMasksType | None): Either a single BlockMask or a dict of BlockMasks keyed by layer.
 
         Returns:
             torch.Tensor: Output tensor with the same shape as the input.
         """
-        x = x + self.attention(self.attention_norm(x), rope_cache, attention_masks)
+        # Extract the appropriate mask for this layer
+        if self.use_sliding_attention:
+            layer_mask = attention_masks.get("sliding_window_mask", None)
+        else:
+            layer_mask = attention_masks.get("basic_mask", None)
+        assert layer_mask is not None
+
+        x = x + self.attention(self.attention_norm(x), rope_cache, layer_mask)
         x = x + self.moe(self.ffn_norm(x))
         return x
 
@@ -357,24 +342,54 @@ def get_attention_masks(
         tokenizer: BaseTokenizer,
         extra_inputs: dict[str, torch.Tensor] | None = None,
     ) -> AttentionMasksType:
-        # TODO: implement this function
-        mask_mods = [get_causal_mask_mod()]
+
+        basic_mask_mods = []
+        sliding_window_mask_mods = [
+            get_sliding_window_mask_mod(self.model_args.sliding_window_size)
+        ]
         match self.model_args.attn_mask_type:
             case "causal":
                 B = 1
+                basic_mask_mods.append(get_causal_mask_mod())
+                sliding_window_mask_mods.append(get_causal_mask_mod())
             case "block_causal":
                 B = input_batch.shape[0]
-                mask_mods.append(get_document_mask_mod(input_batch, tokenizer.eos_id))
+                basic_mask_mods.append(
+                    get_document_mask_mod(input_batch, tokenizer.eos_id)
+                )
+                sliding_window_mask_mods.append(
+                    get_document_mask_mod(input_batch, tokenizer.eos_id)
+                )
             case _:
                 raise ValueError(
                     f"Unknown attention mask type: {self.model_args.attn_mask_type}"
                 )
-        return create_attention_mask(
-            and_masks(*mask_mods), B, None, input_batch.shape[1], input_batch.shape[1]
+
+        # create basic attention mask: causal or block_causal
+        basic_mask = create_attention_mask(
+            and_masks(*basic_mask_mods),
+            B,
+            None,
+            input_batch.shape[1],
+            input_batch.shape[1],
+        )
+
+        # create sliding window mask, has to
+        sliding_window_mask = create_attention_mask(
+            and_masks(*sliding_window_mask_mods),
+            B,
+            None,
+            input_batch.shape[1],
+            input_batch.shape[1],
         )
 
+        return {"basic_mask": basic_mask, "sliding_window_mask": sliding_window_mask}
 
-    def forward(self, tokens: torch.Tensor, attention_masks: AttentionMasksType | None = None,):
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        attention_masks: AttentionMasksType | None = None,
+    ):
         """
         Forward pass for the Transformer model.
 
diff --git a/torchtitan/experiments/gpt_oss/model/moe.py b/torchtitan/experiments/gpt_oss/model/moe.py
@@ -8,6 +8,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from typing import Callable
+
 import torch
 from torch import nn
 from torch.distributed.tensor import DTensor
@@ -34,7 +35,7 @@ def wrapper(
         x: torch.Tensor,
         num_tokens_per_expert: torch.Tensor,
     ) -> torch.Tensor:
-        num_local_experts = w1.shape[0]
+        num_local_experts = mlp1_weight.shape[0]
         ep_degree = num_tokens_per_expert.shape[0] // num_local_experts
 
         input_shape, x, permuted_indices, num_tokens_per_expert = _permute(
@@ -57,6 +58,7 @@ def wrapper(
 
     return wrapper
 
+
 def swiglu(x, alpha: float = 1.702, limit: float = 7.0):
     x_glu, x_linear = x[..., ::2], x[..., 1::2]
     # Clamp the input values
@@ -66,6 +68,7 @@ def swiglu(x, alpha: float = 1.702, limit: float = 7.0):
     # Note we add an extra bias of 1 to the linear layer
     return out_glu * (x_linear + 1)
 
+
 def _run_experts_for_loop(
     mlp1_weight: torch.Tensor,
     mlp1_bias: torch.Tensor,
@@ -91,10 +94,7 @@ def _run_experts_for_loop(
         )
         out_experts_splits = []
         for expert_idx, x_expert in enumerate(x):
-            h = (
-                torch.matmul(x_expert, mlp1_weight[expert_idx])
-                + mlp1_bias[expert_idx]
-            )
+            h = torch.matmul(x_expert, mlp1_weight[expert_idx]) + mlp1_bias[expert_idx]
             h = swiglu(h, limit=swiglu_limit)
             h = torch.matmul(h, mlp2_weight[expert_idx]) + mlp2_bias[expert_idx]
             out_experts_splits.append(h)
@@ -110,6 +110,7 @@ def _run_experts_for_loop(
 
     return out
 
+
 def _run_experts_grouped_mm(
     mlp1_weight: torch.Tensor,
     mlp1_bias: torch.Tensor,
@@ -129,14 +130,6 @@ def _run_experts_grouped_mm(
         # fall back to regular bmm between 3D tensors
         assert x.dim() == 3
 
-    if isinstance(mlp1_weight, DTensor):
-        mlp1_weight, mlp1_bias, mlp2_weight, mlp2_bias = (
-            mlp1_weight.to_local(),
-            mlp1_bias.to_local(),
-            mlp2_weight.to_local(),
-            mlp2_bias.to_local(),
-        )
-
     h = torch._grouped_mm(x.bfloat16(), mlp1_weight.bfloat16(), offs=offsets)
     if offsets is not None:
         b1 = mlp1_bias.repeat_interleave(num_tokens_per_expert_long, dim=0)
@@ -156,6 +149,7 @@ def _run_experts_grouped_mm(
 
     return h
 
+
 class GptOssGroupedExperts(nn.Module):
     def __init__(
         self,
@@ -201,16 +195,33 @@ def forward(
                 run_experts_fn = indices_padding_wrapper(_run_experts_grouped_mm)
             else:
                 run_experts_fn = _run_experts_grouped_mm
-            return run_experts_fn(mlp1_weight, mlp1_bias, mlp2_weight, mlp2_bias, self.swiglu_limit, x, num_tokens_per_expert)
+            return run_experts_fn(
+                mlp1_weight,
+                mlp1_bias,
+                mlp2_weight,
+                mlp2_bias,
+                self.swiglu_limit,
+                x,
+                num_tokens_per_expert,
+            )
         else:
-            return _run_experts_for_loop(mlp1_weight, mlp1_bias, mlp2_weight, mlp2_bias, self.swiglu_limit, x, num_tokens_per_expert)
+            return _run_experts_for_loop(
+                mlp1_weight,
+                mlp1_bias,
+                mlp2_weight,
+                mlp2_bias,
+                self.swiglu_limit,
+                x,
+                num_tokens_per_expert,
+            )
 
     def init_weights(self, init_std: float):
         nn.init.trunc_normal_(self.mlp1_weight, mean=0.0, std=init_std)
         nn.init.trunc_normal_(self.mlp1_bias, mean=0.0, std=init_std)
         nn.init.trunc_normal_(self.mlp2_weight, mean=0.0, std=init_std)
         nn.init.trunc_normal_(self.mlp2_bias, mean=0.0, std=init_std)
 
+
 class GptOssMoE(MoE):
     """GptOss MoE implementation that inherits from the base MoE class."""
 
diff --git a/torchtitan/models/attention.py b/torchtitan/models/attention.py
diff --git a/torchtitan/models/qwen3/model/model.py b/torchtitan/models/qwen3/model/model.py