make sure modular matches!

ArthurZucker · ArthurZucker · commit a85634c1f275 · 2025-07-07T16:24:56.000Z
diff --git a/src/transformers/models/openai_moe/modular_openai_moe.py b/src/transformers/models/openai_moe/modular_openai_moe.py
@@ -52,8 +52,8 @@ def forward(self, hidden_states):
 class OpenAIMoeExperts(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.num_experts = config.num_local_experts
         self.intermediate_size = config.intermediate_size
+        self.num_experts = config.num_local_experts
         self.hidden_size = config.hidden_size
         self.expert_dim = self.intermediate_size
         self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
@@ -70,16 +70,19 @@ def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weig
         For inference we can sacrifice some memory and compute the output for all experts at once. By repeating the inputs.
 
         Args:
-            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
+            hidden_states (torch.Tensor): (batch_size, seq_len, hidden_size)
             selected_experts (torch.Tensor): (batch_size * token_num, top_k)
             routing_weights (torch.Tensor): (batch_size * token_num, top_k)
         Returns:
             torch.Tensor
         """
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size) # (num_tokens, hidden_size)
+        num_experts = routing_weights.shape[0]
         if self.training:
             next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
             with torch.no_grad():
-                expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=self.num_experts).permute(
+                expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts).permute(
                     2, 1, 0
                 )
                 expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
@@ -100,42 +103,62 @@ def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weig
                 )  # (num_tokens, hidden_dim)
                 weighted_output = out * routing_weights[top_x, idx, None]  # (num_tokens, hidden_dim)
                 next_states.index_add_(0, top_x, weighted_output.to(hidden_states.dtype)[0])
+            next_states = next_states.view(batch_size, -1, self.hidden_size)
         else:
-            hidden_states = hidden_states.repeat(self.num_experts, 1)
-            hidden_states = hidden_states.view(self.num_experts, -1, self.hidden_size)
+            hidden_states = hidden_states.repeat(num_experts, 1)
+            hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
             gate_up = torch.bmm(hidden_states, self.gate_up_proj) + self.gate_up_proj_bias[..., None, :]
             gate, up = gate_up.chunk(2, dim=-1)  # not supported for DTensors
             glu = gate * torch.sigmoid(gate * self.alpha)
-            next_states = torch.bmm(((up + 1) * glu), self.down_proj) + self.down_proj_bias[..., None, :]
-            next_states = next_states.view(-1, self.hidden_size)
-        return next_states
+            next_states = torch.bmm(((up + 1) * glu), self.down_proj)
+            next_states = next_states + self.down_proj_bias[..., None, :]
+            next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size) # (num_experts, batch_size, seq_len, hidden_size)
+        return next_states, None
 
+class TopKRouter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.top_k = config.num_experts_per_tok
+        self.num_experts = config.num_local_experts
+        self.hidden_dim = config.hidden_size
+        self.weight = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim))
+        self.bias = nn.Parameter(torch.empty(self.num_experts))
+    
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.reshape(-1, self.hidden_dim)
+        router_logits = F.linear(hidden_states, self.weight, self.bias) # (seq_len, num_experts)
+        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1) # (seq_len, top_k)
+        router_top_value = torch.nn.functional.softmax(router_top_value, dim=1)
+        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value).transpose(0, 1) # (num_experts, seq_len)
+        return router_scores, router_indices
+
+class TokenDispatcher(nn.Module):
+    # this module is important to add EP hook
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+
+    def forward(self, routed_out, routing_weights):
+        # routed_out is (num_experts, batch_size, seq_len, hidden_size)
+        routed_out = routed_out * routing_weights[:, None, :, None] # we're throwing away computed routed_out for rest of experts
+        routed_out = routed_out.sum(dim=0) # (batch_size, seq_len, hidden_size)
+        return routed_out
 
 @use_kernel_forward_from_hub("MegaBlocksMoeMLP")
 class OpenAIMoeMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.top_k = config.num_experts_per_tok
-        self.hidden_dim = config.hidden_size
-        self.num_local_experts = config.num_local_experts
+        self.router = TopKRouter(config)
         self.experts = OpenAIMoeExperts(config)
-        self.router = nn.Linear(config.hidden_size, config.num_local_experts, bias=True)
+        self.token_dispatcher = TokenDispatcher(config)
 
     def forward(self, hidden_states):
         # we don't slice weight as its not compile compatible
-        batch_size = hidden_states.shape[0]
-        hidden_states = hidden_states.reshape(-1, self.hidden_dim)
-        router_logits = self.router(hidden_states)
-        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
-        router_top_value = torch.nn.functional.softmax(router_top_value, dim=1)
-        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value).transpose(0, 1)
-        routed_out = self.experts(hidden_states, router_indices, router_top_value)
-        if self.training:
-            output_states = routed_out.view(batch_size, -1, self.hidden_dim)
-        else:
-            routed_out = routed_out.view(self.num_local_experts, -1, self.hidden_dim) * router_scores[..., None]
-            output_states = routed_out.view(self.num_local_experts, batch_size, -1, self.hidden_dim).sum(dim=0)
-        return output_states, router_scores
+        router_scores, router_indices = self.router(hidden_states) # (num_experts, seq_len)
+        routed_out, _ = self.experts(hidden_states, router_indices=router_indices, routing_weights=router_scores) #TODO: router_indices isn't used inside this func
+        hidden_states = self.token_dispatcher(routed_out, router_scores)
+        return hidden_states, router_scores
 
 
 class OpenAIMoeRotaryEmbedding(LlamaRotaryEmbedding):