fix for loop impl

danielvegamyhre · danielvegamyhre · commit 5e6863d984fd · 2025-08-01T16:44:27.000-07:00
diff --git a/torchtitan/experiments/llama4/infra/expert_parallel.py b/torchtitan/experiments/llama4/infra/expert_parallel.py
@@ -57,17 +57,19 @@ def _partition_fn(self, name, module, device_mesh):
         # w1 shape = (experts, out_dim, in_dim)
         module.register_parameter(
             "w1", nn.Parameter(distribute_tensor(module.w1, device_mesh, [Shard(1)]))
-        )
+        ) # Rowwise sharding
+
         # w2 shape = (experts, in_dim, out_dim)
         module.register_parameter(
             "w2",
             nn.Parameter(distribute_tensor(module.w2, device_mesh, [Shard(2)])),
-        )
+        ) # Columnwise sharding
+
         # w3 shape = (experts, out_dim, in_dim)
         module.register_parameter(
             "w3",
             nn.Parameter(distribute_tensor(module.w3, device_mesh, [Shard(1)])),
-        )
+        ) # Columnwise sharding
 
     def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
         return distribute_module(
@@ -230,17 +232,19 @@ def _partition_fn_2d(self, name, mod, ep_tp_mesh):
         mod.register_parameter(
             "w1",
             nn.Parameter(distribute_tensor(mod.w1, ep_tp_mesh, [Shard(0), Shard(1)])),
-        )
+        ) # Rowwise sharding
+
         # w2 shape = (experts, in_dim, out_dim)
         mod.register_parameter(
             "w2",
             nn.Parameter(distribute_tensor(mod.w2, ep_tp_mesh, [Shard(0), Shard(2)])),
-        )
+        ) # Columnwise sharding
+
         # w3 shape = (experts, out_dim, in_dim)
         mod.register_parameter(
             "w3",
             nn.Parameter(distribute_tensor(mod.w3, ep_tp_mesh, [Shard(0), Shard(1)])),
-        )
+        ) # Rowwise sharding
 
     def _token_combine(self, mod, routed_output, device_mesh):
         # token combine happens on the EP mesh, whereas device_mesh is [ep, tp] mesh
diff --git a/torchtitan/experiments/llama4/model/moe.py b/torchtitan/experiments/llama4/model/moe.py
@@ -69,9 +69,9 @@ def _run_experts_for_loop(
             )
             out_experts_splits = []
             for expert_idx, x_expert in enumerate(x):
-                h = F.silu(torch.matmul(x_expert, w1[expert_idx]))
-                h = h * torch.matmul(x_expert, w3[expert_idx])
-                h = torch.matmul(h, w2[expert_idx])
+                h = F.silu(torch.matmul(x_expert, w1[expert_idx].transpose(-2, -1)))
+                h = h * torch.matmul(x_expert, w3[expert_idx].transpose(-2, -1))
+                h = torch.matmul(h, w2[expert_idx].transpose(-2, -1))
                 # h shape (tokens_per_expert(varying), dim)
                 out_experts_splits.append(h)
             out = torch.cat(out_experts_splits, dim=0)
@@ -80,10 +80,10 @@ def _run_experts_for_loop(
             out = torch.vstack((out, out.new_zeros((num_padding, out.shape[-1]))))
         else:
             # x shape (num_experts, tokens_per_expert, dim)
-            h = F.silu(torch.bmm(x, w1))
-            h = h * torch.bmm(x, w3)
+            h = F.silu(torch.bmm(x, w1.transpose(-2, -1)))
+            h = h * torch.bmm(x, w3.transpose(-2, -1))
             # out shape (num_experts, tokens_per_expert, dim)
-            out = torch.bmm(h, w2)
+            out = torch.bmm(h, w2.transpose(-2, -1))
 
         return out