Static fused moe op (vllm-project#41)

jkaniecki · web-flow · commit af0f1a691ef5 · 2024-05-28T13:00:38.000+02:00
* Fix mixtral hidden states layout to fit into habana model runner

* Add static moe op to mixtral

* Add mark_step to static_fused_moe

* Update __init__.py

* Fix code indentation

* Make code compatible with non HPU devices

* Move static_fused_moe to vllm.hpu.ops

* Update mixtral.py

* Move op import from forward to top of the file

* Remove circular import
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
@@ -113,3 +113,39 @@ def apply_rope(
 
 def awq_gemm(*args):
     raise NotImplementedError
+
+
+def silu_and_mul_wrapper(x: torch.Tensor) -> torch.Tensor:
+    d = x.shape[-1] // 2
+    output_shape = (x.shape[:-1] + (d, ))
+    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+    silu_and_mul(out, x)
+    return out
+
+
+@hpu_utils.with_mark_steps
+def static_fused_moe(hidden_states, w1, w2, score, topk):
+    B, D = hidden_states.shape
+    num_experts = w1.shape[0]
+    routing_weights = F.softmax(score, dim=1, dtype=torch.float32)
+    routing_weights, selected_experts = torch.topk(routing_weights, topk, dim=-1)
+    routing_weights = routing_weights.to(hidden_states.dtype)
+    final_hidden_states = torch.zeros(
+            (1, B, D), dtype=hidden_states.dtype, device=hidden_states.device
+    )
+    padded_weights = torch.zeros(
+            (B, num_experts), dtype=hidden_states.dtype, device=hidden_states.device
+    )
+    padded_weights.scatter_(-1, selected_experts, routing_weights)
+    padded_weights = padded_weights.reshape(-1, B, w1.shape[0])
+    padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1)
+
+    for expert_idx in range(num_experts):
+        padded_weight = padded_weights[expert_idx]
+        current_state_static = hidden_states.reshape(-1, D)
+        w_output = silu_and_mul_wrapper(torch.matmul(current_state_static, w1[expert_idx].transpose(0, 1)))
+        w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1))
+        current_hidden_states_static = w_output * padded_weight
+        final_hidden_states += current_hidden_states_static
+
+    return final_hidden_states.view(-1, D)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
@@ -50,7 +50,10 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import SamplerOutput
-from vllm.utils import print_warning_once
+from vllm.utils import print_warning_once, is_hpu
+
+if is_hpu():
+    from vllm.hpu.ops import static_fused_moe
 
 
 class MixtralMoE(nn.Module):
@@ -220,28 +223,40 @@ def process_weights_after_loading(self):
                                          requires_grad=False)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_tokens, hidden_size = hidden_states.shape
+        if is_hpu():
+            batch_size, sequence_length, hidden_size = hidden_states.shape
+        else:
+            num_tokens, hidden_size = hidden_states.shape
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = fused_moe(hidden_states,
-                                        self.w13_weight,
-                                        self.w2_weight,
-                                        router_logits,
-                                        self.top_k,
-                                        renormalize=True,
-                                        inplace=True,
-                                        use_fp8=self.use_fp8,
-                                        w1_scale=self.w13_scale,
-                                        w2_scale=self.w2_scale,
-                                        a1_scale=self.a13_scale,
-                                        a2_scale=self.a2_scale)
+
+        if is_hpu():
+            final_hidden_states = static_fused_moe(hidden_states,
+                        self.w13_weight,
+                        self.w2_weight,
+                        router_logits,
+                        self.top_k)
+        else:
+            final_hidden_states = fused_moe(hidden_states,
+                                            self.w13_weight,
+                                            self.w2_weight,
+                                            router_logits,
+                                            self.top_k,
+                                            renormalize=True,
+                                            inplace=True,
+                                            use_fp8=self.use_fp8,
+                                            w1_scale=self.w13_scale,
+                                            w2_scale=self.w2_scale,
+                                            a1_scale=self.a13_scale,
+                                            a2_scale=self.a2_scale)
 
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
                 final_hidden_states)
 
-        return final_hidden_states.view(num_tokens, hidden_size)
+        return (final_hidden_states.view(batch_size, sequence_length, hidden_size) if is_hpu() 
+                else final_hidden_states.view(num_tokens, hidden_size))
 
 
 class MixtralAttention(nn.Module):