Use vanilla fp8 math

qihqi · qihqi · commit d8c8cd7c9dac · 2025-11-05T22:28:56.000Z
diff --git a/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py b/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1,6 +1,7 @@
 from collections.abc import Callable
 
 import torch
+import torch.nn.functional as F
 from compressed_tensors.quantization import QuantizationStrategy
 
 from jax.sharding import PartitionSpec as P
@@ -58,7 +59,7 @@
 from jax.experimental.layout import Format, Layout
 from jax.sharding import Mesh, NamedSharding, PartitionSpec
 from torch.nn.parameter import Parameter
-from torchax.interop import jax_view, torch_view
+from torchax.interop import jax_view, torch_view, call_jax
 from torchax.ops.mappings import t2j
 from vllm.attention.layer import Attention
 from vllm.logger import init_logger
@@ -139,33 +140,47 @@ def __init__(
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         assert isinstance(layer, FusedMoE)
 
+        intermediate_size = layer.w13_weight.shape[1] // 2
+        w1_weight = layer.w13_weight[:, :intermediate_size]
+        w3_weight = layer.w13_weight[:, intermediate_size:]
+        w1_weight_scale = layer.w13_weight_scale[:, :intermediate_size]
+        w3_weight_scale = layer.w13_weight_scale[:, intermediate_size:]
+
         w2_weight = t2j(layer.w2_weight, use_dlpack=False)
-        w13_weight = t2j(layer.w13_weight, use_dlpack=False)
-        w13_weight_scale = t2j(layer.w13_weight_scale, use_dlpack=False)
         w2_weight_scale = t2j(layer.w2_weight_scale, use_dlpack=False)
+        w1_weight = t2j(w1_weight, use_dlpack=False)
+        w1_weight_scale = t2j(w1_weight_scale, use_dlpack=False)
+        w3_weight = t2j(w3_weight, use_dlpack=False)
+        w3_weight_scale = t2j(w3_weight_scale, use_dlpack=False)
 
         if layer.use_ep:
             format = Format(
                 Layout((0, 1, 2)), NamedSharding(self.mesh, P("model", None, None))
             )
-            w13_weight = jax.device_put(w13_weight, format)
-            w13_weight_scale = jax.device_put(w13_weight_scale, format)
+            w1_weight = jax.device_put(w1_weight, format)
+            w1_weight_scale = jax.device_put(w1_weight_scale, format)
+            w3_weight = jax.device_put(w3_weight, format)
+            w3_weight_scale = jax.device_put(w3_weight_scale, format)
             w2_weight = jax.device_put(w2_weight, format)
             w2_weight_scale = jax.device_put(w2_weight_scale, format)
         else:
-            intermediate_size = w13_weight.shape[1] // 2
             assert intermediate_size == w2_weight.shape[-1]
             output_sizes = [intermediate_size, intermediate_size]
             n_shards = self.mesh.shape["model"]
             assert intermediate_size % n_shards == 0
-            w13_weight = reorder_concatenated_tensor_for_sharding(
-                w13_weight, output_sizes, n_shards, dim=1
-            )
+
+            # TODO: enable this if using fused weights
+            #w13_weight = reorder_concatenated_tensor_for_sharding(
+            #    w13_weight, output_sizes, n_shards, dim=1
+            #)
+
             w13_format = Format(
                 Layout((0, 1, 2)), NamedSharding(self.mesh, P(None, "model", None))
             )
-            w13_weight = jax.device_put(w13_weight, w13_format)
-            w13_weight_scale = jax.device_put(w13_weight_scale, w13_format)
+            w1_weight = jax.device_put(w1_weight, w13_format)
+            w1_weight_scale = jax.device_put(w1_weight_scale, w13_format)
+            w3_weight = jax.device_put(w3_weight, w13_format)
+            w3_weight_scale = jax.device_put(w3_weight_scale, w13_format)
             w2_weight = jax.device_put(
                 w2_weight,
                 Format(
@@ -176,15 +191,21 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 w2_weight_scale,
                 Format(Layout((0, 1, 2)), NamedSharding(self.mesh, P())),
             )  # replicate
-        w13_weight = Parameter(torch_view(w13_weight), requires_grad=False)
+
+        w1_weight = Parameter(torch_view(w1_weight), requires_grad=False)
+        w1_weight_scale = Parameter(torch_view(w1_weight_scale), requires_grad=False)
         w2_weight = Parameter(torch_view(w2_weight), requires_grad=False)
-        w13_weight_scale = Parameter(torch_view(w13_weight_scale), requires_grad=False)
         w2_weight_scale = Parameter(torch_view(w2_weight_scale), requires_grad=False)
+        w3_weight = Parameter(torch_view(w3_weight), requires_grad=False)
+        w3_weight_scale = Parameter(torch_view(w3_weight_scale), requires_grad=False)
 
-        layer.w13_weight = w13_weight
+        # TODO dont reuse variable
+        layer.w13_weight = w1_weight
+        layer.w13_weight_scale = w1_weight_scale
         layer.w2_weight = w2_weight
-        layer.w13_weight_scale = w13_weight_scale
         layer.w2_weight_scale = w2_weight_scale
+        layer.w3_weight = w3_weight
+        layer.w3_weight_scale = w3_weight_scale
 
     def apply(
         self,
@@ -215,45 +236,97 @@ def apply(
         if scoring_func != "softmax":
             raise NotImplementedError("Only softmax is supported for scoring_func")
 
-        import sys
-
-        sys.stdin = open(0)
-        breakpoint()
-
-        _fused_moe_func = functools.partial(
-            jax.jit(
-                jax_fused_moe_func_padded,
-                static_argnames=[
-                    "topk",
-                    "global_num_experts",
-                    "renormalize",
-                    "reduce_results",
-                    "mesh",
-                    "use_ep",
-                ],
-            ),
-            topk=top_k,
-            global_num_experts=global_num_experts,
-            renormalize=renormalize,
-            reduce_results=layer.reduce_results,
-            mesh=self.mesh,
-            use_ep=layer.use_ep,
-        )
+        seqlen = x.shape[0]
+
+        # import sys
+
+        # sys.stdin = open(0)
+        # breakpoint()
 
-        output = _fused_moe_func(
-            jax_view(x),
-            (
-                jax_view(layer.w13_weight).astype(jnp.float32.dtype)
-                * jax_view(layer.w13_weight_scale)
-            ).astype(jnp.bfloat16.dtype),
-            (
-                jax_view(layer.w2_weight).astype(jnp.float32.dtype)
-                * jax_view(layer.w2_weight_scale)
-            ).astype(jnp.bfloat16.dtype),
-            jax_view(router_logits),
+        expert_weights = F.softmax(router_logits, dim=-1)
+        expert_weights, expert_indices = torch.topk(
+            expert_weights, top_k, dim=-1
         )
+        if renormalize:
+            expert_weights /= expert_weights.sum(dim=-1, keepdim=True)
+
+        # cond ffn
+        # e = total num of exp = 160
+        # t = seqlen
+        # o = config.imtermediate size
+        # i = config.dim
+        #torch.einsum("ti, eoi -> teo", x, layer.w13_weight) * self.w13_weight_scale)
+        ux1 = call_jax(
+                jax.lax.dot, x, layer.w13_weight,
+                dimension_numbers=(((1, ), (2, )), ((), ())),
+                preferred_element_type=jnp.bfloat16.dtype
+        )
+        x1 = F.silu(ux1 * layer.w13_weight_scale.squeeze(2))
+
+        #x3 = torch.einsum("ti, eoi -> teo", x, layer.w3_weight) * self.w3_weight_scale
+        x3 = call_jax(
+                jax.lax.dot, x, layer.w3_weight,
+                dimension_numbers=(((1, ), (2, )), ((), ())),
+                preferred_element_type=jnp.bfloat16.dtype
+            ) * layer.w3_weight_scale.squeeze(2)
+            
+            
+        #expert_outs = torch.einsum("teo, eio -> tei", (x1 * x3), self.w2_weight) * self.w2_weight_scale
+        expert_outs = call_jax(
+                jax.lax.dot, x1 * x3, layer.w2_weight,
+                dimension_numbers=(((2, ), (2, )), ((1, ), (0, ))),
+                preferred_element_type=jnp.bfloat16.dtype
+            ).transpose(0, 1) * layer.w2_weight_scale.squeeze(2)
+            
+
+        seq_indexes = torch.arange(seqlen, device='jax').unsqueeze(1)
+        expert_outs = expert_outs[seq_indexes, expert_indices]
+
+        # out = torch.einsum("tai,ta -> ti", expert_outs, expert_weights)
+        out = call_jax(
+                jax.lax.dot, expert_outs, expert_weights,
+                dimension_numbers=(((1, ), (1, )), ((0, ), (0, ))),
+                preferred_element_type=jnp.bfloat16.dtype
+            )
 
-        return torch_view(output)
+        return out
+
+        
+
+        # _fused_moe_func = functools.partial(
+        #     jax.jit(
+        #         jax_fused_moe_func_padded,
+        #         static_argnames=[
+        #             "topk",
+        #             "global_num_experts",
+        #             "renormalize",
+        #             "reduce_results",
+        #             "mesh",
+        #             "use_ep",
+        #         ],
+        #     ),
+        #     topk=top_k,
+        #     global_num_experts=global_num_experts,
+        #     renormalize=renormalize,
+        #     reduce_results=layer.reduce_results,
+        #     mesh=self.mesh,
+        #     use_ep=layer.use_ep,
+        # )
+
+        # output = _fused_moe_func(
+        #     jax_view(x),
+        #     (
+        #         jax_view(layer.w13_weight).astype(jnp.bfloat16.dtype)
+        #         * jax_view(layer.w13_weight_scale).astype(jnp.bfloat16.dtype)
+        #     ).astype(jnp.bfloat16.dtype),
+        #     (
+        #         jax_view(layer.w2_weight).astype(jnp.bfloat16.dtype)
+        #         * jax_view(layer.w2_weight_scale).astype(jnp.bfloat16.dtype)
+        #     ).astype(jnp.bfloat16.dtype),
+        #     jax_view(router_logits),
+        # )
+
+        # return torch_view(output)
 
     def create_weights(
         self,