Initial commit

jrplatin · jrplatin · commit dc66730088f5 · 2025-09-25T00:48:03.000Z
Signed-off-by: Jacob Platin &lt;jacobplatin@google.com&gt;
diff --git a/tpu_commons/models/jax/common/moe/deepseek_moe.py b/tpu_commons/models/jax/common/moe/deepseek_moe.py
@@ -1,7 +1,7 @@
 import enum
 from dataclasses import InitVar, dataclass
 from functools import partial
-from typing import Tuple
+from typing import Optional, Tuple
 
 import jax
 import jax.numpy as jnp
@@ -13,6 +13,8 @@
 from tpu_commons.models.jax.common.base import create_param
 from tpu_commons.models.jax.common.layers import FlaxUtils
 from tpu_commons.models.jax.common.moe.moe import MoE
+from tpu_commons.models.jax.utils.quantization.quantization_utils import (
+    manually_quantize_qwix_activation, manually_quantize_qwix_weight)
 
 modeling_flax_utils = FlaxUtils()
 
@@ -141,6 +143,8 @@ class SparseMoE(MoE):
     tile_size: tuple[int, int, int] = (128, 64, 128)
     use_megablox: bool = False
     mesh: jax.sharding.Mesh
+    # This should be set if and only if you have quantized your model (via Qwix)
+    quantized_dtype: Optional[jnp.dtype] = None
 
     def __post_init__(self, rngs: nnx.Rngs):
         super().__post_init__(rngs)
@@ -356,6 +360,10 @@ def _gmm(self, inputs, kernel, group_sizes):
             raise NotImplementedError(
                 "MegaBlox kernel call is not implemented.")
         else:
+            inputs = manually_quantize_qwix_activation(
+                inputs, "ragged_dot", jnp.float8_e4m3fn, [0], {},
+                "absmax") if self.quantized_dtype else inputs
+            # TODO: make qwix.ragged_dot
             output = jax.lax.ragged_dot(
                 lhs=inputs,
                 rhs=kernel,
@@ -583,12 +591,19 @@ def __call__(self, x_TD: Float):
                                  check_rep=False)(
                                      SparseMoE._distributed_sparse_moe_fwd)
 
-        return mapped_moe_fwd(
-            self,
-            x_TD,
-            router_weights_TX,
-            selected_experts_TX,
-            self.kernel_gating_EDF.value,
-            self.kernel_up_proj_EDF.value,
-            self.kernel_down_proj_EFD.value,
-        )
+        quantized_kernel_gating_EDF = manually_quantize_qwix_weight(
+            self.kernel_gating_EDF.value, self.quantized_dtype, [0, 2], {},
+            "absmax") if self.quantized_dtype else self.kernel_gating_EDF.value
+        quantized_kernel_up_proj_EDF = manually_quantize_qwix_weight(
+            self.kernel_up_proj_EDF.value, self.quantized_dtype, [0, 2], {},
+            "absmax"
+        ) if self.quantized_dtype else self.kernel_up_proj_EDF.value
+        quantized_kernel_down_proj_EFD = manually_quantize_qwix_weight(
+            self.kernel_down_proj_EFD.value, self.quantized_dtype, [0, 2], {},
+            "absmax"
+        ) if self.quantized_dtype else self.kernel_down_proj_EFD.value
+
+        return mapped_moe_fwd(self, x_TD, router_weights_TX,
+                              selected_experts_TX, quantized_kernel_gating_EDF,
+                              quantized_kernel_up_proj_EDF,
+                              quantized_kernel_down_proj_EFD)
diff --git a/tpu_commons/models/jax/deepseek_v3.py b/tpu_commons/models/jax/deepseek_v3.py
@@ -24,6 +24,8 @@
 from tpu_commons.models.jax.common.moe.moe import MoE
 from tpu_commons.models.jax.common.transformer_block import (
     SharedExpertsTransformerBlock, TransformerBlock)
+from tpu_commons.models.jax.utils.quantization.quantization_utils import \
+    get_quant_dtype_from_qwix_config
 from tpu_commons.models.jax.utils.weight_utils import (get_param,
                                                        model_weights_generator,
                                                        print_param_info,
@@ -45,7 +47,7 @@ def __init__(self,
         self.vllm_config = vllm_config
         self.rng = nnx.Rngs(rng)
 
-        num_layers: int = 61
+        num_layers: int = 4
         num_local_experts: int = 256
 
         vocab_size: int = 129280
@@ -211,6 +213,8 @@ def _create_mla() -> MLA:
                     activation_ffw_ted=('data', None, None),
                     edf_sharding=('model', None, None),
                     efd_sharding=('model', None, None),
+                    quantized_dtype=self.weight_loader.quant_dtype
+                    if self.weight_loader.is_model_quantized else None,
                     router=router) if is_moe_layer else DenseFFW(
                         dtype=dtype,
                         hidden_act=hidden_act,
@@ -452,20 +456,12 @@ def __init__(self, vllm_config: VllmConfig, num_layers, hidden_size,
             quantization_type = vllm_config.model_config.hf_config.quantization_config[
                 "quant_method"]
             assert quantization_type == "fp8", "DeepSeek only supports the fp8 quantization method for now"
-            # NOTE: this will only be used for loading in quantized weights (via Qwix)
-            qwix_config = vllm_config.additional_config.get(
-                "quantization", {}).get("qwix", {})
-            self.scale_dtype = getattr(
-                jnp, qwix_config.get("scale_dtype", "bfloat16"))
-            # TODO (jacobplatin): move this out of DeepSeek class to a utility function
-            for rule in qwix_config.get("rules", []):
-                if rule.get("module_path") == ".*":
-                    quant_dtype_str = rule.get("weight_qtype", "")
-                    assert quant_dtype_str, "Quantization dtype not found in Qwix config! We currently expect your Qwix config to have a rule with module_path '.*' and a weight_qtype."
-                    self.quant_dtype = getattr(jnp, quant_dtype_str)
-                    logger.info(
-                        f"Quantizing DeepSeek with quantization dtype: {self.quant_dtype} and scale dtype: {self.scale_dtype}"
-                    )
+            self.scale_dtype, self.quant_dtype = get_quant_dtype_from_qwix_config(
+                vllm_config)
+
+            logger.info(
+                f"Quantizing DeepSeek with quantization dtype: {self.quant_dtype} and scale dtype: {self.scale_dtype}"
+            )
 
             quantization_block_sizes = vllm_config.model_config.hf_config.quantization_config[
                 "weight_block_size"]
diff --git a/tpu_commons/models/jax/utils/quantization/quantization_utils.py b/tpu_commons/models/jax/utils/quantization/quantization_utils.py
@@ -7,11 +7,14 @@
 import jax
 import jax.numpy as jnp
 import qwix
+import qwix.pallas
 import yaml
 from flax import nnx
 from flax.typing import PRNGKey
 from jax.sharding import Mesh, NamedSharding
 from jax.sharding import PartitionSpec as P
+from qwix._src.core.qarray import QArray
+from qwix._src.providers import ptq
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -507,3 +510,76 @@ def load_random_weights_into_qwix_abstract_model(rng: PRNGKey,
     if hasattr(model, 'initialize_cache'):
         model.initialize_cache()
     logger.info("Done initializing Qwix-quantized model with random weights")
+
+
+def manually_quantize_qwix_weight(weight: jax.Array, qtype: jnp.dtype,
+                                  channelwise_axes: List[int],
+                                  tiled_axes: dict,
+                                  calibration_method: str) -> QArray:
+    """
+    """
+    # TODO (jacobplatin): clean this up; this is needed because of issues with Qwix quantizing the `shard_map` in SpraseMatmul
+    how_to_quantize = ptq.qarray.HowToQuantize(
+        qtype=qtype,
+        channelwise_axes=channelwise_axes,
+        tiled_axes=tiled_axes,
+        calibration_method=calibration_method)
+
+    return ptq._create_quantized_param(weight, how_to_quantize)
+
+
+def manually_quantize_qwix_activation(inputs: jax.Array, rule_name: str,
+                                      qtype: jnp.dtype,
+                                      channelwise_axes: List[int],
+                                      tiled_axes: dict,
+                                      calibration_method: str) -> QArray:
+    """
+    Manually quantizes an activation tensor using Qwix.  Needed for the SparseMatmul
+    DeepSeek MoE case currently.
+
+    Args:
+        inputs: The activation tensor to quantize.
+        rule_name: The name of the quantization rule to use.
+        qtype: The quantization type.
+        channelwise_axes: The channelwise axes to quantize.
+        tiled_axes: The tiled axes to quantize.
+        calibration_method: The calibration method to use.
+
+    Returns:
+        The quantized activation tensor.
+    """
+    rule = qwix.pallas.get_current_rule(rule_name)
+    lhs_how = ptq.qarray.HowToQuantize(qtype=qtype,
+                                       channelwise_axes=channelwise_axes,
+                                       tiled_axes=tiled_axes,
+                                       calibration_method=calibration_method)
+    assert not rule.act_static_scale, "Static scale not supported right now"
+
+    # channelwise_axes should be set to (a subset of) non-contraction axes. e.g.
+    # for ragged_dot [m, k] x [g, k, n], they are [0] and [0, 2]
+    # TODO (jacobplatin): add support for `act_name`
+    return ptq._quantize_act(inputs, lhs_how, rule, "")
+
+
+def get_quant_dtype_from_qwix_config(
+        vllm_config: "VllmConfig") -> tuple[jnp.dtype, jnp.dtype]:
+    """
+    Gets the quantization dtype from the Qwix config.
+
+    Args:
+        vllm_config: The VllmConfig object.
+
+    Returns:
+        A tuple of the scale dtype and quant dtype.
+    """
+    qwix_config = vllm_config.additional_config.get("quantization",
+                                                    {}).get("qwix", {})
+    scale_dtype = getattr(jnp, qwix_config.get("scale_dtype", "bfloat16"))
+    quant_dtype = None
+    # TODO (jacobplatin): this needs to be much more robust
+    for rule in qwix_config.get("rules", []):
+        if rule.get("module_path") == ".*":
+            quant_dtype_str = rule.get("weight_qtype", "")
+            assert quant_dtype_str, "Quantization dtype not found in Qwix config! We currently expect your Qwix config to have a rule with module_path '.*' and a weight_qtype."
+            quant_dtype = getattr(jnp, quant_dtype_str)
+    return scale_dtype, quant_dtype
diff --git a/tpu_commons/models/jax/utils/weight_utils.py b/tpu_commons/models/jax/utils/weight_utils.py
@@ -111,6 +111,7 @@ def get_model_weights_files(
         )
 
     weights_files.sort()
+    weights_files = weights_files[:4] + [weights_files[-4]]
     return weights_files
 
 

Original file line number	Diff line number	Diff line change
`@@ -111,6 +111,7 @@ def get_model_weights_files(`
`111`	`111`	`)`
`112`	`112`
`113`	`113`	`weights_files.sort()`
	`114`	`+ weights_files = weights_files[:4] + [weights_files[-4]]`
`114`	`115`	`return weights_files`
`115`	`116`
`116`	`117`