rms works fully now, had to remove more conversions (and add them in replacements). TODO pass to remove unnecessary conversions?

ProExpertProg · ProExpertProg · commit 8e4a56f57581 · 2025-10-11T19:42:50.000-04:00
Signed-off-by: Luka Govedič &lt;lgovedic@redhat.com&gt;
diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -145,7 +145,11 @@ void rms_norm_dynamic_per_token_quant(
   if (scale_ub.has_value()) {
     TORCH_CHECK(out.dtype() == kFp8Type);
   }
+  TORCH_CHECK(weight.dtype() == input.dtype());
   TORCH_CHECK(scales.dtype() == torch::kFloat32);
+  if (residual) {
+    TORCH_CHECK(residual->scalar_type() == input.scalar_type());
+  }
 
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
@@ -9,8 +9,8 @@
                                      FusedRMSQuantKey, RMSNormQuantFusionPass)
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
-from vllm.config import (CompilationConfig, CompilationLevel, PassConfig,
-                         VllmConfig)
+from vllm.config import (CompilationConfig, CompilationLevel, ModelConfig,
+                         PassConfig, VllmConfig)
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape, QuantKey, ScaleDesc)
@@ -119,13 +119,16 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
         custom_ops.append("+rms_norm")
     if enable_quant_fp8:
         custom_ops.append("+quant_fp8")
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        debug_dump_path=f"/home/luka/git/vllm/._workspace/"
-                        f"debug_dump_{enable_rms_norm}_{enable_quant_fp8}",
-        level=CompilationLevel.PIECEWISE,
-        custom_ops=custom_ops,
-        pass_config=PassConfig(enable_fusion=True, enable_noop=True),
-    ))
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(dtype=dtype),
+        compilation_config=CompilationConfig(
+            debug_dump_path=f"/home/luka/git/vllm/._workspace/"
+            f"debug_dump_{enable_rms_norm}_{enable_quant_fp8}",
+            level=CompilationLevel.PIECEWISE,
+            custom_ops=custom_ops,
+            pass_config=PassConfig(enable_fusion=True, enable_noop=True),
+        ),
+    )
     with vllm.config.set_current_vllm_config(vllm_config):
         # Reshape pass is needed for the fusion pass to work
         noop_pass = NoOpEliminationPass(vllm_config)
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
@@ -9,7 +9,7 @@
 from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch._ops import OpOverload
 
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape, QuantKey, ScaleDesc, kFp8DynamicTensorSym, kFp8DynamicTokenSym,
@@ -117,6 +117,10 @@ def pattern(input: torch.Tensor, weight: torch.Tensor,
 
         def replacement(input: torch.Tensor, weight: torch.Tensor,
                         scale: torch.Tensor):
+            # In case we're matching native rms-norm, conversions might be
+            # optimized out. We convert here just to be safe.
+            input = input.to(dtype=torch.float16)  # TODO model dtype
+
             result = torch.empty_like(input, dtype=self.quant_dtype)
             at = auto_functionalized(self.FUSED_OP,
                                      result=result,
@@ -130,7 +134,7 @@ def replacement(input: torch.Tensor, weight: torch.Tensor,
 
         inputs = [
             empty_bf16(5, 4),  # input
-            empty_bf16(4,),  # weight
+            empty_bf16(4, ),  # weight
             empty_fp32(1, 1)  # scale
         ]
         pattern(*inputs)
@@ -163,6 +167,11 @@ def pattern(input: torch.Tensor, residual: torch.Tensor,
 
         def replacement(input: torch.Tensor, residual: torch.Tensor,
                         weight: torch.Tensor, scale: torch.Tensor):
+            # In case we're matching native rms-norm, conversions might be
+            # optimized out. We convert here just to be safe.
+            input = input.to(dtype=torch.float16)  # TODO model dtype
+            residual = residual.to(dtype=torch.float16)
+
             result = torch.empty_like(input, dtype=self.quant_dtype)
             at = auto_functionalized(self.FUSED_OP,
                                      result=result,
@@ -176,9 +185,11 @@ def replacement(input: torch.Tensor, residual: torch.Tensor,
             return at[1], at[2]
 
         inputs = [
+            # TODO: maybe 32bit for torch impl?
+            #  TODO dtype doesn't seem to matter?
             empty_bf16(5, 4),  # input
             empty_bf16(5, 4),  # residual
-            empty_bf16(4, ),   # weight
+            empty_bf16(4, ),  # weight
             empty_fp32(1, 1)  # scale
         ]
 
@@ -213,6 +224,10 @@ def pattern(input: torch.Tensor, weight: torch.Tensor):
             return self.quant_matcher(result_rms)
 
         def replacement(input: torch.Tensor, weight: torch.Tensor):
+            # In case we're matching native rms-norm, conversions might be
+            # optimized out. We convert here just to be safe.
+            input = input.to(dtype=torch.float16)  # TODO model dtype
+
             result = torch.empty_like(input, dtype=self.quant_dtype)
             scale = self.quant_matcher.make_scale(input)
             at = auto_functionalized(self.FUSED_OP,
@@ -267,6 +282,11 @@ def pattern(input: torch.Tensor, residual: torch.Tensor,
 
         def replacement(input: torch.Tensor, residual: torch.Tensor,
                         weight: torch.Tensor):
+            # In case we're matching native rms-norm, conversions might be
+            # optimized out. We convert here just to be safe.
+            input = input.to(dtype=torch.float16)  # TODO model dtype
+            residual = residual.to(dtype=torch.float16)
+
             result = torch.empty_like(input, dtype=self.quant_dtype)
             scale = self.quant_matcher.make_scale(input)
             at = auto_functionalized(self.FUSED_OP,
@@ -309,22 +329,23 @@ def __init__(self, config: VllmConfig):
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="rmsnorm_quant_fusion_pass")
 
-        for epsilon in [1e-5, 1e-6]:
-            # Fuse rms_norm + static fp8 quant
-            RMSNormStaticQuantPattern(epsilon,
-                                      FP8_DTYPE).register(self.patterns)
+        with set_current_vllm_config(config, check_compile=False):
+            for epsilon in [1e-5, 1e-6]:
+                # Fuse rms_norm + static fp8 quant
+                RMSNormStaticQuantPattern(epsilon,
+                                          FP8_DTYPE).register(self.patterns)
 
-            # Fuse fused_add_rms_norm + static fp8 quant
-            FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
-                self.patterns)
+                # Fuse fused_add_rms_norm + static fp8 quant
+                FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register(
+                    self.patterns)
 
-            # Fuse rms_norm + dynamic per-token fp8 quant
-            RMSNormDynamicQuantPattern(epsilon,
-                                       FP8_DTYPE).register(self.patterns)
+                # Fuse rms_norm + dynamic per-token fp8 quant
+                RMSNormDynamicQuantPattern(epsilon,
+                                           FP8_DTYPE).register(self.patterns)
 
-            # Fuse fused_add_rms_norm + dynamic per-token fp8 quant
-            FusedAddRMSNormDynamicQuantPattern(epsilon, FP8_DTYPE).register(
-                self.patterns)
+                # Fuse fused_add_rms_norm + dynamic per-token fp8 quant
+                FusedAddRMSNormDynamicQuantPattern(
+                    epsilon, FP8_DTYPE).register(self.patterns)
 
         self.dump_patterns(config, self.patterns)
 
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
@@ -6,6 +6,7 @@
 from torch._higher_order_ops import auto_functionalized
 from torch._ops import OpOverload
 
+from vllm.config import get_current_vllm_config
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey, _normalize_quant_group_shape, kFp8DynamicTensorSym,
@@ -29,16 +30,18 @@
 #         kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default  # noqa: E501
 
 
-class MatcherRMSNorm: # TODO separate residual and not residual
+class MatcherRMSNorm:  # TODO separate residual and not residual
 
     def __init__(self, epsilon: float, enabled: Optional[bool] = None):
         self.epsilon = epsilon
 
         if enabled is None:
-            # TODO either pass config to enabled or set it globally (global during pass init seems reasonable)
+            # TODO either pass config to enabled or set it globally
+            #  (global during pass init seems reasonable)
             enabled = RMSNorm.enabled()
 
         self.forward = self.forward_custom if enabled else self.forward_native
+        self.model_dtype = get_current_vllm_config().model_config.dtype
 
     def forward_custom(
         self,
@@ -72,22 +75,20 @@ def forward_native(
         weight: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        orig_dtype = input.dtype
-        x = input.to(torch.float32)
+        x = input  # .to(torch.float32)
         if residual is not None:
             x = x + residual.to(torch.float32)
-            residual = x
+            residual = x  # conversion to 16-bit is eliminated in full graph
 
         variance = x.pow(2).mean(dim=-1, keepdim=True)
 
         x = x * torch.rsqrt(variance + self.epsilon)
-        x = x.to(orig_dtype)
+        x = x.to(self.model_dtype)
         if weight is not None:
             x = x * weight
 
         return x if residual is None else (x, residual)
 
-
     def __call__(
         self,
         input: torch.Tensor,