quant works except (torch,torch)

ProExpertProg · ProExpertProg · commit e151e6d16e1e · 2025-10-11T19:42:50.000-04:00
Signed-off-by: Luka Govedič &lt;lgovedic@redhat.com&gt;
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
@@ -113,7 +113,7 @@ def register(self, pm_pass: PatternMatcherPass):
         def pattern(input: torch.Tensor, weight: torch.Tensor,
                     scale: torch.Tensor):
             result_rms = self.rmsnorm_matcher(input, weight)
-            return self.quant_matcher(result_rms, scale)
+            return self.quant_matcher(result_rms, scale)[0]
 
         def replacement(input: torch.Tensor, weight: torch.Tensor,
                         scale: torch.Tensor):
@@ -161,7 +161,7 @@ def pattern(input: torch.Tensor, residual: torch.Tensor,
                     weight: torch.Tensor, scale: torch.Tensor):
             result_rms, residual = self.rmsnorm_matcher(
                 input, weight, residual)
-            result = self.quant_matcher(result_rms, scale)
+            result, _ = self.quant_matcher(result_rms, scale)
 
             return result, residual
 
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
@@ -8,6 +8,7 @@
 
 from vllm.config import get_current_vllm_config
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey, _normalize_quant_group_shape, kFp8DynamicTensorSym,
     kFp8DynamicTokenSym, kFp8StaticTensorSym)
@@ -100,17 +101,29 @@ def __call__(
 
 class MatcherQuant:
 
-    def __init__(self, quant_key: QuantKey):
+    def __init__(self, quant_key: QuantKey, enabled: Optional[bool] = None):
+
         self.quant_key = quant_key
         assert quant_key in QUANT_OPS, \
             f"unsupported quantization scheme {quant_key}"
         self.QUANT_OP = QUANT_OPS[quant_key]
 
-    def forward(
+        assert quant_key.scale2 is None
+        self.quant_fp8 = QuantFP8(quant_key.scale.static,
+                                  quant_key.scale.group_shape)
+
+        if enabled is None:
+            # TODO either pass config to enabled or set it globally
+            #  (global during pass init seems reasonable)
+            enabled = self.quant_fp8.enabled()
+
+        self.forward = self.forward_custom if enabled else self.forward_native
+
+    def forward_custom(
         self,
         input: torch.Tensor,
         scale: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         # TODO: why does empty_like produce a permute but
         #  empty via shape doesn't?
         result = torch.empty(input.shape,
@@ -123,7 +136,7 @@ def forward(
                                             result=result,
                                             input=input,
                                             scale=scale)
-            return result
+            return result, scale
         else:
             assert scale is None
             scale = self.make_scale(input)
@@ -134,6 +147,13 @@ def forward(
                                                    scale_ub=None)
             return result, scale
 
+    def forward_native(
+        self,
+        input: torch.Tensor,
+        scale: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.quant_fp8(input, scale)
+
     def make_scale(self, input: torch.Tensor):
         normalized_group_shape = _normalize_quant_group_shape(
             input, self.quant_key.scale.group_shape)
@@ -146,9 +166,8 @@ def make_scale(self, input: torch.Tensor):
                            device=input.device,
                            dtype=torch.float32)
 
-    def __call__(
-        self,
-        input: torch.Tensor,
-        scale: Optional[torch.Tensor] = None
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    def __call__(self,
+                 input: torch.Tensor,
+                 scale: Optional[torch.Tensor] = None
+                 ) -> tuple[torch.Tensor, torch.Tensor]:
         return self.forward(input, scale)