[Core] Default to using per_token quantization for fp8 when cutlass i…

…s supported. (vllm-project#8651) Signed-off-by: mgoin <michael@neuralmagic.com> Co-authored-by: Michael Goin <mgoin@redhat.com> Co-authored-by: mgoin <michael@neuralmagic.com>
44ai-labs · Jan 19, 2025 · ada71fd · ada71fd
1 parent c2cd42b
commit ada71fd
Showing 1 changed file with 2 additions and 1 deletion.
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -355,7 +355,8 @@ def apply(self,
             input_scale=layer.input_scale,
             bias=bias,
             cutlass_fp8_supported=self.cutlass_fp8_supported,
-            use_per_token_if_dynamic=False)
+            # Default to using per_token quantization if cutlass is supported
+            use_per_token_if_dynamic=self.cutlass_fp8_supported)
 
 
 class Fp8MoEMethod(FusedMoEMethodBase):