vllm-project
diff --git a/‎vllm/model_executor/layers/linear.py‎
Lines changed: 7 additions & 7 deletions b/‎vllm/model_executor/layers/linear.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py‎
Lines changed: 35 additions & 33 deletions b/‎vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py‎
Lines changed: 35 additions & 33 deletions
@@ -805,12 +805,10 @@ def weight_loader_v2(self,
         assert loaded_shard_id < len(self.output_sizes)
 
         if isinstance(param, BlockQuantScaleParameter):
-            from vllm.model_executor.layers.quantization.fp8 import (
-                Fp8LinearMethod, Fp8MoEMethod)
             assert self.quant_method is not None
-            assert isinstance(self.quant_method,
-                              (Fp8LinearMethod, Fp8MoEMethod))
-            weight_block_size = self.quant_method.quant_config.weight_block_size
+            # Assume the weight block size has been set by quant method
+            assert hasattr(self, "weight_block_size")
+            weight_block_size = self.weight_block_size
             assert weight_block_size is not None
             block_n, _ = weight_block_size[0], weight_block_size[1]
             shard_offset = (
@@ -989,8 +987,10 @@ def weight_loader_v2(self,
         # Note(simon): This is needed for Qwen3's fp8 quantization.
         if isinstance(param, BlockQuantScaleParameter):
             assert self.quant_method is not None
-            assert hasattr(self.quant_method, "quant_config")
-            weight_block_size = self.quant_method.quant_config.weight_block_size
+            # Assume the weight block size has been set by quant method
+            assert hasattr(self, "weight_block_size")
+            weight_block_size = self.weight_block_size
+            assert weight_block_size is not None
             block_n, _ = weight_block_size[0], weight_block_size[1]
             shard_offset = (shard_offset + block_n - 1) // block_n
             shard_size = (shard_size + block_n - 1) // block_n
 
@@ -12,7 +12,6 @@
                                              QuantizationStrategy,
                                              QuantizationType)
 from compressed_tensors.transform import TransformConfig
-from pydantic import BaseModel
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -268,7 +267,8 @@ def _check_scheme_supported(self,
         else:
             return False
 
-    def _is_fp4a4_nvfp4(self, weight_quant: BaseModel, input_quant: BaseModel):
+    def _is_fp4a4_nvfp4(self, weight_quant: QuantizationArgs,
+                        input_quant: QuantizationArgs):
 
         if weight_quant is None or input_quant is None:
             return False
@@ -288,8 +288,8 @@ def _is_fp4a4_nvfp4(self, weight_quant: BaseModel, input_quant: BaseModel):
         return (is_tensor_group_quant and is_float_type and is_4_bits
                 and is_group_size_16 and is_symmetric)
 
-    def _is_fp4a16_nvfp4(self, weight_quant: BaseModel,
-                         input_quant: BaseModel):
+    def _is_fp4a16_nvfp4(self, weight_quant: QuantizationArgs,
+                         input_quant: QuantizationArgs):
 
         is_weight_only = weight_quant is not None and input_quant is None
         is_tensor_group_quant = (
@@ -303,8 +303,8 @@ def _is_fp4a16_nvfp4(self, weight_quant: BaseModel,
         return (is_weight_only and is_tensor_group_quant and is_float_type
                 and is_4_bits and is_group_size_16 and is_symmetric)
 
-    def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
-                               input_quant: BaseModel) -> bool:
+    def _is_static_tensor_w8a8(self, weight_quant: QuantizationArgs,
+                               input_quant: QuantizationArgs) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
         weight_strategy = (
             weight_quant.strategy == QuantizationStrategy.TENSOR.value
@@ -317,8 +317,8 @@ def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
         # Only symmetric weight quantization supported.
         return is_8_bits and is_tensor and weight_quant.symmetric and is_static
 
-    def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
-                               input_quant: BaseModel) -> bool:
+    def _is_dynamic_token_w8a8(self, weight_quant: QuantizationArgs,
+                               input_quant: QuantizationArgs) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
         weight_strategy = (
             weight_quant.strategy == QuantizationStrategy.TENSOR.value
@@ -331,8 +331,8 @@ def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
         # Only symmetric weight quantization supported.
         return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
 
-    def _is_dynamic_token_w4a8_int(self, weight_quant: BaseModel,
-                                   input_quant: BaseModel) -> bool:
+    def _is_dynamic_token_w4a8_int(self, weight_quant: QuantizationArgs,
+                                   input_quant: QuantizationArgs) -> bool:
         is_weight_4_bits = weight_quant.num_bits == 4
         is_activation_8_bits = input_quant.num_bits == 8
         weight_strategy = (
@@ -347,8 +347,8 @@ def _is_dynamic_token_w4a8_int(self, weight_quant: BaseModel,
         return (is_weight_4_bits and is_activation_8_bits and is_token
                 and weight_quant.symmetric and is_dynamic)
 
-    def _is_fp8_w8a8(self, weight_quant: BaseModel,
-                     input_quant: BaseModel) -> bool:
+    def _is_fp8_w8a8(self, weight_quant: QuantizationArgs,
+                     input_quant: QuantizationArgs) -> bool:
         # Confirm weights and activations quantized.
         if weight_quant is None or input_quant is None:
             return False
@@ -358,11 +358,12 @@ def _is_fp8_w8a8(self, weight_quant: BaseModel,
                              and input_quant.type == QuantizationType.FLOAT)
         is_symmetric_weight = weight_quant.symmetric
         is_static_weight = not weight_quant.dynamic
-        is_per_tensor_or_channel_weight = (weight_quant.strategy in [
-            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
+        is_tensor_or_channel_or_block_weight = (weight_quant.strategy in [
+            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL,
+            QuantizationStrategy.BLOCK
         ])
         if not (is_floating_point and is_symmetric_weight and is_static_weight
-                and is_per_tensor_or_channel_weight):
+                and is_tensor_or_channel_or_block_weight):
             return False
 
         # Dynamic quantization is always supported if weights supported.
@@ -375,8 +376,8 @@ def _is_fp8_w8a8(self, weight_quant: BaseModel,
             input_quant.strategy == QuantizationStrategy.TENSOR)
         return is_symmetric_activation and is_per_tensor_activation
 
-    def _is_fp8_w4a8(self, weight_quant: BaseModel,
-                     input_quant: BaseModel) -> bool:
+    def _is_fp8_w4a8(self, weight_quant: QuantizationArgs,
+                     input_quant: QuantizationArgs) -> bool:
         if not weight_quant or not input_quant:
             return False
         is_weight_4_bits = weight_quant.num_bits == 4
@@ -392,24 +393,24 @@ def _is_fp8_w4a8(self, weight_quant: BaseModel,
         return (is_weight_4_bits and is_activation_8_bits and is_token
                 and is_symmetric and is_dynamic)
 
-    def _is_fp8_w4a8_sm90(self, weight_quant: BaseModel,
-                          input_quant: BaseModel) -> bool:
+    def _is_fp8_w4a8_sm90(self, weight_quant: QuantizationArgs,
+                          input_quant: QuantizationArgs) -> bool:
         return (self._check_scheme_supported(90, error=False, match_exact=True)
                 and self._is_fp8_w4a8(weight_quant, input_quant))
 
-    def _is_fp8_w8a8_sm90(self, weight_quant: BaseModel,
-                          input_quant: BaseModel) -> bool:
+    def _is_fp8_w8a8_sm90(self, weight_quant: QuantizationArgs,
+                          input_quant: QuantizationArgs) -> bool:
         return (self._check_scheme_supported(90, error=False, match_exact=True)
                 and self._is_fp8_w8a8(weight_quant, input_quant))
 
-    def _is_fp8_w8a8_sm100(self, weight_quant: BaseModel,
-                           input_quant: BaseModel) -> bool:
+    def _is_fp8_w8a8_sm100(self, weight_quant: QuantizationArgs,
+                           input_quant: QuantizationArgs) -> bool:
         return (self._check_scheme_supported(
             100, error=False, match_exact=True)
                 and self._is_fp8_w8a8(weight_quant, input_quant))
 
-    def _is_fp8_w8a16(self, weight_quant: BaseModel,
-                      input_quant: BaseModel) -> bool:
+    def _is_fp8_w8a16(self, weight_quant: QuantizationArgs,
+                      input_quant: QuantizationArgs) -> bool:
         # Confirm weights quantized.
         if weight_quant is None:
             return False
@@ -421,18 +422,19 @@ def _is_fp8_w8a16(self, weight_quant: BaseModel,
         # Confirm weight scheme is supported.
         is_symmetric_weight = weight_quant.symmetric
         is_static_weight = not weight_quant.dynamic
-        is_per_tensor_or_channel_weight = (weight_quant.strategy in [
-            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
+        is_tensor_or_channel_or_block_weight = (weight_quant.strategy in [
+            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL,
+            QuantizationStrategy.BLOCK
         ])
         if not (is_symmetric_weight and is_static_weight  # noqa: SIM103
-                and is_per_tensor_or_channel_weight):
+                and is_tensor_or_channel_or_block_weight):
             return False
 
         # All conditions satisfied.
         return True
 
-    def _is_wNa16_group_channel(self, weight_quant: BaseModel,
-                                input_quant: BaseModel) -> bool:
+    def _is_wNa16_group_channel(self, weight_quant: QuantizationArgs,
+                                input_quant: QuantizationArgs) -> bool:
         input_quant_none = input_quant is None
         is_channel_group = (
             weight_quant.strategy == QuantizationStrategy.CHANNEL.value
@@ -443,8 +445,8 @@ def _is_wNa16_group_channel(self, weight_quant: BaseModel,
 
     def _get_scheme_from_parts(
             self,
-            weight_quant: BaseModel,
-            input_quant: BaseModel,
+            weight_quant: QuantizationArgs,
+            input_quant: QuantizationArgs,
             format: Optional[str] = None) -> "CompressedTensorsScheme":
 
         # use the per-layer format if defined, otherwise, use global format
@@ -496,7 +498,7 @@ def _get_scheme_from_parts(
                     CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
                 if is_fp8_w8a8_supported:
                     return CompressedTensorsW8A8Fp8(
-                        strategy=weight_quant.strategy,
+                        weight_quant=weight_quant,
                         is_static_input_scheme=(input_quant
                                                 and not input_quant.dynamic))
                 else: