vllm-project
diff --git a/‎tpu_inference/layers/vllm/quantization/__init__.py‎
Lines changed: 11 additions & 11 deletions b/‎tpu_inference/layers/vllm/quantization/__init__.py‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py‎
Lines changed: 36 additions & 45 deletions b/‎tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py‎
Lines changed: 36 additions & 45 deletions
@@ -2,19 +2,19 @@
 
 from jax.sharding import Mesh
 from vllm.config import VllmConfig
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.quantization.base_config import \
+    QuantizationConfig
 
 from tpu_inference.layers.vllm.quantization.awq import VllmAWQConfig
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
-from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors import (
-    VllmCompressedTensorsConfig,
-)  # noqa: E501
-from tpu_inference.layers.vllm.quantization.unquantized import VllmUnquantizedConfig
+from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors import \
+    VllmCompressedTensorsConfig  # noqa: E501
+from tpu_inference.layers.vllm.quantization.unquantized import \
+    VllmUnquantizedConfig
 
 
-def get_tpu_quantization_config(
-    vllm_config: VllmConfig, mesh: Mesh
-) -> QuantizationConfig:
+def get_tpu_quantization_config(vllm_config: VllmConfig,
+                                mesh: Mesh) -> QuantizationConfig:
     model_config = copy.deepcopy(vllm_config.model_config)
     # TODO(kyuyeunk): Add support for "tpu_int8".
     method_to_config: dict[str, str] = {
@@ -29,11 +29,11 @@ def get_tpu_quantization_config(
     # breakpoint()
     if model_config.quantization not in method_to_config:
         raise NotImplementedError(
-            f"{model_config.quantization} quantization method not supported."
-        )
+            f"{model_config.quantization} quantization method not supported.")
     quant_config = method_to_config[model_config.quantization]
     assert issubclass(quant_config, JaxCommonConfig)
     quant_config.set_configs(vllm_config, mesh)
 
     model_config.quantization = quant_config.get_name()
-    return VllmConfig.get_quantization_config(model_config, vllm_config.load_config)
+    return VllmConfig.get_quantization_config(model_config,
+                                              vllm_config.load_config)
@@ -6,45 +6,41 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 from vllm.model_executor.layers.linear import LinearBase
-from vllm.model_executor.layers.quantization import register_quantization_config
-from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase  # noqa: E501
+from vllm.model_executor.layers.quantization import \
+    register_quantization_config
+from vllm.model_executor.layers.quantization.base_config import \
+    QuantizeMethodBase  # noqa: E501
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (
-    CompressedTensorsConfig,
-    CompressedTensorsKVCacheMethod,
-    CompressedTensorsLinearMethod,
-    CompressedTensorsScheme,
-)
-from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors_moe import (
-    CompressedTensorsW8A8Fp8MoEMethod,
-)
+    CompressedTensorsConfig, CompressedTensorsKVCacheMethod,
+    CompressedTensorsLinearMethod, CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    find_matched_target,
-    is_activation_quantization_format,
-    should_ignore_layer,
-)
+    find_matched_target, should_ignore_layer)
 
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
-from tpu_inference.layers.vllm.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_fp8 import (
-    VllmCompressedTensorsW8A8Fp8,
-)
-from tpu_inference.layers.vllm.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_int8 import (
-    VllmCompressedTensorsW8A8Int8,
-)
-from tpu_inference.layers.vllm.quantization.unquantized import VllmUnquantizedConfig
+from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors_moe import \
+    CompressedTensorsW8A8Fp8MoEMethod
+from tpu_inference.layers.vllm.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_fp8 import \
+    VllmCompressedTensorsW8A8Fp8
+from tpu_inference.layers.vllm.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_int8 import \
+    VllmCompressedTensorsW8A8Int8
+from tpu_inference.layers.vllm.quantization.unquantized import \
+    VllmUnquantizedConfig
 
 P = PartitionSpec
 logger = init_logger(__name__)
 
 
 @register_quantization_config("jax-compressed-tensors")
 class VllmCompressedTensorsConfig(CompressedTensorsConfig, JaxCommonConfig):
+
     @classmethod
     def get_name(cls) -> str:
         return "jax-compressed-tensors"
 
-    def get_scheme(
-        self, layer: torch.nn.Module, layer_name: Optional[str] = None
-    ) -> Optional["CompressedTensorsScheme"]:
+    def get_scheme(self,
+                   layer: torch.nn.Module,
+                   layer_name: Optional[str] = None
+                   ) -> Optional["CompressedTensorsScheme"]:
         """
         compressed-tensors supports non uniform in the following way:
 
@@ -71,24 +67,18 @@ def get_scheme(
             scheme_dict = self.target_scheme_map[matched_target]
             weight_quant = scheme_dict.get("weights")
             input_quant = scheme_dict.get("input_activations")
-            format = scheme_dict.get("format")
 
         if weight_quant is None:
-            logger.warning_once(
-                "Acceleration for non-quantized schemes is "
-                "not supported by Compressed Tensors. "
-                "Falling back to UnquantizedLinearMethod"
-            )
+            logger.warning_once("Acceleration for non-quantized schemes is "
+                                "not supported by Compressed Tensors. "
+                                "Falling back to UnquantizedLinearMethod")
             return None
 
         # TODO(kyuyeunk): Add support for different act_quant_format
-        act_quant_format = (
-            is_activation_quantization_format(  # noqa: F841
-                format
-            )
-            if format is not None
-            else is_activation_quantization_format(self.quant_format)
-        )
+        # act_quant_format = (
+        #     is_activation_quantization_format(  # noqa: F841
+        #         format) if format is not None else
+        #     is_activation_quantization_format(self.quant_format))
 
         linear_config = self.get_linear_config(layer)
         if self._is_fp8_w8a8(weight_quant, input_quant):
@@ -105,28 +95,29 @@ def get_scheme(
                 input_symmetric=input_quant.symmetric,
                 jax_config=linear_config,
             )
-        raise NotImplementedError("No compressed-tensors compatible scheme was found.")
+        raise NotImplementedError(
+            "No compressed-tensors compatible scheme was found.")
 
     def get_quant_method(
         self,
         layer: torch.nn.Module,
         prefix: str,
     ) -> Optional[QuantizeMethodBase]:
-        if should_ignore_layer(
-            prefix, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
-        ):
+        if should_ignore_layer(prefix,
+                               ignore=self.ignore,
+                               fused_mapping=self.packed_modules_mapping):
             return VllmUnquantizedConfig.get_quant_method(self, layer, prefix)
         if isinstance(layer, LinearBase):
             scheme = self.get_scheme(layer=layer, layer_name=prefix)
             if scheme is None:
-                return VllmUnquantizedConfig.get_quant_method(self, layer, prefix)
+                return VllmUnquantizedConfig.get_quant_method(
+                    self, layer, prefix)
             layer.scheme = scheme
             return CompressedTensorsLinearMethod(self)
         if isinstance(layer, FusedMoE):
             print("HERE", layer)
-            return CompressedTensorsW8A8Fp8MoEMethod(
-                self, layer.quant_config, self.mesh
-            )
+            return CompressedTensorsW8A8Fp8MoEMethod(self, layer.quant_config,
+                                                     self.mesh)
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
         return None