diff --git a/src/compressed_tensors/compressors/base.py b/src/compressed_tensors/compressors/base.py index ee751053..2d43a9e4 100644 --- a/src/compressed_tensors/compressors/base.py +++ b/src/compressed_tensors/compressors/base.py @@ -164,6 +164,8 @@ def decompress_module(self, module: Module): return None # module is not quantized quantization_scheme = module.quantization_scheme if not hasattr(quantization_scheme, "weights"): + # models that ran CompressedLinear.from_linear will + # run delattr(module, "weight") return None # weights are not quantized quantization_args = quantization_scheme.weights diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py index 6473554d..29318d20 100644 --- a/src/compressed_tensors/compressors/model_compressors/model_compressor.py +++ b/src/compressed_tensors/compressors/model_compressors/model_compressor.py @@ -103,12 +103,16 @@ def from_pretrained( :return: compressor for the configs, or None if model is not compressed """ config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - compression_config = getattr(config, COMPRESSION_CONFIG_NAME, None) + compression_config = getattr(config, COMPRESSION_CONFIG_NAME, None) or getattr( + config, QUANTIZATION_CONFIG_NAME, None + ) + return cls.from_compression_config(compression_config) @classmethod def from_compression_config( - cls, compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"] + cls, + compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"], ): """ :param compression_config: diff --git a/src/compressed_tensors/linear/compressed_linear.py b/src/compressed_tensors/linear/compressed_linear.py index a4d5b532..3e2b2f5f 100644 --- a/src/compressed_tensors/linear/compressed_linear.py +++ b/src/compressed_tensors/linear/compressed_linear.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Dict, Tuple + import torch from compressed_tensors.compressors.base import BaseCompressor from compressed_tensors.quantization import ( @@ -53,7 +55,7 @@ def from_linear( ) # get the shape and dtype of compressed parameters - compression_params = module.compressor.compression_param_info( + compression_params: Dict[str, Tuple] = module.compressor.compression_param_info( module.weight.shape, quantization_scheme.weights ) diff --git a/src/compressed_tensors/quantization/lifecycle/apply.py b/src/compressed_tensors/quantization/lifecycle/apply.py index 7c498787..50d89b48 100644 --- a/src/compressed_tensors/quantization/lifecycle/apply.py +++ b/src/compressed_tensors/quantization/lifecycle/apply.py @@ -106,7 +106,8 @@ def apply_quantization_config( model: Module, config: Union[QuantizationConfig, None], run_compressed: bool = False ) -> OrderedDict: """ - Initializes the model for quantization in-place based on the given config + Initializes the model for quantization in-place based on the given config. + Optionally coverts quantizable modules to compressed_linear modules :param model: model to apply quantization config to :param config: quantization config @@ -135,6 +136,7 @@ def apply_quantization_config( # list of submodules to ignore ignored_submodules = defaultdict(list) + # mark appropriate layers for quantization by setting their quantization schemes for name, submodule in iter_named_quantizable_modules( model, diff --git a/src/compressed_tensors/quantization/quant_config.py b/src/compressed_tensors/quantization/quant_config.py index 04c8deb7..91df005d 100644 --- a/src/compressed_tensors/quantization/quant_config.py +++ b/src/compressed_tensors/quantization/quant_config.py @@ -35,6 +35,7 @@ __all__ = [ "QuantizationStatus", "QuantizationConfig", + "QuantizationConfigHFQuantizer", "LIFECYCLE_ORDER", "DEFAULT_QUANTIZATION_METHOD", "DEFAULT_QUANTIZATION_FORMAT", @@ -132,9 +133,10 @@ class QuantizationConfig(BaseModel): `k_proj` and `v_proj` in their names. If this is not the case and kv_cache_scheme != None, the quantization of kv cache will fail :global_compression_ratio: optional informational config to report the model - compression ratio acheived by the quantization config + compression ratio acheived by the quantization config :ignore: optional list of layers to ignore from config_groups. Layers in this list - are not quantized even if they match up with a target in config_groups + are not quantized even if they match up with a target in config_groups + :version param: version of the compressors used to optimize the given model """ config_groups: Dict[str, Union[QuantizationScheme, List[str]]] @@ -144,6 +146,7 @@ class QuantizationConfig(BaseModel): quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED global_compression_ratio: Optional[float] = None ignore: Optional[List[str]] = Field(default_factory=list) + version: str = "0.7.1" def model_post_init(self, __context): """ @@ -262,3 +265,14 @@ def requires_calibration_data(self): return True return False + + +# For HFQuantizer, be able to adjust run_compressed on model load +class QuantizationConfigHFQuantizer(QuantizationConfig): + """ + :param run_compressed: param used set run_compressed. + Used for `apply_quantization_config` in CompressedTensorsHfQuantizer + in transformers + """ + + run_compressed: bool = True