Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run compressed hf quantizer #206

Closed
wants to merge 10 commits into from
2 changes: 2 additions & 0 deletions src/compressed_tensors/compressors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,8 @@ def decompress_module(self, module: Module):
return None # module is not quantized
quantization_scheme = module.quantization_scheme
if not hasattr(quantization_scheme, "weights"):
# models that ran CompressedLinear.from_linear will
# run delattr(module, "weight")
return None # weights are not quantized

quantization_args = quantization_scheme.weights
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,16 @@ def from_pretrained(
:return: compressor for the configs, or None if model is not compressed
"""
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
compression_config = getattr(config, COMPRESSION_CONFIG_NAME, None)
compression_config = getattr(config, COMPRESSION_CONFIG_NAME, None) or getattr(
config, QUANTIZATION_CONFIG_NAME, None
)

return cls.from_compression_config(compression_config)

@classmethod
def from_compression_config(
cls, compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"]
cls,
compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"],
):
"""
:param compression_config:
Expand Down
4 changes: 3 additions & 1 deletion src/compressed_tensors/linear/compressed_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, Tuple

import torch
from compressed_tensors.compressors.base import BaseCompressor
from compressed_tensors.quantization import (
Expand Down Expand Up @@ -53,7 +55,7 @@ def from_linear(
)

# get the shape and dtype of compressed parameters
compression_params = module.compressor.compression_param_info(
compression_params: Dict[str, Tuple] = module.compressor.compression_param_info(
module.weight.shape, quantization_scheme.weights
)

Expand Down
4 changes: 3 additions & 1 deletion src/compressed_tensors/quantization/lifecycle/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ def apply_quantization_config(
model: Module, config: Union[QuantizationConfig, None], run_compressed: bool = False
) -> OrderedDict:
"""
Initializes the model for quantization in-place based on the given config
Initializes the model for quantization in-place based on the given config.
Optionally coverts quantizable modules to compressed_linear modules

:param model: model to apply quantization config to
:param config: quantization config
Expand Down Expand Up @@ -135,6 +136,7 @@ def apply_quantization_config(

# list of submodules to ignore
ignored_submodules = defaultdict(list)

# mark appropriate layers for quantization by setting their quantization schemes
for name, submodule in iter_named_quantizable_modules(
model,
Expand Down
18 changes: 16 additions & 2 deletions src/compressed_tensors/quantization/quant_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
__all__ = [
"QuantizationStatus",
"QuantizationConfig",
"QuantizationConfigHFQuantizer",
"LIFECYCLE_ORDER",
"DEFAULT_QUANTIZATION_METHOD",
"DEFAULT_QUANTIZATION_FORMAT",
Expand Down Expand Up @@ -132,9 +133,10 @@ class QuantizationConfig(BaseModel):
`k_proj` and `v_proj` in their names. If this is not the case
and kv_cache_scheme != None, the quantization of kv cache will fail
:global_compression_ratio: optional informational config to report the model
compression ratio acheived by the quantization config
compression ratio acheived by the quantization config
:ignore: optional list of layers to ignore from config_groups. Layers in this list
are not quantized even if they match up with a target in config_groups
are not quantized even if they match up with a target in config_groups
:version param: version of the compressors used to optimize the given model
"""

config_groups: Dict[str, Union[QuantizationScheme, List[str]]]
Expand All @@ -144,6 +146,7 @@ class QuantizationConfig(BaseModel):
quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED
global_compression_ratio: Optional[float] = None
ignore: Optional[List[str]] = Field(default_factory=list)
version: str = "0.7.1"

def model_post_init(self, __context):
"""
Expand Down Expand Up @@ -262,3 +265,14 @@ def requires_calibration_data(self):
return True

return False


# For HFQuantizer, be able to adjust run_compressed on model load
class QuantizationConfigHFQuantizer(QuantizationConfig):
"""
:param run_compressed: param used set run_compressed.
Used for `apply_quantization_config` in CompressedTensorsHfQuantizer
in transformers
"""

run_compressed: bool = True
Loading