Skip to content

Commit a55ba1b

Browse files
kylesayrsjinzhen-lin
authored andcommitted
[Quant] [Bugfix] Fix quantization config matching with hf_to_vllm_mapper (vllm-project#20046)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
1 parent 410ce21 commit a55ba1b

File tree

17 files changed

+107
-29
lines changed

17 files changed

+107
-29
lines changed

tests/quantization/test_register_quantization_config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ class CustomQuantConfig(QuantizationConfig):
5353

5454
def __init__(self, num_bits: int = 8) -> None:
5555
"""Initialize the quantization config."""
56+
super().__init__()
5657
self.num_bits = num_bits
5758

5859
def get_name(self) -> QuantizationMethods:

vllm/lora/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -805,7 +805,7 @@ def create_lora_manager(
805805
lora_manager_cls: type[LoRAModelManager] = LoRAModelManager,
806806
**kwargs) -> LoRAModelManager:
807807
"""Create a LoRA adapter for a given model."""
808-
if not hasattr(model, "packed_modules_mapping"):
808+
if not isinstance(model, SupportsLoRA):
809809
raise ValueError(f"Model {type(model)} is not supported for LoRA.")
810810
lora_manager = lora_manager_cls(
811811
model=model,

vllm/lora/worker_manager.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
111111
# For some models like Qwen2VL, we need to use hf_to_vllm_mapper
112112
# to ensure correct loading of lora weights.
113113
model = self._adapter_manager.model
114-
hf_to_vllm_mapper = None
115-
if (hasattr(model, "hf_to_vllm_mapper")
116-
and model.hf_to_vllm_mapper is not None):
117-
hf_to_vllm_mapper = model.hf_to_vllm_mapper
114+
hf_to_vllm_mapper = getattr(model, "hf_to_vllm_mapper", None)
118115

119116
lora = self._lora_model_cls.from_local_checkpoint(
120117
lora_path,

vllm/model_executor/layers/quantization/base_config.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
if TYPE_CHECKING:
1212
from vllm.model_executor.layers.quantization import QuantizationMethods
13+
from vllm.model_executor.models.utils import WeightsMapper
1314
else:
1415
QuantizationMethods = str
1516

@@ -149,3 +150,15 @@ def get_quant_method(self, layer: torch.nn.Module,
149150

150151
def get_cache_scale(self, name: str) -> Optional[str]:
151152
return None
153+
154+
def apply_vllm_mapper( # noqa: B027
155+
self, hf_to_vllm_mapper: "WeightsMapper"):
156+
"""
157+
Interface for models to update module names referenced in
158+
quantization configs in order to reflect the vllm model structure
159+
160+
:param hf_to_vllm_mapper: maps from hf model structure (the assumed
161+
structure of the qconfig) to vllm model structure
162+
"""
163+
# TODO (@kylesayrs): add implementations for all subclasses
164+
pass

vllm/model_executor/layers/quantization/bitblas.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def __init__(
6363
# (since we have only one group per output channel)
6464
desc_act = False
6565

66+
super().__init__()
6667
self.weight_bits = weight_bits
6768
self.group_size = group_size
6869
self.desc_act = desc_act

vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

44
from contextlib import suppress
5-
from typing import Any, Literal, Optional, cast
5+
from typing import TYPE_CHECKING, Any, Literal, Optional, cast
66

77
import torch
88
from compressed_tensors.config import (CompressionFormat,
@@ -37,6 +37,9 @@
3737
cutlass_fp4_supported)
3838
from vllm.platforms import current_platform
3939

40+
if TYPE_CHECKING:
41+
from vllm.model_executor.models.utils import WeightsMapper
42+
4043
logger = init_logger(__name__)
4144

4245
__all__ = ["CompressedTensorsLinearMethod"]
@@ -80,6 +83,18 @@ def get_min_capability(cls) -> int:
8083
def get_name(self) -> QuantizationMethods:
8184
return "compressed-tensors"
8285

86+
def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
87+
self.target_scheme_map = hf_to_vllm_mapper.apply_dict(
88+
self.target_scheme_map)
89+
self.ignore = hf_to_vllm_mapper.apply_list(self.ignore)
90+
self.sparsity_scheme_map = hf_to_vllm_mapper.apply_dict(
91+
self.sparsity_scheme_map)
92+
self.sparsity_ignore_list = hf_to_vllm_mapper.apply_list(
93+
self.sparsity_ignore_list)
94+
if self.kv_cache_scheme is not None:
95+
self.kv_cache_scheme = hf_to_vllm_mapper.apply_dict(
96+
self.kv_cache_scheme)
97+
8398
def get_quant_method(
8499
self,
85100
layer: torch.nn.Module,

vllm/model_executor/layers/quantization/fp8.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

44
import functools
5-
from typing import Any, Callable, Optional, Union
5+
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
66

77
import torch
88
import torch.nn.functional as F
@@ -39,6 +39,9 @@
3939
from vllm.scalar_type import scalar_types
4040
from vllm.utils import has_deep_gemm
4141

42+
if TYPE_CHECKING:
43+
from vllm.model_executor.models.utils import WeightsMapper
44+
4245
ACTIVATION_SCHEMES = ["static", "dynamic"]
4346

4447
logger = init_logger(__name__)
@@ -100,6 +103,11 @@ def get_min_capability(cls) -> int:
100103
def get_config_filenames(cls) -> list[str]:
101104
return []
102105

106+
def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
107+
if self.ignored_layers is not None:
108+
self.ignored_layers = hf_to_vllm_mapper.apply_list(
109+
self.ignored_layers)
110+
103111
@classmethod
104112
def from_config(cls, config: dict[str, Any]) -> "Fp8Config":
105113
quant_method = cls.get_from_keys(config, ["quant_method"])

vllm/model_executor/layers/quantization/gptq_bitblas.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ def __init__(
8181
# (since we have only one group per output channel)
8282
desc_act = False
8383

84+
super().__init__()
8485
self.weight_bits = weight_bits
8586
self.group_size = group_size
8687
self.desc_act = desc_act

vllm/model_executor/layers/quantization/marlin.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ def __init__(
3232
group_size: int,
3333
lm_head_quantized: bool,
3434
) -> None:
35+
super().__init__()
36+
3537
# Group size for the quantization.
3638
self.group_size = group_size
3739
self.lm_head_quantized = lm_head_quantized

vllm/model_executor/layers/quantization/modelopt.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ def __init__(
181181
exclude_modules: list[str],
182182
group_size: int = 16,
183183
) -> None:
184+
super().__init__()
184185
self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
185186
if is_checkpoint_nvfp4_serialized:
186187
logger.warning(

0 commit comments

Comments
 (0)