|
27 | 27 | from pydantic.dataclasses import dataclass |
28 | 28 | from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE |
29 | 29 | from torch.distributed import ProcessGroup, ReduceOp |
30 | | -from transformers import PretrainedConfig |
31 | 30 | from typing_extensions import Self, deprecated, runtime_checkable |
32 | 31 |
|
33 | 32 | import vllm.envs as envs |
34 | 33 | from vllm import version |
35 | 34 | from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass |
36 | 35 | from vllm.logger import init_logger |
37 | | -from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, |
38 | | - QuantizationMethods, |
39 | | - get_quantization_config) |
40 | | -from vllm.model_executor.models import ModelRegistry |
41 | 36 | from vllm.platforms import current_platform |
42 | | -from vllm.tracing import is_otel_available, otel_import_error_traceback |
43 | 37 | from vllm.transformers_utils.config import ( |
44 | 38 | ConfigFormat, get_config, get_hf_image_processor_config, |
45 | 39 | get_hf_text_config, get_pooling_config, |
|
48 | 42 | try_get_tokenizer_config, uses_mrope) |
49 | 43 | from vllm.transformers_utils.s3_utils import S3Model |
50 | 44 | from vllm.transformers_utils.utils import is_s3, maybe_model_redirect |
| 45 | +# yapf conflicts with isort for this block |
| 46 | +# yapf: disable |
51 | 47 | from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, |
52 | 48 | MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, |
53 | 49 | POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes, |
54 | | - LayerBlockType, common_broadcastable_dtype, |
| 50 | + LayerBlockType, LazyLoader, common_broadcastable_dtype, |
55 | 51 | cuda_device_count_stateless, get_cpu_memory, |
56 | 52 | get_open_port, is_torch_equal_or_newer, random_uuid, |
57 | 53 | resolve_obj_by_qualname) |
58 | 54 |
|
| 55 | +# yapf: enable |
| 56 | + |
59 | 57 | if TYPE_CHECKING: |
60 | 58 | from _typeshed import DataclassInstance |
61 | 59 | from ray.util.placement_group import PlacementGroup |
| 60 | + from transformers.configuration_utils import PretrainedConfig |
62 | 61 |
|
| 62 | + import vllm.model_executor.layers.quantization as me_quant |
| 63 | + import vllm.model_executor.models as me_models |
63 | 64 | from vllm.executor.executor_base import ExecutorBase |
| 65 | + from vllm.model_executor.layers.quantization import QuantizationMethods |
64 | 66 | from vllm.model_executor.layers.quantization.base_config import ( |
65 | 67 | QuantizationConfig) |
66 | 68 | from vllm.model_executor.model_loader import BaseModelLoader |
67 | 69 | from vllm.model_executor.model_loader.tensorizer import TensorizerConfig |
68 | 70 |
|
69 | 71 | ConfigType = type[DataclassInstance] |
| 72 | + HfOverrides = Union[dict, Callable[[type], type]] |
70 | 73 | else: |
71 | 74 | PlacementGroup = Any |
| 75 | + PretrainedConfig = Any |
72 | 76 | ExecutorBase = Any |
73 | 77 | QuantizationConfig = Any |
| 78 | + QuantizationMethods = Any |
74 | 79 | BaseModelLoader = Any |
75 | 80 | TensorizerConfig = Any |
76 | 81 | ConfigType = type |
| 82 | + HfOverrides = Union[dict[str, Any], Callable[[type], type]] |
| 83 | + |
| 84 | + me_quant = LazyLoader("model_executor", globals(), |
| 85 | + "vllm.model_executor.layers.quantization") |
| 86 | + me_models = LazyLoader("model_executor", globals(), |
| 87 | + "vllm.model_executor.models") |
77 | 88 |
|
78 | 89 | logger = init_logger(__name__) |
79 | 90 |
|
|
100 | 111 | for task in tasks |
101 | 112 | } |
102 | 113 |
|
103 | | -HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig], |
104 | | - PretrainedConfig]] |
105 | | - |
106 | 114 |
|
107 | 115 | @runtime_checkable |
108 | 116 | class SupportsHash(Protocol): |
@@ -648,7 +656,7 @@ def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": |
648 | 656 |
|
649 | 657 | @property |
650 | 658 | def registry(self): |
651 | | - return ModelRegistry |
| 659 | + return me_models.ModelRegistry |
652 | 660 |
|
653 | 661 | @property |
654 | 662 | def architectures(self) -> list[str]: |
@@ -859,14 +867,15 @@ def _parse_quant_hf_config(self): |
859 | 867 | return quant_cfg |
860 | 868 |
|
861 | 869 | def _verify_quantization(self) -> None: |
862 | | - supported_quantization = QUANTIZATION_METHODS |
| 870 | + supported_quantization = me_quant.QUANTIZATION_METHODS |
863 | 871 | optimized_quantization_methods = [ |
864 | 872 | "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", |
865 | 873 | "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8", |
866 | 874 | "quark", "modelopt_fp4", "bitblas", "gptq_bitblas" |
867 | 875 | ] |
868 | 876 | if self.quantization is not None: |
869 | | - self.quantization = cast(QuantizationMethods, self.quantization) |
| 877 | + self.quantization = cast(me_quant.QuantizationMethods, |
| 878 | + self.quantization) |
870 | 879 |
|
871 | 880 | # Parse quantization method from the HF model config, if available. |
872 | 881 | quant_cfg = self._parse_quant_hf_config() |
@@ -900,14 +909,14 @@ def _verify_quantization(self) -> None: |
900 | 909 |
|
901 | 910 | # Detect which checkpoint is it |
902 | 911 | for name in quantization_methods: |
903 | | - method = get_quantization_config(name) |
| 912 | + method = me_quant.get_quantization_config(name) |
904 | 913 | quantization_override = method.override_quantization_method( |
905 | 914 | quant_cfg, self.quantization) |
906 | 915 | if quantization_override is not None: |
907 | 916 | # Raise error if the override is not custom (custom would |
908 | 917 | # be in QUANTIZATION_METHODS but not QuantizationMethods) |
909 | 918 | # and hasn't been added to the overrides list. |
910 | | - if (name in get_args(QuantizationMethods) |
| 919 | + if (name in get_args(me_quant.QuantizationMethods) |
911 | 920 | and name not in overrides): |
912 | 921 | raise ValueError( |
913 | 922 | f"Quantization method {name} is an override but " |
@@ -1417,7 +1426,7 @@ def runner_type(self) -> RunnerType: |
1417 | 1426 | @property |
1418 | 1427 | def is_v1_compatible(self) -> bool: |
1419 | 1428 | architectures = getattr(self.hf_config, "architectures", []) |
1420 | | - return ModelRegistry.is_v1_compatible(architectures) |
| 1429 | + return me_models.ModelRegistry.is_v1_compatible(architectures) |
1421 | 1430 |
|
1422 | 1431 | @property |
1423 | 1432 | def is_matryoshka(self) -> bool: |
@@ -2376,7 +2385,7 @@ class SpeculativeConfig: |
2376 | 2385 | according to the log probability settings in SamplingParams.""" |
2377 | 2386 |
|
2378 | 2387 | # Draft model configuration |
2379 | | - quantization: Optional[QuantizationMethods] = None |
| 2388 | + quantization: Optional[me_quant.QuantizationMethods] = None |
2380 | 2389 | """Quantization method that was used to quantize the draft model weights. |
2381 | 2390 | If `None`, we assume the model weights are not quantized. Note that it only |
2382 | 2391 | takes effect when using the draft model-based speculative method.""" |
@@ -3624,6 +3633,7 @@ def __post_init__(self): |
3624 | 3633 | and "," in self.collect_detailed_traces[0]): |
3625 | 3634 | self._parse_collect_detailed_traces() |
3626 | 3635 |
|
| 3636 | + from vllm.tracing import is_otel_available, otel_import_error_traceback |
3627 | 3637 | if not is_otel_available() and self.otlp_traces_endpoint is not None: |
3628 | 3638 | raise ValueError( |
3629 | 3639 | "OpenTelemetry is not available. Unable to configure " |
|
0 commit comments