From d4187692c67847071167c34e68c3e36cdb4647e2 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 10 Apr 2025 19:41:36 +0200 Subject: [PATCH 1/8] Improve configs - `LoadConfig` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config.py | 67 ++++++++++++++++++---------------- vllm/engine/arg_utils.py | 78 +++++++++++++--------------------------- 2 files changed, 60 insertions(+), 85 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 23541a884d91..fe4c5c2fc689 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1431,44 +1431,49 @@ class LoadFormat(str, enum.Enum): FASTSAFETENSORS = "fastsafetensors" +@config @dataclass class LoadConfig: - """ - download_dir: Directory to download and load the weights, default to the - default cache directory of huggingface. - load_format: The format of the model weights to load: - "auto" will try to load the weights in the safetensors format and - fall back to the pytorch bin format if safetensors format is - not available. - "pt" will load the weights in the pytorch bin format. - "safetensors" will load the weights in the safetensors format. - "npcache" will load the weights in pytorch format and store - a numpy cache to speed up the loading. - "dummy" will initialize the weights with random values, which is - mainly for profiling. - "tensorizer" will use CoreWeave's tensorizer library for - fast weight loading. - "bitsandbytes" will load nf4 type weights. - "sharded_state" will load weights from pre-sharded checkpoint files, - supporting efficient loading of tensor-parallel models. - "gguf" will load weights from GGUF format files. - "mistral" will load weights from consolidated safetensors files used - by Mistral models. - "runai_streamer" will load weights from RunAI streamer format files. - model_loader_extra_config: The extra config for the model loader. - ignore_patterns: The list of patterns to ignore when loading the model. - Default to "original/**/*" to avoid repeated loading of llama's - checkpoints. - use_tqdm_on_load: Whether to enable tqdm for showing progress bar during - loading. Default to True - """ + """Configuration for loading the model weights.""" load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO + """ + The format of the model weights to load: + + - "auto" will try to load the weights in the safetensors format and fall + back to the pytorch bin format if safetensors format is not available. + - "pt" will load the weights in the pytorch bin format. + - "safetensors" will load the weights in the safetensors format. + - "npcache" will load the weights in pytorch format and store a numpy cache + to speed up the loading. + - "dummy" will initialize the weights with random values, which is mainly + for profiling. + - "tensorizer" will use CoreWeave's tensorizer library for fast weight + loading. See the Tensorize vLLM Model script in the Examples section for + more information. + - "runai_streamer" will load the Safetensors weights using Run:ai Model + Streamer. + - "bitsandbytes" will load the weights using bitsandbytes quantization. + - "sharded_state" will load weights from pre-sharded checkpoint files, + supporting efficient loading of tensor-parallel models. + - "gguf" will load weights from GGUF format files (details specified in + https://github.com/ggml-org/ggml/blob/master/docs/gguf.md). + - "mistral" will load weights from consolidated safetensors files used by + Mistral models. + """ download_dir: Optional[str] = None - model_loader_extra_config: Optional[Union[str, dict]] = field( - default_factory=dict) + """Directory to download and load the weights, default to the default + cache directory of Hugging Face.""" + model_loader_extra_config: Optional[Union[str, dict]] = None + """Extra config for model loader. This will be passed to the model loader + corresponding to the chosen load_format. This should be a JSON string that + will be parsed into a dictionary.""" ignore_patterns: Optional[Union[list[str], str]] = None + """The list of patterns to ignore when loading the model. Default to + "original/**/*" to avoid repeated loading of llama's checkpoints.""" use_tqdm_on_load: bool = True + """Whether to enable tqdm for showing progress bar when loading model + weights.""" def compute_hash(self) -> str: """ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ba71a8770d17..9986fb83a20e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -101,8 +101,8 @@ class EngineArgs: tokenizer_mode: str = 'auto' trust_remote_code: bool = False allowed_local_media_path: str = "" - download_dir: Optional[str] = None - load_format: str = 'auto' + download_dir: Optional[str] = LoadConfig.download_dir + load_format: str = LoadConfig.load_format config_format: ConfigFormat = ConfigFormat.AUTO dtype: str = 'auto' kv_cache_dtype: str = 'auto' @@ -174,8 +174,10 @@ class EngineArgs: ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight num_gpu_blocks_override: Optional[int] = None num_lookahead_slots: int = 0 - model_loader_extra_config: Optional[dict] = None - ignore_patterns: Optional[Union[str, List[str]]] = None + model_loader_extra_config: Optional[ + dict] = LoadConfig.model_loader_extra_config + ignore_patterns: Optional[Union[str, + List[str]]] = LoadConfig.ignore_patterns preemption_mode: Optional[str] = None scheduler_delay_factor: float = 0.0 @@ -213,7 +215,7 @@ class EngineArgs: additional_config: Optional[Dict[str, Any]] = None enable_reasoning: Optional[bool] = None reasoning_parser: Optional[str] = None - use_tqdm_on_load: bool = True + use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load def __post_init__(self): if not self.tokenizer: @@ -333,38 +335,23 @@ def get_kwargs(cls: type[Any]) -> Dict[str, Any]: "from directories specified by the server file system. " "This is a security risk. " "Should only be enabled in trusted environments.") - parser.add_argument('--download-dir', - type=nullable_str, - default=EngineArgs.download_dir, - help='Directory to download and load the weights.') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=[f.value for f in LoadFormat], - help='The format of the model weights to load.\n\n' - '* "auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available.\n' - '* "pt" will load the weights in the pytorch bin format.\n' - '* "safetensors" will load the weights in the safetensors format.\n' - '* "npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading.\n' - '* "dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.\n' - '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples ' - 'section for more information.\n' - '* "runai_streamer" will load the Safetensors weights using Run:ai' - 'Model Streamer.\n' - '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n' - '* "sharded_state" will load weights from pre-sharded checkpoint ' - 'files, supporting efficient loading of tensor-parallel models\n' - '* "gguf" will load weights from GGUF format files (details ' - 'specified in https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n' - '* "mistral" will load weights from consolidated safetensors files ' - 'used by Mistral models.\n') + # Model loading arguments + load_kwargs = get_kwargs(LoadConfig) + load_group = parser.add_argument_group( + title="LoadConfig", + description=LoadConfig.__doc__, + ) + load_group.add_argument('--load-format', + choices=[f.value for f in LoadFormat], + **load_kwargs["load_format"]) + load_group.add_argument('--download-dir', + **load_kwargs["download_dir"]) + load_group.add_argument('--model-loader-extra-config', + **load_kwargs["model_loader_extra_config"]) + parser.add_argument('--use-tqdm-on-load', + action=argparse.BooleanOptionalAction, + **load_kwargs["use_tqdm_on_load"]) + parser.add_argument( '--config-format', default=EngineArgs.config_format, @@ -770,14 +757,6 @@ def get_kwargs(cls: type[Any]) -> Dict[str, Any]: default=1, help=('Maximum number of forward steps per ' 'scheduler call.')) - parser.add_argument( - '--use-tqdm-on-load', - dest='use_tqdm_on_load', - action=argparse.BooleanOptionalAction, - default=EngineArgs.use_tqdm_on_load, - help='Whether to enable/disable progress bar ' - 'when loading model weights.', - ) parser.add_argument( '--multi-step-stream-outputs', @@ -806,15 +785,6 @@ def get_kwargs(cls: type[Any]) -> Dict[str, Any]: default=None, help='The configurations for speculative decoding.' ' Should be a JSON string.') - - parser.add_argument('--model-loader-extra-config', - type=nullable_str, - default=EngineArgs.model_loader_extra_config, - help='Extra config for model loader. ' - 'This will be passed to the model loader ' - 'corresponding to the chosen load_format. ' - 'This should be a JSON string that will be ' - 'parsed into a dictionary.') parser.add_argument( '--ignore-patterns', action="append", From 9fc2cacf672986e9ecda4186d850b903428735ce Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 11 Apr 2025 13:19:28 +0200 Subject: [PATCH 2/8] Enable newlines in argparse help strings Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/utils.py b/vllm/utils.py index 551f1a4c9d26..19e15fccf38a 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -2,7 +2,6 @@ from __future__ import annotations -import argparse import asyncio import concurrent import contextlib @@ -32,6 +31,8 @@ import uuid import warnings import weakref +from argparse import (Action, ArgumentDefaultsHelpFormatter, ArgumentParser, + ArgumentTypeError, RawTextHelpFormatter) from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task from collections import UserDict, defaultdict from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable, @@ -1209,7 +1210,7 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> None: return wrapper -class StoreBoolean(argparse.Action): +class StoreBoolean(Action): def __call__(self, parser, namespace, values, option_string=None): if values.lower() == "true": @@ -1221,7 +1222,7 @@ def __call__(self, parser, namespace, values, option_string=None): "Expected 'true' or 'false'.") -class SortedHelpFormatter(argparse.ArgumentDefaultsHelpFormatter): +class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, RawTextHelpFormatter): """SortedHelpFormatter that sorts arguments by their option strings.""" def add_arguments(self, actions): @@ -1229,7 +1230,7 @@ def add_arguments(self, actions): super().add_arguments(actions) -class FlexibleArgumentParser(argparse.ArgumentParser): +class FlexibleArgumentParser(ArgumentParser): """ArgumentParser that allows both underscore and dash in names.""" def __init__(self, *args, **kwargs): @@ -1280,11 +1281,10 @@ def check_port(self, value): value = int(value) except ValueError: msg = "Port must be an integer" - raise argparse.ArgumentTypeError(msg) from None + raise ArgumentTypeError(msg) from None if not (1024 <= value <= 65535): - raise argparse.ArgumentTypeError( - "Port must be between 1024 and 65535") + raise ArgumentTypeError("Port must be between 1024 and 65535") return value From de4329af07bf4e6869f2693ced85406a7c292559 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 11 Apr 2025 13:20:42 +0200 Subject: [PATCH 3/8] Update docstring formatting of `LoadConfig.load_format` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index fe4c5c2fc689..b638d42702bf 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1437,9 +1437,7 @@ class LoadConfig: """Configuration for loading the model weights.""" load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO - """ - The format of the model weights to load: - + """The format of the model weights to load: - "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available. - "pt" will load the weights in the pytorch bin format. @@ -1459,8 +1457,7 @@ class LoadConfig: - "gguf" will load weights from GGUF format files (details specified in https://github.com/ggml-org/ggml/blob/master/docs/gguf.md). - "mistral" will load weights from consolidated safetensors files used by - Mistral models. - """ + Mistral models.""" download_dir: Optional[str] = None """Directory to download and load the weights, default to the default cache directory of Hugging Face.""" From 0dd13b2358257e6e49d00bcad47187313d66cf50 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 11 Apr 2025 13:22:57 +0200 Subject: [PATCH 4/8] Fix IntelliSense for `config` decorator Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index b638d42702bf..3770adb4166a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -17,7 +17,7 @@ from importlib.util import find_spec from pathlib import Path from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal, - Optional, Protocol, Union) + Optional, Protocol, TypeVar, Union) import torch from pydantic import BaseModel, Field, PrivateAttr @@ -45,6 +45,7 @@ random_uuid, resolve_obj_by_qualname) if TYPE_CHECKING: + from _typeshed import DataclassInstance from ray.util.placement_group import PlacementGroup from vllm.executor.executor_base import ExecutorBase @@ -53,8 +54,11 @@ from vllm.model_executor.model_loader.loader import BaseModelLoader from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( BaseTokenizerGroup) + + Config = TypeVar("Config", bound=DataclassInstance) else: QuantizationConfig = None + Config = TypeVar("Config") logger = init_logger(__name__) @@ -159,7 +163,7 @@ def pairwise(iterable): return out -def config(cls: type[Any]) -> type[Any]: +def config(cls: type[Config]) -> type[Config]: """ A decorator that ensures all fields in a dataclass have default values and that each field has a docstring. From 2d39e64b3a56814fee8313dea258ff6c77693d8a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 11 Apr 2025 13:26:20 +0200 Subject: [PATCH 5/8] Handle `Union[str, ...]` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/engine/arg_utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 9986fb83a20e..f1e8afb96409 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -236,9 +236,13 @@ def __post_init__(self): def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: """Shared CLI arguments for vLLM engine.""" + def is_type_in_union(cls: type[Any], type: type[Any]) -> bool: + """Check if the class is a type in a union type.""" + return get_origin(cls) is Union and type in get_args(cls) + def is_optional(cls: type[Any]) -> bool: """Check if the class is an optional type.""" - return get_origin(cls) is Union and type(None) in get_args(cls) + return is_type_in_union(cls, type(None)) def get_kwargs(cls: type[Any]) -> Dict[str, Any]: cls_docs = get_attr_docs(cls) @@ -257,6 +261,10 @@ def get_kwargs(cls: type[Any]) -> Dict[str, Any]: if is_optional(field.type): kwargs[name]["type"] = nullable_str continue + # Handle str in union fields + if is_type_in_union(field.type, str): + kwargs[name]["type"] = str + continue kwargs[name]["type"] = field.type return kwargs From 4d5df236872e66fd65ae11578a295617bb03e582 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 11 Apr 2025 13:43:43 +0200 Subject: [PATCH 6/8] Use `Enum` member value as default Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index 3770adb4166a..9f0eeef1142a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1440,7 +1440,8 @@ class LoadFormat(str, enum.Enum): class LoadConfig: """Configuration for loading the model weights.""" - load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO + load_format: Union[str, LoadFormat, + "BaseModelLoader"] = LoadFormat.AUTO.value """The format of the model weights to load: - "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available. From fe71abc0817f5860e068dc4bf5ea214d0fcbc25c Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 11 Apr 2025 15:59:16 +0200 Subject: [PATCH 7/8] Better handling of newlines in config docstrings Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config.py | 22 +++++++++++----------- vllm/utils.py | 18 ++++++++++++++++-- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 9f0eeef1142a..359039336bd8 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1442,25 +1442,25 @@ class LoadConfig: load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO.value - """The format of the model weights to load: + """The format of the model weights to load:\n - "auto" will try to load the weights in the safetensors format and fall - back to the pytorch bin format if safetensors format is not available. - - "pt" will load the weights in the pytorch bin format. - - "safetensors" will load the weights in the safetensors format. + back to the pytorch bin format if safetensors format is not available.\n + - "pt" will load the weights in the pytorch bin format.\n + - "safetensors" will load the weights in the safetensors format.\n - "npcache" will load the weights in pytorch format and store a numpy cache - to speed up the loading. + to speed up the loading.\n - "dummy" will initialize the weights with random values, which is mainly - for profiling. + for profiling.\n - "tensorizer" will use CoreWeave's tensorizer library for fast weight loading. See the Tensorize vLLM Model script in the Examples section for - more information. + more information.\n - "runai_streamer" will load the Safetensors weights using Run:ai Model - Streamer. - - "bitsandbytes" will load the weights using bitsandbytes quantization. + Streamer.\n + - "bitsandbytes" will load the weights using bitsandbytes quantization.\n - "sharded_state" will load weights from pre-sharded checkpoint files, - supporting efficient loading of tensor-parallel models. + supporting efficient loading of tensor-parallel models.\n - "gguf" will load weights from GGUF format files (details specified in - https://github.com/ggml-org/ggml/blob/master/docs/gguf.md). + https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n - "mistral" will load weights from consolidated safetensors files used by Mistral models.""" download_dir: Optional[str] = None diff --git a/vllm/utils.py b/vllm/utils.py index 19e15fccf38a..0fa3384aa090 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -24,6 +24,7 @@ import subprocess import sys import tempfile +import textwrap import threading import time import traceback @@ -32,7 +33,7 @@ import warnings import weakref from argparse import (Action, ArgumentDefaultsHelpFormatter, ArgumentParser, - ArgumentTypeError, RawTextHelpFormatter) + ArgumentTypeError) from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task from collections import UserDict, defaultdict from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable, @@ -1222,9 +1223,22 @@ def __call__(self, parser, namespace, values, option_string=None): "Expected 'true' or 'false'.") -class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, RawTextHelpFormatter): +class SortedHelpFormatter(ArgumentDefaultsHelpFormatter): """SortedHelpFormatter that sorts arguments by their option strings.""" + def _split_lines(self, text, width): + """ + 1. Sentences split across lines have their single newlines removed. + 2. Paragraphs and explicit newlines are split into separate lines. + 3. Each line is wrapped to the specified width (width of terminal). + """ + # The patterns also include whitespace after the newline + single_newline = re.compile("(? Date: Fri, 11 Apr 2025 19:24:34 +0200 Subject: [PATCH 8/8] Typo Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/engine/arg_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 1b476528ac94..70e628ed1680 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -356,9 +356,9 @@ def get_kwargs(cls: type[Any]) -> Dict[str, Any]: **load_kwargs["download_dir"]) load_group.add_argument('--model-loader-extra-config', **load_kwargs["model_loader_extra_config"]) - parser.add_argument('--use-tqdm-on-load', - action=argparse.BooleanOptionalAction, - **load_kwargs["use_tqdm_on_load"]) + load_group.add_argument('--use-tqdm-on-load', + action=argparse.BooleanOptionalAction, + **load_kwargs["use_tqdm_on_load"]) parser.add_argument( '--config-format',