diff --git a/tests/utils.py b/tests/utils.py index 04067ef372ac2..6e5bc05b3901a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -20,7 +20,7 @@ init_distributed_environment) from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.openai.cli_args import make_arg_parser -from vllm.model_executor.model_loader.loader import DefaultModelLoader +from vllm.model_executor.model_loader.loader import get_model_loader from vllm.platforms import current_platform from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip @@ -89,11 +89,11 @@ def __init__(self, is_local = os.path.isdir(model) if not is_local: engine_args = AsyncEngineArgs.from_cli_args(args) - engine_config = engine_args.create_engine_config() - dummy_loader = DefaultModelLoader(engine_config.load_config) - dummy_loader._prepare_weights(engine_config.model_config.model, - engine_config.model_config.revision, - fall_back_to_pt=True) + model_config = engine_args.create_model_config() + load_config = engine_args.create_load_config() + + model_loader = get_model_loader(load_config) + model_loader.download_model(model_config) env = os.environ.copy() # the current process might initialize cuda, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7620093660b43..9bc03948d3845 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -771,33 +771,8 @@ def from_cli_args(cls, args: argparse.Namespace): engine_args = cls(**{attr: getattr(args, attr) for attr in attrs}) return engine_args - def create_engine_config(self) -> EngineConfig: - # gguf file needs a specific model loader and doesn't use hf_repo - if check_gguf_file(self.model): - self.quantization = self.load_format = "gguf" - - # bitsandbytes quantization needs a specific model loader - # so we make sure the quant method and the load format are consistent - if (self.quantization == "bitsandbytes" or - self.qlora_adapter_name_or_path is not None) and \ - self.load_format != "bitsandbytes": - raise ValueError( - "BitsAndBytes quantization and QLoRA adapter only support " - f"'bitsandbytes' load format, but got {self.load_format}") - - if (self.load_format == "bitsandbytes" or - self.qlora_adapter_name_or_path is not None) and \ - self.quantization != "bitsandbytes": - raise ValueError( - "BitsAndBytes load format and QLoRA adapter only support " - f"'bitsandbytes' quantization, but got {self.quantization}") - - assert self.cpu_offload_gb >= 0, ( - "CPU offload space must be non-negative" - f", but got {self.cpu_offload_gb}") - - device_config = DeviceConfig(device=self.device) - model_config = ModelConfig( + def create_model_config(self) -> ModelConfig: + return ModelConfig( model=self.model, tokenizer=self.tokenizer, tokenizer_mode=self.tokenizer_mode, @@ -825,6 +800,42 @@ def create_engine_config(self) -> EngineConfig: config_format=self.config_format, ) + def create_load_config(self) -> LoadConfig: + return LoadConfig( + load_format=self.load_format, + download_dir=self.download_dir, + model_loader_extra_config=self.model_loader_extra_config, + ignore_patterns=self.ignore_patterns, + ) + + def create_engine_config(self) -> EngineConfig: + # gguf file needs a specific model loader and doesn't use hf_repo + if check_gguf_file(self.model): + self.quantization = self.load_format = "gguf" + + # bitsandbytes quantization needs a specific model loader + # so we make sure the quant method and the load format are consistent + if (self.quantization == "bitsandbytes" or + self.qlora_adapter_name_or_path is not None) and \ + self.load_format != "bitsandbytes": + raise ValueError( + "BitsAndBytes quantization and QLoRA adapter only support " + f"'bitsandbytes' load format, but got {self.load_format}") + + if (self.load_format == "bitsandbytes" or + self.qlora_adapter_name_or_path is not None) and \ + self.quantization != "bitsandbytes": + raise ValueError( + "BitsAndBytes load format and QLoRA adapter only support " + f"'bitsandbytes' quantization, but got {self.quantization}") + + assert self.cpu_offload_gb >= 0, ( + "CPU offload space must be non-negative" + f", but got {self.cpu_offload_gb}") + + device_config = DeviceConfig(device=self.device) + model_config = self.create_model_config() + cache_config = CacheConfig( block_size=self.block_size if self.device != "neuron" else self.max_model_len, # neuron needs block_size = max_model_len @@ -967,12 +978,7 @@ def create_engine_config(self) -> EngineConfig: self.model_loader_extra_config[ "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path - load_config = LoadConfig( - load_format=self.load_format, - download_dir=self.download_dir, - model_loader_extra_config=self.model_loader_extra_config, - ignore_patterns=self.ignore_patterns, - ) + load_config = self.create_load_config() prompt_adapter_config = PromptAdapterConfig( max_prompt_adapters=self.max_prompt_adapters, diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index bcc866a194637..f59eb805ea907 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -185,6 +185,11 @@ class BaseModelLoader(ABC): def __init__(self, load_config: LoadConfig): self.load_config = load_config + @abstractmethod + def download_model(self, model_config: ModelConfig) -> None: + """Download a model so that it can be immediately loaded.""" + raise NotImplementedError + @abstractmethod def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, @@ -193,7 +198,7 @@ def load_model(self, *, model_config: ModelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: """Load a model with the given configurations.""" - ... + raise NotImplementedError class DefaultModelLoader(BaseModelLoader): @@ -335,6 +340,11 @@ def _xla_weights_iterator(iterator: Generator): weights_iterator = _xla_weights_iterator(weights_iterator) return weights_iterator + def download_model(self, model_config: ModelConfig) -> None: + self._prepare_weights(model_config.model, + model_config.revision, + fall_back_to_pt=True) + def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], @@ -377,6 +387,9 @@ def __init__(self, load_config: LoadConfig): raise ValueError(f"Model loader extra config is not supported for " f"load format {load_config.load_format}") + def download_model(self, model_config: ModelConfig) -> None: + pass # Nothing to download + def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], @@ -467,6 +480,12 @@ def _load_model_serialized( model = load_with_tensorizer(tensorizer_config, **extra_kwargs) return model.eval() + def download_model(self, model_config: ModelConfig) -> None: + self.tensorizer_config.verify_with_model_config(model_config) + + with self.tensorizer_config.open_stream(): + pass + def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], @@ -568,6 +587,9 @@ def _prepare_weights(self, model_name_or_path: str, ignore_patterns=self.load_config.ignore_patterns, ) + def download_model(self, model_config: ModelConfig) -> None: + self._prepare_weights(model_config.model, model_config.revision) + def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], @@ -995,6 +1017,9 @@ def _load_weights(self, model_config: ModelConfig, set_weight_attrs( param, {"matmul_state": [None] * len(quant_states)}) + def download_model(self, model_config: ModelConfig) -> None: + self._prepare_weights(model_config.model, model_config.revision) + def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], @@ -1070,6 +1095,9 @@ def _get_weights_iterator( return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map) + def download_model(self, model_config: ModelConfig) -> None: + self._prepare_weights(model_config.model) + def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index b009ad8c882d4..3aac5cd2b43a5 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -99,6 +99,13 @@ def verify_with_model_config(self, model_config: "ModelConfig") -> None: "Loading a model using Tensorizer with quantization on vLLM" " is unstable and may lead to errors.") + def open_stream(self, tensorizer_args: Optional["TensorizerArgs"] = None): + if tensorizer_args is None: + tensorizer_args = self._construct_tensorizer_args() + + return open_stream(self.tensorizer_uri, + **tensorizer_args.stream_params) + def load_with_tensorizer(tensorizer_config: TensorizerConfig, **extra_kwargs) -> nn.Module: