Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bugfix] Fix broken OpenAI tensorizer test #8258

Merged
merged 3 commits into from
Sep 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
init_distributed_environment)
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.model_executor.model_loader.loader import DefaultModelLoader
from vllm.model_executor.model_loader.loader import get_model_loader
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip

Expand Down Expand Up @@ -89,11 +89,11 @@ def __init__(self,
is_local = os.path.isdir(model)
if not is_local:
engine_args = AsyncEngineArgs.from_cli_args(args)
engine_config = engine_args.create_engine_config()
dummy_loader = DefaultModelLoader(engine_config.load_config)
dummy_loader._prepare_weights(engine_config.model_config.model,
engine_config.model_config.revision,
fall_back_to_pt=True)
model_config = engine_args.create_model_config()
load_config = engine_args.create_load_config()

model_loader = get_model_loader(load_config)
model_loader.download_model(model_config)

env = os.environ.copy()
# the current process might initialize cuda,
Expand Down
72 changes: 39 additions & 33 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -771,33 +771,8 @@ def from_cli_args(cls, args: argparse.Namespace):
engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
return engine_args

def create_engine_config(self) -> EngineConfig:
# gguf file needs a specific model loader and doesn't use hf_repo
if check_gguf_file(self.model):
self.quantization = self.load_format = "gguf"

# bitsandbytes quantization needs a specific model loader
# so we make sure the quant method and the load format are consistent
if (self.quantization == "bitsandbytes" or
self.qlora_adapter_name_or_path is not None) and \
self.load_format != "bitsandbytes":
raise ValueError(
"BitsAndBytes quantization and QLoRA adapter only support "
f"'bitsandbytes' load format, but got {self.load_format}")

if (self.load_format == "bitsandbytes" or
self.qlora_adapter_name_or_path is not None) and \
self.quantization != "bitsandbytes":
raise ValueError(
"BitsAndBytes load format and QLoRA adapter only support "
f"'bitsandbytes' quantization, but got {self.quantization}")

assert self.cpu_offload_gb >= 0, (
"CPU offload space must be non-negative"
f", but got {self.cpu_offload_gb}")

device_config = DeviceConfig(device=self.device)
model_config = ModelConfig(
def create_model_config(self) -> ModelConfig:
return ModelConfig(
model=self.model,
tokenizer=self.tokenizer,
tokenizer_mode=self.tokenizer_mode,
Expand Down Expand Up @@ -825,6 +800,42 @@ def create_engine_config(self) -> EngineConfig:
config_format=self.config_format,
)

def create_load_config(self) -> LoadConfig:
return LoadConfig(
load_format=self.load_format,
download_dir=self.download_dir,
model_loader_extra_config=self.model_loader_extra_config,
ignore_patterns=self.ignore_patterns,
)

def create_engine_config(self) -> EngineConfig:
# gguf file needs a specific model loader and doesn't use hf_repo
if check_gguf_file(self.model):
self.quantization = self.load_format = "gguf"

# bitsandbytes quantization needs a specific model loader
# so we make sure the quant method and the load format are consistent
if (self.quantization == "bitsandbytes" or
self.qlora_adapter_name_or_path is not None) and \
self.load_format != "bitsandbytes":
raise ValueError(
"BitsAndBytes quantization and QLoRA adapter only support "
f"'bitsandbytes' load format, but got {self.load_format}")

if (self.load_format == "bitsandbytes" or
self.qlora_adapter_name_or_path is not None) and \
self.quantization != "bitsandbytes":
raise ValueError(
"BitsAndBytes load format and QLoRA adapter only support "
f"'bitsandbytes' quantization, but got {self.quantization}")

assert self.cpu_offload_gb >= 0, (
"CPU offload space must be non-negative"
f", but got {self.cpu_offload_gb}")

device_config = DeviceConfig(device=self.device)
model_config = self.create_model_config()

cache_config = CacheConfig(
block_size=self.block_size if self.device != "neuron" else
self.max_model_len, # neuron needs block_size = max_model_len
Expand Down Expand Up @@ -967,12 +978,7 @@ def create_engine_config(self) -> EngineConfig:
self.model_loader_extra_config[
"qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path

load_config = LoadConfig(
load_format=self.load_format,
download_dir=self.download_dir,
model_loader_extra_config=self.model_loader_extra_config,
ignore_patterns=self.ignore_patterns,
)
load_config = self.create_load_config()

prompt_adapter_config = PromptAdapterConfig(
max_prompt_adapters=self.max_prompt_adapters,
Expand Down
30 changes: 29 additions & 1 deletion vllm/model_executor/model_loader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,11 @@ class BaseModelLoader(ABC):
def __init__(self, load_config: LoadConfig):
self.load_config = load_config

@abstractmethod
def download_model(self, model_config: ModelConfig) -> None:
"""Download a model so that it can be immediately loaded."""
raise NotImplementedError

@abstractmethod
def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
Expand All @@ -193,7 +198,7 @@ def load_model(self, *, model_config: ModelConfig,
scheduler_config: SchedulerConfig,
cache_config: CacheConfig) -> nn.Module:
"""Load a model with the given configurations."""
...
raise NotImplementedError


class DefaultModelLoader(BaseModelLoader):
Expand Down Expand Up @@ -335,6 +340,11 @@ def _xla_weights_iterator(iterator: Generator):
weights_iterator = _xla_weights_iterator(weights_iterator)
return weights_iterator

def download_model(self, model_config: ModelConfig) -> None:
self._prepare_weights(model_config.model,
model_config.revision,
fall_back_to_pt=True)

def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
Expand Down Expand Up @@ -377,6 +387,9 @@ def __init__(self, load_config: LoadConfig):
raise ValueError(f"Model loader extra config is not supported for "
f"load format {load_config.load_format}")

def download_model(self, model_config: ModelConfig) -> None:
pass # Nothing to download

def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
Expand Down Expand Up @@ -467,6 +480,12 @@ def _load_model_serialized(
model = load_with_tensorizer(tensorizer_config, **extra_kwargs)
return model.eval()

def download_model(self, model_config: ModelConfig) -> None:
self.tensorizer_config.verify_with_model_config(model_config)

with self.tensorizer_config.open_stream():
pass

def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
Expand Down Expand Up @@ -568,6 +587,9 @@ def _prepare_weights(self, model_name_or_path: str,
ignore_patterns=self.load_config.ignore_patterns,
)

def download_model(self, model_config: ModelConfig) -> None:
self._prepare_weights(model_config.model, model_config.revision)

def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
Expand Down Expand Up @@ -995,6 +1017,9 @@ def _load_weights(self, model_config: ModelConfig,
set_weight_attrs(
param, {"matmul_state": [None] * len(quant_states)})

def download_model(self, model_config: ModelConfig) -> None:
self._prepare_weights(model_config.model, model_config.revision)

def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
Expand Down Expand Up @@ -1070,6 +1095,9 @@ def _get_weights_iterator(
return gguf_quant_weights_iterator(model_name_or_path,
gguf_to_hf_name_map)

def download_model(self, model_config: ModelConfig) -> None:
self._prepare_weights(model_config.model)

def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
Expand Down
7 changes: 7 additions & 0 deletions vllm/model_executor/model_loader/tensorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@ def verify_with_model_config(self, model_config: "ModelConfig") -> None:
"Loading a model using Tensorizer with quantization on vLLM"
" is unstable and may lead to errors.")

def open_stream(self, tensorizer_args: Optional["TensorizerArgs"] = None):
if tensorizer_args is None:
tensorizer_args = self._construct_tensorizer_args()

return open_stream(self.tensorizer_uri,
**tensorizer_args.stream_params)


def load_with_tensorizer(tensorizer_config: TensorizerConfig,
**extra_kwargs) -> nn.Module:
Expand Down
Loading