From 13f9f7a3d0373421ee9fd7498e450214e134aa6c Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 25 Sep 2024 08:08:55 +0800 Subject: [PATCH] [[Misc]Upgrade bitsandbytes to the latest version 0.44.0 (#8768) --- docs/source/quantization/bnb.rst | 2 +- examples/lora_with_quantization_inference.py | 26 +++++++--------- requirements-test.txt | 2 +- tests/quantization/test_bitsandbytes.py | 2 +- vllm/config.py | 30 ++++++++++++++----- .../layers/quantization/bitsandbytes.py | 8 ++--- vllm/model_executor/model_loader/loader.py | 8 ++--- 7 files changed, 44 insertions(+), 34 deletions(-) diff --git a/docs/source/quantization/bnb.rst b/docs/source/quantization/bnb.rst index aefb54a8acb65..682938cc63d48 100644 --- a/docs/source/quantization/bnb.rst +++ b/docs/source/quantization/bnb.rst @@ -11,7 +11,7 @@ Below are the steps to utilize BitsAndBytes with vLLM. .. code-block:: console - $ pip install bitsandbytes>=0.42.0 + $ pip install bitsandbytes>=0.44.0 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. diff --git a/examples/lora_with_quantization_inference.py b/examples/lora_with_quantization_inference.py index 3b2347c1115e1..0c454ea50f665 100644 --- a/examples/lora_with_quantization_inference.py +++ b/examples/lora_with_quantization_inference.py @@ -79,23 +79,17 @@ def initialize_engine(model: str, quantization: str, # It quantizes the model when loading, with some config info from the # LoRA adapter repo. So need to set the parameter of load_format and # qlora_adapter_name_or_path as below. - engine_args = EngineArgs( - model=model, - quantization=quantization, - qlora_adapter_name_or_path=lora_repo, - load_format="bitsandbytes", - enable_lora=True, - max_lora_rank=64, - # set it only in GPUs of limited memory - enforce_eager=True) + engine_args = EngineArgs(model=model, + quantization=quantization, + qlora_adapter_name_or_path=lora_repo, + load_format="bitsandbytes", + enable_lora=True, + max_lora_rank=64) else: - engine_args = EngineArgs( - model=model, - quantization=quantization, - enable_lora=True, - max_loras=4, - # set it only in GPUs of limited memory - enforce_eager=True) + engine_args = EngineArgs(model=model, + quantization=quantization, + enable_lora=True, + max_loras=4) return LLMEngine.from_engine_args(engine_args) diff --git a/requirements-test.txt b/requirements-test.txt index 10d463de27be5..9c6fadb88865a 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -30,5 +30,5 @@ datamodel_code_generator # required for minicpm3 test aiohttp # quantization -bitsandbytes==0.42.0 +bitsandbytes>=0.44.0 buildkite-test-collector==0.1.8 diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index 36167cf95f589..ac2ebc622ba6f 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -107,7 +107,7 @@ def validate_generated_texts(hf_runner, quantization='bitsandbytes', load_format='bitsandbytes', tensor_parallel_size=vllm_tp_size, - enforce_eager=True, + enforce_eager=False, gpu_memory_utilization=0.8) as llm: vllm_outputs = llm.generate_greedy(prompts, 8) vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner") diff --git a/vllm/config.py b/vllm/config.py index 8c65d99c44651..562564bbfa032 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -222,6 +222,7 @@ def __init__(self, self._verify_embedding_mode() self._verify_quantization() self._verify_cuda_graph() + self._verify_bnb_config() def _init_multimodal_config( self, limit_mm_per_prompt: Optional[Mapping[str, int]] @@ -337,6 +338,28 @@ def _verify_cuda_graph(self) -> None: self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_model_len) + def _verify_bnb_config(self) -> None: + """ + The current version of bitsandbytes (0.44.0) with 8-bit models does not + yet support CUDA graph. + """ + is_bitsandbytes = self.quantization == "bitsandbytes" + has_quantization_config = (getattr(self.hf_config, + "quantization_config", None) + is not None) + is_8bit = (self.hf_config.quantization_config.get( + "load_in_8bit", False) if has_quantization_config else False) + if all([ + is_bitsandbytes, + has_quantization_config, + is_8bit, + not self.enforce_eager, + ]): + logger.warning( + "CUDA graph is not supported on BitAndBytes 8bit yet, " + "fallback to the eager mode.") + self.enforce_eager = True + def verify_async_output_proc(self, parallel_config, speculative_config, device_config) -> None: if not self.use_async_output_proc: @@ -401,13 +424,6 @@ def verify_with_parallel_config( "Pipeline parallelism is only supported for the following " f" architectures: {_PP_SUPPORTED_MODELS}.") - # Remove the constraint after the bitsandbytes issue is fixed: - # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1308 - if self.quantization == "bitsandbytes" and self.enforce_eager is False: - logger.warning("CUDA graph is not supported on BitAndBytes yet, " - "fallback to the eager mode.") - self.enforce_eager = True - if pipeline_parallel_size > 1 and self.use_async_output_proc: logger.warning("Async output processor is not supported with " "pipeline parallelism currently. Disabling it.") diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 66bc5395dbd7a..38495d5a5a863 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -121,12 +121,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase): def __init__(self, quant_config: BitsAndBytesConfig): try: import bitsandbytes - if bitsandbytes.__version__ < "0.42.0": + if bitsandbytes.__version__ < "0.44.0": raise ImportError("bitsandbytes version is wrong. Please " - "install bitsandbytes>=0.42.0.") + "install bitsandbytes>=0.44.0.") except ImportError as err: - raise ImportError("Please install bitsandbytes>=0.42.0 via " - "`pip install bitsandbytes>=0.42.0` to use " + raise ImportError("Please install bitsandbytes>=0.44.0 via " + "`pip install bitsandbytes>=0.44.0` to use " "bitsandbytes quantizer.") from err self.quant_config = quant_config diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index aea3354cada90..c21b10d661ecc 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -851,12 +851,12 @@ def _get_quantized_weights_iterator( # only load the bitsandbytes module when needed try: import bitsandbytes - if bitsandbytes.__version__ < "0.42.0": + if bitsandbytes.__version__ < "0.44.0": raise ImportError("bitsandbytes version is wrong. Please " - "install bitsandbytes>=0.42.0.") + "install bitsandbytes>=0.44.0.") except ImportError as err: - raise ImportError("Please install bitsandbytes>=0.42.0 via " - "`pip install bitsandbytes>=0.42.0` to use " + raise ImportError("Please install bitsandbytes>=0.44.0 via " + "`pip install bitsandbytes>=0.44.0` to use " "bitsandbytes quantizer.") from err hf_weights_files, use_safetensors = self._prepare_weights(