From 13f9f7a3d0373421ee9fd7498e450214e134aa6c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 25 Sep 2024 08:08:55 +0800
Subject: [PATCH] [[Misc]Upgrade bitsandbytes to the latest version 0.44.0
 (#8768)

---
 docs/source/quantization/bnb.rst              |  2 +-
 examples/lora_with_quantization_inference.py  | 26 +++++++---------
 requirements-test.txt                         |  2 +-
 tests/quantization/test_bitsandbytes.py       |  2 +-
 vllm/config.py                                | 30 ++++++++++++++-----
 .../layers/quantization/bitsandbytes.py       |  8 ++---
 vllm/model_executor/model_loader/loader.py    |  8 ++---
 7 files changed, 44 insertions(+), 34 deletions(-)

diff --git a/docs/source/quantization/bnb.rst b/docs/source/quantization/bnb.rst
index aefb54a8acb65..682938cc63d48 100644
--- a/docs/source/quantization/bnb.rst
+++ b/docs/source/quantization/bnb.rst
@@ -11,7 +11,7 @@ Below are the steps to utilize BitsAndBytes with vLLM.
 
 .. code-block:: console
 
-    $ pip install bitsandbytes>=0.42.0
+    $ pip install bitsandbytes>=0.44.0
 
 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
 
diff --git a/examples/lora_with_quantization_inference.py b/examples/lora_with_quantization_inference.py
index 3b2347c1115e1..0c454ea50f665 100644
--- a/examples/lora_with_quantization_inference.py
+++ b/examples/lora_with_quantization_inference.py
@@ -79,23 +79,17 @@ def initialize_engine(model: str, quantization: str,
         # It quantizes the model when loading, with some config info from the
         # LoRA adapter repo. So need to set the parameter of load_format and
         # qlora_adapter_name_or_path as below.
-        engine_args = EngineArgs(
-            model=model,
-            quantization=quantization,
-            qlora_adapter_name_or_path=lora_repo,
-            load_format="bitsandbytes",
-            enable_lora=True,
-            max_lora_rank=64,
-            # set it only in GPUs of limited memory
-            enforce_eager=True)
+        engine_args = EngineArgs(model=model,
+                                 quantization=quantization,
+                                 qlora_adapter_name_or_path=lora_repo,
+                                 load_format="bitsandbytes",
+                                 enable_lora=True,
+                                 max_lora_rank=64)
     else:
-        engine_args = EngineArgs(
-            model=model,
-            quantization=quantization,
-            enable_lora=True,
-            max_loras=4,
-            # set it only in GPUs of limited memory
-            enforce_eager=True)
+        engine_args = EngineArgs(model=model,
+                                 quantization=quantization,
+                                 enable_lora=True,
+                                 max_loras=4)
     return LLMEngine.from_engine_args(engine_args)
 
 
diff --git a/requirements-test.txt b/requirements-test.txt
index 10d463de27be5..9c6fadb88865a 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -30,5 +30,5 @@ datamodel_code_generator # required for minicpm3 test
 aiohttp
 
 # quantization
-bitsandbytes==0.42.0
+bitsandbytes>=0.44.0
 buildkite-test-collector==0.1.8
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 36167cf95f589..ac2ebc622ba6f 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -107,7 +107,7 @@ def validate_generated_texts(hf_runner,
                      quantization='bitsandbytes',
                      load_format='bitsandbytes',
                      tensor_parallel_size=vllm_tp_size,
-                     enforce_eager=True,
+                     enforce_eager=False,
                      gpu_memory_utilization=0.8) as llm:
         vllm_outputs = llm.generate_greedy(prompts, 8)
         vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
diff --git a/vllm/config.py b/vllm/config.py
index 8c65d99c44651..562564bbfa032 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -222,6 +222,7 @@ def __init__(self,
         self._verify_embedding_mode()
         self._verify_quantization()
         self._verify_cuda_graph()
+        self._verify_bnb_config()
 
     def _init_multimodal_config(
         self, limit_mm_per_prompt: Optional[Mapping[str, int]]
@@ -337,6 +338,28 @@ def _verify_cuda_graph(self) -> None:
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
 
+    def _verify_bnb_config(self) -> None:
+        """
+        The current version of bitsandbytes (0.44.0) with 8-bit models does not 
+        yet support CUDA graph.
+        """
+        is_bitsandbytes = self.quantization == "bitsandbytes"
+        has_quantization_config = (getattr(self.hf_config,
+                                           "quantization_config", None)
+                                   is not None)
+        is_8bit = (self.hf_config.quantization_config.get(
+            "load_in_8bit", False) if has_quantization_config else False)
+        if all([
+                is_bitsandbytes,
+                has_quantization_config,
+                is_8bit,
+                not self.enforce_eager,
+        ]):
+            logger.warning(
+                "CUDA graph is not supported on BitAndBytes 8bit yet, "
+                "fallback to the eager mode.")
+            self.enforce_eager = True
+
     def verify_async_output_proc(self, parallel_config, speculative_config,
                                  device_config) -> None:
         if not self.use_async_output_proc:
@@ -401,13 +424,6 @@ def verify_with_parallel_config(
                 "Pipeline parallelism is only supported for the following "
                 f" architectures: {_PP_SUPPORTED_MODELS}.")
 
-        # Remove the constraint after the bitsandbytes issue is fixed:
-        # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1308
-        if self.quantization == "bitsandbytes" and self.enforce_eager is False:
-            logger.warning("CUDA graph is not supported on BitAndBytes yet, "
-                           "fallback to the eager mode.")
-            self.enforce_eager = True
-
         if pipeline_parallel_size > 1 and self.use_async_output_proc:
             logger.warning("Async output processor is not supported with "
                            "pipeline parallelism currently. Disabling it.")
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 66bc5395dbd7a..38495d5a5a863 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -121,12 +121,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: BitsAndBytesConfig):
         try:
             import bitsandbytes
-            if bitsandbytes.__version__ < "0.42.0":
+            if bitsandbytes.__version__ < "0.44.0":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.42.0.")
+                                  "install bitsandbytes>=0.44.0.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.42.0 via "
-                              "`pip install bitsandbytes>=0.42.0` to use "
+            raise ImportError("Please install bitsandbytes>=0.44.0 via "
+                              "`pip install bitsandbytes>=0.44.0` to use "
                               "bitsandbytes quantizer.") from err
 
         self.quant_config = quant_config
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index aea3354cada90..c21b10d661ecc 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -851,12 +851,12 @@ def _get_quantized_weights_iterator(
         # only load the bitsandbytes module when needed
         try:
             import bitsandbytes
-            if bitsandbytes.__version__ < "0.42.0":
+            if bitsandbytes.__version__ < "0.44.0":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.42.0.")
+                                  "install bitsandbytes>=0.44.0.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.42.0 via "
-                              "`pip install bitsandbytes>=0.42.0` to use "
+            raise ImportError("Please install bitsandbytes>=0.44.0 via "
+                              "`pip install bitsandbytes>=0.44.0` to use "
                               "bitsandbytes quantizer.") from err
 
         hf_weights_files, use_safetensors = self._prepare_weights(