From 846f853517945f411ee5c12427e9ca67a855022a Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 29 May 2024 08:20:42 -0400
Subject: [PATCH 01/61] enable cpu bnb path

---
 src/transformers/quantizers/quantizer_bnb_8bit.py | 7 ++-----
 src/transformers/utils/import_utils.py            | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py
index dbfceac2de8667..9479d562785762 100644
--- a/src/transformers/quantizers/quantizer_bnb_8bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_8bit.py
@@ -64,10 +64,7 @@ def __init__(self, quantization_config, **kwargs):
             self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
 
     def validate_environment(self, *args, **kwargs):
-        if not torch.cuda.is_available():
-            raise RuntimeError("No GPU found. A GPU is needed for quantization.")
-
-        if not is_accelerate_available():
+        if not (is_accelerate_available() and is_bitsandbytes_available()):
             raise ImportError(
                 f"Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
             )
@@ -126,7 +123,7 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
         return torch_dtype
 
     def update_device_map(self, device_map):
-        if device_map is None:
+        if device_map is None and torch.cuda.is_available():
             device_map = {"": torch.cuda.current_device()}
             logger.info(
                 "The device_map was not initialized. "
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index c4bb1a64eb6361..fb315e977cf434 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -848,7 +848,7 @@ def is_bitsandbytes_available():
     # let's avoid that by adding a simple check
     import torch
 
-    return _bitsandbytes_available and torch.cuda.is_available()
+    return _bitsandbytes_available
 
 
 def is_flash_attn_2_available():

From 6c567037d4de6b8bcdb233aa69ab0ce73e7bbd65 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 29 May 2024 08:26:39 -0400
Subject: [PATCH 02/61] fix style

---
 src/transformers/utils/import_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index fb315e977cf434..2fe1f2c700facc 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -846,7 +846,6 @@ def is_bitsandbytes_available():
 
     # bitsandbytes throws an error if cuda is not available
     # let's avoid that by adding a simple check
-    import torch
 
     return _bitsandbytes_available
 

From 3f02c9bed7ea3bb30dc6bf8838fff1054107de16 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 29 May 2024 08:36:09 -0400
Subject: [PATCH 03/61] fix code style

---
 src/transformers/quantizers/quantizer_bnb_4bit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index 936a1820a8da4d..1bd70139ae5101 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -248,7 +248,7 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
 
     # Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer.update_device_map
     def update_device_map(self, device_map):
-        if device_map is None:
+        if device_map is None and torch.cuda.is_available():
             device_map = {"": torch.cuda.current_device()}
             logger.info(
                 "The device_map was not initialized. "

From 9ccbf109ab880ca826c2e7431e505581f91bcc37 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 29 May 2024 08:54:28 -0400
Subject: [PATCH 04/61] fix 4 bit path

---
 src/transformers/quantizers/quantizer_bnb_4bit.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index 1bd70139ae5101..232f0fd9164f7a 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -64,9 +64,7 @@ def __init__(self, quantization_config, **kwargs):
             self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
 
     def validate_environment(self, *args, **kwargs):
-        if not torch.cuda.is_available():
-            raise RuntimeError("No GPU found. A GPU is needed for quantization.")
-        if not is_accelerate_available():
+        if not (is_accelerate_available() and is_bitsandbytes_available())
             raise ImportError(
                 f"Using `bitsandbytes` 4-bit quantization requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
             )

From 89fa5ef49dc6fa1dc84b2d5fb89a858dbad1040e Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Wed, 17 Jul 2024 09:01:29 +0800
Subject: [PATCH 05/61] Update src/transformers/utils/import_utils.py

Co-authored-by: Aarni Koskela <akx@iki.fi>
---
 src/transformers/utils/import_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 2fe1f2c700facc..c3fe3e0a9c73bc 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -844,8 +844,6 @@ def is_bitsandbytes_available():
     if not is_torch_available():
         return False
 
-    # bitsandbytes throws an error if cuda is not available
-    # let's avoid that by adding a simple check
 
     return _bitsandbytes_available
 

From a52d7af1afc444138d9c4357d7688dbfa9ac6b0f Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 16 Jul 2024 21:48:59 -0400
Subject: [PATCH 06/61] add multi backend refactor tests

---
 src/transformers/quantizers/quantizer_bnb_4bit.py | 4 ++++
 src/transformers/quantizers/quantizer_bnb_8bit.py | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index 232f0fd9164f7a..2125f6c3d26b4e 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -72,6 +72,10 @@ def validate_environment(self, *args, **kwargs):
             raise ImportError(
                 "Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
             )
+        if not torch.cuda.is_available():
+            import bitsandbytes as bnb
+            if not getattr(bnb, "is_multi_backend_refactor_preview", False):
+                raise RuntimeError("Current bitsandbytes only support cuda, please switch to multi_backend_refactor to support multi backends.")
 
         if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
             raise ValueError(
diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py
index 9479d562785762..7292916295afe0 100644
--- a/src/transformers/quantizers/quantizer_bnb_8bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_8bit.py
@@ -72,6 +72,10 @@ def validate_environment(self, *args, **kwargs):
             raise ImportError(
                 "Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
             )
+        if not torch.cuda.is_available():
+            import bitsandbytes as bnb
+            if not getattr(bnb, "is_multi_backend_refactor_preview", False):
+                raise RuntimeError("Current bitsandbytes only support cuda, please switch to multi_backend_refactor to support multi backends.")
 
         if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
             raise ValueError(

From 6f6786252f15f025a3d113b87d2518f3d9c44ed2 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 16 Jul 2024 21:56:28 -0400
Subject: [PATCH 07/61] fix style

---
 src/transformers/quantizers/quantizer_bnb_4bit.py | 5 ++++-
 src/transformers/quantizers/quantizer_bnb_8bit.py | 5 ++++-
 src/transformers/utils/import_utils.py            | 1 -
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index 2125f6c3d26b4e..2ef9a5101ce125 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -74,8 +74,11 @@ def validate_environment(self, *args, **kwargs):
             )
         if not torch.cuda.is_available():
             import bitsandbytes as bnb
+
             if not getattr(bnb, "is_multi_backend_refactor_preview", False):
-                raise RuntimeError("Current bitsandbytes only support cuda, please switch to multi_backend_refactor to support multi backends.")
+                raise RuntimeError(
+                    "Current bitsandbytes only support cuda, please switch to multi_backend_refactor to support multi backends."
+                )
 
         if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
             raise ValueError(
diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py
index 7292916295afe0..4deba5ccaee012 100644
--- a/src/transformers/quantizers/quantizer_bnb_8bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_8bit.py
@@ -74,8 +74,11 @@ def validate_environment(self, *args, **kwargs):
             )
         if not torch.cuda.is_available():
             import bitsandbytes as bnb
+
             if not getattr(bnb, "is_multi_backend_refactor_preview", False):
-                raise RuntimeError("Current bitsandbytes only support cuda, please switch to multi_backend_refactor to support multi backends.")
+                raise RuntimeError(
+                    "Current bitsandbytes only support cuda, please switch to multi_backend_refactor to support multi backends."
+                )
 
         if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
             raise ValueError(
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index c3fe3e0a9c73bc..9c086b81662b76 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -844,7 +844,6 @@ def is_bitsandbytes_available():
     if not is_torch_available():
         return False
 
-
     return _bitsandbytes_available
 
 

From ee23eb0a7cd68b2d8bdf6f5f5aeb526f0d06cd37 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 30 Jul 2024 21:26:33 +0000
Subject: [PATCH 08/61] tweak 4bit quantizer + fix corresponding tests

---
 .../quantizers/quantizer_bnb_4bit.py          | 12 +++-
 src/transformers/testing_utils.py             | 35 ++++++++++++
 tests/quantization/bnb/test_4bit.py           | 57 +++++++++++++------
 tests/quantization/bnb/test_mixed_int8.py     |  9 ++-
 4 files changed, 92 insertions(+), 21 deletions(-)

diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index 2ef9a5101ce125..fd83c5e8647658 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -72,12 +72,16 @@ def validate_environment(self, *args, **kwargs):
             raise ImportError(
                 "Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
             )
+        import bitsandbytes as bnb
+
+        bnb_is_multibackend_enabled = "multi_backend" in getattr(bnb, "features", set())
+
         if not torch.cuda.is_available():
             import bitsandbytes as bnb
 
-            if not getattr(bnb, "is_multi_backend_refactor_preview", False):
+            if not bnb_is_multibackend_enabled:
                 raise RuntimeError(
-                    "Current bitsandbytes only support cuda, please switch to multi_backend_refactor to support multi backends."
+                    "Current bitsandbytes (`main`) only supports CUDA, please switch to the `multi-backend-refactor` preview release for WIP support of other backends."
                 )
 
         if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
@@ -95,7 +99,9 @@ def validate_environment(self, *args, **kwargs):
             device_map_without_lm_head = {
                 key: device_map[key] for key in device_map.keys() if key not in self.modules_to_not_convert
             }
-            if "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
+            if set(device_map.values()) == {"cpu"} and bnb_is_multibackend_enabled:
+                pass
+            elif "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
                 raise ValueError(
                     "Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the "
                     "quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules "
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 3d30c9ff647980..e1d650eeeb6cbf 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -328,6 +328,29 @@ def tooslow(test_case):
     return unittest.skip(reason="test is too slow")(test_case)
 
 
+def skip_if_not_implemented(test_func):
+    @functools.wraps(test_func)
+    def wrapper(*args, **kwargs):
+        try:
+            return test_func(*args, **kwargs)
+        except NotImplementedError as e:
+            raise unittest.SkipTest(f"Test skipped due to NotImplementedError: {e}")
+
+    return wrapper
+
+
+def apply_skip_if_not_implemented(cls):
+    """
+    Class decorator to apply @skip_if_not_implemented to all test methods.
+    """
+    for attr_name in dir(cls):
+        if attr_name.startswith("test_"):
+            attr = getattr(cls, attr_name)
+            if callable(attr):
+                setattr(cls, attr_name, skip_if_not_implemented(attr))
+    return cls
+
+
 def custom_tokenizers(test_case):
     """
     Decorator marking a test for a custom tokenizer.
@@ -927,6 +950,18 @@ def require_torch_gpu(test_case):
     return unittest.skipUnless(torch_device == "cuda", "test requires CUDA")(test_case)
 
 
+def require_torch_gpu_if_bnb_not_multi_backend_enabled(test_case):
+    """
+    Decorator marking a test that requires a GPU if bitsandbytes multi-backend feature is not enabled.
+    """
+    if is_bitsandbytes_available():
+        import bitsandbytes as bnb
+
+        if hasattr(bnb, "features") and "multi_backend" in bnb.features:
+            return test_case
+    return require_torch_gpu(test_case)
+
+
 def require_torch_accelerator(test_case):
     """Decorator marking a test that requires an accessible accelerator and PyTorch."""
     return unittest.skipUnless(torch_device is not None and torch_device != "cpu", "test requires accelerator")(
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 71a2d7c81572e7..cc11d96c6c1e62 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -30,12 +30,13 @@
     pipeline,
 )
 from transformers.testing_utils import (
+    apply_skip_if_not_implemented,
     is_bitsandbytes_available,
     is_torch_available,
     require_accelerate,
     require_bitsandbytes,
     require_torch,
-    require_torch_gpu,
+    require_torch_gpu_if_bnb_not_multi_backend_enabled,
     require_torch_multi_gpu,
     slow,
     torch_device,
@@ -81,11 +82,18 @@ def forward(self, input, *args, **kwargs):
 if is_bitsandbytes_available():
     import bitsandbytes as bnb
 
+    def setUpModule():
+        global device
+        if hasattr(bnb, "features") and "multi_backend" in bnb.features:
+            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        else:
+            device = torch.device("cuda:0")
+
 
 @require_bitsandbytes
 @require_accelerate
 @require_torch
-@require_torch_gpu
+@require_torch_gpu_if_bnb_not_multi_backend_enabled
 @slow
 class Base4bitTest(unittest.TestCase):
     # We keep the constants inside the init function and model loading inside setUp function
@@ -111,6 +119,7 @@ def setUp(self):
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
 
+@apply_skip_if_not_implemented
 class Bnb4BitTest(Base4bitTest):
     def setUp(self):
         super().setUp()
@@ -206,7 +215,7 @@ def test_rwkv_4bit(self):
         tok = AutoTokenizer.from_pretrained(model_id)
 
         text = "Hello my name is"
-        input_ids = tok.encode(text, return_tensors="pt").to(0)
+        input_ids = tok.encode(text, return_tensors="pt").to(device)
 
         _ = model.generate(input_ids, max_new_tokens=30)
 
@@ -217,7 +226,7 @@ def test_generate_quality(self):
         the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
         """
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-        output_sequences = self.model_4bit.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        output_sequences = self.model_4bit.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
@@ -234,7 +243,7 @@ def test_generate_quality_config(self):
 
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
         output_sequences = model_4bit_from_config.generate(
-            input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10
+            input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10
         )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -252,7 +261,7 @@ def test_generate_quality_dequantize(self):
         model_4bit.dequantize()
 
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-        output_sequences = model_4bit.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        output_sequences = model_4bit.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
@@ -285,7 +294,7 @@ def test_device_and_dtype_assignment(self):
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
 
         self.model_fp16 = self.model_fp16.to(torch.float32)
-        _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
 
         # Check this does not throw an error
         _ = self.model_fp16.to("cpu")
@@ -314,8 +323,9 @@ def test_bnb_4bit_wrong_config(self):
 @require_bitsandbytes
 @require_accelerate
 @require_torch
-@require_torch_gpu
+@require_torch_gpu_if_bnb_not_multi_backend_enabled
 @slow
+@apply_skip_if_not_implemented
 class Bnb4BitT5Test(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -345,14 +355,14 @@ def test_inference_without_keep_in_fp32(self):
 
         # test with `google-t5/t5-small`
         model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
         _ = model.generate(**encoded_input)
 
         # test with `flan-t5-small`
         model = T5ForConditionalGeneration.from_pretrained(
             self.dense_act_model_name, load_in_4bit=True, device_map="auto"
         )
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
         _ = model.generate(**encoded_input)
         T5ForConditionalGeneration._keep_in_fp32_modules = modules
 
@@ -370,17 +380,18 @@ def test_inference_with_keep_in_fp32(self):
         # there was a bug with decoders - this test checks that it is fixed
         self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear4bit))
 
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
         _ = model.generate(**encoded_input)
 
         # test with `flan-t5-small`
         model = T5ForConditionalGeneration.from_pretrained(
             self.dense_act_model_name, load_in_4bit=True, device_map="auto"
         )
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
         _ = model.generate(**encoded_input)
 
 
+@apply_skip_if_not_implemented
 class Classes4BitModelTest(Base4bitTest):
     def setUp(self):
         super().setUp()
@@ -430,6 +441,7 @@ def test_correct_head_class(self):
         self.assertTrue(self.seq_to_seq_model.lm_head.weight.__class__ == torch.nn.Parameter)
 
 
+@apply_skip_if_not_implemented
 class Pipeline4BitTest(Base4bitTest):
     def setUp(self):
         super().setUp()
@@ -454,7 +466,12 @@ def test_pipeline(self):
         self.pipe = pipeline(
             "text-generation",
             model=self.model_name,
-            model_kwargs={"device_map": "auto", "load_in_4bit": True, "torch_dtype": torch.float16},
+            model_kwargs={
+                "device_map": "auto",
+                "load_in_4bit": True,
+                # float16 isn't supported on CPU, use bfloat16 instead
+                "torch_dtype": torch.bloat16 if device == "cpu" else torch.float16,
+            },
             max_new_tokens=self.MAX_NEW_TOKENS,
         )
 
@@ -464,6 +481,7 @@ def test_pipeline(self):
 
 
 @require_torch_multi_gpu
+@apply_skip_if_not_implemented
 class Bnb4bitTestMultiGpu(Base4bitTest):
     def setUp(self):
         super().setUp()
@@ -485,10 +503,11 @@ def test_multi_gpu_loading(self):
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
 
         # Second real batch
-        output_parallel = model_parallel.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        output_parallel = model_parallel.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
         self.assertIn(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
 
+@apply_skip_if_not_implemented
 class Bnb4BitTestTraining(Base4bitTest):
     def setUp(self):
         self.model_name = "facebook/opt-350m"
@@ -517,7 +536,7 @@ def test_training(self):
                 module.v_proj = LoRALayer(module.v_proj, rank=16)
 
         # Step 3: dummy batch
-        batch = self.tokenizer("Test batch ", return_tensors="pt").to(0)
+        batch = self.tokenizer("Test batch ", return_tensors="pt").to(device)
 
         # Step 4: Check if the gradient is not None
         with torch.cuda.amp.autocast():
@@ -532,6 +551,7 @@ def test_training(self):
                 self.assertTrue(module.weight.grad is None)
 
 
+@apply_skip_if_not_implemented
 class Bnb4BitGPT2Test(Bnb4BitTest):
     model_name = "openai-community/gpt2-xl"
     EXPECTED_RELATIVE_DIFFERENCE = 3.3191854854152187
@@ -540,8 +560,9 @@ class Bnb4BitGPT2Test(Bnb4BitTest):
 @require_bitsandbytes
 @require_accelerate
 @require_torch
-@require_torch_gpu
+@require_torch_gpu_if_bnb_not_multi_backend_enabled
 @slow
+@apply_skip_if_not_implemented
 class BaseSerializationTest(unittest.TestCase):
     model_name = "facebook/opt-125m"
     input_text = "Mars colonists' favorite meals are"
@@ -629,6 +650,7 @@ def _decode(token):
         )
 
 
+@apply_skip_if_not_implemented
 class ExtendedSerializationTest(BaseSerializationTest):
     """
     tests more combinations of parameters
@@ -676,8 +698,9 @@ class GPTSerializationTest(BaseSerializationTest):
 
 @require_bitsandbytes
 @require_accelerate
-@require_torch_gpu
+@require_torch_gpu_if_bnb_not_multi_backend_enabled
 @slow
+@apply_skip_if_not_implemented
 class Bnb4BitTestBasicConfigTest(unittest.TestCase):
     def test_load_in_4_and_8_bit_fails(self):
         with self.assertRaisesRegex(ValueError, "load_in_4bit and load_in_8bit are both True"):
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index ca3f043c749a31..70028533653adf 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -31,11 +31,12 @@
 )
 from transformers.testing_utils import (
     is_accelerate_available,
+    apply_skip_if_not_implemented,
     is_torch_available,
     require_accelerate,
     require_bitsandbytes,
     require_torch,
-    require_torch_gpu,
+    require_torch_gpu_if_bnb_not_multi_backend_enabled,
     require_torch_multi_gpu,
     slow,
 )
@@ -108,6 +109,7 @@ def setUp(self):
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
 
+@apply_skip_if_not_implemented
 class MixedInt8Test(BaseMixedInt8Test):
     def setUp(self):
         super().setUp()
@@ -614,6 +616,7 @@ def test_correct_head_class(self):
         self.assertTrue(self.seq_to_seq_model.lm_head.weight.__class__ == torch.nn.Parameter)
 
 
+@apply_skip_if_not_implemented
 class MixedInt8TestPipeline(BaseMixedInt8Test):
     def setUp(self):
         super().setUp()
@@ -648,6 +651,7 @@ def test_pipeline(self):
 
 
 @require_torch_multi_gpu
+@apply_skip_if_not_implemented
 class MixedInt8TestMultiGpu(BaseMixedInt8Test):
     def setUp(self):
         super().setUp()
@@ -674,6 +678,7 @@ def test_multi_gpu_loading(self):
 
 
 @require_torch_multi_gpu
+@apply_skip_if_not_implemented
 class MixedInt8TestCpuGpu(BaseMixedInt8Test):
     def setUp(self):
         super().setUp()
@@ -819,6 +824,7 @@ def test_cpu_gpu_disk_loading_custom_device_map_kwargs(self):
             self.check_inference_correctness(model_8bit)
 
 
+@apply_skip_if_not_implemented
 class MixedInt8TestTraining(BaseMixedInt8Test):
     def setUp(self):
         self.model_name = "facebook/opt-350m"
@@ -862,6 +868,7 @@ def test_training(self):
                 self.assertTrue(module.weight.grad is None)
 
 
+@apply_skip_if_not_implemented
 class MixedInt8GPT2Test(MixedInt8Test):
     model_name = "openai-community/gpt2-xl"
     EXPECTED_RELATIVE_DIFFERENCE = 1.8720077507258357

From 678e6732bc9879d2ac445e295e7d6afd1449edac Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 30 Jul 2024 21:43:57 +0000
Subject: [PATCH 09/61] tweak 8bit quantizer + *try* fixing corresponding tests

---
 .../quantizers/quantizer_bnb_8bit.py          | 12 +++-
 tests/quantization/bnb/test_mixed_int8.py     | 60 ++++++++++++-------
 2 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py
index 4deba5ccaee012..cef19d6a03aa20 100644
--- a/src/transformers/quantizers/quantizer_bnb_8bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_8bit.py
@@ -72,12 +72,16 @@ def validate_environment(self, *args, **kwargs):
             raise ImportError(
                 "Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
             )
+        import bitsandbytes as bnb
+
+        bnb_is_multibackend_enabled = "multi_backend" in getattr(bnb, "features", set())
+
         if not torch.cuda.is_available():
             import bitsandbytes as bnb
 
-            if not getattr(bnb, "is_multi_backend_refactor_preview", False):
+            if not bnb_is_multibackend_enabled:
                 raise RuntimeError(
-                    "Current bitsandbytes only support cuda, please switch to multi_backend_refactor to support multi backends."
+                    "Current bitsandbytes (`main`) only supports CUDA, please switch to the `multi-backend-refactor` preview release for WIP support of other backends."
                 )
 
         if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
@@ -95,7 +99,9 @@ def validate_environment(self, *args, **kwargs):
             device_map_without_lm_head = {
                 key: device_map[key] for key in device_map.keys() if key not in self.modules_to_not_convert
             }
-            if "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
+            if set(device_map.values()) == {"cpu"} and bnb_is_multibackend_enabled:
+                pass
+            elif "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
                 raise ValueError(
                     "Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the "
                     "quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules "
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index 70028533653adf..54fbdcbded044f 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -30,8 +30,9 @@
     pipeline,
 )
 from transformers.testing_utils import (
-    is_accelerate_available,
     apply_skip_if_not_implemented,
+    is_accelerate_available,
+    is_bitsandbytes_available,
     is_torch_available,
     require_accelerate,
     require_bitsandbytes,
@@ -78,10 +79,21 @@ def forward(self, input, *args, **kwargs):
             return self.module(input, *args, **kwargs) + self.adapter(input)
 
 
+if is_bitsandbytes_available():
+    import bitsandbytes as bnb
+
+    def setUpModule():
+        global device
+        if hasattr(bnb, "features") and "multi_backend" in bnb.features:
+            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        else:
+            device = torch.device("cuda:0")
+
+
 @require_bitsandbytes
 @require_accelerate
 @require_torch
-@require_torch_gpu
+@require_torch_gpu_if_bnb_not_multi_backend_enabled
 @slow
 class BaseMixedInt8Test(unittest.TestCase):
     # We keep the constants inside the init function and model loading inside setUp function
@@ -265,7 +277,7 @@ def test_generate_quality(self):
         the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
         """
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-        output_sequences = self.model_8bit.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        output_sequences = self.model_8bit.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
@@ -282,7 +294,7 @@ def test_generate_quality_config(self):
 
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
         output_sequences = model_8bit_from_config.generate(
-            input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10
+            input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10
         )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -300,7 +312,7 @@ def test_generate_quality_dequantize(self):
         model_8bit.dequantize()
 
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-        output_sequences = model_8bit.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        output_sequences = model_8bit.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
@@ -348,7 +360,7 @@ def test_device_and_dtype_assignment(self):
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
 
         self.model_fp16 = self.model_fp16.to(torch.float32)
-        _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
 
         # Check this does not throw an error
         _ = self.model_fp16.to("cpu")
@@ -387,7 +399,9 @@ def test_int8_serialization(self):
 
             # generate
             encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-            output_sequences = model_from_saved.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+            output_sequences = model_from_saved.generate(
+                input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10
+            )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
@@ -412,7 +426,9 @@ def test_int8_serialization_regression(self):
 
             # generate
             encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-            output_sequences = model_from_saved.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+            output_sequences = model_from_saved.generate(
+                input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10
+            )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
@@ -437,7 +453,9 @@ def test_int8_serialization_sharded(self):
 
             # generate
             encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-            output_sequences = model_from_saved.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+            output_sequences = model_from_saved.generate(
+                input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10
+            )
 
             self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
@@ -457,7 +475,7 @@ def test_int8_from_pretrained(self):
 
         # generate
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
@@ -465,7 +483,7 @@ def test_int8_from_pretrained(self):
 @require_bitsandbytes
 @require_accelerate
 @require_torch
-@require_torch_gpu
+@require_torch_gpu_if_bnb_not_multi_backend_enabled
 @slow
 class MixedInt8T5Test(unittest.TestCase):
     @classmethod
@@ -496,14 +514,14 @@ def test_inference_without_keep_in_fp32(self):
 
         # test with `google-t5/t5-small`
         model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
         _ = model.generate(**encoded_input)
 
         # test with `flan-t5-small`
         model = T5ForConditionalGeneration.from_pretrained(
             self.dense_act_model_name, load_in_8bit=True, device_map="auto"
         )
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
         _ = model.generate(**encoded_input)
         T5ForConditionalGeneration._keep_in_fp32_modules = modules
 
@@ -523,14 +541,14 @@ def test_inference_with_keep_in_fp32(self):
         # there was a bug with decoders - this test checks that it is fixed
         self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear8bitLt))
 
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
         _ = model.generate(**encoded_input)
 
         # test with `flan-t5-small`
         model = T5ForConditionalGeneration.from_pretrained(
             self.dense_act_model_name, load_in_8bit=True, device_map="auto"
         )
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
         _ = model.generate(**encoded_input)
 
     def test_inference_with_keep_in_fp32_serialized(self):
@@ -555,14 +573,14 @@ def test_inference_with_keep_in_fp32_serialized(self):
             # there was a bug with decoders - this test checks that it is fixed
             self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear8bitLt))
 
-            encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+            encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
             _ = model.generate(**encoded_input)
 
             # test with `flan-t5-small`
             model = T5ForConditionalGeneration.from_pretrained(
                 self.dense_act_model_name, load_in_8bit=True, device_map="auto"
             )
-            encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+            encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
             _ = model.generate(**encoded_input)
 
 
@@ -673,7 +691,7 @@ def test_multi_gpu_loading(self):
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
 
         # Second real batch
-        output_parallel = model_parallel.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        output_parallel = model_parallel.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
         self.assertIn(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
 
@@ -688,7 +706,7 @@ def check_inference_correctness(self, model):
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
 
         # Check the exactness of the results
-        output_parallel = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        output_parallel = model.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
 
         # Get the generation
         output_text = self.tokenizer.decode(output_parallel[0], skip_special_tokens=True)
@@ -853,7 +871,7 @@ def test_training(self):
                 module.v_proj = LoRALayer(module.v_proj, rank=16)
 
         # Step 3: dummy batch
-        batch = self.tokenizer("Test batch ", return_tensors="pt").to(0)
+        batch = self.tokenizer("Test batch ", return_tensors="pt").to(device)
 
         # Step 4: Check if the gradient is not None
         with torch.cuda.amp.autocast():
@@ -894,6 +912,6 @@ def test_int8_from_pretrained(self):
 
         # generate
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
+        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)

From 0858b3e7c9033a9f05fc5c6be30710f9a6d38b29 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 1 Aug 2024 06:53:49 -0400
Subject: [PATCH 10/61] fix dequant bnb 8bit

---
 src/transformers/integrations/bitsandbytes.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
index c49d353ccb520b..345796e310caaf 100644
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@@ -332,7 +332,7 @@ def get_keys_to_not_convert(model):
 
 
 # Copied from PEFT: https://github.com/huggingface/peft/blob/47b3712898539569c02ec5b3ed4a6c36811331a1/src/peft/utils/integrations.py#L41
-def dequantize_bnb_weight(weight: "torch.nn.Parameter", state=None):
+def dequantize_bnb_weight(weight: "torch.nn.Parameter", dtype: "torch.dtype", state=None):
     """
     Helper function to dequantize 4bit or 8bit bnb weights.
 
@@ -350,7 +350,7 @@ def dequantize_bnb_weight(weight: "torch.nn.Parameter", state=None):
         logger.warning_once(
             f"The model is going to be dequantized in {output_tensor.dtype} - if you want to upcast it to another dtype, make sure to pass the desired dtype when quantizing the model through `bnb_4bit_quant_type` argument of `BitsAndBytesConfig`"
         )
-        return output_tensor
+        return output_tensor.to(dtype)
 
     if state.SCB is None:
         state.SCB = weight.SCB
@@ -361,7 +361,7 @@ def dequantize_bnb_weight(weight: "torch.nn.Parameter", state=None):
     if state.CxB is None:
         state.CxB, state.SB = bnb.functional.transform(weight.data, to_order=state.formatB)
     out32, Sout32 = bnb.functional.igemmlt(im, state.CxB, Sim, state.SB)
-    return bnb.functional.mm_dequant(out32, Sout32, SCim, state.SCB, bias=None).t()
+    return bnb.functional.mm_dequant(out32, Sout32, SCim, state.SCB, bias=None).t().to(dtype)
 
 
 def _create_accelerate_new_hook(old_hook):
@@ -383,6 +383,7 @@ def _create_accelerate_new_hook(old_hook):
 
 def _dequantize_and_replace(
     model,
+    dtype,
     modules_to_not_convert=None,
     current_key_name=None,
     quantization_config=None,
@@ -422,7 +423,7 @@ def _dequantize_and_replace(
                 else:
                     state = None
 
-                new_module.weight = torch.nn.Parameter(dequantize_bnb_weight(module.weight, state))
+                new_module.weight = torch.nn.Parameter(dequantize_bnb_weight(module.weight, dtype, state))
 
                 if bias is not None:
                     new_module.bias = bias
@@ -440,6 +441,7 @@ def _dequantize_and_replace(
         if len(list(module.children())) > 0:
             _, has_been_replaced = _dequantize_and_replace(
                 module,
+                dtype,
                 modules_to_not_convert,
                 current_key_name,
                 quantization_config,
@@ -457,6 +459,7 @@ def dequantize_and_replace(
 ):
     model, has_been_replaced = _dequantize_and_replace(
         model,
+        model.dtype,
         modules_to_not_convert=modules_to_not_convert,
         quantization_config=quantization_config,
     )

From c76d2430699668a667db94efb06e736e9b569296 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Thu, 1 Aug 2024 22:16:39 +0000
Subject: [PATCH 11/61] account for Intel CPU in variability of expected
 outputs

---
 tests/quantization/bnb/test_mixed_int8.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index 54fbdcbded044f..f60924f37120b2 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -895,6 +895,8 @@ class MixedInt8GPT2Test(MixedInt8Test):
     EXPECTED_OUTPUTS.add("Hello my name is John Doe, and I'm a fan of the")
     # Expected values on a A10
     EXPECTED_OUTPUTS.add("Hello my name is John Doe, and I am a member of the")
+    # Expected values on Intel CPU
+    EXPECTED_OUTPUTS.add("Hello my name is John Doe. I am a man. I am")
 
     def test_int8_from_pretrained(self):
         r"""

From 5843f281b45d23028f2b908aa4aaf5092a622740 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 7 Aug 2024 15:49:00 -0400
Subject: [PATCH 12/61] enable cpu and xpu device map

---
 src/transformers/quantizers/quantizer_bnb_4bit.py | 12 +++++++++---
 src/transformers/quantizers/quantizer_bnb_8bit.py | 12 +++++++++---
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index fd83c5e8647658..a5d632c538be55 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -28,6 +28,7 @@
     is_accelerate_available,
     is_bitsandbytes_available,
     is_torch_available,
+    is_torch_xpu_available,
     logging,
 )
 
@@ -259,11 +260,16 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
 
     # Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer.update_device_map
     def update_device_map(self, device_map):
-        if device_map is None and torch.cuda.is_available():
-            device_map = {"": torch.cuda.current_device()}
+        if device_map is None:
+            if torch.cuda.is_available():
+                device_map = {"": torch.cuda.current_device()}
+            elif is_torch_xpu_available():
+                device_map = {"": f"xpu:{torch.xpu.current_device()}"}
+            else:
+                device_map = {"": "cpu"}
             logger.info(
                 "The device_map was not initialized. "
-                "Setting device_map to {'':torch.cuda.current_device()}. "
+                f"Setting device_map to {device_map}. "
                 "If you want to use the model for inference, please set device_map ='auto' "
             )
         return device_map
diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py
index cef19d6a03aa20..e9e1a8dac0ec27 100644
--- a/src/transformers/quantizers/quantizer_bnb_8bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_8bit.py
@@ -27,6 +27,7 @@
     is_accelerate_available,
     is_bitsandbytes_available,
     is_torch_available,
+    is_torch_xpu_available,
     logging,
 )
 from .quantizers_utils import get_module_from_name
@@ -136,11 +137,16 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
         return torch_dtype
 
     def update_device_map(self, device_map):
-        if device_map is None and torch.cuda.is_available():
-            device_map = {"": torch.cuda.current_device()}
+        if device_map is None:
+            if torch.cuda.is_available():
+                device_map = {"": torch.cuda.current_device()}
+            elif is_torch_xpu_available():
+                device_map = {"": f"xpu:{torch.xpu.current_device()}"}
+            else:
+                device_map = {"": "cpu"}
             logger.info(
                 "The device_map was not initialized. "
-                "Setting device_map to {'':torch.cuda.current_device()}. "
+                f"Setting device_map to {device_map}. "
                 "If you want to use the model for inference, please set device_map ='auto' "
             )
         return device_map

From 1a864a86363a737bdbe4ea7732a5fa3f5b6e4629 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 2 Aug 2024 14:37:50 +0000
Subject: [PATCH 13/61] further tweaks to account for Intel CPU

---
 tests/quantization/bnb/test_4bit.py       | 5 ++++-
 tests/quantization/bnb/test_mixed_int8.py | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index cc11d96c6c1e62..081117ccbe9bb9 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -520,7 +520,10 @@ def test_training(self):
         # Step 1: freeze all parameters
         model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True)
 
-        self.assertEqual(set(model.hf_device_map.values()), {torch.cuda.current_device()})
+        if torch.cuda.is_available():
+            self.assertEqual(set(model.hf_device_map.values()), {torch.cuda.current_device()})
+        else:
+            self.assertTrue(all(param.device.type == "cpu" for param in model.parameters()))
 
         for param in model.parameters():
             param.requires_grad = False  # freeze the model - train adapters later
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index f60924f37120b2..7352b730d253aa 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -855,7 +855,10 @@ def test_training(self):
         # Step 1: freeze all parameters
         model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True)
 
-        self.assertEqual(set(model.hf_device_map.values()), {torch.cuda.current_device()})
+        if torch.cuda.is_available():
+            self.assertEqual(set(model.hf_device_map.values()), {torch.cuda.current_device()})
+        else:
+            self.assertTrue(all(param.device.type == "cpu" for param in model.parameters()))
 
         for param in model.parameters():
             param.requires_grad = False  # freeze the model - train adapters later
@@ -897,6 +900,7 @@ class MixedInt8GPT2Test(MixedInt8Test):
     EXPECTED_OUTPUTS.add("Hello my name is John Doe, and I am a member of the")
     # Expected values on Intel CPU
     EXPECTED_OUTPUTS.add("Hello my name is John Doe. I am a man. I am")
+    EXPECTED_OUTPUTS.add("Hello my name is John, and I'm a writer. I'm")
 
     def test_int8_from_pretrained(self):
         r"""

From f3753fc800a4d8750eb4cd717e4d412f35e2219c Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 13 Aug 2024 16:30:17 +0000
Subject: [PATCH 14/61] fix autocast to work with both cpu + cuda

---
 tests/quantization/bnb/test_4bit.py       | 6 +++---
 tests/quantization/bnb/test_mixed_int8.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 081117ccbe9bb9..3d5c9c0c3ea92c 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -85,9 +85,9 @@ def forward(self, input, *args, **kwargs):
     def setUpModule():
         global device
         if hasattr(bnb, "features") and "multi_backend" in bnb.features:
-            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+            device = "cuda" if torch.cuda.is_available() else "cpu"  # TODO: how to add "xpu" ?
         else:
-            device = torch.device("cuda:0")
+            device = "cuda"
 
 
 @require_bitsandbytes
@@ -542,7 +542,7 @@ def test_training(self):
         batch = self.tokenizer("Test batch ", return_tensors="pt").to(device)
 
         # Step 4: Check if the gradient is not None
-        with torch.cuda.amp.autocast():
+        with torch.autocast(device):
             out = model.forward(**batch)
             out.logits.norm().backward()
 
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index 7352b730d253aa..503dab3a75a8e6 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -85,9 +85,9 @@ def forward(self, input, *args, **kwargs):
     def setUpModule():
         global device
         if hasattr(bnb, "features") and "multi_backend" in bnb.features:
-            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+            device = "cuda" if torch.cuda.is_available() else "cpu"  # TODO: how to add "xpu" ?
         else:
-            device = torch.device("cuda:0")
+            device = "cuda"
 
 
 @require_bitsandbytes
@@ -877,7 +877,7 @@ def test_training(self):
         batch = self.tokenizer("Test batch ", return_tensors="pt").to(device)
 
         # Step 4: Check if the gradient is not None
-        with torch.cuda.amp.autocast():
+        with torch.autocast(device):
             out = model.forward(**batch)
             out.logits.norm().backward()
 

From 0cc1b7ea20ebb3240a75f6f97a206b83dee2a0b5 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Wed, 14 Aug 2024 20:15:10 +0000
Subject: [PATCH 15/61] fix comments

---
 tests/quantization/bnb/test_4bit.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 3d5c9c0c3ea92c..24c2c7b3a72a82 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -267,8 +267,10 @@ def test_generate_quality_dequantize(self):
 
     def test_device_and_dtype_assignment(self):
         r"""
-        Test whether trying to cast (or assigning a device to) a model after converting it in 8-bit will throw an error.
-        Checks also if other models are casted correctly.
+        Test whether attempting to change the device or cast the dtype of a model 
+        after converting it to 4-bit precision will raise an appropriate error.
+        The test ensures that such operations are prohibited on 4-bit models 
+        to prevent invalid conversions.
         """
         with self.assertRaises(ValueError):
             # Tries with `str`
@@ -283,11 +285,11 @@ def test_device_and_dtype_assignment(self):
             self.model_4bit.to(torch.device("cuda:0"))
 
         with self.assertRaises(ValueError):
-            # Tries with a `device`
+            # Tries cast the 4-bit model to float32 using `float()`
             self.model_4bit.float()
 
         with self.assertRaises(ValueError):
-            # Tries with a `device`
+            # Tries to cast the 4-bit model to float16 using `half()`
             self.model_4bit.half()
 
         # Test if we did not break anything

From b6118126c5b3d27a6cbade4caf0145b16aaa8579 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Wed, 14 Aug 2024 20:30:48 +0000
Subject: [PATCH 16/61] fix comments

---
 tests/quantization/bnb/test_4bit.py       |  6 +++---
 tests/quantization/bnb/test_mixed_int8.py | 10 ++++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 24c2c7b3a72a82..8b3356cfe28af6 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -267,9 +267,9 @@ def test_generate_quality_dequantize(self):
 
     def test_device_and_dtype_assignment(self):
         r"""
-        Test whether attempting to change the device or cast the dtype of a model 
+        Test whether attempting to change the device or cast the dtype of a model
         after converting it to 4-bit precision will raise an appropriate error.
-        The test ensures that such operations are prohibited on 4-bit models 
+        The test ensures that such operations are prohibited on 4-bit models
         to prevent invalid conversions.
         """
         with self.assertRaises(ValueError):
@@ -285,7 +285,7 @@ def test_device_and_dtype_assignment(self):
             self.model_4bit.to(torch.device("cuda:0"))
 
         with self.assertRaises(ValueError):
-            # Tries cast the 4-bit model to float32 using `float()`
+            # Tries to cast the 4-bit model to float32 using `float()`
             self.model_4bit.float()
 
         with self.assertRaises(ValueError):
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index 503dab3a75a8e6..24432ceb9b9664 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -333,8 +333,10 @@ def test_raise_if_config_and_load_in_8bit(self):
 
     def test_device_and_dtype_assignment(self):
         r"""
-        Test whether trying to cast (or assigning a device to) a model after converting it in 8-bit will throw an error.
-        Checks also if other models are casted correctly.
+        Test whether attempting to change the device or cast the dtype of a model
+        after converting it to 8-bit precision will raise an appropriate error.
+        The test ensures that such operations are prohibited on 8-bit models
+        to prevent invalid conversions.
         """
         with self.assertRaises(ValueError):
             # Tries with `str`
@@ -349,11 +351,11 @@ def test_device_and_dtype_assignment(self):
             self.model_8bit.to(torch.device("cuda:0"))
 
         with self.assertRaises(ValueError):
-            # Tries with a `device`
+            # Tries to cast the 8-bit model to float32 using `float()`
             self.model_8bit.float()
 
         with self.assertRaises(ValueError):
-            # Tries with a `device`
+            # Tries to cast the 4-bit model to float16 using `half()`
             self.model_8bit.half()
 
         # Test if we did not break anything

From ab4836e45b9cbbf7a0ba170ab818d1e4fc56cd99 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Wed, 14 Aug 2024 22:01:55 +0000
Subject: [PATCH 17/61] switch to testing_utils.torch_device

---
 tests/quantization/bnb/test_4bit.py       | 41 ++++++++--------
 tests/quantization/bnb/test_mixed_int8.py | 57 +++++++++++------------
 2 files changed, 47 insertions(+), 51 deletions(-)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 8b3356cfe28af6..a56906a8837590 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -82,13 +82,6 @@ def forward(self, input, *args, **kwargs):
 if is_bitsandbytes_available():
     import bitsandbytes as bnb
 
-    def setUpModule():
-        global device
-        if hasattr(bnb, "features") and "multi_backend" in bnb.features:
-            device = "cuda" if torch.cuda.is_available() else "cpu"  # TODO: how to add "xpu" ?
-        else:
-            device = "cuda"
-
 
 @require_bitsandbytes
 @require_accelerate
@@ -215,7 +208,7 @@ def test_rwkv_4bit(self):
         tok = AutoTokenizer.from_pretrained(model_id)
 
         text = "Hello my name is"
-        input_ids = tok.encode(text, return_tensors="pt").to(device)
+        input_ids = tok.encode(text, return_tensors="pt").to(torch_device)
 
         _ = model.generate(input_ids, max_new_tokens=30)
 
@@ -226,7 +219,9 @@ def test_generate_quality(self):
         the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
         """
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-        output_sequences = self.model_4bit.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
+        output_sequences = self.model_4bit.generate(
+            input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
+        )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
@@ -243,7 +238,7 @@ def test_generate_quality_config(self):
 
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
         output_sequences = model_4bit_from_config.generate(
-            input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10
+            input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
         )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -261,7 +256,9 @@ def test_generate_quality_dequantize(self):
         model_4bit.dequantize()
 
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-        output_sequences = model_4bit.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
+        output_sequences = model_4bit.generate(
+            input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
+        )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
@@ -282,7 +279,7 @@ def test_device_and_dtype_assignment(self):
 
         with self.assertRaises(ValueError):
             # Tries with a `device`
-            self.model_4bit.to(torch.device("cuda:0"))
+            self.model_4bit.to(torch.device(torch_device))
 
         with self.assertRaises(ValueError):
             # Tries to cast the 4-bit model to float32 using `float()`
@@ -296,7 +293,7 @@ def test_device_and_dtype_assignment(self):
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
 
         self.model_fp16 = self.model_fp16.to(torch.float32)
-        _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
+        _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10)
 
         # Check this does not throw an error
         _ = self.model_fp16.to("cpu")
@@ -357,14 +354,14 @@ def test_inference_without_keep_in_fp32(self):
 
         # test with `google-t5/t5-small`
         model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
         _ = model.generate(**encoded_input)
 
         # test with `flan-t5-small`
         model = T5ForConditionalGeneration.from_pretrained(
             self.dense_act_model_name, load_in_4bit=True, device_map="auto"
         )
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
         _ = model.generate(**encoded_input)
         T5ForConditionalGeneration._keep_in_fp32_modules = modules
 
@@ -382,14 +379,14 @@ def test_inference_with_keep_in_fp32(self):
         # there was a bug with decoders - this test checks that it is fixed
         self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear4bit))
 
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
         _ = model.generate(**encoded_input)
 
         # test with `flan-t5-small`
         model = T5ForConditionalGeneration.from_pretrained(
             self.dense_act_model_name, load_in_4bit=True, device_map="auto"
         )
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
         _ = model.generate(**encoded_input)
 
 
@@ -472,7 +469,7 @@ def test_pipeline(self):
                 "device_map": "auto",
                 "load_in_4bit": True,
                 # float16 isn't supported on CPU, use bfloat16 instead
-                "torch_dtype": torch.bloat16 if device == "cpu" else torch.float16,
+                "torch_dtype": torch.bfloat16 if torch_device == "cpu" else torch.float16,
             },
             max_new_tokens=self.MAX_NEW_TOKENS,
         )
@@ -505,7 +502,9 @@ def test_multi_gpu_loading(self):
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
 
         # Second real batch
-        output_parallel = model_parallel.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
+        output_parallel = model_parallel.generate(
+            input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
+        )
         self.assertIn(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
 
@@ -541,10 +540,10 @@ def test_training(self):
                 module.v_proj = LoRALayer(module.v_proj, rank=16)
 
         # Step 3: dummy batch
-        batch = self.tokenizer("Test batch ", return_tensors="pt").to(device)
+        batch = self.tokenizer("Test batch ", return_tensors="pt").to(torch_device)
 
         # Step 4: Check if the gradient is not None
-        with torch.autocast(device):
+        with torch.autocast(torch_device):
             out = model.forward(**batch)
             out.logits.norm().backward()
 
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index 24432ceb9b9664..3486c363ef4399 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -40,6 +40,7 @@
     require_torch_gpu_if_bnb_not_multi_backend_enabled,
     require_torch_multi_gpu,
     slow,
+    torch_device,
 )
 
 
@@ -82,13 +83,6 @@ def forward(self, input, *args, **kwargs):
 if is_bitsandbytes_available():
     import bitsandbytes as bnb
 
-    def setUpModule():
-        global device
-        if hasattr(bnb, "features") and "multi_backend" in bnb.features:
-            device = "cuda" if torch.cuda.is_available() else "cpu"  # TODO: how to add "xpu" ?
-        else:
-            device = "cuda"
-
 
 @require_bitsandbytes
 @require_accelerate
@@ -254,7 +248,6 @@ def test_llm_skip(self):
         r"""
         A simple test to check if `llm_int8_skip_modules` works as expected
         """
-        import bitsandbytes as bnb
 
         quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_skip_modules=["classifier"])
         seq_classification_model = AutoModelForSequenceClassification.from_pretrained(
@@ -277,7 +270,9 @@ def test_generate_quality(self):
         the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
         """
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-        output_sequences = self.model_8bit.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
+        output_sequences = self.model_8bit.generate(
+            input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
+        )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
@@ -294,7 +289,7 @@ def test_generate_quality_config(self):
 
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
         output_sequences = model_8bit_from_config.generate(
-            input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10
+            input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
         )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -312,7 +307,9 @@ def test_generate_quality_dequantize(self):
         model_8bit.dequantize()
 
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-        output_sequences = model_8bit.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
+        output_sequences = model_8bit.generate(
+            input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
+        )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
@@ -348,7 +345,7 @@ def test_device_and_dtype_assignment(self):
 
         with self.assertRaises(ValueError):
             # Tries with a `device`
-            self.model_8bit.to(torch.device("cuda:0"))
+            self.model_8bit.to(torch.device(torch_device))
 
         with self.assertRaises(ValueError):
             # Tries to cast the 8-bit model to float32 using `float()`
@@ -362,7 +359,7 @@ def test_device_and_dtype_assignment(self):
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
 
         self.model_fp16 = self.model_fp16.to(torch.float32)
-        _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
+        _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10)
 
         # Check this does not throw an error
         _ = self.model_fp16.to("cpu")
@@ -402,7 +399,7 @@ def test_int8_serialization(self):
             # generate
             encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
             output_sequences = model_from_saved.generate(
-                input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10
+                input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
             )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -429,7 +426,7 @@ def test_int8_serialization_regression(self):
             # generate
             encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
             output_sequences = model_from_saved.generate(
-                input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10
+                input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
             )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -456,7 +453,7 @@ def test_int8_serialization_sharded(self):
             # generate
             encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
             output_sequences = model_from_saved.generate(
-                input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10
+                input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
             )
 
             self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -477,7 +474,7 @@ def test_int8_from_pretrained(self):
 
         # generate
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
+        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10)
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
@@ -516,14 +513,14 @@ def test_inference_without_keep_in_fp32(self):
 
         # test with `google-t5/t5-small`
         model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
         _ = model.generate(**encoded_input)
 
         # test with `flan-t5-small`
         model = T5ForConditionalGeneration.from_pretrained(
             self.dense_act_model_name, load_in_8bit=True, device_map="auto"
         )
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
         _ = model.generate(**encoded_input)
         T5ForConditionalGeneration._keep_in_fp32_modules = modules
 
@@ -533,7 +530,6 @@ def test_inference_with_keep_in_fp32(self):
         `flan-t5-small` uses `T5DenseGatedActDense` whereas `google-t5/t5-small` uses `T5DenseReluDense`. We need to test
         both cases.
         """
-        import bitsandbytes as bnb
 
         from transformers import T5ForConditionalGeneration
 
@@ -543,14 +539,14 @@ def test_inference_with_keep_in_fp32(self):
         # there was a bug with decoders - this test checks that it is fixed
         self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear8bitLt))
 
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
         _ = model.generate(**encoded_input)
 
         # test with `flan-t5-small`
         model = T5ForConditionalGeneration.from_pretrained(
             self.dense_act_model_name, load_in_8bit=True, device_map="auto"
         )
-        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
         _ = model.generate(**encoded_input)
 
     def test_inference_with_keep_in_fp32_serialized(self):
@@ -560,7 +556,6 @@ def test_inference_with_keep_in_fp32_serialized(self):
         `flan-t5-small` uses `T5DenseGatedActDense` whereas `google-t5/t5-small` uses `T5DenseReluDense`. We need to test
         both cases.
         """
-        import bitsandbytes as bnb
 
         from transformers import T5ForConditionalGeneration
 
@@ -575,14 +570,14 @@ def test_inference_with_keep_in_fp32_serialized(self):
             # there was a bug with decoders - this test checks that it is fixed
             self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear8bitLt))
 
-            encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
+            encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
             _ = model.generate(**encoded_input)
 
             # test with `flan-t5-small`
             model = T5ForConditionalGeneration.from_pretrained(
                 self.dense_act_model_name, load_in_8bit=True, device_map="auto"
             )
-            encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(device)
+            encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
             _ = model.generate(**encoded_input)
 
 
@@ -693,7 +688,9 @@ def test_multi_gpu_loading(self):
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
 
         # Second real batch
-        output_parallel = model_parallel.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
+        output_parallel = model_parallel.generate(
+            input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
+        )
         self.assertIn(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
 
@@ -708,7 +705,7 @@ def check_inference_correctness(self, model):
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
 
         # Check the exactness of the results
-        output_parallel = model.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
+        output_parallel = model.generate(input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10)
 
         # Get the generation
         output_text = self.tokenizer.decode(output_parallel[0], skip_special_tokens=True)
@@ -876,10 +873,10 @@ def test_training(self):
                 module.v_proj = LoRALayer(module.v_proj, rank=16)
 
         # Step 3: dummy batch
-        batch = self.tokenizer("Test batch ", return_tensors="pt").to(device)
+        batch = self.tokenizer("Test batch ", return_tensors="pt").to(torch_device)
 
         # Step 4: Check if the gradient is not None
-        with torch.autocast(device):
+        with torch.autocast(torch_device):
             out = model.forward(**batch)
             out.logits.norm().backward()
 
@@ -920,6 +917,6 @@ def test_int8_from_pretrained(self):
 
         # generate
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
-        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
+        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10)
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)

From 73995002ea4e4383d767cc42cc60ec52d18351a7 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Sun, 18 Aug 2024 23:22:12 +0000
Subject: [PATCH 18/61] allow for xpu in multi-gpu tests

---
 src/transformers/testing_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index e1d650eeeb6cbf..2d998056552941 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -750,7 +750,9 @@ def require_torch_multi_gpu(test_case):
 
     import torch
 
-    return unittest.skipUnless(torch.cuda.device_count() > 1, "test requires multiple GPUs")(test_case)
+    device_count = torch.cuda.device_count() if not is_torch_xpu_available else torch.xpu.device_count()
+
+    return unittest.skipUnless(device_count > 1, "test requires multiple GPUs")(test_case)
 
 
 def require_torch_multi_accelerator(test_case):

From b41059c7fc549030ba4a58696d43da451b01cbf9 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 20 Aug 2024 08:54:18 -0400
Subject: [PATCH 19/61] fix tests 4bit for CPU NF4

---
 tests/quantization/bnb/test_4bit.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index a56906a8837590..e7202f133d25ad 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -630,7 +630,9 @@ def test_serialization(self, quant_type="nf4", double_quant=True, safe_serializa
                     d1[k].quant_state.as_dict().values(),
                 ):
                     if isinstance(v0, torch.Tensor):
-                        self.assertTrue(torch.equal(v0, v1.to(v0.device)))
+                        # The asbmax will not be saved in the quant_state when using NF4 in CPU
+                        if v0.numel() != 0:
+                            self.assertTrue(torch.equal(v0, v1.to(v0.device)))
                     else:
                         self.assertTrue(v0 == v1)
 

From 1a7a6fe78934010edbdca741abdd7eb990b598aa Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 20 Aug 2024 18:10:48 +0000
Subject: [PATCH 20/61] fix bug with is_torch_xpu_available needing to be
 called as func

---
 src/transformers/testing_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 2d998056552941..c1bcc4827b2765 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -750,8 +750,8 @@ def require_torch_multi_gpu(test_case):
 
     import torch
 
-    device_count = torch.cuda.device_count() if not is_torch_xpu_available else torch.xpu.device_count()
-
+    device_count = torch.cuda.device_count() if not is_torch_xpu_available() else torch.xpu.device_count()
+    print(f'{device_count = }')
     return unittest.skipUnless(device_count > 1, "test requires multiple GPUs")(test_case)
 
 

From 87983df82affc4093e6dffe9f3de56601ee95c7b Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 20 Aug 2024 18:16:41 +0000
Subject: [PATCH 21/61] avoid issue where test reports attr err due to other
 failure

---
 src/transformers/testing_utils.py         | 2 +-
 tests/quantization/bnb/test_4bit.py       | 3 ++-
 tests/quantization/bnb/test_mixed_int8.py | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index c1bcc4827b2765..3b7142e8ec8a55 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -751,7 +751,7 @@ def require_torch_multi_gpu(test_case):
     import torch
 
     device_count = torch.cuda.device_count() if not is_torch_xpu_available() else torch.xpu.device_count()
-    print(f'{device_count = }')
+    
     return unittest.skipUnless(device_count > 1, "test requires multiple GPUs")(test_case)
 
 
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index e7202f133d25ad..7f0e5eabe561e9 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -450,7 +450,8 @@ def tearDown(self):
         TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
         avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
         """
-        del self.pipe
+        if hasattr(self, "pipe"):
+            del self.pipe
 
         gc.collect()
         torch.cuda.empty_cache()
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index 3486c363ef4399..5a99ab32e42b8c 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -641,7 +641,8 @@ def tearDown(self):
         TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
         avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
         """
-        del self.pipe
+        if hasattr(self, "pipe"):
+            del self.pipe
 
         gc.collect()
         torch.cuda.empty_cache()

From 7f17188b340c10528de6bc57541cc16d5a43fd25 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Wed, 21 Aug 2024 01:08:39 +0000
Subject: [PATCH 22/61] fix formatting

---
 src/transformers/testing_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 3b7142e8ec8a55..78d8175dcee851 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -751,7 +751,7 @@ def require_torch_multi_gpu(test_case):
     import torch
 
     device_count = torch.cuda.device_count() if not is_torch_xpu_available() else torch.xpu.device_count()
-    
+
     return unittest.skipUnless(device_count > 1, "test requires multiple GPUs")(test_case)
 
 

From bb3ba4a2c8c0d96d0077136c9e6fef815172b575 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Wed, 21 Aug 2024 12:33:44 +0000
Subject: [PATCH 23/61] fix typo from resolving of merge conflict

---
 src/transformers/quantizers/quantizer_bnb_4bit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index a5d632c538be55..a1ab425e54aebc 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -65,7 +65,7 @@ def __init__(self, quantization_config, **kwargs):
             self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
 
     def validate_environment(self, *args, **kwargs):
-        if not (is_accelerate_available() and is_bitsandbytes_available())
+        if not (is_accelerate_available() and is_bitsandbytes_available()):
             raise ImportError(
                 f"Using `bitsandbytes` 4-bit quantization requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
             )

From 463c211199e56d09612b6094974ebf6d53402250 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Thu, 22 Aug 2024 11:34:38 +0800
Subject: [PATCH 24/61] polish based on last PR review

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 src/transformers/integrations/__init__.py     |  2 +
 src/transformers/integrations/bitsandbytes.py |  6 +-
 .../integrations/integration_utils.py         | 63 +++++++++++++++++++
 .../quantizers/quantizer_bnb_4bit.py          | 14 ++---
 .../quantizers/quantizer_bnb_8bit.py          | 14 ++---
 src/transformers/testing_utils.py             | 15 ++++-
 src/transformers/utils/__init__.py            | 29 +++++++++
 src/transformers/utils/import_utils.py        |  6 +-
 8 files changed, 124 insertions(+), 25 deletions(-)

diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
index 4c756a23ae0aa4..692cada5cd25d0 100755
--- a/src/transformers/integrations/__init__.py
+++ b/src/transformers/integrations/__init__.py
@@ -93,6 +93,7 @@
         "run_hp_search_ray",
         "run_hp_search_sigopt",
         "run_hp_search_wandb",
+        "validate_bnb_backend_availability",
     ],
     "peft": ["PeftAdapterMixin"],
     "quanto": ["replace_with_quanto_layers"],
@@ -175,6 +176,7 @@
         run_hp_search_ray,
         run_hp_search_sigopt,
         run_hp_search_wandb,
+        validate_bnb_backend_availability,
     )
     from .peft import PeftAdapterMixin
     from .quanto import replace_with_quanto_layers
diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
index 345796e310caaf..c89cc562c81333 100644
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@@ -6,7 +6,11 @@
 
 from packaging import version
 
-from ..utils import is_accelerate_available, is_bitsandbytes_available, logging
+from ..utils import (
+    is_accelerate_available,
+    is_bitsandbytes_available,
+    logging,
+)
 
 
 if is_bitsandbytes_available():
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 9172f9599f77b0..27378adc1f6e99 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -38,7 +38,9 @@
 from ..utils import (
     PushToHubMixin,
     flatten_dict,
+    get_available_devices,
     is_datasets_available,
+    is_ipex_available,
     is_pandas_available,
     is_tf_available,
     is_torch_available,
@@ -204,6 +206,67 @@ def is_dvclive_available():
     return importlib.util.find_spec("dvclive") is not None
 
 
+def _validate_bnb_multi_backend_availability(raise_exception):
+    import bitsandbytes as bnb
+
+    bnb_supported_devices = getattr(bnb, "supported_torch_devices", set())
+    available_devices = get_available_devices()
+
+    if available_devices == {"cpu"} and not is_ipex_available():
+        from importlib.util import find_spec
+
+        if find_spec("intel_extension_for_pytorch"):
+            logger.warning(
+                "You have Intel IPEX installed but if you're intending to use it for CPU, it might not have the right version. Be sure to double check that your PyTorch and IPEX installs are compatible."
+            )
+
+        available_devices.discard("cpu")  # Only Intel CPU is supported by BNB at the moment
+
+    if not available_devices.intersection(bnb_supported_devices):
+        if raise_exception:
+            bnb_supported_devices_with_info = set(  # noqa: C401
+                '"cpu" (needs an Intel CPU and intel_extension_for_pytorch installed and compatible with the PyTorch version)'
+                if device == "cpu"
+                else device
+                for device in bnb_supported_devices
+            )
+            err_msg = f"None of the available devices `available_devices = {available_devices or None}` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {bnb_supported_devices_with_info}`. Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+
+            logger.error(err_msg)
+            raise RuntimeError(err_msg)
+
+        logger.warning("No supported devices found for bitsandbytes multi-backend.")
+        return False
+
+    logger.debug("Multi-backend validation successful.")
+    return True
+
+
+def _validate_bnb_cuda_backend_availability(raise_exception):
+    if not torch.cuda.is_available():
+        log_msg = "CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+        if raise_exception:
+            logger.error(log_msg)
+            raise RuntimeError(log_msg)
+
+        logger.warning(log_msg)
+        return False
+
+    logger.debug("CUDA backend validation successful.")
+    return True
+
+
+def validate_bnb_backend_availability(raise_exception=False):
+    """
+    Validates if the available devices are supported by bitsandbytes, optionally raising an exception if not.
+    """
+    import bitsandbytes as bnb
+
+    if "multi_backend" in getattr(bnb, "features", set()):
+        return _validate_bnb_multi_backend_availability(raise_exception)
+    return _validate_bnb_cuda_backend_availability(raise_exception)
+
+
 def hp_params(trial):
     if is_optuna_available():
         import optuna
diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index a1ab425e54aebc..a79174f9faa139 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -65,7 +65,7 @@ def __init__(self, quantization_config, **kwargs):
             self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
 
     def validate_environment(self, *args, **kwargs):
-        if not (is_accelerate_available() and is_bitsandbytes_available()):
+        if not is_accelerate_available():
             raise ImportError(
                 f"Using `bitsandbytes` 4-bit quantization requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
             )
@@ -75,15 +75,11 @@ def validate_environment(self, *args, **kwargs):
             )
         import bitsandbytes as bnb
 
-        bnb_is_multibackend_enabled = "multi_backend" in getattr(bnb, "features", set())
+        bnb_multibackend_is_enabled = "multi_backend" in getattr(bnb, "features", set())
 
-        if not torch.cuda.is_available():
-            import bitsandbytes as bnb
+        from ..integrations.integration_utils import validate_bnb_backend_availability
 
-            if not bnb_is_multibackend_enabled:
-                raise RuntimeError(
-                    "Current bitsandbytes (`main`) only supports CUDA, please switch to the `multi-backend-refactor` preview release for WIP support of other backends."
-                )
+        validate_bnb_backend_availability(raise_exception=True)
 
         if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
             raise ValueError(
@@ -100,7 +96,7 @@ def validate_environment(self, *args, **kwargs):
             device_map_without_lm_head = {
                 key: device_map[key] for key in device_map.keys() if key not in self.modules_to_not_convert
             }
-            if set(device_map.values()) == {"cpu"} and bnb_is_multibackend_enabled:
+            if set(device_map.values()) == {"cpu"} and bnb_multibackend_is_enabled:
                 pass
             elif "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
                 raise ValueError(
diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py
index e9e1a8dac0ec27..f10bd6ec799654 100644
--- a/src/transformers/quantizers/quantizer_bnb_8bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_8bit.py
@@ -65,7 +65,7 @@ def __init__(self, quantization_config, **kwargs):
             self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
 
     def validate_environment(self, *args, **kwargs):
-        if not (is_accelerate_available() and is_bitsandbytes_available()):
+        if not is_accelerate_available():
             raise ImportError(
                 f"Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
             )
@@ -75,15 +75,11 @@ def validate_environment(self, *args, **kwargs):
             )
         import bitsandbytes as bnb
 
-        bnb_is_multibackend_enabled = "multi_backend" in getattr(bnb, "features", set())
+        bnb_multibackend_is_enabled = "multi_backend" in getattr(bnb, "features", set())
 
-        if not torch.cuda.is_available():
-            import bitsandbytes as bnb
+        from ..integrations.integration_utils import validate_bnb_backend_availability
 
-            if not bnb_is_multibackend_enabled:
-                raise RuntimeError(
-                    "Current bitsandbytes (`main`) only supports CUDA, please switch to the `multi-backend-refactor` preview release for WIP support of other backends."
-                )
+        validate_bnb_backend_availability(raise_exception=True)
 
         if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
             raise ValueError(
@@ -100,7 +96,7 @@ def validate_environment(self, *args, **kwargs):
             device_map_without_lm_head = {
                 key: device_map[key] for key in device_map.keys() if key not in self.modules_to_not_convert
             }
-            if set(device_map.values()) == {"cpu"} and bnb_is_multibackend_enabled:
+            if set(device_map.values()) == {"cpu"} and bnb_multibackend_is_enabled:
                 pass
             elif "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
                 raise ValueError(
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 78d8175dcee851..e6a99988a926b1 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -221,6 +221,17 @@ def parse_int_from_env(key, default=None):
 _run_third_party_device_tests = parse_flag_from_env("RUN_THIRD_PARTY_DEVICE_TESTS", default=False)
 
 
+def get_device_count():
+    import torch
+
+    if is_torch_xpu_available():
+        num_devices = torch.xpu.device_count()
+    else:
+        num_devices = torch.cuda.device_count()
+
+    return num_devices
+
+
 def is_pt_tf_cross_test(test_case):
     """
     Decorator marking a test as a test that control interactions between PyTorch and TensorFlow.
@@ -748,9 +759,7 @@ def require_torch_multi_gpu(test_case):
     if not is_torch_available():
         return unittest.skip(reason="test requires PyTorch")(test_case)
 
-    import torch
-
-    device_count = torch.cuda.device_count() if not is_torch_xpu_available() else torch.xpu.device_count()
+    device_count = get_device_count()
 
     return unittest.skipUnless(device_count > 1, "test requires multiple GPUs")(test_case)
 
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index b1a1bb56cbd82c..5d48000de344ca 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -15,6 +15,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from functools import lru_cache
+
 from huggingface_hub import get_full_repo_name  # for backward compatibility
 from huggingface_hub.constants import HF_HUB_DISABLE_TELEMETRY as DISABLE_TELEMETRY  # for backward compatibility
 from packaging import version
@@ -274,3 +276,30 @@ def check_min_version(min_version):
             + "Check out https://github.com/huggingface/transformers/tree/main/examples#important-note for the examples corresponding to other "
             "versions of HuggingFace Transformers."
         )
+
+@lru_cache()
+def get_available_devices():
+    """
+    Returns a set of devices available for the current PyTorch installation.
+    """
+    devices = {"cpu"}  # `cpu` is always supported as a device in PyTorch
+
+    if is_torch_cuda_available():
+        devices.add("cuda")
+
+    if is_torch_mps_available():
+        devices.add("mps")
+
+    if is_torch_xpu_available():
+        devices.add("xpu")
+
+    if is_torch_npu_available():
+        devices.add("npu")
+
+    if is_torch_mlu_available():
+        devices.add("mlu")
+
+    if is_torch_musa_available():
+        devices.add("musa")
+
+    return devices
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 9c086b81662b76..6045d35bffafed 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -840,11 +840,11 @@ def is_torch_xpu_available(check_device=False):
     return hasattr(torch, "xpu") and torch.xpu.is_available()
 
 
+@lru_cache()
 def is_bitsandbytes_available():
-    if not is_torch_available():
-        return False
+    from transformers.integrations.integration_utils import validate_bnb_backend_availability
 
-    return _bitsandbytes_available
+    return is_torch_available() and validate_bnb_backend_availability(raise_exception=False)
 
 
 def is_flash_attn_2_available():

From 6d89ee4f7792c15c876406d2c90616380c2289d8 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 28 Aug 2024 05:49:24 -0400
Subject: [PATCH 25/61] fix CI

---
 src/transformers/utils/__init__.py     | 1 +
 src/transformers/utils/import_utils.py | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 5d48000de344ca..406a010d663189 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -277,6 +277,7 @@ def check_min_version(min_version):
             "versions of HuggingFace Transformers."
         )
 
+
 @lru_cache()
 def get_available_devices():
     """
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 6045d35bffafed..ff6af1b921f6b2 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -844,7 +844,9 @@ def is_torch_xpu_available(check_device=False):
 def is_bitsandbytes_available():
     from transformers.integrations.integration_utils import validate_bnb_backend_availability
 
-    return is_torch_available() and validate_bnb_backend_availability(raise_exception=False)
+    return (
+        _bitsandbytes_available and is_torch_available() and validate_bnb_backend_availability(raise_exception=False)
+    )
 
 
 def is_flash_attn_2_available():

From 7e01cfb0eab9a06d7d3b471218ccdb29dfbde5b5 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Thu, 29 Aug 2024 13:22:20 +0800
Subject: [PATCH 26/61] Update
 src/transformers/integrations/integration_utils.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 src/transformers/integrations/integration_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 27378adc1f6e99..d58b8e484ee9ca 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -230,7 +230,8 @@ def _validate_bnb_multi_backend_availability(raise_exception):
                 else device
                 for device in bnb_supported_devices
             )
-            err_msg = f"None of the available devices `available_devices = {available_devices or None}` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {bnb_supported_devices_with_info}`. Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+            err_msg = f"None of the available devices `available_devices = {available_devices or None}` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {bnb_supported_devices_with_info}`. 
+            Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
 
             logger.error(err_msg)
             raise RuntimeError(err_msg)

From 9bffc93da01b79200184e13758771f2a43dcd974 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Thu, 29 Aug 2024 13:22:32 +0800
Subject: [PATCH 27/61] Update
 src/transformers/integrations/integration_utils.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 src/transformers/integrations/integration_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index d58b8e484ee9ca..f41b404c1aee60 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -245,7 +245,8 @@ def _validate_bnb_multi_backend_availability(raise_exception):
 
 def _validate_bnb_cuda_backend_availability(raise_exception):
     if not torch.cuda.is_available():
-        log_msg = "CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+        log_msg = "CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. 
+        Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
         if raise_exception:
             logger.error(log_msg)
             raise RuntimeError(log_msg)

From 01b7587c0841e600663bfdbb8f6bc880c2289ae4 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 29 Aug 2024 09:21:50 -0400
Subject: [PATCH 28/61] fix error log

---
 src/transformers/integrations/integration_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index f41b404c1aee60..130e6f6c238395 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -245,7 +245,7 @@ def _validate_bnb_multi_backend_availability(raise_exception):
 
 def _validate_bnb_cuda_backend_availability(raise_exception):
     if not torch.cuda.is_available():
-        log_msg = "CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. 
+        log_msg = f"CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. 
         Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
         if raise_exception:
             logger.error(log_msg)

From 171b130d85861c9ec9f356960575aacd31bfbde1 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 29 Aug 2024 09:36:58 -0400
Subject: [PATCH 29/61] fix error msg

---
 src/transformers/integrations/integration_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 130e6f6c238395..4795f108d51db2 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -230,8 +230,8 @@ def _validate_bnb_multi_backend_availability(raise_exception):
                 else device
                 for device in bnb_supported_devices
             )
-            err_msg = f"None of the available devices `available_devices = {available_devices or None}` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {bnb_supported_devices_with_info}`. 
-            Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+            err_msg = f"None of the available devices `available_devices = {available_devices or None}` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {bnb_supported_devices_with_info}`." \
+            + "Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
 
             logger.error(err_msg)
             raise RuntimeError(err_msg)
@@ -245,8 +245,8 @@ def _validate_bnb_multi_backend_availability(raise_exception):
 
 def _validate_bnb_cuda_backend_availability(raise_exception):
     if not torch.cuda.is_available():
-        log_msg = f"CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. 
-        Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+        log_msg = "CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress." \
+        + " Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
         if raise_exception:
             logger.error(log_msg)
             raise RuntimeError(log_msg)

From 5e9bf9a940cc4484fed8fc632b8db29b4291d1fa Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 29 Aug 2024 09:38:31 -0400
Subject: [PATCH 30/61] add \n in error log

---
 src/transformers/integrations/integration_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 4795f108d51db2..db2ff3509b03bd 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -231,7 +231,7 @@ def _validate_bnb_multi_backend_availability(raise_exception):
                 for device in bnb_supported_devices
             )
             err_msg = f"None of the available devices `available_devices = {available_devices or None}` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {bnb_supported_devices_with_info}`." \
-            + "Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+            + "\nPlease check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
 
             logger.error(err_msg)
             raise RuntimeError(err_msg)
@@ -246,7 +246,7 @@ def _validate_bnb_multi_backend_availability(raise_exception):
 def _validate_bnb_cuda_backend_availability(raise_exception):
     if not torch.cuda.is_available():
         log_msg = "CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress." \
-        + " Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+        + "\nPlease check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
         if raise_exception:
             logger.error(log_msg)
             raise RuntimeError(log_msg)

From 496c0467a4f401312b78e94c91056a5a109da4bb Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 29 Aug 2024 09:43:18 -0400
Subject: [PATCH 31/61] make quality

---
 src/transformers/integrations/integration_utils.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index db2ff3509b03bd..f50fb7441dffd7 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -230,8 +230,10 @@ def _validate_bnb_multi_backend_availability(raise_exception):
                 else device
                 for device in bnb_supported_devices
             )
-            err_msg = f"None of the available devices `available_devices = {available_devices or None}` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {bnb_supported_devices_with_info}`." \
-            + "\nPlease check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+            err_msg = (
+                f"None of the available devices `available_devices = {available_devices or None}` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {bnb_supported_devices_with_info}`. "
+                "Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+            )
 
             logger.error(err_msg)
             raise RuntimeError(err_msg)
@@ -245,8 +247,10 @@ def _validate_bnb_multi_backend_availability(raise_exception):
 
 def _validate_bnb_cuda_backend_availability(raise_exception):
     if not torch.cuda.is_available():
-        log_msg = "CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress." \
-        + "\nPlease check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+        log_msg = (
+            "CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. "
+            "Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+        )
         if raise_exception:
             logger.error(log_msg)
             raise RuntimeError(log_msg)

From 86d0016ad9706a458814df9f0be0a15c22678728 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 30 Aug 2024 12:47:31 -0400
Subject: [PATCH 32/61] rm bnb cuda restriction in doc

---
 docs/source/en/model_doc/chameleon.md  | 2 +-
 docs/source/en/model_doc/llava_next.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md
index 323b83813160b0..fa6e42b15ff84f 100644
--- a/docs/source/en/model_doc/chameleon.md
+++ b/docs/source/en/model_doc/chameleon.md
@@ -128,7 +128,7 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 ### Quantization using Bitsandbytes
 
-The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. Simply change the snippet above with:
+The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes`. Simply change the snippet above with:
 
 ```python
 from transformers import ChameleonForConditionalGeneration, BitsAndBytesConfig
diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md
index d0558be76467a2..299feb00c0e2f4 100644
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@@ -233,7 +233,7 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 ### Quantization using Bitsandbytes
 
-The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. Simply change the snippet above with:
+The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes`. Simply change the snippet above with:
 
 ```python
 from transformers import LlavaNextForConditionalGeneration, BitsAndBytesConfig

From 1c96ae90f7ed41ff14081ccf20039140c8ba07d3 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 3 Sep 2024 08:16:44 -0400
Subject: [PATCH 33/61] cpu model don't need dispatch

---
 src/transformers/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index b943b5e7989f03..0066be99b783f4 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -4002,7 +4002,7 @@ def from_pretrained(
                 pass
 
         # Dispatch model with hooks on all devices if necessary
-        if device_map is not None:
+        if device_map is not None and device_map != {'': torch.device(type='cpu')}:
             device_map_kwargs = {
                 "device_map": device_map,
                 "offload_dir": offload_folder,

From 3aec62677759ec906cd443b186ffd7ced6991df7 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 3 Sep 2024 08:24:55 -0400
Subject: [PATCH 34/61] fix doc

---
 docs/source/en/model_doc/chameleon.md  | 2 +-
 docs/source/en/model_doc/llava_next.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md
index fa6e42b15ff84f..8c0df86ac3643a 100644
--- a/docs/source/en/model_doc/chameleon.md
+++ b/docs/source/en/model_doc/chameleon.md
@@ -128,7 +128,7 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 ### Quantization using Bitsandbytes
 
-The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes`. Simply change the snippet above with:
+he model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. If you want to run bitsandbytes on CPU or other devices, please follow the [instruction](https://huggingface.co/docs/bitsandbytes/main/en/installation). Simply change the snippet above with:
 
 ```python
 from transformers import ChameleonForConditionalGeneration, BitsAndBytesConfig
diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md
index 299feb00c0e2f4..57e9254a0ce34d 100644
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@@ -233,7 +233,7 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 ### Quantization using Bitsandbytes
 
-The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes`. Simply change the snippet above with:
+The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes`. If you want to run bitsandbytes on CPU or other devices, please follow the [instruction](https://huggingface.co/docs/bitsandbytes/main/en/installation). Simply change the snippet above with:
 
 ```python
 from transformers import LlavaNextForConditionalGeneration, BitsAndBytesConfig

From daa1e27566d67fc2f04a5842ec36154a2804a73f Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 3 Sep 2024 08:31:07 -0400
Subject: [PATCH 35/61] fix style

---
 src/transformers/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index f165b405adfa1f..2c4cea3cb8d347 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -4018,7 +4018,7 @@ def from_pretrained(
                 pass
 
         # Dispatch model with hooks on all devices if necessary
-        if device_map is not None and device_map != {'': torch.device(type='cpu')}:
+        if device_map is not None and device_map != {"": torch.device(type="cpu")}:
             device_map_kwargs = {
                 "device_map": device_map,
                 "offload_dir": offload_folder,

From d55db0e76425f00909f58fb8e40f046d09227122 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 5 Sep 2024 10:00:16 -0400
Subject: [PATCH 36/61] check cuda avaliable in testing

---
 src/transformers/modeling_utils.py  |  2 +-
 tests/quantization/bnb/test_4bit.py | 18 ++++++++++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 2c4cea3cb8d347..f931a6af3eb21d 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -4018,7 +4018,7 @@ def from_pretrained(
                 pass
 
         # Dispatch model with hooks on all devices if necessary
-        if device_map is not None and device_map != {"": torch.device(type="cpu")}:
+        if device_map is not None:
             device_map_kwargs = {
                 "device_map": device_map,
                 "offload_dir": offload_folder,
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 5de3fd4cff30d3..d09c89e0214770 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -273,10 +273,11 @@ def test_device_assignment(self):
         self.assertEqual(self.model_4bit.device.type, "cpu")
         self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before)
 
-        # Move back to CUDA device
-        self.model_4bit.to(0)
-        self.assertEqual(self.model_4bit.device, torch.device(0))
-        self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before)
+        if torch.cuda.is_available():
+            # Move back to CUDA device
+            self.model_4bit.to(0)
+            self.assertEqual(self.model_4bit.device, torch.device(0))
+            self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before)
 
     def test_device_and_dtype_assignment(self):
         r"""
@@ -299,6 +300,10 @@ def test_device_and_dtype_assignment(self):
             with self.assertRaises(ValueError):
                 # Tries with `cuda`
                 self.model_4bit.cuda()
+        else:
+            with self.assertRaises(ValueError):
+                # Tries with `str`
+                self.model_4bit.to("cpu")
 
         with self.assertRaises(ValueError):
             # Tries with a `dtype`
@@ -322,8 +327,9 @@ def test_device_and_dtype_assignment(self):
         self.model_fp16 = self.model_fp16.to(torch.float32)
         _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10)
 
-        # Check that this does not throw an error
-        _ = self.model_fp16.cuda()
+        if torch.cuda.is_available():
+            # Check that this does not throw an error
+            _ = self.model_fp16.cuda()
 
         # Check this does not throw an error
         _ = self.model_fp16.to("cpu")

From a21a9168fb06bbc701956cfe1678bf94c44e53e0 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 5 Sep 2024 15:05:08 -0400
Subject: [PATCH 37/61] fix tests

---
 tests/quantization/bnb/test_4bit.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index d09c89e0214770..7ced31dfd7101f 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -300,19 +300,11 @@ def test_device_and_dtype_assignment(self):
             with self.assertRaises(ValueError):
                 # Tries with `cuda`
                 self.model_4bit.cuda()
-        else:
-            with self.assertRaises(ValueError):
-                # Tries with `str`
-                self.model_4bit.to("cpu")
 
         with self.assertRaises(ValueError):
             # Tries with a `dtype`
             self.model_4bit.to(torch.float16)
 
-        with self.assertRaises(ValueError):
-            # Tries with a `device`
-            self.model_4bit.to(torch.device(torch_device))
-
         with self.assertRaises(ValueError):
             # Tries to cast the 4-bit model to float32 using `float()`
             self.model_4bit.float()
@@ -322,6 +314,8 @@ def test_device_and_dtype_assignment(self):
             self.model_4bit.half()
 
         # Test if we did not break anything
+        self.model_4bit.to(torch.device(torch_device))
+
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
 
         self.model_fp16 = self.model_fp16.to(torch.float32)

From 8ad17e82e710cfbd69c5a0ce6b16356eae3fc564 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Wed, 11 Sep 2024 13:24:39 +0800
Subject: [PATCH 38/61] Update docs/source/en/model_doc/chameleon.md

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 docs/source/en/model_doc/chameleon.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md
index 8c0df86ac3643a..25886be8a0dc52 100644
--- a/docs/source/en/model_doc/chameleon.md
+++ b/docs/source/en/model_doc/chameleon.md
@@ -128,7 +128,7 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 ### Quantization using Bitsandbytes
 
-he model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. If you want to run bitsandbytes on CPU or other devices, please follow the [instruction](https://huggingface.co/docs/bitsandbytes/main/en/installation). Simply change the snippet above with:
+The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. If you want to run bitsandbytes on CPU or other devices, please follow the [instruction](https://huggingface.co/docs/bitsandbytes/main/en/installation). Simply change the snippet above with:
 
 ```python
 from transformers import ChameleonForConditionalGeneration, BitsAndBytesConfig

From 107e02b01a9579c169bd60c66f0a3b45b03fb1f1 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Wed, 11 Sep 2024 13:25:10 +0800
Subject: [PATCH 39/61] Update docs/source/en/model_doc/llava_next.md

Co-authored-by: Aarni Koskela <akx@iki.fi>
---
 docs/source/en/model_doc/llava_next.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md
index 57e9254a0ce34d..eac77542a11654 100644
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@@ -233,7 +233,7 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 ### Quantization using Bitsandbytes
 
-The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes`. If you want to run bitsandbytes on CPU or other devices, please follow the [instruction](https://huggingface.co/docs/bitsandbytes/main/en/installation). Simply change the snippet above with:
+The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes`. If you want to run bitsandbytes on CPU or other devices, please follow [these installation instructions](https://huggingface.co/docs/bitsandbytes/main/en/installation). Simply change the snippet above with:
 
 ```python
 from transformers import LlavaNextForConditionalGeneration, BitsAndBytesConfig

From 3bab7d76032e832a718babeb15da049eeb857811 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 11 Sep 2024 09:21:22 -0400
Subject: [PATCH 40/61] fix doc

---
 docs/source/en/model_doc/chameleon.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md
index 25886be8a0dc52..19c6bcf2ffe715 100644
--- a/docs/source/en/model_doc/chameleon.md
+++ b/docs/source/en/model_doc/chameleon.md
@@ -128,7 +128,7 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 ### Quantization using Bitsandbytes
 
-The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. If you want to run bitsandbytes on CPU or other devices, please follow the [instruction](https://huggingface.co/docs/bitsandbytes/main/en/installation). Simply change the snippet above with:
+The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. If you want to run bitsandbytes on CPU or other devices, please follow the [these installation instructions](https://huggingface.co/docs/bitsandbytes/main/en/installation). Simply change the snippet above with:
 
 ```python
 from transformers import ChameleonForConditionalGeneration, BitsAndBytesConfig

From 20f6b5e2b235a3c8a23074ed87aa807f833cbdc9 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Wed, 11 Sep 2024 13:51:32 +0800
Subject: [PATCH 41/61] Update tests/quantization/bnb/test_4bit.py

Co-authored-by: Aarni Koskela <akx@iki.fi>
---
 tests/quantization/bnb/test_4bit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 7ced31dfd7101f..9a7afe8b8902bb 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -661,7 +661,7 @@ def test_serialization(self, quant_type="nf4", double_quant=True, safe_serializa
                     d1[k].quant_state.as_dict().values(),
                 ):
                     if isinstance(v0, torch.Tensor):
-                        # The asbmax will not be saved in the quant_state when using NF4 in CPU
+                        # The absmax will not be saved in the quant_state when using NF4 in CPU
                         if v0.numel() != 0:
                             self.assertTrue(torch.equal(v0, v1.to(v0.device)))
                     else:

From 9ac038efba53a3c92f099eacb5a17fb31011303f Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Wed, 11 Sep 2024 14:28:55 +0800
Subject: [PATCH 42/61] Update tests/quantization/bnb/test_4bit.py

Co-authored-by: Aarni Koskela <akx@iki.fi>
---
 tests/quantization/bnb/test_4bit.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 9a7afe8b8902bb..336ee22ce5dbb5 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -275,8 +275,8 @@ def test_device_assignment(self):
 
         if torch.cuda.is_available():
             # Move back to CUDA device
-            self.model_4bit.to(0)
-            self.assertEqual(self.model_4bit.device, torch.device(0))
+            self.model_4bit.to("cuda")
+            self.assertEqual(self.model_4bit.device, torch.device("cuda"))
             self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before)
 
     def test_device_and_dtype_assignment(self):

From 08f31f875751d4432261fefb2fc5f34fadfc77d8 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 11 Sep 2024 10:37:30 -0400
Subject: [PATCH 43/61] fix check multibackends

---
 src/transformers/integrations/__init__.py          | 2 ++
 src/transformers/integrations/integration_utils.py | 7 ++++++-
 src/transformers/quantizers/quantizer_bnb_4bit.py  | 6 ++----
 src/transformers/quantizers/quantizer_bnb_8bit.py  | 6 ++----
 src/transformers/testing_utils.py                  | 8 +++-----
 5 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
index 20327e3abf16a6..63bc89544ffe33 100755
--- a/src/transformers/integrations/__init__.py
+++ b/src/transformers/integrations/__init__.py
@@ -94,6 +94,7 @@
         "run_hp_search_sigopt",
         "run_hp_search_wandb",
         "validate_bnb_backend_availability",
+        "is_bitsandbytes_multi_backend_available",
     ],
     "peft": ["PeftAdapterMixin"],
     "quanto": ["replace_with_quanto_layers"],
@@ -188,6 +189,7 @@
         run_hp_search_sigopt,
         run_hp_search_wandb,
         validate_bnb_backend_availability,
+        is_bitsandbytes_multi_backend_available,
     )
     from .peft import PeftAdapterMixin
     from .quanto import replace_with_quanto_layers
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index f50fb7441dffd7..158cdeabf605e9 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -206,6 +206,11 @@ def is_dvclive_available():
     return importlib.util.find_spec("dvclive") is not None
 
 
+def is_bitsandbytes_multi_backend_available() -> bool:
+    import bitsandbytes as bnb
+    return "multi_backend" in getattr(bnb, "features", set())
+
+
 def _validate_bnb_multi_backend_availability(raise_exception):
     import bitsandbytes as bnb
 
@@ -268,7 +273,7 @@ def validate_bnb_backend_availability(raise_exception=False):
     """
     import bitsandbytes as bnb
 
-    if "multi_backend" in getattr(bnb, "features", set()):
+    if is_bitsandbytes_multi_backend_available():
         return _validate_bnb_multi_backend_availability(raise_exception)
     return _validate_bnb_cuda_backend_availability(raise_exception)
 
diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index ef3fbb8f9cfe23..99ab886e3ea89c 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -74,12 +74,10 @@ def validate_environment(self, *args, **kwargs):
             raise ImportError(
                 "Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
             )
-        import bitsandbytes as bnb
-
-        bnb_multibackend_is_enabled = "multi_backend" in getattr(bnb, "features", set())
 
-        from ..integrations.integration_utils import validate_bnb_backend_availability
+        from ..integrations.integration_utils import validate_bnb_backend_availability, is_bitsandbytes_multi_backend_available
 
+        bnb_multibackend_is_enabled = is_bitsandbytes_multi_backend_available()
         validate_bnb_backend_availability(raise_exception=True)
 
         if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py
index f10bd6ec799654..1e5803c2d0d4e0 100644
--- a/src/transformers/quantizers/quantizer_bnb_8bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_8bit.py
@@ -73,12 +73,10 @@ def validate_environment(self, *args, **kwargs):
             raise ImportError(
                 "Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
             )
-        import bitsandbytes as bnb
-
-        bnb_multibackend_is_enabled = "multi_backend" in getattr(bnb, "features", set())
 
-        from ..integrations.integration_utils import validate_bnb_backend_availability
+        from ..integrations.integration_utils import validate_bnb_backend_availability, is_bitsandbytes_multi_backend_available
 
+        bnb_multibackend_is_enabled = is_bitsandbytes_multi_backend_available()
         validate_bnb_backend_availability(raise_exception=True)
 
         if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index a0c7ad8dfa7f8b..7f063326d7d2c5 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -49,6 +49,7 @@
     is_sigopt_available,
     is_tensorboard_available,
     is_wandb_available,
+    is_bitsandbytes_multi_backend_available,
 )
 from .integrations.deepspeed import is_deepspeed_available
 from .utils import (
@@ -978,11 +979,8 @@ def require_torch_gpu_if_bnb_not_multi_backend_enabled(test_case):
     """
     Decorator marking a test that requires a GPU if bitsandbytes multi-backend feature is not enabled.
     """
-    if is_bitsandbytes_available():
-        import bitsandbytes as bnb
-
-        if hasattr(bnb, "features") and "multi_backend" in bnb.features:
-            return test_case
+    if is_bitsandbytes_available() and is_bitsandbytes_multi_backend_available():
+        return test_case
     return require_torch_gpu(test_case)
 
 

From 9eb09705c250ece56351ac66947f05a90459769c Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 11 Sep 2024 10:42:01 -0400
Subject: [PATCH 44/61] fix import sort

---
 src/transformers/integrations/__init__.py          | 2 +-
 src/transformers/integrations/integration_utils.py | 2 +-
 src/transformers/quantizers/quantizer_bnb_4bit.py  | 5 ++++-
 src/transformers/quantizers/quantizer_bnb_8bit.py  | 5 ++++-
 src/transformers/testing_utils.py                  | 2 +-
 5 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
index 63bc89544ffe33..59192d61128bdb 100755
--- a/src/transformers/integrations/__init__.py
+++ b/src/transformers/integrations/__init__.py
@@ -168,6 +168,7 @@
         get_reporting_integration_callbacks,
         hp_params,
         is_azureml_available,
+        is_bitsandbytes_multi_backend_available,
         is_clearml_available,
         is_codecarbon_available,
         is_comet_available,
@@ -189,7 +190,6 @@
         run_hp_search_sigopt,
         run_hp_search_wandb,
         validate_bnb_backend_availability,
-        is_bitsandbytes_multi_backend_available,
     )
     from .peft import PeftAdapterMixin
     from .quanto import replace_with_quanto_layers
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 158cdeabf605e9..fd31cc5fc081ba 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -208,6 +208,7 @@ def is_dvclive_available():
 
 def is_bitsandbytes_multi_backend_available() -> bool:
     import bitsandbytes as bnb
+
     return "multi_backend" in getattr(bnb, "features", set())
 
 
@@ -271,7 +272,6 @@ def validate_bnb_backend_availability(raise_exception=False):
     """
     Validates if the available devices are supported by bitsandbytes, optionally raising an exception if not.
     """
-    import bitsandbytes as bnb
 
     if is_bitsandbytes_multi_backend_available():
         return _validate_bnb_multi_backend_availability(raise_exception)
diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index 99ab886e3ea89c..3b197522ee0b5f 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -75,7 +75,10 @@ def validate_environment(self, *args, **kwargs):
                 "Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
             )
 
-        from ..integrations.integration_utils import validate_bnb_backend_availability, is_bitsandbytes_multi_backend_available
+        from ..integrations.integration_utils import (
+            is_bitsandbytes_multi_backend_available,
+            validate_bnb_backend_availability,
+        )
 
         bnb_multibackend_is_enabled = is_bitsandbytes_multi_backend_available()
         validate_bnb_backend_availability(raise_exception=True)
diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py
index 1e5803c2d0d4e0..62f19185e3f8b6 100644
--- a/src/transformers/quantizers/quantizer_bnb_8bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_8bit.py
@@ -74,7 +74,10 @@ def validate_environment(self, *args, **kwargs):
                 "Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
             )
 
-        from ..integrations.integration_utils import validate_bnb_backend_availability, is_bitsandbytes_multi_backend_available
+        from ..integrations.integration_utils import (
+            is_bitsandbytes_multi_backend_available,
+            validate_bnb_backend_availability,
+        )
 
         bnb_multibackend_is_enabled = is_bitsandbytes_multi_backend_available()
         validate_bnb_backend_availability(raise_exception=True)
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 7f063326d7d2c5..64f3ac0f15171c 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -43,13 +43,13 @@
 from transformers import logging as transformers_logging
 
 from .integrations import (
+    is_bitsandbytes_multi_backend_available,
     is_clearml_available,
     is_optuna_available,
     is_ray_available,
     is_sigopt_available,
     is_tensorboard_available,
     is_wandb_available,
-    is_bitsandbytes_multi_backend_available,
 )
 from .integrations.deepspeed import is_deepspeed_available
 from .utils import (

From b506b98e115770d52df86511c7acd7431171f670 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 11 Sep 2024 10:51:06 -0400
Subject: [PATCH 45/61] remove check torch in bnb

---
 src/transformers/utils/import_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 4123ca00bad037..cb076df0efe342 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -853,9 +853,7 @@ def is_torch_xpu_available(check_device=False):
 def is_bitsandbytes_available():
     from transformers.integrations.integration_utils import validate_bnb_backend_availability
 
-    return (
-        _bitsandbytes_available and is_torch_available() and validate_bnb_backend_availability(raise_exception=False)
-    )
+    return _bitsandbytes_available and validate_bnb_backend_availability(raise_exception=False)
 
 
 def is_flash_attn_2_available():

From 2be4169773f02f2595ca4859e9693f8bebdefc61 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Wed, 11 Sep 2024 10:35:51 +0000
Subject: [PATCH 46/61] docs: update bitsandbytes references with multi-backend
 info

---
 docs/source/en/llm_tutorial_optimization.md  |  2 +-
 docs/source/en/model_doc/chameleon.md        | 12 +++++++++++-
 docs/source/en/model_doc/llava_next.md       | 12 +++++++++++-
 docs/source/en/model_doc/llava_next_video.md | 12 +++++++++++-
 docs/source/en/model_doc/llava_onevision.md  | 14 ++++++++++++--
 docs/source/en/model_doc/video_llava.md      | 12 +++++++++++-
 docs/source/en/model_memory_anatomy.md       |  2 +-
 docs/source/en/perf_train_gpu_one.md         |  2 +-
 docs/source/en/quantization/bitsandbytes.md  |  8 ++++++++
 docs/source/en/quantization/overview.md      | 16 +++++++++++++++-
 10 files changed, 82 insertions(+), 10 deletions(-)

diff --git a/docs/source/en/llm_tutorial_optimization.md b/docs/source/en/llm_tutorial_optimization.md
index a675a6de39a2fc..9d3d8ad6ba8b86 100644
--- a/docs/source/en/llm_tutorial_optimization.md
+++ b/docs/source/en/llm_tutorial_optimization.md
@@ -181,7 +181,7 @@ for every matrix multiplication. Dequantization and re-quantization is performed
 
 Therefore, inference time is often **not** reduced when using quantized weights, but rather increases.
 Enough theory, let's give it a try! To quantize the weights with Transformers, you need to make sure that
-the [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes) library is installed.
+the [`bitsandbytes`](https://github.com/bitsandbytes-foundation/bitsandbytes) library is installed.
 
 ```bash
 !pip install bitsandbytes
diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md
index 19c6bcf2ffe715..420a06ec746e34 100644
--- a/docs/source/en/model_doc/chameleon.md
+++ b/docs/source/en/model_doc/chameleon.md
@@ -128,7 +128,17 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 ### Quantization using Bitsandbytes
 
-The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. If you want to run bitsandbytes on CPU or other devices, please follow the [these installation instructions](https://huggingface.co/docs/bitsandbytes/main/en/installation). Simply change the snippet above with:
+The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and to have access to a GPU/accelerator that is supported by the library.
+
+<Tip>
+
+bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+
+</Tip>
+
+Simply change the snippet above with:
 
 ```python
 from transformers import ChameleonForConditionalGeneration, BitsAndBytesConfig
diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md
index eac77542a11654..f04827cc7d5f74 100644
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@@ -233,7 +233,17 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 ### Quantization using Bitsandbytes
 
-The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes`. If you want to run bitsandbytes on CPU or other devices, please follow [these installation instructions](https://huggingface.co/docs/bitsandbytes/main/en/installation). Simply change the snippet above with:
+The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes`, and to have access to a GPU/accelerator that is supported by the library.
+
+<Tip>
+
+bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+
+</Tip>
+
+Simply change the snippet above with:
 
 ```python
 from transformers import LlavaNextForConditionalGeneration, BitsAndBytesConfig
diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md
index 48e50f950621e8..fe905dfb7932ab 100644
--- a/docs/source/en/model_doc/llava_next_video.md
+++ b/docs/source/en/model_doc/llava_next_video.md
@@ -205,7 +205,17 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 The model can be loaded in lower bits, significantly reducing memory burden while maintaining the performance of the original model. This allows for efficient deployment on resource-constrained cases. 
 
-First make sure to install bitsandbytes by running `pip install bitsandbytes` and to have access to a CUDA compatible GPU device. Load the quantized model by simply adding [`BitsAndBytesConfig`](../main_classes/quantization#transformers.BitsAndBytesConfig) as shown below:
+First, make sure to install bitsandbytes by running `pip install bitsandbytes` and to have access to a GPU/accelerator that is supported by the library.
+
+<Tip>
+
+bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+
+</Tip>
+
+Then simply load the quantized model by adding [`BitsAndBytesConfig`](../main_classes/quantization#transformers.BitsAndBytesConfig) as shown below:
 
 
 ```python
diff --git a/docs/source/en/model_doc/llava_onevision.md b/docs/source/en/model_doc/llava_onevision.md
index 64a127abca4c28..717784da738d8c 100644
--- a/docs/source/en/model_doc/llava_onevision.md
+++ b/docs/source/en/model_doc/llava_onevision.md
@@ -264,9 +264,19 @@ processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spac
 
 ## Model optimization
 
-### Quantization using Bitsandbytes
+### Quantization using bitsandbytes
 
-The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. Simply change the snippet above with:
+The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a GPU/accelerator that is supported by the library.
+
+<Tip>
+
+bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+
+</Tip>
+
+Simply change the snippet above with:
 
 ```python
 from transformers import LlavaOnevisionForConditionalGeneration, BitsAndBytesConfig
diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md
index f098e82a177670..1c4b5b4b874dd7 100644
--- a/docs/source/en/model_doc/video_llava.md
+++ b/docs/source/en/model_doc/video_llava.md
@@ -139,7 +139,17 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
 
 The model can be loaded in lower bits, significantly reducing memory burden while maintaining the performance of the original model. his allows for efficient deployment on resource-constrained cases. 
 
-First make sure to install bitsandbytes by running `pip install bitsandbytes` and to have access to a CUDA compatible GPU device. Load the quantized model by simply adding [`BitsAndBytesConfig`](../main_classes/quantization#transformers.BitsAndBytesConfig) as shown below:
+First make sure to install bitsandbytes by running `pip install bitsandbytes` and to have access to a GPU/accelerator that is supported by the library.
+
+<Tip>
+
+bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+
+</Tip>
+
+Load the quantized model by simply adding [`BitsAndBytesConfig`](../main_classes/quantization#transformers.BitsAndBytesConfig) as shown below:
 
 
 ```python
diff --git a/docs/source/en/model_memory_anatomy.md b/docs/source/en/model_memory_anatomy.md
index c1d9d4c54bc728..44c197aae5cfe4 100644
--- a/docs/source/en/model_memory_anatomy.md
+++ b/docs/source/en/model_memory_anatomy.md
@@ -233,7 +233,7 @@ Let's look at the details.
 **Optimizer States:**
 
 - 8 bytes * number of parameters for normal AdamW (maintains 2 states)
-- 2 bytes * number of parameters for 8-bit AdamW optimizers like [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
+- 2 bytes * number of parameters for 8-bit AdamW optimizers like [bitsandbytes](https://github.com/bitsandbytes-foundation/bitsandbytes)
 - 4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state)
 
 **Gradients**
diff --git a/docs/source/en/perf_train_gpu_one.md b/docs/source/en/perf_train_gpu_one.md
index c90f2ca58483bf..364fc46544c6fd 100644
--- a/docs/source/en/perf_train_gpu_one.md
+++ b/docs/source/en/perf_train_gpu_one.md
@@ -284,7 +284,7 @@ training_args = TrainingArguments(per_device_train_batch_size=4, optim="adamw_bn
 
 However, we can also use a third-party implementation of the 8-bit optimizer for demonstration purposes to see how that can be integrated.
 
-First, follow the installation guide in the GitHub [repo](https://github.com/TimDettmers/bitsandbytes) to install the `bitsandbytes` library 
+First, follow the installation guide in the GitHub [repo](https://github.com/bitsandbytes-foundation/bitsandbytes) to install the `bitsandbytes` library 
 that implements the 8-bit Adam optimizer.
 
 Next you need to initialize the optimizer. This involves two steps: 
diff --git a/docs/source/en/quantization/bitsandbytes.md b/docs/source/en/quantization/bitsandbytes.md
index 334b6145e537fe..e9447555e82449 100644
--- a/docs/source/en/quantization/bitsandbytes.md
+++ b/docs/source/en/quantization/bitsandbytes.md
@@ -38,6 +38,14 @@ pip install --upgrade accelerate transformers
 </hfoption>
 </hfoptions>
 
+<Tip>
+
+bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+
+</Tip>
+
 Now you can quantize a model by passing a `BitsAndBytesConfig` to [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it supports loading with Accelerate and contains `torch.nn.Linear` layers.
 
 <hfoptions id="bnb">
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
index 9eb74793a12797..97bb0cf5326308 100644
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -49,7 +49,7 @@ Use the table below to help you decide which quantization method to use.
 |-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------|
 | [AQLM](./aqlm)                                | 🔴                       |  🟢   |     🟢     | 🔴              | 🔴                     | 🟢                      | 1 / 2          | 🟢                                   | 🟢            | 🟢                      | https://github.com/Vahe1994/AQLM            |
 | [AWQ](./awq) | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | ?                       | 4              | 🟢                                   | 🟢            | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
-| [bitsandbytes](./bitsandbytes)                        | 🟢                       | 🔴   |     🟢     | 🔴              | 🔴                     | 🔴                       | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/TimDettmers/bitsandbytes |
+| [bitsandbytes](./bitsandbytes)     | 🟢            | 🟡 *   |     🟢     | 🟡 *            | 🔴 **    | 🔴    (soon!)          | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
 | [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
 | GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
 | [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
@@ -57,3 +57,17 @@ Use the table below to help you decide which quantization method to use.
 | [Quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/quanto       |
 | [FBGEMM_FP8](./fbgemm_fp8.md)                              | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                      | 🔴                        | 8      | 🔴                                   | 🟢            | 🟢                      | https://github.com/pytorch/FBGEMM       |
 | [torchao](./torchao.md)                              | 🟢                       |     | 🟢        | 🔴              | partial support (int4 weight only)       |                       | 4 / 8      |                                   | 🟢🔴           | 🟢                      | https://github.com/pytorch/ao       |
+
+<Tip>
+
+\* bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+
+</Tip>
+
+<Tip>
+
+\** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships.
+
+</Tip>

From e607b7c33cbe816a1c3098cf7e72625d5edb0a58 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Wed, 11 Sep 2024 19:18:03 +0000
Subject: [PATCH 47/61] docs: fix small mistakes in bnb paragraph

---
 docs/source/en/model_doc/mixtral.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md
index 26eff8ec21ad7a..71c7d7921ef005 100644
--- a/docs/source/en/model_doc/mixtral.md
+++ b/docs/source/en/model_doc/mixtral.md
@@ -141,7 +141,7 @@ The Flash Attention-2 model uses also a more memory efficient cache slicing mech
 
 As the Mixtral model has 45 billion parameters, that would require about 90GB of GPU RAM in half precision (float16), since each parameter is stored in 2 bytes. However, one can shrink down the size of the model using [quantization](../quantization.md). If the model is quantized to 4 bits (or half a byte per parameter), a single A100 with 40GB of RAM is enough to fit the entire model, as in that case only about 27 GB of RAM is required.
 
-Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the BitsAndyBytes quantization (but refer to [this page](../quantization.md) for other quantization methods):
+Quantizing a model is as simple as passing a `quantization_config` to the model. Below, we'll leverage the bitsandbytes quantization library (but refer to [this page](../quantization.md) for alternative quantization methods):
 
 ```python
 >>> import torch

From ac108c6d6b34f45a5745a736ba57282405cfaa61 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Wed, 11 Sep 2024 21:50:13 +0000
Subject: [PATCH 48/61] run formatting

---
 .circleci/create_circleci_config.py    | 184 ++++++++++++++++---------
 .circleci/parse_test_outputs.py        |  28 ++--
 benchmark/benchmark.py                 |   2 -
 scripts/benchmark/trainer-benchmark.py |   7 +-
 4 files changed, 136 insertions(+), 85 deletions(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index d8d3e7d86cf383..6759c876c8c75d 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -16,10 +16,9 @@
 import argparse
 import copy
 import os
-import random
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
-import glob
+
 import yaml
 
 
@@ -32,7 +31,7 @@
     "RUN_PT_FLAX_CROSS_TESTS": False,
 }
 # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
-COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsf":None}
+COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsf": None}
 DEFAULT_DOCKER_IMAGE = [{"image": "cimg/python:3.8.12"}]
 
 
@@ -42,7 +41,7 @@ class EmptyJob:
     def to_dict(self):
         return {
             "docker": copy.deepcopy(DEFAULT_DOCKER_IMAGE),
-            "steps":["checkout"],
+            "steps": ["checkout"],
         }
 
 
@@ -72,7 +71,10 @@ def __post_init__(self):
         else:
             # BIG HACK WILL REMOVE ONCE FETCHER IS UPDATED
             print(os.environ.get("GIT_COMMIT_MESSAGE"))
-            if "[build-ci-image]" in os.environ.get("GIT_COMMIT_MESSAGE", "") or os.environ.get("GIT_COMMIT_MESSAGE", "") == "dev-ci":
+            if (
+                "[build-ci-image]" in os.environ.get("GIT_COMMIT_MESSAGE", "")
+                or os.environ.get("GIT_COMMIT_MESSAGE", "") == "dev-ci"
+            ):
                 self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
             print(f"Using {self.docker_image} docker image")
         if self.install_steps is None:
@@ -82,7 +84,7 @@ def __post_init__(self):
         if isinstance(self.tests_to_run, str):
             self.tests_to_run = [self.tests_to_run]
         else:
-            test_file = os.path.join("test_preparation" , f"{self.job_name}_test_list.txt")
+            test_file = os.path.join("test_preparation", f"{self.job_name}_test_list.txt")
             print("Looking for ", test_file)
             if os.path.exists(test_file):
                 with open(test_file) as f:
@@ -105,46 +107,93 @@ def to_dict(self):
             job["resource_class"] = self.resource_class
 
         all_options = {**COMMON_PYTEST_OPTIONS, **self.pytest_options}
-        pytest_flags = [f"--{key}={value}" if (value is not None or key in ["doctest-modules"]) else f"-{key}" for key, value in all_options.items()]
+        pytest_flags = [
+            f"--{key}={value}" if (value is not None or key in ["doctest-modules"]) else f"-{key}"
+            for key, value in all_options.items()
+        ]
         pytest_flags.append(
             f"--make-reports={self.name}" if "examples" in self.name else f"--make-reports=tests_{self.name}"
         )
-                # Examples special case: we need to download NLTK files in advance to avoid cuncurrency issues
+        # Examples special case: we need to download NLTK files in advance to avoid cuncurrency issues
         timeout_cmd = f"timeout {self.command_timeout} " if self.command_timeout else ""
         marker_cmd = f"-m '{self.marker}'" if self.marker is not None else ""
-        additional_flags = f" -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
-        parallel = f' << pipeline.parameters.{self.job_name}_parallelism >> '
+        additional_flags = " -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
+        parallel = f" << pipeline.parameters.{self.job_name}_parallelism >> "
         steps = [
             "checkout",
             {"attach_workspace": {"at": "test_preparation"}},
             {"run": "apt-get update && apt-get install -y curl"},
             {"run": " && ".join(self.install_steps)},
-            {"run": {"name": "Download NLTK files", "command": """python -c "import nltk; nltk.download('punkt', quiet=True)" """} if "example" in self.name else "echo Skipping"},
-            {"run": {
+            {
+                "run": {
+                    "name": "Download NLTK files",
+                    "command": """python -c "import nltk; nltk.download('punkt', quiet=True)" """,
+                }
+                if "example" in self.name
+                else "echo Skipping"
+            },
+            {
+                "run": {
                     "name": "Show installed libraries and their size",
-                    "command": """du -h -d 1 "$(pip -V | cut -d ' ' -f 4 | sed 's/pip//g')" | grep -vE "dist-info|_distutils_hack|__pycache__" | sort -h | tee installed.txt || true"""}
+                    "command": """du -h -d 1 "$(pip -V | cut -d ' ' -f 4 | sed 's/pip//g')" | grep -vE "dist-info|_distutils_hack|__pycache__" | sort -h | tee installed.txt || true""",
+                }
             },
-            {"run": {
-                "name": "Show installed libraries and their versions",
-                "command": """pip list --format=freeze | tee installed.txt || true"""}
+            {
+                "run": {
+                    "name": "Show installed libraries and their versions",
+                    "command": """pip list --format=freeze | tee installed.txt || true""",
+                }
             },
-            {"run": {
-                "name": "Show biggest libraries",
-                "command": """dpkg-query --show --showformat='${Installed-Size}\t${Package}\n' | sort -rh | head -25 | sort -h | awk '{ package=$2; sub(".*/", "", package); printf("%.5f GB %s\n", $1/1024/1024, package)}' || true"""}
+            {
+                "run": {
+                    "name": "Show biggest libraries",
+                    "command": """dpkg-query --show --showformat='${Installed-Size}\t${Package}\n' | sort -rh | head -25 | sort -h | awk '{ package=$2; sub(".*/", "", package); printf("%.5f GB %s\n", $1/1024/1024, package)}' || true""",
+                }
             },
             {"run": {"name": "Create `test-results` directory", "command": "mkdir test-results"}},
-            {"run": {"name": "Get files to test", "command":f'curl -L -o {self.job_name}_test_list.txt <<pipeline.parameters.{self.job_name}_test_list>>' if self.name != "pr_documentation_tests" else 'echo "Skipped"'}},
-                        {"run": {"name": "Split tests across parallel nodes: show current parallel tests",
-                    "command": f"TESTS=$(circleci tests split  --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt"
-                    }
+            {
+                "run": {
+                    "name": "Get files to test",
+                    "command": f"curl -L -o {self.job_name}_test_list.txt <<pipeline.parameters.{self.job_name}_test_list>>"
+                    if self.name != "pr_documentation_tests"
+                    else 'echo "Skipped"',
+                }
+            },
+            {
+                "run": {
+                    "name": "Split tests across parallel nodes: show current parallel tests",
+                    "command": f"TESTS=$(circleci tests split  --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'"
+                    if self.parallelism
+                    else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt",
+                }
+            },
+            {
+                "run": {
+                    "name": "Run tests",
+                    "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {additional_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)",
+                }
             },
-            {"run": {
-                "name": "Run tests",
-                "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {additional_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
+            {
+                "run": {
+                    "name": "Expand to show skipped tests",
+                    "when": "always",
+                    "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip",
+                }
+            },
+            {
+                "run": {
+                    "name": "Failed tests: show reasons",
+                    "when": "always",
+                    "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail",
+                }
+            },
+            {
+                "run": {
+                    "name": "Errors",
+                    "when": "always",
+                    "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors",
+                }
             },
-            {"run": {"name": "Expand to show skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
-            {"run": {"name": "Failed tests: show reasons",   "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
-            {"run": {"name": "Errors",                       "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}},
             {"store_test_results": {"path": "test-results"}},
             {"store_artifacts": {"path": "test-results/junit.xml"}},
             {"store_artifacts": {"path": "reports"}},
@@ -159,13 +208,17 @@ def to_dict(self):
 
     @property
     def job_name(self):
-        return self.name if ("examples" in self.name or "pipeline" in self.name or "pr_documentation" in self.name) else f"tests_{self.name}"
+        return (
+            self.name
+            if ("examples" in self.name or "pipeline" in self.name or "pr_documentation" in self.name)
+            else f"tests_{self.name}"
+        )
 
 
 # JOBS
 torch_and_tf_job = CircleCIJob(
     "torch_and_tf",
-    docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
+    docker_image=[{"image": "huggingface/transformers-torch-tf-light"}],
     additional_env={"RUN_PT_TF_CROSS_TESTS": True},
     marker="is_pt_tf_cross_test",
     pytest_options={"rA": None, "durations": 0},
@@ -175,7 +228,7 @@ def job_name(self):
 torch_and_flax_job = CircleCIJob(
     "torch_and_flax",
     additional_env={"RUN_PT_FLAX_CROSS_TESTS": True},
-    docker_image=[{"image":"huggingface/transformers-torch-jax-light"}],
+    docker_image=[{"image": "huggingface/transformers-torch-jax-light"}],
     marker="is_pt_flax_cross_test",
     pytest_options={"rA": None, "durations": 0},
 )
@@ -185,7 +238,7 @@ def job_name(self):
     docker_image=[{"image": "huggingface/transformers-torch-light"}],
     marker="not generate",
     parallelism=6,
-    pytest_num_workers=8
+    pytest_num_workers=8,
 )
 
 generate_job = CircleCIJob(
@@ -193,54 +246,48 @@ def job_name(self):
     docker_image=[{"image": "huggingface/transformers-torch-light"}],
     marker="generate",
     parallelism=6,
-    pytest_num_workers=8
+    pytest_num_workers=8,
 )
 
 tokenization_job = CircleCIJob(
     "tokenization",
     docker_image=[{"image": "huggingface/transformers-torch-light"}],
     parallelism=8,
-    pytest_num_workers=16
+    pytest_num_workers=16,
 )
 
 processor_job = CircleCIJob(
-    "processors",
-    docker_image=[{"image": "huggingface/transformers-torch-light"}],
-    parallelism=8,
-    pytest_num_workers=6
+    "processors", docker_image=[{"image": "huggingface/transformers-torch-light"}], parallelism=8, pytest_num_workers=6
 )
 
 tf_job = CircleCIJob(
     "tf",
-    docker_image=[{"image":"huggingface/transformers-tf-light"}],
+    docker_image=[{"image": "huggingface/transformers-tf-light"}],
     parallelism=6,
     pytest_num_workers=16,
 )
 
 
 flax_job = CircleCIJob(
-    "flax",
-    docker_image=[{"image":"huggingface/transformers-jax-light"}],
-    parallelism=6,
-    pytest_num_workers=16
+    "flax", docker_image=[{"image": "huggingface/transformers-jax-light"}], parallelism=6, pytest_num_workers=16
 )
 
 
 pipelines_torch_job = CircleCIJob(
     "pipelines_torch",
     additional_env={"RUN_PIPELINE_TESTS": True},
-    docker_image=[{"image":"huggingface/transformers-torch-light"}],
+    docker_image=[{"image": "huggingface/transformers-torch-light"}],
     marker="is_pipeline_test",
-    parallelism=4
+    parallelism=4,
 )
 
 
 pipelines_tf_job = CircleCIJob(
     "pipelines_tf",
     additional_env={"RUN_PIPELINE_TESTS": True},
-    docker_image=[{"image":"huggingface/transformers-tf-light"}],
+    docker_image=[{"image": "huggingface/transformers-tf-light"}],
     marker="is_pipeline_test",
-    parallelism=4
+    parallelism=4,
 )
 
 
@@ -254,7 +301,7 @@ def job_name(self):
 examples_torch_job = CircleCIJob(
     "examples_torch",
     additional_env={"OMP_NUM_THREADS": 8},
-    docker_image=[{"image":"huggingface/transformers-examples-torch"}],
+    docker_image=[{"image": "huggingface/transformers-examples-torch"}],
     # TODO @ArthurZucker remove this once docker is easier to build
     install_steps=["uv venv && uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"],
     pytest_num_workers=8,
@@ -264,7 +311,7 @@ def job_name(self):
 examples_tensorflow_job = CircleCIJob(
     "examples_tensorflow",
     additional_env={"OMP_NUM_THREADS": 8},
-    docker_image=[{"image":"huggingface/transformers-examples-tf"}],
+    docker_image=[{"image": "huggingface/transformers-examples-tf"}],
     pytest_num_workers=16,
 )
 
@@ -272,9 +319,9 @@ def job_name(self):
 hub_job = CircleCIJob(
     "hub",
     additional_env={"HUGGINGFACE_CO_STAGING": True},
-    docker_image=[{"image":"huggingface/transformers-torch-light"}],
+    docker_image=[{"image": "huggingface/transformers-torch-light"}],
     install_steps=[
-        'uv venv && uv pip install .',
+        "uv venv && uv pip install .",
         'git config --global user.email "ci@dummy.com"',
         'git config --global user.name "ci"',
     ],
@@ -285,7 +332,7 @@ def job_name(self):
 
 onnx_job = CircleCIJob(
     "onnx",
-    docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
+    docker_image=[{"image": "huggingface/transformers-torch-tf-light"}],
     install_steps=[
         "uv venv",
         "uv pip install .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]",
@@ -297,7 +344,7 @@ def job_name(self):
 
 exotic_models_job = CircleCIJob(
     "exotic_models",
-    docker_image=[{"image":"huggingface/transformers-exotic-models"}],
+    docker_image=[{"image": "huggingface/transformers-exotic-models"}],
     pytest_num_workers=12,
     parallelism=4,
     pytest_options={"durations": 100},
@@ -306,7 +353,7 @@ def job_name(self):
 
 repo_utils_job = CircleCIJob(
     "repo_utils",
-    docker_image=[{"image":"huggingface/transformers-consistency"}],
+    docker_image=[{"image": "huggingface/transformers-consistency"}],
     pytest_num_workers=4,
     resource_class="large",
 )
@@ -320,7 +367,7 @@ def job_name(self):
 command = f'echo """{py_command}""" > pr_documentation_tests_temp.txt'
 doc_test_job = CircleCIJob(
     "pr_documentation_tests",
-    docker_image=[{"image":"huggingface/transformers-consistency"}],
+    docker_image=[{"image": "huggingface/transformers-consistency"}],
     additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
     install_steps=[
         # Add an empty file to keep the test step running correctly even no file is selected to be tested.
@@ -328,7 +375,7 @@ def job_name(self):
         "touch dummy.py",
         command,
         "cat pr_documentation_tests_temp.txt",
-        "tail -n1 pr_documentation_tests_temp.txt | tee pr_documentation_tests_test_list.txt"
+        "tail -n1 pr_documentation_tests_temp.txt | tee pr_documentation_tests_test_list.txt",
     ],
     tests_to_run="$(cat pr_documentation_tests.txt)",  # noqa
     pytest_options={"-doctest-modules": None, "doctest-glob": "*.md", "dist": "loadfile", "rvsA": None},
@@ -336,37 +383,42 @@ def job_name(self):
     pytest_num_workers=1,
 )
 
-REGULAR_TESTS = [torch_and_tf_job, torch_and_flax_job, torch_job, tf_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job] # fmt: skip
+REGULAR_TESTS = [torch_and_tf_job, torch_and_flax_job, torch_job, tf_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job]  # fmt: skip
 EXAMPLES_TESTS = [examples_torch_job, examples_tensorflow_job]
 PIPELINE_TESTS = [pipelines_torch_job, pipelines_tf_job]
 REPO_UTIL_TESTS = [repo_utils_job]
 DOC_TESTS = [doc_test_job]
 ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job]  # fmt: skip
 
+
 def create_circleci_config(folder=None):
     if folder is None:
         folder = os.getcwd()
     os.environ["test_preparation_dir"] = folder
-    jobs = [k for k in ALL_TESTS if os.path.isfile(os.path.join("test_preparation" , f"{k.job_name}_test_list.txt") )]
+    jobs = [k for k in ALL_TESTS if os.path.isfile(os.path.join("test_preparation", f"{k.job_name}_test_list.txt"))]
     print("The following jobs will be run ", jobs)
 
     if len(jobs) == 0:
         jobs = [EmptyJob()]
-    print("Full list of job name inputs", {j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs})
+    print("Full list of job name inputs", {j.job_name + "_test_list": {"type": "string", "default": ""} for j in jobs})
     config = {
         "version": "2.1",
         "parameters": {
             # Only used to accept the parameters from the trigger
             "nightly": {"type": "boolean", "default": False},
-            "tests_to_run": {"type": "string", "default": ''},
-            **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs},
-            **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs},
+            "tests_to_run": {"type": "string", "default": ""},
+            **{j.job_name + "_test_list": {"type": "string", "default": ""} for j in jobs},
+            **{j.job_name + "_parallelism": {"type": "integer", "default": 1} for j in jobs},
         },
-        "jobs" : {j.job_name: j.to_dict() for j in jobs},
-        "workflows": {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}}
+        "jobs": {j.job_name: j.to_dict() for j in jobs},
+        "workflows": {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}},
     }
     with open(os.path.join(folder, "generated_config.yml"), "w") as f:
-        f.write(yaml.dump(config, sort_keys=False, default_flow_style=False).replace("' << pipeline", " << pipeline").replace(">> '", " >>"))
+        f.write(
+            yaml.dump(config, sort_keys=False, default_flow_style=False)
+            .replace("' << pipeline", " << pipeline")
+            .replace(">> '", " >>")
+        )
 
 
 if __name__ == "__main__":
diff --git a/.circleci/parse_test_outputs.py b/.circleci/parse_test_outputs.py
index a69da1a3eafb27..21f186c76b5e76 100644
--- a/.circleci/parse_test_outputs.py
+++ b/.circleci/parse_test_outputs.py
@@ -1,53 +1,57 @@
-import re
 import argparse
+import re
+
 
 def parse_pytest_output(file_path):
     skipped_tests = {}
     skipped_count = 0
-    with open(file_path, 'r') as file:
+    with open(file_path, "r") as file:
         for line in file:
-            match = re.match(r'^SKIPPED \[(\d+)\] (tests/.*): (.*)$', line)
+            match = re.match(r"^SKIPPED \[(\d+)\] (tests/.*): (.*)$", line)
             if match:
                 skipped_count += 1
                 test_file, test_line, reason = match.groups()
                 skipped_tests[reason] = skipped_tests.get(reason, []) + [(test_file, test_line)]
-    for k,v in sorted(skipped_tests.items(), key=lambda x:len(x[1])):
+    for k, v in sorted(skipped_tests.items(), key=lambda x: len(x[1])):
         print(f"{len(v):4} skipped because: {k}")
     print("Number of skipped tests:", skipped_count)
 
+
 def parse_pytest_failure_output(file_path):
     failed_tests = {}
     failed_count = 0
-    with open(file_path, 'r') as file:
+    with open(file_path, "r") as file:
         for line in file:
-            match = re.match(r'^FAILED (tests/.*) - (.*): (.*)$', line)
+            match = re.match(r"^FAILED (tests/.*) - (.*): (.*)$", line)
             if match:
                 failed_count += 1
                 _, error, reason = match.groups()
                 failed_tests[reason] = failed_tests.get(reason, []) + [error]
-    for k,v in sorted(failed_tests.items(), key=lambda x:len(x[1])):
+    for k, v in sorted(failed_tests.items(), key=lambda x: len(x[1])):
         print(f"{len(v):4} failed because `{v[0]}` -> {k}")
     print("Number of failed tests:", failed_count)
-    if failed_count>0:
+    if failed_count > 0:
         exit(1)
 
+
 def parse_pytest_errors_output(file_path):
     print(file_path)
     error_tests = {}
     error_count = 0
-    with open(file_path, 'r') as file:
+    with open(file_path, "r") as file:
         for line in file:
-            match = re.match(r'^ERROR (tests/.*) - (.*): (.*)$', line)
+            match = re.match(r"^ERROR (tests/.*) - (.*): (.*)$", line)
             if match:
                 error_count += 1
                 _, test_error, reason = match.groups()
                 error_tests[reason] = error_tests.get(reason, []) + [test_error]
-    for k,v in sorted(error_tests.items(), key=lambda x:len(x[1])):
+    for k, v in sorted(error_tests.items(), key=lambda x: len(x[1])):
         print(f"{len(v):4} errored out because of `{v[0]}` -> {k}")
     print("Number of errors:", error_count)
-    if error_count>0:
+    if error_count > 0:
         exit(1)
 
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--file", help="file to parse")
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index 304bbd4441cf66..f0507b86622ece 100644
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -31,9 +31,7 @@
 from pathlib import Path
 
 from git import Repo
-
 from huggingface_hub import HfApi
-
 from optimum_benchmark import Benchmark
 from optimum_benchmark_wrapper import main
 
diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
index c9470eeeae8548..2304e46465ed63 100755
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -181,23 +181,21 @@ def get_original_command(max_width=80, full_python_path=False):
 
 
 def get_base_command(args, output_dir):
-
     # unwrap multi-line input
     args.base_cmd = re.sub(r"[\\\n]+", " ", args.base_cmd)
 
     # remove --output_dir if any and set our own
-    args.base_cmd = re.sub("--output_dir\s+[^\s]+", "", args.base_cmd)
+    args.base_cmd = re.sub(r"--output_dir\s+[^\s]+", "", args.base_cmd)
     args.base_cmd += f" --output_dir {output_dir}"
 
     # ensure we have --overwrite_output_dir
-    args.base_cmd = re.sub("--overwrite_output_dir\s+", "", args.base_cmd)
+    args.base_cmd = re.sub(r"--overwrite_output_dir\s+", "", args.base_cmd)
     args.base_cmd += " --overwrite_output_dir"
 
     return [sys.executable] + shlex.split(args.base_cmd)
 
 
 def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose):
-
     # Enable to debug everything but the run itself, to do it fast and see the progress.
     # This is useful for debugging the output formatting quickly - we can remove it later once
     # everybody is happy with the output
@@ -296,7 +294,6 @@ def get_versions():
 
 
 def process_results(results, target_metric_key, report_metric_keys, base_variation, output_dir):
-
     df = pd.DataFrame(results)
     variation_key = "variation"
     diff_key = "diff_%"

From c66e7e752c70d46e631141eb22f493f7751d83f0 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 12 Sep 2024 05:53:21 -0400
Subject: [PATCH 49/61] reveret bnb check

---
 src/transformers/utils/import_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 611a263df1870f..3a0353ffb64438 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -851,6 +851,9 @@ def is_torch_xpu_available(check_device=False):
 
 @lru_cache()
 def is_bitsandbytes_available():
+    if not is_torch_available():
+        return False
+
     from transformers.integrations.integration_utils import validate_bnb_backend_availability
 
     return _bitsandbytes_available and validate_bnb_backend_availability(raise_exception=False)

From 8f25ee26b456f36d683e4ea2e0de5e667525f8f6 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 13 Sep 2024 12:01:16 -0400
Subject: [PATCH 50/61] move bnb multi-backend check to import_utils

---
 src/transformers/integrations/__init__.py          |  2 --
 src/transformers/integrations/integration_utils.py | 10 +++-------
 src/transformers/quantizers/quantizer_bnb_4bit.py  |  6 ++----
 src/transformers/quantizers/quantizer_bnb_8bit.py  |  6 ++----
 src/transformers/testing_utils.py                  |  2 +-
 src/transformers/utils/__init__.py                 |  1 +
 src/transformers/utils/import_utils.py             |  6 ++++++
 7 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
index 59192d61128bdb..20327e3abf16a6 100755
--- a/src/transformers/integrations/__init__.py
+++ b/src/transformers/integrations/__init__.py
@@ -94,7 +94,6 @@
         "run_hp_search_sigopt",
         "run_hp_search_wandb",
         "validate_bnb_backend_availability",
-        "is_bitsandbytes_multi_backend_available",
     ],
     "peft": ["PeftAdapterMixin"],
     "quanto": ["replace_with_quanto_layers"],
@@ -168,7 +167,6 @@
         get_reporting_integration_callbacks,
         hp_params,
         is_azureml_available,
-        is_bitsandbytes_multi_backend_available,
         is_clearml_available,
         is_codecarbon_available,
         is_comet_available,
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index fd31cc5fc081ba..5b86098ac6d8d3 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -206,12 +206,6 @@ def is_dvclive_available():
     return importlib.util.find_spec("dvclive") is not None
 
 
-def is_bitsandbytes_multi_backend_available() -> bool:
-    import bitsandbytes as bnb
-
-    return "multi_backend" in getattr(bnb, "features", set())
-
-
 def _validate_bnb_multi_backend_availability(raise_exception):
     import bitsandbytes as bnb
 
@@ -273,7 +267,9 @@ def validate_bnb_backend_availability(raise_exception=False):
     Validates if the available devices are supported by bitsandbytes, optionally raising an exception if not.
     """
 
-    if is_bitsandbytes_multi_backend_available():
+    import bitsandbytes as bnb
+
+    if "multi_backend" in getattr(bnb, "features", set()):
         return _validate_bnb_multi_backend_availability(raise_exception)
     return _validate_bnb_cuda_backend_availability(raise_exception)
 
diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index 3b197522ee0b5f..f75787fa6d6888 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -75,10 +75,8 @@ def validate_environment(self, *args, **kwargs):
                 "Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
             )
 
-        from ..integrations.integration_utils import (
-            is_bitsandbytes_multi_backend_available,
-            validate_bnb_backend_availability,
-        )
+        from ..integrations.integration_utils import validate_bnb_backend_availability
+        from ..utils import is_bitsandbytes_multi_backend_available
 
         bnb_multibackend_is_enabled = is_bitsandbytes_multi_backend_available()
         validate_bnb_backend_availability(raise_exception=True)
diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py
index 62f19185e3f8b6..8fa8121dc17d1f 100644
--- a/src/transformers/quantizers/quantizer_bnb_8bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_8bit.py
@@ -74,10 +74,8 @@ def validate_environment(self, *args, **kwargs):
                 "Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
             )
 
-        from ..integrations.integration_utils import (
-            is_bitsandbytes_multi_backend_available,
-            validate_bnb_backend_availability,
-        )
+        from ..integrations.integration_utils import validate_bnb_backend_availability
+        from ..utils import is_bitsandbytes_multi_backend_available
 
         bnb_multibackend_is_enabled = is_bitsandbytes_multi_backend_available()
         validate_bnb_backend_availability(raise_exception=True)
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 64f3ac0f15171c..197a2c48078e09 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -43,7 +43,6 @@
 from transformers import logging as transformers_logging
 
 from .integrations import (
-    is_bitsandbytes_multi_backend_available,
     is_clearml_available,
     is_optuna_available,
     is_ray_available,
@@ -62,6 +61,7 @@
     is_auto_gptq_available,
     is_av_available,
     is_bitsandbytes_available,
+    is_bitsandbytes_multi_backend_available,
     is_bs4_available,
     is_cv2_available,
     is_cython_available,
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 35b39790499c1c..cc04d1a366585a 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -120,6 +120,7 @@
     is_auto_gptq_available,
     is_av_available,
     is_bitsandbytes_available,
+    is_bitsandbytes_multi_backend_available,
     is_bs4_available,
     is_coloredlogs_available,
     is_cv2_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 3a0353ffb64438..e8de5446ad0770 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -849,6 +849,12 @@ def is_torch_xpu_available(check_device=False):
     return hasattr(torch, "xpu") and torch.xpu.is_available()
 
 
+def is_bitsandbytes_multi_backend_available() -> bool:
+    import bitsandbytes as bnb
+
+    return "multi_backend" in getattr(bnb, "features", set())
+
+
 @lru_cache()
 def is_bitsandbytes_available():
     if not is_torch_available():

From 32cbb8d43b39981eb7c097d621dbae432269cc78 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Sat, 14 Sep 2024 05:29:35 -0400
Subject: [PATCH 51/61] fix bnb check

---
 src/transformers/integrations/__init__.py     |  4 +-
 src/transformers/integrations/bitsandbytes.py | 70 +++++++++++++++++++
 .../integrations/integration_utils.py         | 70 -------------------
 .../quantizers/quantizer_bnb_4bit.py          |  2 +-
 .../quantizers/quantizer_bnb_8bit.py          |  2 +-
 src/transformers/utils/import_utils.py        | 22 +++---
 6 files changed, 88 insertions(+), 82 deletions(-)

diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
index 20327e3abf16a6..00bbcf2d060fe9 100755
--- a/src/transformers/integrations/__init__.py
+++ b/src/transformers/integrations/__init__.py
@@ -31,6 +31,7 @@
         "replace_with_bnb_linear",
         "set_module_8bit_tensor_to_device",
         "set_module_quantized_tensor_to_device",
+        "validate_bnb_backend_availability",
     ],
     "deepspeed": [
         "HfDeepSpeedConfig",
@@ -93,7 +94,6 @@
         "run_hp_search_ray",
         "run_hp_search_sigopt",
         "run_hp_search_wandb",
-        "validate_bnb_backend_availability",
     ],
     "peft": ["PeftAdapterMixin"],
     "quanto": ["replace_with_quanto_layers"],
@@ -125,6 +125,7 @@
         replace_with_bnb_linear,
         set_module_8bit_tensor_to_device,
         set_module_quantized_tensor_to_device,
+        validate_bnb_backend_availability,
     )
     from .deepspeed import (
         HfDeepSpeedConfig,
@@ -187,7 +188,6 @@
         run_hp_search_ray,
         run_hp_search_sigopt,
         run_hp_search_wandb,
-        validate_bnb_backend_availability,
     )
     from .peft import PeftAdapterMixin
     from .quanto import replace_with_quanto_layers
diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
index c89cc562c81333..b9faba1b823e78 100644
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@@ -7,8 +7,10 @@
 from packaging import version
 
 from ..utils import (
+    get_available_devices,
     is_accelerate_available,
     is_bitsandbytes_available,
+    is_ipex_available,
     logging,
 )
 
@@ -474,3 +476,71 @@ def dequantize_and_replace(
         )
 
     return model
+
+
+def _validate_bnb_multi_backend_availability(raise_exception):
+    import bitsandbytes as bnb
+
+    bnb_supported_devices = getattr(bnb, "supported_torch_devices", set())
+    available_devices = get_available_devices()
+
+    if available_devices == {"cpu"} and not is_ipex_available():
+        from importlib.util import find_spec
+
+        if find_spec("intel_extension_for_pytorch"):
+            logger.warning(
+                "You have Intel IPEX installed but if you're intending to use it for CPU, it might not have the right version. Be sure to double check that your PyTorch and IPEX installs are compatible."
+            )
+
+        available_devices.discard("cpu")  # Only Intel CPU is supported by BNB at the moment
+
+    if not available_devices.intersection(bnb_supported_devices):
+        if raise_exception:
+            bnb_supported_devices_with_info = set(  # noqa: C401
+                '"cpu" (needs an Intel CPU and intel_extension_for_pytorch installed and compatible with the PyTorch version)'
+                if device == "cpu"
+                else device
+                for device in bnb_supported_devices
+            )
+            err_msg = (
+                f"None of the available devices `available_devices = {available_devices or None}` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {bnb_supported_devices_with_info}`. "
+                "Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+            )
+
+            logger.error(err_msg)
+            raise RuntimeError(err_msg)
+
+        logger.warning("No supported devices found for bitsandbytes multi-backend.")
+        return False
+
+    logger.debug("Multi-backend validation successful.")
+    return True
+
+
+def _validate_bnb_cuda_backend_availability(raise_exception):
+    if not torch.cuda.is_available():
+        log_msg = (
+            "CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. "
+            "Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
+        )
+        if raise_exception:
+            logger.error(log_msg)
+            raise RuntimeError(log_msg)
+
+        logger.warning(log_msg)
+        return False
+
+    logger.debug("CUDA backend validation successful.")
+    return True
+
+
+def validate_bnb_backend_availability(raise_exception=False):
+    """
+    Validates if the available devices are supported by bitsandbytes, optionally raising an exception if not.
+    """
+
+    import bitsandbytes as bnb
+
+    if "multi_backend" in getattr(bnb, "features", set()):
+        return _validate_bnb_multi_backend_availability(raise_exception)
+    return _validate_bnb_cuda_backend_availability(raise_exception)
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 5b86098ac6d8d3..9172f9599f77b0 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -38,9 +38,7 @@
 from ..utils import (
     PushToHubMixin,
     flatten_dict,
-    get_available_devices,
     is_datasets_available,
-    is_ipex_available,
     is_pandas_available,
     is_tf_available,
     is_torch_available,
@@ -206,74 +204,6 @@ def is_dvclive_available():
     return importlib.util.find_spec("dvclive") is not None
 
 
-def _validate_bnb_multi_backend_availability(raise_exception):
-    import bitsandbytes as bnb
-
-    bnb_supported_devices = getattr(bnb, "supported_torch_devices", set())
-    available_devices = get_available_devices()
-
-    if available_devices == {"cpu"} and not is_ipex_available():
-        from importlib.util import find_spec
-
-        if find_spec("intel_extension_for_pytorch"):
-            logger.warning(
-                "You have Intel IPEX installed but if you're intending to use it for CPU, it might not have the right version. Be sure to double check that your PyTorch and IPEX installs are compatible."
-            )
-
-        available_devices.discard("cpu")  # Only Intel CPU is supported by BNB at the moment
-
-    if not available_devices.intersection(bnb_supported_devices):
-        if raise_exception:
-            bnb_supported_devices_with_info = set(  # noqa: C401
-                '"cpu" (needs an Intel CPU and intel_extension_for_pytorch installed and compatible with the PyTorch version)'
-                if device == "cpu"
-                else device
-                for device in bnb_supported_devices
-            )
-            err_msg = (
-                f"None of the available devices `available_devices = {available_devices or None}` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {bnb_supported_devices_with_info}`. "
-                "Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
-            )
-
-            logger.error(err_msg)
-            raise RuntimeError(err_msg)
-
-        logger.warning("No supported devices found for bitsandbytes multi-backend.")
-        return False
-
-    logger.debug("Multi-backend validation successful.")
-    return True
-
-
-def _validate_bnb_cuda_backend_availability(raise_exception):
-    if not torch.cuda.is_available():
-        log_msg = (
-            "CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. "
-            "Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
-        )
-        if raise_exception:
-            logger.error(log_msg)
-            raise RuntimeError(log_msg)
-
-        logger.warning(log_msg)
-        return False
-
-    logger.debug("CUDA backend validation successful.")
-    return True
-
-
-def validate_bnb_backend_availability(raise_exception=False):
-    """
-    Validates if the available devices are supported by bitsandbytes, optionally raising an exception if not.
-    """
-
-    import bitsandbytes as bnb
-
-    if "multi_backend" in getattr(bnb, "features", set()):
-        return _validate_bnb_multi_backend_availability(raise_exception)
-    return _validate_bnb_cuda_backend_availability(raise_exception)
-
-
 def hp_params(trial):
     if is_optuna_available():
         import optuna
diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index f75787fa6d6888..73e7664aeb884d 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -75,7 +75,7 @@ def validate_environment(self, *args, **kwargs):
                 "Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
             )
 
-        from ..integrations.integration_utils import validate_bnb_backend_availability
+        from ..integrations import validate_bnb_backend_availability
         from ..utils import is_bitsandbytes_multi_backend_available
 
         bnb_multibackend_is_enabled = is_bitsandbytes_multi_backend_available()
diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py
index 8fa8121dc17d1f..65d97716d02cf8 100644
--- a/src/transformers/quantizers/quantizer_bnb_8bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_8bit.py
@@ -74,7 +74,7 @@ def validate_environment(self, *args, **kwargs):
                 "Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
             )
 
-        from ..integrations.integration_utils import validate_bnb_backend_availability
+        from ..integrations import validate_bnb_backend_availability
         from ..utils import is_bitsandbytes_multi_backend_available
 
         bnb_multibackend_is_enabled = is_bitsandbytes_multi_backend_available()
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index e8de5446ad0770..4084d4ffadf7c1 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -849,20 +849,26 @@ def is_torch_xpu_available(check_device=False):
     return hasattr(torch, "xpu") and torch.xpu.is_available()
 
 
-def is_bitsandbytes_multi_backend_available() -> bool:
-    import bitsandbytes as bnb
-
-    return "multi_backend" in getattr(bnb, "features", set())
-
-
 @lru_cache()
 def is_bitsandbytes_available():
     if not is_torch_available():
         return False
 
-    from transformers.integrations.integration_utils import validate_bnb_backend_availability
+    import torch
+
+    if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.43.1"):
+        return torch.cuda.is_available() and _bitsandbytes_available
+    else:
+        return _bitsandbytes_available
+
+
+def is_bitsandbytes_multi_backend_available() -> bool:
+    if not is_bitsandbytes_available():
+        return False
+
+    import bitsandbytes as bnb
 
-    return _bitsandbytes_available and validate_bnb_backend_availability(raise_exception=False)
+    return "multi_backend" in getattr(bnb, "features", set())
 
 
 def is_flash_attn_2_available():

From 4ce4b55840f29346350ccef90da3a704f9429bbd Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Sat, 14 Sep 2024 05:33:31 -0400
Subject: [PATCH 52/61] minor fix for bnb

---
 src/transformers/integrations/bitsandbytes.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
index b9faba1b823e78..90d5e20d431f0e 100644
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@@ -11,6 +11,7 @@
     is_accelerate_available,
     is_bitsandbytes_available,
     is_ipex_available,
+    is_torch_available,
     logging,
 )
 
@@ -518,6 +519,11 @@ def _validate_bnb_multi_backend_availability(raise_exception):
 
 
 def _validate_bnb_cuda_backend_availability(raise_exception):
+    if not is_torch_available():
+        return False
+
+    import torch
+
     if not torch.cuda.is_available():
         log_msg = (
             "CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. "
@@ -538,6 +544,8 @@ def validate_bnb_backend_availability(raise_exception=False):
     """
     Validates if the available devices are supported by bitsandbytes, optionally raising an exception if not.
     """
+    if not is_bitsandbytes_available():
+        return False
 
     import bitsandbytes as bnb
 

From 937ed3bf974a3c5e6382ada3abeb3912b40890c5 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Sat, 14 Sep 2024 05:36:26 -0400
Subject: [PATCH 53/61] check lib first

---
 src/transformers/utils/import_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 4084d4ffadf7c1..d15f90360d83d6 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -851,15 +851,15 @@ def is_torch_xpu_available(check_device=False):
 
 @lru_cache()
 def is_bitsandbytes_available():
-    if not is_torch_available():
+    if not is_torch_available() or not _bitsandbytes_available:
         return False
 
     import torch
 
     if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.43.1"):
-        return torch.cuda.is_available() and _bitsandbytes_available
+        return torch.cuda.is_available()
     else:
-        return _bitsandbytes_available
+        return True
 
 
 def is_bitsandbytes_multi_backend_available() -> bool:

From e40f28479b46f1ab329b7d88012c208cc36b1a1a Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Sat, 14 Sep 2024 05:43:52 -0400
Subject: [PATCH 54/61] fix code style

---
 src/transformers/integrations/bitsandbytes.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
index 90d5e20d431f0e..d70c41e236ac89 100644
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@@ -10,6 +10,7 @@
     get_available_devices,
     is_accelerate_available,
     is_bitsandbytes_available,
+    is_bitsandbytes_multi_backend_available,
     is_ipex_available,
     is_torch_available,
     logging,
@@ -547,8 +548,6 @@ def validate_bnb_backend_availability(raise_exception=False):
     if not is_bitsandbytes_available():
         return False
 
-    import bitsandbytes as bnb
-
-    if "multi_backend" in getattr(bnb, "features", set()):
+    if is_bitsandbytes_multi_backend_available():
         return _validate_bnb_multi_backend_availability(raise_exception)
     return _validate_bnb_cuda_backend_availability(raise_exception)

From a4333cb38087f6fb4e164e2bb2ad58f01834dc3c Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Sat, 14 Sep 2024 16:30:21 +0800
Subject: [PATCH 55/61] Update src/transformers/utils/import_utils.py

Co-authored-by: Aarni Koskela <akx@iki.fi>
---
 src/transformers/utils/import_utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index d15f90360d83d6..5a0960a0d328b7 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -856,10 +856,13 @@ def is_bitsandbytes_available():
 
     import torch
 
+    # `bitsandbytes` versions older than 0.43.1 eagerly require CUDA at import time,
+    # so those versions of the library are practically only available when CUDA is too.
     if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.43.1"):
         return torch.cuda.is_available()
-    else:
-        return True
+    
+    # Newer versions of `bitsandbytes` can be imported on systems without CUDA.
+    return True
 
 
 def is_bitsandbytes_multi_backend_available() -> bool:

From b8093ce8e53d2c1b915243abda41b8d01dffb6e6 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Sat, 14 Sep 2024 12:28:58 -0400
Subject: [PATCH 56/61] Revert "run formatting"

This reverts commit ac108c6d6b34f45a5745a736ba57282405cfaa61.
---
 .circleci/create_circleci_config.py    | 184 +++++++++----------------
 .circleci/parse_test_outputs.py        |  28 ++--
 benchmark/benchmark.py                 |   2 +
 scripts/benchmark/trainer-benchmark.py |   7 +-
 4 files changed, 85 insertions(+), 136 deletions(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 6759c876c8c75d..d8d3e7d86cf383 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -16,9 +16,10 @@
 import argparse
 import copy
 import os
+import random
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
-
+import glob
 import yaml
 
 
@@ -31,7 +32,7 @@
     "RUN_PT_FLAX_CROSS_TESTS": False,
 }
 # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
-COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsf": None}
+COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsf":None}
 DEFAULT_DOCKER_IMAGE = [{"image": "cimg/python:3.8.12"}]
 
 
@@ -41,7 +42,7 @@ class EmptyJob:
     def to_dict(self):
         return {
             "docker": copy.deepcopy(DEFAULT_DOCKER_IMAGE),
-            "steps": ["checkout"],
+            "steps":["checkout"],
         }
 
 
@@ -71,10 +72,7 @@ def __post_init__(self):
         else:
             # BIG HACK WILL REMOVE ONCE FETCHER IS UPDATED
             print(os.environ.get("GIT_COMMIT_MESSAGE"))
-            if (
-                "[build-ci-image]" in os.environ.get("GIT_COMMIT_MESSAGE", "")
-                or os.environ.get("GIT_COMMIT_MESSAGE", "") == "dev-ci"
-            ):
+            if "[build-ci-image]" in os.environ.get("GIT_COMMIT_MESSAGE", "") or os.environ.get("GIT_COMMIT_MESSAGE", "") == "dev-ci":
                 self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
             print(f"Using {self.docker_image} docker image")
         if self.install_steps is None:
@@ -84,7 +82,7 @@ def __post_init__(self):
         if isinstance(self.tests_to_run, str):
             self.tests_to_run = [self.tests_to_run]
         else:
-            test_file = os.path.join("test_preparation", f"{self.job_name}_test_list.txt")
+            test_file = os.path.join("test_preparation" , f"{self.job_name}_test_list.txt")
             print("Looking for ", test_file)
             if os.path.exists(test_file):
                 with open(test_file) as f:
@@ -107,93 +105,46 @@ def to_dict(self):
             job["resource_class"] = self.resource_class
 
         all_options = {**COMMON_PYTEST_OPTIONS, **self.pytest_options}
-        pytest_flags = [
-            f"--{key}={value}" if (value is not None or key in ["doctest-modules"]) else f"-{key}"
-            for key, value in all_options.items()
-        ]
+        pytest_flags = [f"--{key}={value}" if (value is not None or key in ["doctest-modules"]) else f"-{key}" for key, value in all_options.items()]
         pytest_flags.append(
             f"--make-reports={self.name}" if "examples" in self.name else f"--make-reports=tests_{self.name}"
         )
-        # Examples special case: we need to download NLTK files in advance to avoid cuncurrency issues
+                # Examples special case: we need to download NLTK files in advance to avoid cuncurrency issues
         timeout_cmd = f"timeout {self.command_timeout} " if self.command_timeout else ""
         marker_cmd = f"-m '{self.marker}'" if self.marker is not None else ""
-        additional_flags = " -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
-        parallel = f" << pipeline.parameters.{self.job_name}_parallelism >> "
+        additional_flags = f" -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
+        parallel = f' << pipeline.parameters.{self.job_name}_parallelism >> '
         steps = [
             "checkout",
             {"attach_workspace": {"at": "test_preparation"}},
             {"run": "apt-get update && apt-get install -y curl"},
             {"run": " && ".join(self.install_steps)},
-            {
-                "run": {
-                    "name": "Download NLTK files",
-                    "command": """python -c "import nltk; nltk.download('punkt', quiet=True)" """,
-                }
-                if "example" in self.name
-                else "echo Skipping"
-            },
-            {
-                "run": {
+            {"run": {"name": "Download NLTK files", "command": """python -c "import nltk; nltk.download('punkt', quiet=True)" """} if "example" in self.name else "echo Skipping"},
+            {"run": {
                     "name": "Show installed libraries and their size",
-                    "command": """du -h -d 1 "$(pip -V | cut -d ' ' -f 4 | sed 's/pip//g')" | grep -vE "dist-info|_distutils_hack|__pycache__" | sort -h | tee installed.txt || true""",
-                }
+                    "command": """du -h -d 1 "$(pip -V | cut -d ' ' -f 4 | sed 's/pip//g')" | grep -vE "dist-info|_distutils_hack|__pycache__" | sort -h | tee installed.txt || true"""}
             },
-            {
-                "run": {
-                    "name": "Show installed libraries and their versions",
-                    "command": """pip list --format=freeze | tee installed.txt || true""",
-                }
+            {"run": {
+                "name": "Show installed libraries and their versions",
+                "command": """pip list --format=freeze | tee installed.txt || true"""}
             },
-            {
-                "run": {
-                    "name": "Show biggest libraries",
-                    "command": """dpkg-query --show --showformat='${Installed-Size}\t${Package}\n' | sort -rh | head -25 | sort -h | awk '{ package=$2; sub(".*/", "", package); printf("%.5f GB %s\n", $1/1024/1024, package)}' || true""",
-                }
+            {"run": {
+                "name": "Show biggest libraries",
+                "command": """dpkg-query --show --showformat='${Installed-Size}\t${Package}\n' | sort -rh | head -25 | sort -h | awk '{ package=$2; sub(".*/", "", package); printf("%.5f GB %s\n", $1/1024/1024, package)}' || true"""}
             },
             {"run": {"name": "Create `test-results` directory", "command": "mkdir test-results"}},
-            {
-                "run": {
-                    "name": "Get files to test",
-                    "command": f"curl -L -o {self.job_name}_test_list.txt <<pipeline.parameters.{self.job_name}_test_list>>"
-                    if self.name != "pr_documentation_tests"
-                    else 'echo "Skipped"',
-                }
-            },
-            {
-                "run": {
-                    "name": "Split tests across parallel nodes: show current parallel tests",
-                    "command": f"TESTS=$(circleci tests split  --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'"
-                    if self.parallelism
-                    else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt",
-                }
-            },
-            {
-                "run": {
-                    "name": "Run tests",
-                    "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {additional_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)",
-                }
+            {"run": {"name": "Get files to test", "command":f'curl -L -o {self.job_name}_test_list.txt <<pipeline.parameters.{self.job_name}_test_list>>' if self.name != "pr_documentation_tests" else 'echo "Skipped"'}},
+                        {"run": {"name": "Split tests across parallel nodes: show current parallel tests",
+                    "command": f"TESTS=$(circleci tests split  --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt"
+                    }
             },
-            {
-                "run": {
-                    "name": "Expand to show skipped tests",
-                    "when": "always",
-                    "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip",
-                }
-            },
-            {
-                "run": {
-                    "name": "Failed tests: show reasons",
-                    "when": "always",
-                    "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail",
-                }
-            },
-            {
-                "run": {
-                    "name": "Errors",
-                    "when": "always",
-                    "command": "python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors",
-                }
+            {"run": {
+                "name": "Run tests",
+                "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {additional_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
             },
+            {"run": {"name": "Expand to show skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
+            {"run": {"name": "Failed tests: show reasons",   "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
+            {"run": {"name": "Errors",                       "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --errors"}},
             {"store_test_results": {"path": "test-results"}},
             {"store_artifacts": {"path": "test-results/junit.xml"}},
             {"store_artifacts": {"path": "reports"}},
@@ -208,17 +159,13 @@ def to_dict(self):
 
     @property
     def job_name(self):
-        return (
-            self.name
-            if ("examples" in self.name or "pipeline" in self.name or "pr_documentation" in self.name)
-            else f"tests_{self.name}"
-        )
+        return self.name if ("examples" in self.name or "pipeline" in self.name or "pr_documentation" in self.name) else f"tests_{self.name}"
 
 
 # JOBS
 torch_and_tf_job = CircleCIJob(
     "torch_and_tf",
-    docker_image=[{"image": "huggingface/transformers-torch-tf-light"}],
+    docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
     additional_env={"RUN_PT_TF_CROSS_TESTS": True},
     marker="is_pt_tf_cross_test",
     pytest_options={"rA": None, "durations": 0},
@@ -228,7 +175,7 @@ def job_name(self):
 torch_and_flax_job = CircleCIJob(
     "torch_and_flax",
     additional_env={"RUN_PT_FLAX_CROSS_TESTS": True},
-    docker_image=[{"image": "huggingface/transformers-torch-jax-light"}],
+    docker_image=[{"image":"huggingface/transformers-torch-jax-light"}],
     marker="is_pt_flax_cross_test",
     pytest_options={"rA": None, "durations": 0},
 )
@@ -238,7 +185,7 @@ def job_name(self):
     docker_image=[{"image": "huggingface/transformers-torch-light"}],
     marker="not generate",
     parallelism=6,
-    pytest_num_workers=8,
+    pytest_num_workers=8
 )
 
 generate_job = CircleCIJob(
@@ -246,48 +193,54 @@ def job_name(self):
     docker_image=[{"image": "huggingface/transformers-torch-light"}],
     marker="generate",
     parallelism=6,
-    pytest_num_workers=8,
+    pytest_num_workers=8
 )
 
 tokenization_job = CircleCIJob(
     "tokenization",
     docker_image=[{"image": "huggingface/transformers-torch-light"}],
     parallelism=8,
-    pytest_num_workers=16,
+    pytest_num_workers=16
 )
 
 processor_job = CircleCIJob(
-    "processors", docker_image=[{"image": "huggingface/transformers-torch-light"}], parallelism=8, pytest_num_workers=6
+    "processors",
+    docker_image=[{"image": "huggingface/transformers-torch-light"}],
+    parallelism=8,
+    pytest_num_workers=6
 )
 
 tf_job = CircleCIJob(
     "tf",
-    docker_image=[{"image": "huggingface/transformers-tf-light"}],
+    docker_image=[{"image":"huggingface/transformers-tf-light"}],
     parallelism=6,
     pytest_num_workers=16,
 )
 
 
 flax_job = CircleCIJob(
-    "flax", docker_image=[{"image": "huggingface/transformers-jax-light"}], parallelism=6, pytest_num_workers=16
+    "flax",
+    docker_image=[{"image":"huggingface/transformers-jax-light"}],
+    parallelism=6,
+    pytest_num_workers=16
 )
 
 
 pipelines_torch_job = CircleCIJob(
     "pipelines_torch",
     additional_env={"RUN_PIPELINE_TESTS": True},
-    docker_image=[{"image": "huggingface/transformers-torch-light"}],
+    docker_image=[{"image":"huggingface/transformers-torch-light"}],
     marker="is_pipeline_test",
-    parallelism=4,
+    parallelism=4
 )
 
 
 pipelines_tf_job = CircleCIJob(
     "pipelines_tf",
     additional_env={"RUN_PIPELINE_TESTS": True},
-    docker_image=[{"image": "huggingface/transformers-tf-light"}],
+    docker_image=[{"image":"huggingface/transformers-tf-light"}],
     marker="is_pipeline_test",
-    parallelism=4,
+    parallelism=4
 )
 
 
@@ -301,7 +254,7 @@ def job_name(self):
 examples_torch_job = CircleCIJob(
     "examples_torch",
     additional_env={"OMP_NUM_THREADS": 8},
-    docker_image=[{"image": "huggingface/transformers-examples-torch"}],
+    docker_image=[{"image":"huggingface/transformers-examples-torch"}],
     # TODO @ArthurZucker remove this once docker is easier to build
     install_steps=["uv venv && uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"],
     pytest_num_workers=8,
@@ -311,7 +264,7 @@ def job_name(self):
 examples_tensorflow_job = CircleCIJob(
     "examples_tensorflow",
     additional_env={"OMP_NUM_THREADS": 8},
-    docker_image=[{"image": "huggingface/transformers-examples-tf"}],
+    docker_image=[{"image":"huggingface/transformers-examples-tf"}],
     pytest_num_workers=16,
 )
 
@@ -319,9 +272,9 @@ def job_name(self):
 hub_job = CircleCIJob(
     "hub",
     additional_env={"HUGGINGFACE_CO_STAGING": True},
-    docker_image=[{"image": "huggingface/transformers-torch-light"}],
+    docker_image=[{"image":"huggingface/transformers-torch-light"}],
     install_steps=[
-        "uv venv && uv pip install .",
+        'uv venv && uv pip install .',
         'git config --global user.email "ci@dummy.com"',
         'git config --global user.name "ci"',
     ],
@@ -332,7 +285,7 @@ def job_name(self):
 
 onnx_job = CircleCIJob(
     "onnx",
-    docker_image=[{"image": "huggingface/transformers-torch-tf-light"}],
+    docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
     install_steps=[
         "uv venv",
         "uv pip install .[torch,tf,testing,sentencepiece,onnxruntime,vision,rjieba]",
@@ -344,7 +297,7 @@ def job_name(self):
 
 exotic_models_job = CircleCIJob(
     "exotic_models",
-    docker_image=[{"image": "huggingface/transformers-exotic-models"}],
+    docker_image=[{"image":"huggingface/transformers-exotic-models"}],
     pytest_num_workers=12,
     parallelism=4,
     pytest_options={"durations": 100},
@@ -353,7 +306,7 @@ def job_name(self):
 
 repo_utils_job = CircleCIJob(
     "repo_utils",
-    docker_image=[{"image": "huggingface/transformers-consistency"}],
+    docker_image=[{"image":"huggingface/transformers-consistency"}],
     pytest_num_workers=4,
     resource_class="large",
 )
@@ -367,7 +320,7 @@ def job_name(self):
 command = f'echo """{py_command}""" > pr_documentation_tests_temp.txt'
 doc_test_job = CircleCIJob(
     "pr_documentation_tests",
-    docker_image=[{"image": "huggingface/transformers-consistency"}],
+    docker_image=[{"image":"huggingface/transformers-consistency"}],
     additional_env={"TRANSFORMERS_VERBOSITY": "error", "DATASETS_VERBOSITY": "error", "SKIP_CUDA_DOCTEST": "1"},
     install_steps=[
         # Add an empty file to keep the test step running correctly even no file is selected to be tested.
@@ -375,7 +328,7 @@ def job_name(self):
         "touch dummy.py",
         command,
         "cat pr_documentation_tests_temp.txt",
-        "tail -n1 pr_documentation_tests_temp.txt | tee pr_documentation_tests_test_list.txt",
+        "tail -n1 pr_documentation_tests_temp.txt | tee pr_documentation_tests_test_list.txt"
     ],
     tests_to_run="$(cat pr_documentation_tests.txt)",  # noqa
     pytest_options={"-doctest-modules": None, "doctest-glob": "*.md", "dist": "loadfile", "rvsA": None},
@@ -383,42 +336,37 @@ def job_name(self):
     pytest_num_workers=1,
 )
 
-REGULAR_TESTS = [torch_and_tf_job, torch_and_flax_job, torch_job, tf_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job]  # fmt: skip
+REGULAR_TESTS = [torch_and_tf_job, torch_and_flax_job, torch_job, tf_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job] # fmt: skip
 EXAMPLES_TESTS = [examples_torch_job, examples_tensorflow_job]
 PIPELINE_TESTS = [pipelines_torch_job, pipelines_tf_job]
 REPO_UTIL_TESTS = [repo_utils_job]
 DOC_TESTS = [doc_test_job]
 ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job]  # fmt: skip
 
-
 def create_circleci_config(folder=None):
     if folder is None:
         folder = os.getcwd()
     os.environ["test_preparation_dir"] = folder
-    jobs = [k for k in ALL_TESTS if os.path.isfile(os.path.join("test_preparation", f"{k.job_name}_test_list.txt"))]
+    jobs = [k for k in ALL_TESTS if os.path.isfile(os.path.join("test_preparation" , f"{k.job_name}_test_list.txt") )]
     print("The following jobs will be run ", jobs)
 
     if len(jobs) == 0:
         jobs = [EmptyJob()]
-    print("Full list of job name inputs", {j.job_name + "_test_list": {"type": "string", "default": ""} for j in jobs})
+    print("Full list of job name inputs", {j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs})
     config = {
         "version": "2.1",
         "parameters": {
             # Only used to accept the parameters from the trigger
             "nightly": {"type": "boolean", "default": False},
-            "tests_to_run": {"type": "string", "default": ""},
-            **{j.job_name + "_test_list": {"type": "string", "default": ""} for j in jobs},
-            **{j.job_name + "_parallelism": {"type": "integer", "default": 1} for j in jobs},
+            "tests_to_run": {"type": "string", "default": ''},
+            **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs},
+            **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs},
         },
-        "jobs": {j.job_name: j.to_dict() for j in jobs},
-        "workflows": {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}},
+        "jobs" : {j.job_name: j.to_dict() for j in jobs},
+        "workflows": {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}}
     }
     with open(os.path.join(folder, "generated_config.yml"), "w") as f:
-        f.write(
-            yaml.dump(config, sort_keys=False, default_flow_style=False)
-            .replace("' << pipeline", " << pipeline")
-            .replace(">> '", " >>")
-        )
+        f.write(yaml.dump(config, sort_keys=False, default_flow_style=False).replace("' << pipeline", " << pipeline").replace(">> '", " >>"))
 
 
 if __name__ == "__main__":
diff --git a/.circleci/parse_test_outputs.py b/.circleci/parse_test_outputs.py
index 21f186c76b5e76..a69da1a3eafb27 100644
--- a/.circleci/parse_test_outputs.py
+++ b/.circleci/parse_test_outputs.py
@@ -1,57 +1,53 @@
-import argparse
 import re
-
+import argparse
 
 def parse_pytest_output(file_path):
     skipped_tests = {}
     skipped_count = 0
-    with open(file_path, "r") as file:
+    with open(file_path, 'r') as file:
         for line in file:
-            match = re.match(r"^SKIPPED \[(\d+)\] (tests/.*): (.*)$", line)
+            match = re.match(r'^SKIPPED \[(\d+)\] (tests/.*): (.*)$', line)
             if match:
                 skipped_count += 1
                 test_file, test_line, reason = match.groups()
                 skipped_tests[reason] = skipped_tests.get(reason, []) + [(test_file, test_line)]
-    for k, v in sorted(skipped_tests.items(), key=lambda x: len(x[1])):
+    for k,v in sorted(skipped_tests.items(), key=lambda x:len(x[1])):
         print(f"{len(v):4} skipped because: {k}")
     print("Number of skipped tests:", skipped_count)
 
-
 def parse_pytest_failure_output(file_path):
     failed_tests = {}
     failed_count = 0
-    with open(file_path, "r") as file:
+    with open(file_path, 'r') as file:
         for line in file:
-            match = re.match(r"^FAILED (tests/.*) - (.*): (.*)$", line)
+            match = re.match(r'^FAILED (tests/.*) - (.*): (.*)$', line)
             if match:
                 failed_count += 1
                 _, error, reason = match.groups()
                 failed_tests[reason] = failed_tests.get(reason, []) + [error]
-    for k, v in sorted(failed_tests.items(), key=lambda x: len(x[1])):
+    for k,v in sorted(failed_tests.items(), key=lambda x:len(x[1])):
         print(f"{len(v):4} failed because `{v[0]}` -> {k}")
     print("Number of failed tests:", failed_count)
-    if failed_count > 0:
+    if failed_count>0:
         exit(1)
 
-
 def parse_pytest_errors_output(file_path):
     print(file_path)
     error_tests = {}
     error_count = 0
-    with open(file_path, "r") as file:
+    with open(file_path, 'r') as file:
         for line in file:
-            match = re.match(r"^ERROR (tests/.*) - (.*): (.*)$", line)
+            match = re.match(r'^ERROR (tests/.*) - (.*): (.*)$', line)
             if match:
                 error_count += 1
                 _, test_error, reason = match.groups()
                 error_tests[reason] = error_tests.get(reason, []) + [test_error]
-    for k, v in sorted(error_tests.items(), key=lambda x: len(x[1])):
+    for k,v in sorted(error_tests.items(), key=lambda x:len(x[1])):
         print(f"{len(v):4} errored out because of `{v[0]}` -> {k}")
     print("Number of errors:", error_count)
-    if error_count > 0:
+    if error_count>0:
         exit(1)
 
-
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--file", help="file to parse")
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index f0507b86622ece..304bbd4441cf66 100644
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -31,7 +31,9 @@
 from pathlib import Path
 
 from git import Repo
+
 from huggingface_hub import HfApi
+
 from optimum_benchmark import Benchmark
 from optimum_benchmark_wrapper import main
 
diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
index 2304e46465ed63..c9470eeeae8548 100755
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -181,21 +181,23 @@ def get_original_command(max_width=80, full_python_path=False):
 
 
 def get_base_command(args, output_dir):
+
     # unwrap multi-line input
     args.base_cmd = re.sub(r"[\\\n]+", " ", args.base_cmd)
 
     # remove --output_dir if any and set our own
-    args.base_cmd = re.sub(r"--output_dir\s+[^\s]+", "", args.base_cmd)
+    args.base_cmd = re.sub("--output_dir\s+[^\s]+", "", args.base_cmd)
     args.base_cmd += f" --output_dir {output_dir}"
 
     # ensure we have --overwrite_output_dir
-    args.base_cmd = re.sub(r"--overwrite_output_dir\s+", "", args.base_cmd)
+    args.base_cmd = re.sub("--overwrite_output_dir\s+", "", args.base_cmd)
     args.base_cmd += " --overwrite_output_dir"
 
     return [sys.executable] + shlex.split(args.base_cmd)
 
 
 def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose):
+
     # Enable to debug everything but the run itself, to do it fast and see the progress.
     # This is useful for debugging the output formatting quickly - we can remove it later once
     # everybody is happy with the output
@@ -294,6 +296,7 @@ def get_versions():
 
 
 def process_results(results, target_metric_key, report_metric_keys, base_variation, output_dir):
+
     df = pd.DataFrame(results)
     variation_key = "variation"
     diff_key = "diff_%"

From 0551d2398f4f1b726237c63eb0cf5e514cbb8a96 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Sat, 14 Sep 2024 12:31:15 -0400
Subject: [PATCH 57/61] fix format

---
 src/transformers/utils/import_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 5a0960a0d328b7..7dcea2e8449168 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -860,7 +860,7 @@ def is_bitsandbytes_available():
     # so those versions of the library are practically only available when CUDA is too.
     if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.43.1"):
         return torch.cuda.is_available()
-    
+
     # Newer versions of `bitsandbytes` can be imported on systems without CUDA.
     return True
 

From e33e43bb943ea101d1ec764d4e69abab2babc11b Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 18 Sep 2024 06:19:24 -0400
Subject: [PATCH 58/61] give warning when bnb version is low and no cuda found]

---
 src/transformers/integrations/bitsandbytes.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
index d70c41e236ac89..3222f26bcf5bb0 100644
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@@ -546,6 +546,10 @@ def validate_bnb_backend_availability(raise_exception=False):
     Validates if the available devices are supported by bitsandbytes, optionally raising an exception if not.
     """
     if not is_bitsandbytes_available():
+        if importlib.util.find_spec("bitsandbytes") and version.parse(
+            importlib.metadata.version("bitsandbytes")
+        ) < version.parse("0.43.1"):
+            return _validate_bnb_cuda_backend_availability(raise_exception)
         return False
 
     if is_bitsandbytes_multi_backend_available():

From 170dd585c8a9aa8511b437ca055895762f7c6442 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Sun, 22 Sep 2024 03:07:19 +0000
Subject: [PATCH 59/61] fix device assignment check to be multi-device capable

---
 tests/quantization/bnb/test_4bit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index 336ee22ce5dbb5..0ac9b3d82fc7b0 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -276,7 +276,7 @@ def test_device_assignment(self):
         if torch.cuda.is_available():
             # Move back to CUDA device
             self.model_4bit.to("cuda")
-            self.assertEqual(self.model_4bit.device, torch.device("cuda"))
+            self.assertEqual(self.model_4bit.device.type, "cuda")
             self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before)
 
     def test_device_and_dtype_assignment(self):

From 9ba4a5e7a7add912a82441e11a17b37599a8784a Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 23 Sep 2024 21:20:45 +0000
Subject: [PATCH 60/61] address akx feedback on get_avlbl_dev fn

---
 src/transformers/__init__.py       | 3 +++
 src/transformers/utils/__init__.py | 7 ++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 36775d8454ab8c..a0435b00d4d8c1 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -27,6 +27,7 @@
 from .utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
+    get_available_devices,
     is_bitsandbytes_available,
     is_essentia_available,
     is_flax_available,
@@ -917,6 +918,7 @@
         "TensorType",
         "add_end_docstrings",
         "add_start_docstrings",
+        "get_available_devices",
         "is_apex_available",
         "is_av_available",
         "is_bitsandbytes_available",
@@ -5741,6 +5743,7 @@
         TensorType,
         add_end_docstrings,
         add_start_docstrings,
+        get_available_devices,
         is_apex_available,
         is_av_available,
         is_bitsandbytes_available,
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index cc04d1a366585a..93976c2375565b 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -16,6 +16,7 @@
 # limitations under the License.
 
 from functools import lru_cache
+from typing import FrozenSet
 
 from huggingface_hub import get_full_repo_name  # for backward compatibility
 from huggingface_hub.constants import HF_HUB_DISABLE_TELEMETRY as DISABLE_TELEMETRY  # for backward compatibility
@@ -283,9 +284,9 @@ def check_min_version(min_version):
 
 
 @lru_cache()
-def get_available_devices():
+def get_available_devices() -> FrozenSet[str]:
     """
-    Returns a set of devices available for the current PyTorch installation.
+    Returns a frozenset of devices available for the current PyTorch installation.
     """
     devices = {"cpu"}  # `cpu` is always supported as a device in PyTorch
 
@@ -307,4 +308,4 @@ def get_available_devices():
     if is_torch_musa_available():
         devices.add("musa")
 
-    return devices
+    return frozenset(devices)

From 594f6f8a99c609790566eecfeea7c56606bc0949 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 24 Sep 2024 08:43:58 +0000
Subject: [PATCH 61/61] we don't want the function tat publicc, as docs would
 be too much

---
 src/transformers/__init__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index a0435b00d4d8c1..36775d8454ab8c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -27,7 +27,6 @@
 from .utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
-    get_available_devices,
     is_bitsandbytes_available,
     is_essentia_available,
     is_flax_available,
@@ -918,7 +917,6 @@
         "TensorType",
         "add_end_docstrings",
         "add_start_docstrings",
-        "get_available_devices",
         "is_apex_available",
         "is_av_available",
         "is_bitsandbytes_available",
@@ -5743,7 +5741,6 @@
         TensorType,
         add_end_docstrings,
         add_start_docstrings,
-        get_available_devices,
         is_apex_available,
         is_av_available,
         is_bitsandbytes_available,