Enabled BnB NF4 inference on Gaudi (#20172)

rsshaik1 · web-flow · commit 016b8d1b7f1d · 2025-07-14T20:26:08.000-07:00
Signed-off-by: Ruheena Suhani Shaik &lt;rsshaik@habana.ai&gt;
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -13,6 +13,7 @@
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
 
@@ -390,12 +391,11 @@ def _apply_bnb_4bit_fake(
 
 
 try:
-    direct_register_custom_op(
-        op_name="apply_bnb_4bit",
-        op_func=_apply_bnb_4bit,
-        mutates_args=["out"],
-        fake_impl=_apply_bnb_4bit_fake,
-    )
+    direct_register_custom_op(op_name="apply_bnb_4bit",
+                              op_func=_apply_bnb_4bit,
+                              mutates_args=["out"],
+                              fake_impl=_apply_bnb_4bit_fake,
+                              dispatch_key=current_platform.dispatch_key)
     apply_bnb_4bit = torch.ops.vllm.apply_bnb_4bit
 
 except AttributeError as error:
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -199,6 +199,10 @@ def _get_quantized_weights_iterator(
 
         if self.pre_quant:
             if self.load_8bit:
+                if current_platform.is_hpu():
+                    raise ValueError(
+                        "currently hpu supports 4bit quantization only")
+
                 return self._quantized_8bit_generator(
                     hf_weights_files, use_safetensors,
                     quant_state_dict), quant_state_dict
@@ -302,6 +306,10 @@ def _parse_quant_state(param_name: str,
                         in temp_state_dict):
                 quant_state = _parse_quant_state(mapped_weight_name,
                                                  temp_state_dict)
+                if current_platform.is_hpu():
+                    assert quant_state.quant_type == "nf4", (
+                        "currently hpu supports nf4 quant_type only")
+
                 quant_state_dict[mapped_weight_name] = quant_state
                 yield org_weight_name, weight_tensor
             else:
@@ -372,10 +380,12 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors,
                                                       ...]
 
                 # bitsandbytes requires data in GPU
-                if weight_sub_tensor.is_cuda:
+                if (weight_sub_tensor.is_cuda
+                        or weight_sub_tensor.device.type == "hpu"):
                     loaded_weight = weight_sub_tensor
                 else:
-                    loaded_weight = weight_sub_tensor.cuda()
+                    loaded_weight = weight_sub_tensor.to(
+                        device=current_platform.device_type)
 
                 # remove the following after the issue is fixed:
                 # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1342