Disable the exllama on all non-cuda devices. (#2003)

* Disable the exllama on all non-cuda devices. 1. Disable the exllama on all non-cuda devices. 2. Don't raise the error when running on non-cuda device. Signed-off-by: yuanwu <yuan.wu@intel.com> * Refine the code Signed-off-by: yuanwu <yuan.wu@intel.com> * Fix errors of make style Signed-off-by: yuanwu <yuan.wu@intel.com> * Add hpu device Signed-off-by: yuanwu <yuan.wu@intel.com> * Update optimum/gptq/constants.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Update optimum/gptq/quantizer.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Update optimum/gptq/quantizer.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Update optimum/gptq/quantizer.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Fix error of make style Signed-off-by: yuanwu <yuan.wu@intel.com> --------- Signed-off-by: yuanwu <yuan.wu@intel.com> Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
huggingface · Sep 18, 2024 · 2179d33 · 2179d33
1 parent ca36fc4
commit 2179d33
Showing 1 changed file with 8 additions and 7 deletions.
diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
@@ -546,7 +546,7 @@ def tmp(_, input, output):
 
  if self.bits == 4:
  # device not on gpu
- if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
+ if device.type != "cuda" or (has_device_map and any(d in devices for d in ["cpu", "disk", "hpu"])):
  if not self.disable_exllama:
  logger.warning(
  "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
@@ -589,13 +589,14 @@ def post_init_model(self, model):
  The input model
  """
  if self.bits == 4 and not self.disable_exllama:
- if get_device(model) == torch.device("cpu") or (
- hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"])
+ if get_device(model).type != "cuda" or (
+ hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk", "hpu"])
  ):
- raise ValueError(
- "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU."
- "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
- )
+ if not self.disable_exllama:
+ logger.warning(
+ "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
+ )
+ self.disable_exllama = True
 
  class StoreAttr(object):
  pass