huggingface · Qubitium · Feb 28, 2025 · Feb 28, 2025 · Feb 28, 2025 · Feb 28, 2025
diff --git a/src/peft/import_utils.py b/src/peft/import_utils.py
@@ -52,7 +52,7 @@ def is_auto_gptq_available():
 @lru_cache
 def is_gptqmodel_available():
     if importlib.util.find_spec("gptqmodel") is not None:
-        GPTQMODEL_MINIMUM_VERSION = packaging.version.parse("1.7.0")
+        GPTQMODEL_MINIMUM_VERSION = packaging.version.parse("1.9.99")
         OPTIMUM_MINIMUM_VERSION = packaging.version.parse("1.23.99")
         version_gptqmodel = packaging.version.parse(importlib_metadata.version("gptqmodel"))
         if GPTQMODEL_MINIMUM_VERSION <= version_gptqmodel:
@@ -62,17 +62,17 @@ def is_gptqmodel_available():
                     return True
                 else:
                     raise ImportError(
-                        f"gptqmodel requires optimum version {OPTIMUM_MINIMUM_VERSION} or higher. Found version {version_optimum}, "
-                        f"but only versions above {OPTIMUM_MINIMUM_VERSION} are supported"
+                        f"gptqmodel requires optimum version `{OPTIMUM_MINIMUM_VERSION}` or higher. Found version `{version_optimum}`, "
+                        f"but only versions above `{OPTIMUM_MINIMUM_VERSION}` are supported"
                     )
             else:
                 raise ImportError(
-                    f"gptqmodel requires optimum version {OPTIMUM_MINIMUM_VERSION} or higher to be installed."
+                    f"gptqmodel requires optimum version `{OPTIMUM_MINIMUM_VERSION}` or higher to be installed."
                 )
         else:
             raise ImportError(
-                f"Found an incompatible version of gptqmodel. Found version {version_gptqmodel}, "
-                f"but only versions above {GPTQMODEL_MINIMUM_VERSION} are supported"
+                f"Found an incompatible version of gptqmodel. Found version `{version_gptqmodel}`, "
+                f"but only versions above `{GPTQMODEL_MINIMUM_VERSION}` are supported"
             )
 
 

diff --git a/src/peft/tuners/lora/__init__.py b/src/peft/tuners/lora/__init__.py
@@ -17,7 +17,7 @@
 
 from .config import EvaConfig, LoftQConfig, LoraConfig, LoraRuntimeConfig
 from .eva import get_eva_state_dict, initialize_lora_eva_weights
-from .gptq import QuantLinear
+from .gptq import GPTQLoraLinear
 from .layer import Conv2d, Conv3d, Embedding, Linear, LoraLayer
 from .model import LoraModel
 
@@ -27,13 +27,13 @@
     "Conv3d",
     "Embedding",
     "EvaConfig",
+    "GPTQLoraLinear",
     "Linear",
     "LoftQConfig",
     "LoraConfig",
     "LoraLayer",
     "LoraModel",
     "LoraRuntimeConfig",
-    "QuantLinear",
     "get_eva_state_dict",
     "initialize_lora_eva_weights",
 ]

diff --git a/src/peft/tuners/lora/gptq.py b/src/peft/tuners/lora/gptq.py
@@ -11,18 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from typing import Any, Optional
 
 import torch
 
 from peft.import_utils import is_gptqmodel_available
 from peft.tuners.lora.layer import LoraLayer
 from peft.tuners.tuners_utils import BaseTunerLayer
-from peft.utils import get_auto_gptq_quant_linear, get_gptqmodel_quant_linear
+from peft.utils import get_auto_gptq_quant_linear
 
 
-class QuantLinear(torch.nn.Module, LoraLayer):
+class GPTQLoraLinear(torch.nn.Module, LoraLayer):
     def __init__(
         self,
         base_layer,
@@ -64,9 +63,11 @@ def forward(self, x: torch.Tensor):
         if self.disable_adapters:
             return result
 
+        lora_A_keys = self.lora_A.keys()
         for active_adapter in self.active_adapters:
-            if active_adapter not in self.lora_A.keys():
+            if active_adapter not in lora_A_keys:
                 continue
+
             lora_A = self.lora_A[active_adapter]
             lora_B = self.lora_B[active_adapter]
             dropout = self.lora_dropout[active_adapter]
@@ -78,9 +79,13 @@ def forward(self, x: torch.Tensor):
                 x = self._cast_input_dtype(x, lora_A.weight.dtype)
 
             output = lora_B(lora_A(dropout(x)))
+
             if requires_conversion:
                 output = output.to(expected_dtype)
-            output = output * scaling
+
+            if scaling != 1:  # skip scaling == 1 no-op
+                output = output * scaling
+
             result += output
         return result
 
@@ -110,13 +115,16 @@ def dispatch_gptq(
     cfg = kwargs.get("gptq_quantization_config", None)
 
     if is_gptqmodel_available():
-        device_map = kwargs.get("device_map", None)
-        quant_linear = get_gptqmodel_quant_linear(cfg, device_map=device_map)
+        from gptqmodel.nn_modules.qlinear import BaseQuantLinear
+
+        if isinstance(target_base_layer, BaseQuantLinear):
+            new_module = GPTQLoraLinear(target, adapter_name, **kwargs)
+            target.qweight = target_base_layer.qweight
     else:
         quant_linear = get_auto_gptq_quant_linear(cfg)
 
-    if quant_linear is not None and isinstance(target_base_layer, quant_linear):
-        new_module = QuantLinear(target, adapter_name, **kwargs)
-        target.qweight = target_base_layer.qweight
+        if quant_linear is not None and isinstance(target_base_layer, quant_linear):
+            new_module = GPTQLoraLinear(target, adapter_name, **kwargs)
+            target.qweight = target_base_layer.qweight
 
     return new_module
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
@@ -489,7 +489,13 @@ def _mixed_batch_forward(
             # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear
             # layer output
             sub_batch = x[sub_batch_indices_list[i]].to(lora_A.weight.dtype)
-            lora_output = lora_B(lora_A(dropout(sub_batch))) * scaling
+
+            # Loras such as EoRA will always be scaling == 1 so we can skip the no-op math
+            if scaling == 1:
+                lora_output = lora_B(lora_A(dropout(sub_batch)))
+            else:
+                lora_output = lora_B(lora_A(dropout(sub_batch))) * scaling
+
             result[sub_batch_indices_list[i]] += lora_output.to(torch_result_dtype)
 
         return result
@@ -711,17 +717,24 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
         else:
             result = self.base_layer(x, *args, **kwargs)
             torch_result_dtype = result.dtype
+
+            lora_A_keys = self.lora_A.keys()
             for active_adapter in self.active_adapters:
-                if active_adapter not in self.lora_A.keys():
+                if active_adapter not in lora_A_keys:
                     continue
+
                 lora_A = self.lora_A[active_adapter]
                 lora_B = self.lora_B[active_adapter]
                 dropout = self.lora_dropout[active_adapter]
                 scaling = self.scaling[active_adapter]
                 x = self._cast_input_dtype(x, lora_A.weight.dtype)
 
                 if not self.use_dora[active_adapter]:
-                    result = result + lora_B(lora_A(dropout(x))) * scaling
+                    # Loras such as EoRA will always be scaling == 1 so we can skip the no-op math
+                    if scaling == 1:
+                        result = result + lora_B(lora_A(dropout(x)))
+                    else:
+                        result = result + lora_B(lora_A(dropout(x))) * scaling
                 else:
                     if isinstance(dropout, nn.Identity) or not self.training:
                         base_result = result

diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py
@@ -14,7 +14,6 @@
 from __future__ import annotations
 
 import copy
-import logging
 import os
 import re
 import textwrap
@@ -46,9 +45,6 @@
 from ._buffer_dict import BufferDict
 
 
-logger = logging.getLogger(__name__)
-
-
 @contextmanager
 def onload_layer(layer):
     r"""
@@ -168,7 +164,7 @@ def __init__(
         if not hasattr(self, "peft_config"):
             self.peft_config = {adapter_name: peft_config} if isinstance(peft_config, PeftConfig) else peft_config
         else:
-            logger.info(
+            warnings.warn(
                 "Already found a `peft_config` attribute in the model. This will lead to having multiple adapters"
                 " in the model. Make sure to know what you are doing!"
             )

diff --git a/tests/test_gptqmodel.py b/tests/test_gptqmodel.py
@@ -36,6 +36,7 @@
     get_peft_model,
     prepare_model_for_kbit_training,
 )
+from peft.tuners.lora import GPTQLoraLinear
 from peft.utils import SAFETENSORS_WEIGHTS_NAME, infer_device
 
 from .testing_utils import (
@@ -347,3 +348,30 @@ def test_non_default_adapter_name(self):
         # sanity check
         assert n_trainable_default == n_trainable_other
         assert n_total_default == n_total_other
+
+    @staticmethod
+    def test_load_lora():
+        model_id = "ModelCloud/Llama-3.2-1B-gptqmodel-ci-4bit"
+        adapter_id = "ModelCloud/Llama-3.2-1B-gptqmodel-ci-4bit-lora"
+
+        model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
+        model.load_adapter(adapter_id)
+
+        print("peft model", model)
+
+        # assert dynamic rank
+        v_proj_module = model.model.layers[5].self_attn.v_proj
+        assert isinstance(v_proj_module, GPTQLoraLinear)
+        assert v_proj_module.lora_A["default"].weight.data.shape[0] == 128
+        assert v_proj_module.lora_B["default"].weight.data.shape[1] == 128
+        gate_proj_module = model.model.layers[5].mlp.gate_proj
+        assert isinstance(gate_proj_module, GPTQLoraLinear)
+        assert gate_proj_module.lora_A["default"].weight.data.shape[0] == 256
+        assert gate_proj_module.lora_B["default"].weight.data.shape[1] == 256
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inp = tokenizer("Capital of France is", return_tensors="pt").to(model.device)
+        tokens = model.generate(**inp)[0]
+        result = tokenizer.decode(tokens)
+
+        print("result: ", result)