FIX: Don't target the classification head when using target_modules="all-linear" (#2033)

BenjaminBossan · web-flow · commit 1a5d0f815151 · 2024-08-23T16:00:43.000+02:00
Fixes #2027 When using a transformers sequence classification model, target_modules="all-linear" should not wrap the classification head with LoRA. This is because it is already wrapped with ModulesToSave, i.e. it will be fully fine-tuned, which is the generally desired behavior. Before this bug fix, the classification head would be double-wrapped. With #2028, this now raises an error. With this PR, it is avoided completely. Still, keeping #2028 is good because it helps prevent other situations where double-wrapping might occur due to misconfiguration. Note that there is no fool-proof method to detect the classification head, we have to rely on transformers convention.
diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py
@@ -30,8 +30,8 @@
 from transformers.pytorch_utils import Conv1D
 
 from peft.utils import INCLUDE_LINEAR_LAYERS_SHORTHAND
-from peft.utils.constants import DUMMY_TARGET_MODULES
-from peft.utils.peft_types import PeftType
+from peft.utils.constants import DUMMY_TARGET_MODULES, SEQ_CLS_HEAD_NAMES
+from peft.utils.peft_types import PeftType, TaskType
 
 from ..config import PeftConfig
 from ..utils import ModulesToSaveWrapper, _get_submodules
@@ -812,11 +812,25 @@ def _maybe_include_all_linear_layers(peft_config: PeftConfig, model: nn.Module)
             names = name.rsplit(".", 1)[-1]  # get the base name
             linear_module_names.add(names)
 
-    # ignore the last classification head for text generation models
+    # Try to remove linear layers that should not be targeted as best as possible. We have to rely on convention as
+    # there are no hard rules to detect these modules.
+    module_names_to_exclude = set()
     output_emb = model.get_output_embeddings()
     if output_emb is not None:
+        # ignore the last classification head for text generation models
         last_module_name = [name for name, module in model.named_modules() if module is output_emb][0]
-        linear_module_names -= {last_module_name}
+        module_names_to_exclude.add(last_module_name)
+    elif peft_config.task_type == TaskType.SEQ_CLS:
+        # ignore classifier head for classification models (issue 2027)
+        # there is no fix name for the classifier head, so check the common ones
+        for name in SEQ_CLS_HEAD_NAMES:
+            cls_head = getattr(model, name, None)
+            if cls_head is not None:
+                last_module_name = [name for name, module in model.named_modules() if module is cls_head][0]
+                module_names_to_exclude.add(last_module_name)
+                break
+
+    linear_module_names -= module_names_to_exclude
     peft_config.target_modules = linear_module_names
     return peft_config
 
diff --git a/src/peft/utils/constants.py b/src/peft/utils/constants.py
@@ -257,6 +257,7 @@ def starcoder_model_postprocess_past_key_value(past_key_values):
 SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors"
 CONFIG_NAME = "adapter_config.json"
 EMBEDDING_LAYER_NAMES = ["embed_tokens", "lm_head"]
+SEQ_CLS_HEAD_NAMES = ["score", "classifier"]
 INCLUDE_LINEAR_LAYERS_SHORTHAND = "all-linear"
 TOKENIZER_CONFIG_NAME = "tokenizer_config.json"
 DUMMY_TARGET_MODULES = "dummy-target-modules"
diff --git a/tests/test_tuners_utils.py b/tests/test_tuners_utils.py
@@ -23,7 +23,13 @@
 from diffusers import StableDiffusionPipeline
 from parameterized import parameterized
 from torch import nn
-from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, BitsAndBytesConfig
+from transformers import (
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoModelForSequenceClassification,
+    BitsAndBytesConfig,
+)
 
 from peft import (
     AdaptionPromptConfig,
@@ -42,7 +48,7 @@
     check_target_module_exists,
     inspect_matched_modules,
 )
-from peft.utils import INCLUDE_LINEAR_LAYERS_SHORTHAND, infer_device
+from peft.utils import INCLUDE_LINEAR_LAYERS_SHORTHAND, ModulesToSaveWrapper, infer_device
 
 from .testing_utils import require_bitsandbytes, require_non_cpu, require_torch_gpu
 
@@ -330,6 +336,23 @@ def test_maybe_include_all_linear_layers_diffusion(self):
         ):
             model.unet = get_peft_model(model.unet, config)
 
+    def test_maybe_include_all_linear_does_not_target_classifier_head(self):
+        # See issue 2027
+        # Ensure that if a SEQ_CLS model is being used with target_modules="all-linear", the classification head is not
+        # targeted by the adapter layer.
+        model_id = "HuggingFaceH4/tiny-random-LlamaForCausalLM"
+        model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=10)
+        # sanity check
+        assert isinstance(model.score, nn.Linear)
+
+        config = LoraConfig(task_type="SEQ_CLS", target_modules="all-linear")
+        model = get_peft_model(model, config)
+        assert isinstance(model.base_model.score, ModulesToSaveWrapper)
+
+        # the bug was that these were lora.Linear instances
+        assert isinstance(model.base_model.score.original_module, nn.Linear)
+        assert isinstance(model.base_model.score.modules_to_save["default"], nn.Linear)
+
 
 class MLP(nn.Module):
     def __init__(self, bias=True):