From d2d69900c51bf18e3c3839a995ce6229637216cb Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 3 Jan 2024 21:27:11 +0530
Subject: [PATCH 1/8] improve docs and add small utils

---
 docs/source/developer_guides/lora.md | 13 ++++++++++
 src/peft/__init__.py                 |  2 ++
 src/peft/utils/__init__.py           |  2 ++
 src/peft/utils/other.py              | 36 ++++++++++++++++++++++++++++
 src/peft/utils/peft_types.py         | 29 ++++++++++++++++++++--
 5 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md
index 6fd5d62b07..cd6e2673e9 100644
--- a/docs/source/developer_guides/lora.md
+++ b/docs/source/developer_guides/lora.md
@@ -136,3 +136,16 @@ model.unload()
 # delete adapter
 model.delete_adapter("dpo")
 ```
+
+## Utilities
+
+Q-LoRA paper outlines that best performance is achieved by targetting all the Linear layers of the model instead of only targetting the query and value layers. To find all the linear layers to target, we provide a utility function `get_linear_layer_names`. Below is the sample usage of it:
+
+```py
+from peft import get_linear_layer_names
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+print(get_linear_layer_names(model))
+# ['v_proj', 'o_proj', 'k_proj', 'down_proj', 'up_proj', 'q_proj', 'gate_proj']
+```
\ No newline at end of file
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index 2b1eee8163..d255271e66 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -83,5 +83,7 @@
     set_peft_model_state_dict,
     shift_tokens_right,
     load_peft_weights,
+    get_linear_layer_names,
+    cast_non_trainable_to_dtype,
 )
 from .config import PeftConfig, PromptLearningConfig
diff --git a/src/peft/utils/__init__.py b/src/peft/utils/__init__.py
index 7cb25e75aa..c4bb40394f 100644
--- a/src/peft/utils/__init__.py
+++ b/src/peft/utils/__init__.py
@@ -46,5 +46,7 @@
     get_auto_gptq_quant_linear,
     get_quantization_config,
     id_tensor_storage,
+    get_linear_layer_names,
+    cast_non_trainable_to_dtype,
 )
 from .save_and_load import get_peft_model_state_dict, set_peft_model_state_dict, load_peft_weights
diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index 1a2d8297af..c0f9fd262e 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -496,3 +496,39 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
         unique_id = storage_ptr(tensor)
 
     return tensor.device, unique_id, storage_size(tensor)
+
+
+# adapted from https://github.com/Bavest/fin-llama/blob/8ba68f4b49753358ffbe693d38f2461c031625e3/qlora.py#L229
+def get_linear_layer_names(model):
+    """
+    Returns a list of linear layers in the model.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model for which to get the layer names.
+    """
+    lora_module_names = set()
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear):
+            names = name.split(".")
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+
+    if "lm_head" in lora_module_names:  # needed for 16-bit
+        lora_module_names.remove("lm_head")
+    return list(lora_module_names)
+
+
+def cast_non_trainable_to_dtype(model, dtype):
+    """
+    Cast all non-trainable parameters of the model to the given `dtype`.
+    This is meant to reduce the GPU memory usage when using PEFT methods.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model to cast the non-trainable parameters of.
+        dtype (`torch.dtype`):
+            The dtype to cast the non-trainable parameters to.
+    """
+    for p in model.parameters():
+        if not p.requires_grad:
+            p.data = p.to(dtype)
diff --git a/src/peft/utils/peft_types.py b/src/peft/utils/peft_types.py
index aaec190855..21fdb2f6d0 100644
--- a/src/peft/utils/peft_types.py
+++ b/src/peft/utils/peft_types.py
@@ -20,7 +20,22 @@
 
 
 class PeftType(str, enum.Enum):
-    """Enum class for the different types of adapters in PEFT."""
+    """
+    Enum class for the different types of adapters in PEFT.
+
+    Supported PEFT types:
+    - PROMPT_TUNING
+    - MULTITASK_PROMPT_TUNING
+    - P_TUNING
+    - PREFIX_TUNING
+    - LORA
+    - ADALORA
+    - ADAPTION_PROMPT
+    - IA3
+    - LOHA
+    - LOKR
+    - OFT
+    """
 
     PROMPT_TUNING = "PROMPT_TUNING"
     MULTITASK_PROMPT_TUNING = "MULTITASK_PROMPT_TUNING"
@@ -36,7 +51,17 @@ class PeftType(str, enum.Enum):
 
 
 class TaskType(str, enum.Enum):
-    """Enum class for the different types of tasks supported by PEFT."""
+    """
+    Enum class for the different types of tasks supported by PEFT.
+
+    Overview of the supported task types:
+    - SEQ_CLS: Text classification.
+    - SEQ_2_SEQ_LM: Sequence-to-sequence language modeling.
+    - Causal LM: Causal language modeling.
+    - TOKEN_CLS: Token classification.
+    - QUESTION_ANS: Question answering.
+    - FEATURE_EXTRACTION: Feature extraction. Provides the hidden states which can be used as exmbeddings or features for downstream tasks.
+    """
 
     SEQ_CLS = "SEQ_CLS"
     SEQ_2_SEQ_LM = "SEQ_2_SEQ_LM"

From 4ed491f5d527980415aa48585389d1a6e9bf751e Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 3 Jan 2024 21:32:20 +0530
Subject: [PATCH 2/8] quality

---
 src/peft/utils/other.py      | 4 ++--
 src/peft/utils/peft_types.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index c0f9fd262e..f9a8bed821 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -520,8 +520,8 @@ def get_linear_layer_names(model):
 
 def cast_non_trainable_to_dtype(model, dtype):
     """
-    Cast all non-trainable parameters of the model to the given `dtype`.
-    This is meant to reduce the GPU memory usage when using PEFT methods.
+    Cast all non-trainable parameters of the model to the given `dtype`. This is meant to reduce the GPU memory usage
+    when using PEFT methods by using half-precision dtype for non-trainable parameters.
 
     Args:
         model (`torch.nn.Module`):
diff --git a/src/peft/utils/peft_types.py b/src/peft/utils/peft_types.py
index 21fdb2f6d0..cc2af8b60f 100644
--- a/src/peft/utils/peft_types.py
+++ b/src/peft/utils/peft_types.py
@@ -60,7 +60,8 @@ class TaskType(str, enum.Enum):
     - Causal LM: Causal language modeling.
     - TOKEN_CLS: Token classification.
     - QUESTION_ANS: Question answering.
-    - FEATURE_EXTRACTION: Feature extraction. Provides the hidden states which can be used as exmbeddings or features for downstream tasks.
+    - FEATURE_EXTRACTION: Feature extraction. Provides the hidden states which can be used as exmbeddings or features
+      for downstream tasks.
     """
 
     SEQ_CLS = "SEQ_CLS"

From 65dd0a4add79ae8e91c837b375ea6c1657327786 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 3 Jan 2024 21:33:12 +0530
Subject: [PATCH 3/8] fix typo

---
 src/peft/utils/peft_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/peft/utils/peft_types.py b/src/peft/utils/peft_types.py
index cc2af8b60f..f8c5476ecc 100644
--- a/src/peft/utils/peft_types.py
+++ b/src/peft/utils/peft_types.py
@@ -60,7 +60,7 @@ class TaskType(str, enum.Enum):
     - Causal LM: Causal language modeling.
     - TOKEN_CLS: Token classification.
     - QUESTION_ANS: Question answering.
-    - FEATURE_EXTRACTION: Feature extraction. Provides the hidden states which can be used as exmbeddings or features
+    - FEATURE_EXTRACTION: Feature extraction. Provides the hidden states which can be used as embeddings or features
       for downstream tasks.
     """
 

From aa1d75c55c2d63666b37fb3790c92cfaf446b6f7 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 10 Jan 2024 13:25:46 +0530
Subject: [PATCH 4/8] updates

---
 docs/source/developer_guides/lora.md | 13 -------------
 src/peft/__init__.py                 |  1 -
 src/peft/utils/__init__.py           |  1 -
 src/peft/utils/other.py              | 27 +++++----------------------
 4 files changed, 5 insertions(+), 37 deletions(-)

diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md
index cd6e2673e9..0276123464 100644
--- a/docs/source/developer_guides/lora.md
+++ b/docs/source/developer_guides/lora.md
@@ -135,17 +135,4 @@ model.unload()
 
 # delete adapter
 model.delete_adapter("dpo")
-```
-
-## Utilities
-
-Q-LoRA paper outlines that best performance is achieved by targetting all the Linear layers of the model instead of only targetting the query and value layers. To find all the linear layers to target, we provide a utility function `get_linear_layer_names`. Below is the sample usage of it:
-
-```py
-from peft import get_linear_layer_names
-from transformers import AutoModelForCausalLM
-
-model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
-print(get_linear_layer_names(model))
-# ['v_proj', 'o_proj', 'k_proj', 'down_proj', 'up_proj', 'q_proj', 'gate_proj']
 ```
\ No newline at end of file
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index d255271e66..0ee2ca9a5d 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -83,7 +83,6 @@
     set_peft_model_state_dict,
     shift_tokens_right,
     load_peft_weights,
-    get_linear_layer_names,
     cast_non_trainable_to_dtype,
 )
 from .config import PeftConfig, PromptLearningConfig
diff --git a/src/peft/utils/__init__.py b/src/peft/utils/__init__.py
index c4bb40394f..255a24e8c5 100644
--- a/src/peft/utils/__init__.py
+++ b/src/peft/utils/__init__.py
@@ -46,7 +46,6 @@
     get_auto_gptq_quant_linear,
     get_quantization_config,
     id_tensor_storage,
-    get_linear_layer_names,
     cast_non_trainable_to_dtype,
 )
 from .save_and_load import get_peft_model_state_dict, set_peft_model_state_dict, load_peft_weights
diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index f9a8bed821..187fa2cefc 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -498,30 +498,11 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
     return tensor.device, unique_id, storage_size(tensor)
 
 
-# adapted from https://github.com/Bavest/fin-llama/blob/8ba68f4b49753358ffbe693d38f2461c031625e3/qlora.py#L229
-def get_linear_layer_names(model):
-    """
-    Returns a list of linear layers in the model.
-
-    Args:
-        model (`torch.nn.Module`):
-            The model for which to get the layer names.
-    """
-    lora_module_names = set()
-    for name, module in model.named_modules():
-        if isinstance(module, torch.nn.Linear):
-            names = name.split(".")
-            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
-
-    if "lm_head" in lora_module_names:  # needed for 16-bit
-        lora_module_names.remove("lm_head")
-    return list(lora_module_names)
-
-
 def cast_non_trainable_to_dtype(model, dtype):
     """
-    Cast all non-trainable parameters of the model to the given `dtype`. This is meant to reduce the GPU memory usage
-    when using PEFT methods by using half-precision dtype for non-trainable parameters.
+    Cast all non-trainable parameters of the model to the given `dtype`. The trainable parameters are casted to full precision.
+    This is meant to reduce the GPU memory usage when using PEFT methods by using half-precision dtype for non-trainable parameters.
+    Having the trainable parameters in full-precision preserves training stability when using automatic mixed precision training.
 
     Args:
         model (`torch.nn.Module`):
@@ -532,3 +513,5 @@ def cast_non_trainable_to_dtype(model, dtype):
     for p in model.parameters():
         if not p.requires_grad:
             p.data = p.to(dtype)
+        else:
+            p.data = p.to(torch.float32)

From 827b7c33fdb88bb56ba0da35b4246309ac8c0072 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 10 Jan 2024 13:28:44 +0530
Subject: [PATCH 5/8] quality

---
 src/peft/utils/other.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index b0bd563ba8..37a91720af 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -502,9 +502,10 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
 
 def cast_non_trainable_to_dtype(model, dtype):
     """
-    Cast all non-trainable parameters of the model to the given `dtype`. The trainable parameters are casted to full precision.
-    This is meant to reduce the GPU memory usage when using PEFT methods by using half-precision dtype for non-trainable parameters.
-    Having the trainable parameters in full-precision preserves training stability when using automatic mixed precision training.
+    Cast all non-trainable parameters of the model to the given `dtype`. The trainable parameters are casted to full
+    precision. This is meant to reduce the GPU memory usage when using PEFT methods by using half-precision dtype for
+    non-trainable parameters. Having the trainable parameters in full-precision preserves training stability when using
+    automatic mixed precision training.
 
     Args:
         model (`torch.nn.Module`):

From 90868de56d72d6f9f135992ebe6c833ee564bd69 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Fri, 12 Jan 2024 14:26:02 +0530
Subject: [PATCH 6/8] Update src/peft/utils/other.py

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
---
 src/peft/utils/other.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index 37a91720af..ab9323eea9 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -502,7 +502,7 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
 
 def cast_non_trainable_to_dtype(model, dtype):
     """
-    Cast all non-trainable parameters of the model to the given `dtype`. The trainable parameters are casted to full
+    Cast all non-trainable parameters of the model to the given `dtype`. The trainable parameters are cast to full
     precision. This is meant to reduce the GPU memory usage when using PEFT methods by using half-precision dtype for
     non-trainable parameters. Having the trainable parameters in full-precision preserves training stability when using
     automatic mixed precision training.

From fd0a77b1ffefa77523f7fb4181a3661a04c931bd Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Fri, 12 Jan 2024 14:32:37 +0530
Subject: [PATCH 7/8] address comments

---
 docs/source/developer_guides/troubleshooting.md | 13 +++++++++++++
 src/peft/__init__.py                            |  2 +-
 src/peft/utils/__init__.py                      |  2 +-
 src/peft/utils/other.py                         | 14 ++++++++------
 4 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/docs/source/developer_guides/troubleshooting.md b/docs/source/developer_guides/troubleshooting.md
index 2156af1313..119c4ce00b 100644
--- a/docs/source/developer_guides/troubleshooting.md
+++ b/docs/source/developer_guides/troubleshooting.md
@@ -58,6 +58,19 @@ trainer = Trainer(model=peft_model, fp16=True, ...)
 trainer.train()
 ```
 
+Alternatively, you can use the utility function `cast_mixed_precision_params` from peft as shown below:
+```python
+from peft import cast_mixed_precision_params
+
+peft_model = get_peft_model(...)
+cast_mixed_precision_params(peft_model, dtype=torch.float16)
+
+# proceed as usual
+trainer = Trainer(model=peft_model, fp16=True, ...)
+trainer.train()
+```
+
+
 ## Bad results from a loaded PEFT model
 
 There can be several reasons for getting a poor result from a loaded PEFT model, which are listed below. If you're still unable to troubleshoot the problem, see if anyone else had a similar [issue](https://github.com/huggingface/peft/issues) on GitHub, and if you can't find any, open a new issue.
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
index 0ee2ca9a5d..1d35ed9d23 100644
--- a/src/peft/__init__.py
+++ b/src/peft/__init__.py
@@ -83,6 +83,6 @@
     set_peft_model_state_dict,
     shift_tokens_right,
     load_peft_weights,
-    cast_non_trainable_to_dtype,
+    cast_mixed_precision_params,
 )
 from .config import PeftConfig, PromptLearningConfig
diff --git a/src/peft/utils/__init__.py b/src/peft/utils/__init__.py
index 28e4e925b7..9ab4bd4d7a 100644
--- a/src/peft/utils/__init__.py
+++ b/src/peft/utils/__init__.py
@@ -47,6 +47,6 @@
     get_auto_gptq_quant_linear,
     get_quantization_config,
     id_tensor_storage,
-    cast_non_trainable_to_dtype,
+    cast_mixed_precision_params,
 )
 from .save_and_load import get_peft_model_state_dict, set_peft_model_state_dict, load_peft_weights
diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index 37a91720af..88778f2d6b 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -500,18 +500,20 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
     return tensor.device, unique_id, storage_size(tensor)
 
 
-def cast_non_trainable_to_dtype(model, dtype):
+def cast_mixed_precision_params(model, dtype):
     """
-    Cast all non-trainable parameters of the model to the given `dtype`. The trainable parameters are casted to full
-    precision. This is meant to reduce the GPU memory usage when using PEFT methods by using half-precision dtype for
-    non-trainable parameters. Having the trainable parameters in full-precision preserves training stability when using
-    automatic mixed precision training.
+    Cast all non-trainable parameters of the model to the given `dtype`. The `dtype` can be `torch.float16` or
+    `torch.bfloat16` as per the mixed-precision training you are performing. The trainable parameters are casted to
+    full precision. This is meant to reduce the GPU memory usage when using PEFT methods by using half-precision dtype
+    for non-trainable parameters. Having the trainable parameters in full-precision preserves training stability when
+    using automatic mixed-precision training.
 
     Args:
         model (`torch.nn.Module`):
             The model to cast the non-trainable parameters of.
         dtype (`torch.dtype`):
-            The dtype to cast the non-trainable parameters to.
+            The dtype to cast the non-trainable parameters to. The `dtype` can be `torch.float16` or
+    `torch.bfloat16` as per the mixed-precision training you are performing.
     """
     for p in model.parameters():
         if not p.requires_grad:

From d899be4c18f0694ae564daf09ba723da4c7f6480 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Fri, 12 Jan 2024 14:37:17 +0530
Subject: [PATCH 8/8] quality

---
 src/peft/utils/other.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
index 46bb75593c..57c6c115a5 100644
--- a/src/peft/utils/other.py
+++ b/src/peft/utils/other.py
@@ -503,10 +503,10 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
 def cast_mixed_precision_params(model, dtype):
     """
     Cast all non-trainable parameters of the model to the given `dtype`. The `dtype` can be `torch.float16` or
-    `torch.bfloat16` as per the mixed-precision training you are performing. The trainable parameters are cast to
-    full precision. This is meant to reduce the GPU memory usage when using PEFT methods by using half-precision dtype
-    for non-trainable parameters. Having the trainable parameters in full-precision preserves training stability when
-    using automatic mixed-precision training.
+    `torch.bfloat16` as per the mixed-precision training you are performing. The trainable parameters are cast to full
+    precision. This is meant to reduce the GPU memory usage when using PEFT methods by using half-precision dtype for
+    non-trainable parameters. Having the trainable parameters in full-precision preserves training stability when using
+    automatic mixed-precision training.
 
     Args:
         model (`torch.nn.Module`):