huggingface · BenjaminBossan · Oct 1, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/src/peft/config.py b/src/peft/config.py
@@ -149,6 +149,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, subfolder: Optional
 
         loaded_attributes = cls.from_json_file(config_file)
         kwargs = {**class_kwargs, **loaded_attributes}
+        kwargs = cls.check_kwargs(**kwargs)
         return cls.from_peft_type(**kwargs)
 
     @classmethod
@@ -213,6 +214,15 @@ def _get_peft_type(
         loaded_attributes = cls.from_json_file(config_file)
         return loaded_attributes["peft_type"]
 
+    @classmethod
+    def check_kwargs(cls, **kwargs):
+        """Check kwargs before initializing the config instance.
+
+        Subclasses can override this method to add specific checks.
+
+        """
+        return kwargs
+
     @property
     def is_prompt_learning(self) -> bool:
         r"""

diff --git a/src/peft/mixed_model.py b/src/peft/mixed_model.py
@@ -34,7 +34,6 @@
     LoKrModel,
     LoraModel,
     MixedModel,
-    OFTModel,
 )
 from .tuners.mixed import COMPATIBLE_TUNER_TYPES
 from .utils import PeftType, _set_adapter, _set_trainable
@@ -46,7 +45,6 @@
     PeftType.LOKR: LoKrModel,
     PeftType.ADALORA: AdaLoraModel,
     PeftType.IA3: IA3Model,
-    PeftType.OFT: OFTModel,
 }
 
 

diff --git a/src/peft/tuners/boft/config.py b/src/peft/tuners/boft/config.py
@@ -32,7 +32,9 @@ class BOFTConfig(PeftConfig):
         boft_block_num (`int`): Number of BOFT blocks per injected layer.
         boft_n_butterfly_factor (`int`): Number of butterfly factors across different layers.
         target_modules (`Union[List[str],str]`): The names of the modules to apply the adapter to.
-        boft_dropout (`float`): The multiplicative dropout probability for BOFT layers.
+        boft_dropout (`float`):
+            The multiplicative dropout probability, by setting OFT blocks to identity during training, similar to the
+            dropout layer in LoRA.
         fan_in_fan_out (`bool`): Set this to True if the layer to replace stores weight like (fan_in, fan_out).
             For example, gpt-2 uses `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set
             to `True`.
@@ -81,7 +83,12 @@ class BOFTConfig(PeftConfig):
             "example": "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' ",
         },
     )
-    boft_dropout: float = field(default=0.0, metadata={"help": "BOFT multiplicative dropout"})
+    boft_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "BOFT multiplicative dropout, randomly setting blocks of OFT to be identity matrix, similar to the dropout layer in LoRA."
+        },
+    )
     fan_in_fan_out: bool = field(
         default=False,
         metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
@@ -125,9 +132,10 @@ def __post_init__(self):
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
         if self.boft_block_size == 0 and self.boft_block_num == 0:
-            raise ValueError("You must specify either boft_block_size or boft_block_num.")
+            raise ValueError(
+                f"Either `boft_block_size` or `boft_block_num` must be non-zero. Currently, boft_block_size = {self.boft_block_size} and boft_block_num = {self.boft_block_num}."
+            )
         if not (self.boft_block_size != 0) ^ (self.boft_block_num != 0):
             raise ValueError(
-                f"You can only specify either boft_block_size ({self.boft_block_size}) or boft_block_num ({self.boft_block_num}), "
-                "but not both simultaneously, because boft_block_size x boft_block_num != in_features."
+                f"You can only specify either boft_block_size ({self.boft_block_size}) or boft_block_num ({self.boft_block_num}), but not both simultaneously, because boft_block_size x boft_block_num == in_features."
             )
diff --git a/src/peft/tuners/boft/layer.py b/src/peft/tuners/boft/layer.py
@@ -324,8 +324,7 @@ def update_layer(
 
         else:
             raise ValueError(
-                f"You can only specify either boft_block_size ({boft_block_size}) or boft_block_num ({boft_block_num}), but not both simultaneously or setting both"
-                "to be 0, because boft_block_size x boft_block_num != in_features."
+                "Something went wrong, please report this error: https://github.com/huggingface/peft/issues"
             )
 
         # In OFT you can specify the number of blocks to be 1
@@ -710,11 +709,6 @@ def update_layer(
         conv_filter_dim = self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0]
 
         # Initialize the BOFT parameters.
-        if not (boft_block_size != 0) ^ (boft_block_num != 0):
-            raise ValueError(
-                f"You can only specify either boft_block_size ({boft_block_size}) or boft_block_num ({boft_block_num}), but not both simultaneously, because boft_block_size x boft_block_num != in_features."
-            )
-
         if boft_block_size == 0 and boft_block_num != 0:
             if conv_filter_dim % boft_block_num != 0:
                 raise ValueError(
@@ -752,7 +746,9 @@ def update_layer(
             boft_block_num = int(conv_filter_dim // boft_block_size)
 
         else:
-            raise ValueError("Unknown error!")
+            raise ValueError(
+                "Something went wrong, please report this error: https://github.com/huggingface/peft/issues"
+            )
 
         # In OFT you can specify the number of blocks to be 1
         if boft_n_butterfly_factor != 0:
@@ -776,7 +772,7 @@ def update_layer(
         self.boft_R[adapter_name] = nn.Parameter(
             torch.zeros(boft_n_butterfly_factor + 1, boft_block_num, boft_block_size, boft_block_size)
         )
-        self.boft_s[adapter_name] = nn.Parameter(torch.ones(1, int(self.out_features)))
+        self.boft_s[adapter_name] = nn.Parameter(torch.ones(int(self.out_features), 1))
 
         self.reset_boft_parameters(adapter_name, init_weights)
 
@@ -815,9 +811,11 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N
                     butterfly_oft_mat, boft_s = self.get_delta_weight(active_adapter)
 
                     orig_weight = orig_weight.view(
-                        self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0], self.out_features
+                        self.out_features, self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0]
                     )
+                    orig_weight = torch.transpose(orig_weight, 0, 1)
                     orig_weight = torch.mm(butterfly_oft_mat, orig_weight)
+                    orig_weight = torch.transpose(orig_weight, 0, 1)
                     orig_weight = orig_weight * boft_s
                     orig_weight = orig_weight.view(
                         self.out_features, self.in_features, base_layer.kernel_size[0], base_layer.kernel_size[0]
@@ -829,9 +827,11 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N
 
                     orig_weight = base_layer.weight.data.clone()
                     orig_weight = orig_weight.view(
-                        self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0], self.out_features
+                        self.out_features, self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0]
                     )
+                    orig_weight = torch.transpose(orig_weight, 0, 1)
                     orig_weight = torch.mm(butterfly_oft_mat, orig_weight)
+                    orig_weight = torch.transpose(orig_weight, 0, 1)
                     orig_weight = orig_weight * boft_s
                     orig_weight = orig_weight.view(
                         self.out_features, self.in_features, base_layer.kernel_size[0], base_layer.kernel_size[0]
@@ -855,10 +855,12 @@ def unmerge(self) -> None:
 
                 orig_weight = self.get_base_layer().weight.data.clone()
                 orig_weight = orig_weight.view(
-                    self.in_features * self.get_base_layer().kernel_size[0] * self.get_base_layer().kernel_size[0],
                     self.out_features,
+                    self.in_features * self.get_base_layer().kernel_size[0] * self.get_base_layer().kernel_size[0],
                 )
+                orig_weight = torch.transpose(orig_weight, 0, 1)
                 orig_weight = torch.mm(butterfly_oft_mat.t(), orig_weight)
+                orig_weight = torch.transpose(orig_weight, 0, 1)
                 orig_weight = orig_weight * (1 / boft_s)
                 orig_weight = orig_weight.view(
                     self.out_features,
@@ -917,7 +919,7 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
                 device=x.device,
                 dtype=x.dtype,
             )
-            boft_scale = torch.ones((1, int(self.out_features)), device=x.device, dtype=x.dtype)
+            boft_scale = torch.ones((int(self.out_features), 1), device=x.device, dtype=x.dtype)
 
             for active_adapter in self.active_adapters:
                 if active_adapter not in self.boft_R.keys():
@@ -954,10 +956,12 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
 
             orig_weight = self.base_layer.weight.data
             orig_weight = orig_weight.view(
-                self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0],
                 self.out_features,
+                self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0],
             )
+            orig_weight = torch.transpose(orig_weight, 0, 1)
             rotated_weight = torch.mm(boft_rotation, orig_weight)
+            rotated_weight = torch.transpose(rotated_weight, 0, 1)
 
             scaled_rotated_weight = rotated_weight * boft_scale
 

diff --git a/src/peft/tuners/oft/config.py b/src/peft/tuners/oft/config.py
@@ -12,29 +12,38 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import Literal, Optional, Union
 
-from peft.tuners.lycoris_utils import LycorisConfig
+from peft.config import PeftConfig
 from peft.utils import PeftType
 
 
 @dataclass
-class OFTConfig(LycorisConfig):
+class OFTConfig(PeftConfig):
     """
     This is the configuration class to store the configuration of a [`OFTModel`].
 
     Args:
-        r (`int`): OFT rank.
-        module_dropout (`int`): The dropout probability for disabling OFT modules during training.
-        target_modules (`Optional[Union[List[str], str]]`):
+        r (`int`): OFT rank, number of OFT blocks per injected layer.
+        oft_block_size (`int`): OFT block size across different layers.
+        module_dropout (`float`):
+            The multiplicative dropout probability, by setting OFT blocks to identity during training, similar to the
+            dropout layer in LoRA.
+        target_modules (`Optional[Union[list[str], str]]`):
             The names of the modules to apply the adapter to. If this is specified, only the modules with the specified
             names will be replaced. When passing a string, a regex match will be performed. When passing a list of
             strings, either an exact match will be performed or it is checked if the name of the module ends with any
             of the passed strings. If this is specified as 'all-linear', then all linear modules are chosen, excluding
             the output layer. If this is not specified, modules will be chosen according to the model architecture. If
             the architecture is not known, an error will be raised -- in this case, you should specify the target
             modules manually.
+        fan_in_fan_out (`bool`): Set this to True if the layer to replace stores weight like (fan_in, fan_out).
+        bias (`str`): Bias type for OFT. Can be 'none', 'all' or 'oft_only'. If 'all' or 'oft_only', the
+            corresponding biases will be updated during training. Be aware that this means that, even when disabling
+            the adapters, the model will not produce the same output as the base model would have without adaptation.
         init_weights (`bool`):
             Whether to perform initialization of OFT weights.
         layers_to_transform (`Union[List[int], int]`):
@@ -56,18 +65,35 @@ class OFTConfig(LycorisConfig):
             Whether to share the OFT parameters between blocks or not. This is `False` by default.
     """
 
-    r: int = field(default=8, metadata={"help": "OFT rank"})
+    r: int = field(default=8, metadata={"help": "OFT rank, number of OFT blocks per injected layer."})
+    oft_block_size: int = field(
+        default=0,
+        metadata={
+            "help": "OFT block size across different layers.",
+            "note": "You can only specify either r or oft_block_size, but not both simultaneously, because r x oft_block_size = layer dimension.",
+        },
+    )
     module_dropout: float = field(
-        default=0.0, metadata={"help": "The dropout probability for disabling OFT modules during training"}
+        default=0.0,
+        metadata={
+            "help": "OFT multiplicative dropout, randomly setting blocks of OFT to be identity matrix, similar to the dropout layer in LoRA."
+        },
     )
-    target_modules: Optional[Union[List[str], str]] = field(
+    target_modules: Optional[Union[list[str], str]] = field(
         default=None,
         metadata={
             "help": "List of module names or regex expression of the module names to replace with OFT."
             "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
             "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer."
         },
     )
+    fan_in_fan_out: bool = field(
+        default=False,
+        metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
+    )
+    bias: Literal["none", "all", "oft_only"] = field(
+        default="none", metadata={"help": "Bias type for OFT. Can be 'none', 'all' or 'oft_only'"}
+    )
     init_weights: bool = field(
         default=True,
         metadata={
@@ -77,7 +103,7 @@ class OFTConfig(LycorisConfig):
             ),
         },
     )
-    layers_to_transform: Optional[Union[List[int], int]] = field(
+    layers_to_transform: Optional[Union[list[int], int]] = field(
         default=None,
         metadata={
             "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
@@ -89,7 +115,7 @@ class OFTConfig(LycorisConfig):
             "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
         },
     )
-    modules_to_save: Optional[List[str]] = field(
+    modules_to_save: Optional[list[str]] = field(
         default=None,
         metadata={
             "help": "List of modules apart from OFT layers to be set as trainable and saved in the final checkpoint. "
@@ -111,9 +137,53 @@ class OFTConfig(LycorisConfig):
         default=False,
         metadata={"help": "Whether to share the OFT parameters between blocks or not."},
     )
+    rank_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}"
+                "Important: the rank pattern won't be applied to the layers after 0.12.1.dev0!"
+            )
+        },
+    )
+    alpha_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `alpha`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}"
+                "Important: the alpha pattern won't be applied to the layers after 0.12.1.dev0!"
+            )
+        },
+    )
 
     def __post_init__(self):
         self.peft_type = PeftType.OFT
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
+        if self.r == 0 and self.oft_block_size == 0:
+            raise ValueError(
+                f"Either `r` or `oft_block_size` must be non-zero. Currently, r = {self.r} and oft_block_size = {self.oft_block_size}."
+            )
+        if not (self.r != 0) ^ (self.oft_block_size != 0):
+            raise ValueError(
+                f"You can only specify either r ({self.r}) or oft_block_size ({self.oft_block_size}), but not both simultaneously, because r x oft_block_size == in_features."
+            )
+
+    @classmethod
+    def check_kwargs(cls, **kwargs):
+        r"""
+        Check if the kwargs are valid for the configuration.
+
+        Args:
+            kwargs (additional keyword arguments, *optional*):
+                Additional keyword arguments passed along to the child class initialization.
+        """
+        if "oft_block_size" not in kwargs:
+            raise ValueError(
+                "OFT has been updated since PEFT 0.13.0. Your trained adapter weights are incompatible with the latest version of OFT. Please retrain your adapter weights with newer PEFT versions."
+                "Alternatively, downgrade PEFT to version 0.12.0 to use the old adapter weights."
+            )
+        return super().check_kwargs(**kwargs)