huggingface · BenjaminBossan · Oct 1, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/src/peft/config.py b/src/peft/config.py
@@ -149,6 +149,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, subfolder: Optional
 
         loaded_attributes = cls.from_json_file(config_file)
         kwargs = {**class_kwargs, **loaded_attributes}
+        kwargs = cls.check_kwargs(**kwargs)
         return cls.from_peft_type(**kwargs)
 
     @classmethod
@@ -213,6 +214,21 @@ def _get_peft_type(
         loaded_attributes = cls.from_json_file(config_file)
         return loaded_attributes["peft_type"]
 
+    @classmethod
+    def check_kwargs(cls, **kwargs):
+        r"""
+        Check if the kwargs are valid for the configuration.
+
+        Args:
+            kwargs (additional keyword arguments, *optional*):
+                Additional keyword arguments passed along to the child class initialization.
+        """
+        if "oft_block_size" in kwargs:
+            warnings.warn(
+                'OFT has been updated since 0.12.1.dev0. Your trained adapter weights may not be compatible with the latest version of OFT. Please retrain your adapter weights.')
+            )
+        return kwargs
+
     @property
     def is_prompt_learning(self) -> bool:
         r"""

diff --git a/src/peft/tuners/boft/config.py b/src/peft/tuners/boft/config.py
@@ -32,7 +32,7 @@ class BOFTConfig(PeftConfig):
         boft_block_num (`int`): Number of BOFT blocks per injected layer.
         boft_n_butterfly_factor (`int`): Number of butterfly factors across different layers.
         target_modules (`Union[List[str],str]`): The names of the modules to apply the adapter to.
-        boft_dropout (`float`): The multiplicative dropout probability for BOFT layers.
+        boft_dropout (`float`): The multiplicative dropout probability, by setting OFT blocks to identity during training, similar to the dropout layer in LoRA.
         fan_in_fan_out (`bool`): Set this to True if the layer to replace stores weight like (fan_in, fan_out).
             For example, gpt-2 uses `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set
             to `True`.
@@ -81,7 +81,7 @@ class BOFTConfig(PeftConfig):
             "example": "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' ",
         },
     )
-    boft_dropout: float = field(default=0.0, metadata={"help": "BOFT multiplicative dropout"})
+    boft_dropout: float = field(default=0.0, metadata={"help": "BOFT multiplicative dropout, randomly setting blocks of OFT to be identity matrix, similar to the dropout layer in LoRA."})
     fan_in_fan_out: bool = field(
         default=False,
         metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
@@ -125,9 +125,10 @@ def __post_init__(self):
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
         if self.boft_block_size == 0 and self.boft_block_num == 0:
-            raise ValueError("You must specify either boft_block_size or boft_block_num.")
+            raise ValueError(
+                f"Either `boft_block_size` or `boft_block_num` must be non-zero. Currently, boft_block_size = {self.boft_block_size} and boft_block_num = {self.boft_block_num}."
+            )
         if not (self.boft_block_size != 0) ^ (self.boft_block_num != 0):
             raise ValueError(
-                f"You can only specify either boft_block_size ({self.boft_block_size}) or boft_block_num ({self.boft_block_num}), "
-                "but not both simultaneously, because boft_block_size x boft_block_num != in_features."
+                f"You can only specify either boft_block_size ({self.boft_block_size}) or boft_block_num ({self.boft_block_num}), but not both simultaneously, because boft_block_size x boft_block_num == in_features."
             )
diff --git a/src/peft/tuners/boft/layer.py b/src/peft/tuners/boft/layer.py
@@ -316,10 +316,7 @@ def update_layer(
             boft_block_num = int(self.in_features // boft_block_size)
 
         else:
-            raise ValueError(
-                f"You can only specify either boft_block_size ({boft_block_size}) or boft_block_num ({boft_block_num}), but not both simultaneously or setting both"
-                "to be 0, because boft_block_size x boft_block_num != in_features."
-            )
+            raise ValueError("Something went wrong, please report this error: https://github.com/huggingface/peft/issues")
 
         # In OFT you can specify the number of blocks to be 1
         if boft_n_butterfly_factor != 0:
@@ -711,11 +708,6 @@ def update_layer(
         conv_filter_dim = self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0]
 
         # Initialize the BOFT parameters.
-        if not (boft_block_size != 0) ^ (boft_block_num != 0):
-            raise ValueError(
-                f"You can only specify either boft_block_size ({boft_block_size}) or boft_block_num ({boft_block_num}), but not both simultaneously, because boft_block_size x boft_block_num != in_features."
-            )
-
         if boft_block_size == 0 and boft_block_num != 0:
             if conv_filter_dim % boft_block_num != 0:
                 raise ValueError(
@@ -753,7 +745,8 @@ def update_layer(
             boft_block_num = int(conv_filter_dim // boft_block_size)
 
         else:
-            raise ValueError("Unknown error!")
+            raise ValueError("Something went wrong, please report this error: https://github.com/huggingface/peft/issues")
+
 
         # In OFT you can specify the number of blocks to be 1
         if boft_n_butterfly_factor != 0:
@@ -777,7 +770,7 @@ def update_layer(
         self.boft_R[adapter_name] = nn.Parameter(
             torch.zeros(boft_n_butterfly_factor + 1, boft_block_num, boft_block_size, boft_block_size)
         )
-        self.boft_s[adapter_name] = nn.Parameter(torch.ones(1, int(self.out_features)))
+        self.boft_s[adapter_name] = nn.Parameter(torch.ones(int(self.out_features), 1))
 
         self.reset_boft_parameters(adapter_name, init_weights)
 
@@ -816,9 +809,12 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N
                     butterfly_oft_mat, boft_s = self.get_delta_weight(active_adapter)
 
                     orig_weight = orig_weight.view(
-                        self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0], self.out_features
+                        self.out_features,
+                        self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0]
                     )
+                    orig_weight = torch.transpose(orig_weight, 0, 1)
                     orig_weight = torch.mm(butterfly_oft_mat, orig_weight)
+                    orig_weight = torch.transpose(orig_weight, 0, 1)
                     orig_weight = orig_weight * boft_s
                     orig_weight = orig_weight.view(
                         self.out_features, self.in_features, base_layer.kernel_size[0], base_layer.kernel_size[0]
@@ -830,9 +826,12 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N
 
                     orig_weight = base_layer.weight.data.clone()
                     orig_weight = orig_weight.view(
-                        self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0], self.out_features
+                        self.out_features,
+                        self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0]
                     )
+                    orig_weight = torch.transpose(orig_weight, 0, 1)
                     orig_weight = torch.mm(butterfly_oft_mat, orig_weight)
+                    orig_weight = torch.transpose(orig_weight, 0, 1)
                     orig_weight = orig_weight * boft_s
                     orig_weight = orig_weight.view(
                         self.out_features, self.in_features, base_layer.kernel_size[0], base_layer.kernel_size[0]
@@ -856,10 +855,12 @@ def unmerge(self) -> None:
 
                 orig_weight = self.get_base_layer().weight.data.clone()
                 orig_weight = orig_weight.view(
-                    self.in_features * self.get_base_layer().kernel_size[0] * self.get_base_layer().kernel_size[0],
                     self.out_features,
+                    self.in_features * self.get_base_layer().kernel_size[0] * self.get_base_layer().kernel_size[0],
                 )
+                orig_weight = torch.transpose(orig_weight, 0, 1)
                 orig_weight = torch.mm(butterfly_oft_mat.t(), orig_weight)
+                orig_weight = torch.transpose(orig_weight, 0, 1)
                 orig_weight = orig_weight * (1 / boft_s)
                 orig_weight = orig_weight.view(
                     self.out_features,
@@ -918,7 +919,7 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
                 device=x.device,
                 dtype=x.dtype,
             )
-            boft_scale = torch.ones((1, int(self.out_features)), device=x.device, dtype=x.dtype)
+            boft_scale = torch.ones((int(self.out_features), 1), device=x.device, dtype=x.dtype)
 
             for active_adapter in self.active_adapters:
                 if active_adapter not in self.boft_R.keys():
@@ -955,10 +956,12 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
 
             orig_weight = self.base_layer.weight.data
             orig_weight = orig_weight.view(
-                self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0],
                 self.out_features,
+                self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0],
             )
+            orig_weight = torch.transpose(orig_weight, 0, 1)
             rotated_weight = torch.mm(boft_rotation, orig_weight)
+            rotated_weight = torch.transpose(rotated_weight, 0, 1)
 
             scaled_rotated_weight = rotated_weight * boft_scale
 

diff --git a/src/peft/tuners/oft/config.py b/src/peft/tuners/oft/config.py
@@ -15,18 +15,19 @@
 from dataclasses import dataclass, field
 from typing import List, Optional, Union
 
-from peft.tuners.lycoris_utils import LycorisConfig
+from peft.config import PeftConfig
 from peft.utils import PeftType
 
 
 @dataclass
-class OFTConfig(LycorisConfig):
+class OFTConfig(PeftConfig):
     """
     This is the configuration class to store the configuration of a [`OFTModel`].
 
     Args:
-        r (`int`): OFT rank.
-        module_dropout (`int`): The dropout probability for disabling OFT modules during training.
+        r (`int`): OFT rank, number of OFT blocks per injected layer.
+        oft_block_size (`int`): OFT block size across different layers.
+        module_dropout (`float`): The multiplicative dropout probability, by setting OFT blocks to identity during training, similar to the dropout layer in LoRA.
         target_modules (`Optional[Union[List[str], str]]`):
             The names of the modules to apply the adapter to. If this is specified, only the modules with the specified
             names will be replaced. When passing a string, a regex match will be performed. When passing a list of
@@ -35,6 +36,10 @@ class OFTConfig(LycorisConfig):
             the output layer. If this is not specified, modules will be chosen according to the model architecture. If
             the architecture is not known, an error will be raised -- in this case, you should specify the target
             modules manually.
+        fan_in_fan_out (`bool`): Set this to True if the layer to replace stores weight like (fan_in, fan_out).
+        bias (`str`): Bias type for OFT. Can be 'none', 'all' or 'oft_only'. If 'all' or 'oft_only', the
+            corresponding biases will be updated during training. Be aware that this means that, even when disabling
+            the adapters, the model will not produce the same output as the base model would have without adaptation.
         init_weights (`bool`):
             Whether to perform initialization of OFT weights.
         layers_to_transform (`Union[List[int], int]`):
@@ -56,9 +61,16 @@ class OFTConfig(LycorisConfig):
             Whether to share the OFT parameters between blocks or not. This is `False` by default.
     """
 
-    r: int = field(default=8, metadata={"help": "OFT rank"})
+    r: int = field(default=8, metadata={"help": "OFT rank, number of OFT blocks per injected layer."})
+    oft_block_size: int = field(
+        default=0,
+        metadata={
+            "help": "OFT block size across different layers.",
+            "note": "You can only specify either r or oft_block_size, but not both simultaneously, because r x oft_block_size = layer dimension.",
+        },
+    )
     module_dropout: float = field(
-        default=0.0, metadata={"help": "The dropout probability for disabling OFT modules during training"}
+        default=0.0, metadata={"help": "OFT multiplicative dropout, randomly setting blocks of OFT to be identity matrix, similar to the dropout layer in LoRA."}
     )
     target_modules: Optional[Union[List[str], str]] = field(
         default=None,
@@ -68,6 +80,11 @@ class OFTConfig(LycorisConfig):
             "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer."
         },
     )
+    fan_in_fan_out: bool = field(
+        default=False,
+        metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
+    )
+    bias: str = field(default="none", metadata={"help": "Bias type for OFT. Can be 'none', 'all' or 'oft_only'"})
     init_weights: bool = field(
         default=True,
         metadata={
@@ -111,9 +128,37 @@ class OFTConfig(LycorisConfig):
         default=False,
         metadata={"help": "Whether to share the OFT parameters between blocks or not."},
     )
+    rank_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}"
+                "Important: the rank pattern won't be applied to the layers after 0.12.1.dev0!"
+            )
+        },
+    )
+    alpha_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `alpha`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}"
+                "Important: the alpha pattern won't be applied to the layers after 0.12.1.dev0!"
+            )
+        },
+    )
 
     def __post_init__(self):
         self.peft_type = PeftType.OFT
         self.target_modules = (
             set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
         )
+        if self.r == 0 and self.oft_block_size == 0:
+            raise ValueError(
+                f"Either `r` or `oft_block_size` must be non-zero. Currently, r = {self.r} and oft_block_size = {self.oft_block_size}."
+            )
+        if not (self.r != 0) ^ (self.oft_block_size != 0):
+            raise ValueError(
+                f"You can only specify either r ({self.r}) or oft_block_size ({self.oft_block_size}), but not both simultaneously, because r x oft_block_size == in_features."
+            )