Add Compacter (#297)

Implement compacter adapters.
adapter-hub · Mar 23, 2022 · 3884182 · 3884182
1 parent d1a8596
commit 3884182
Show file tree

Hide file tree

Showing 18 changed files with 512 additions and 16 deletions.
diff --git a/src/transformers/adapters/__init__.py b/src/transformers/adapters/__init__.py
@@ -40,6 +40,8 @@
         "AdapterConfig",
         "AdapterConfigBase",
         "AdapterFusionConfig",
+        "CompacterConfig",
+        "CompacterPlusPlusConfig",
         "ConfigUnion",
         "DynamicAdapterFusionConfig",
         "HoulsbyConfig",
@@ -151,6 +153,8 @@
         AdapterConfig,
         AdapterConfigBase,
         AdapterFusionConfig,
+        CompacterConfig,
+        CompacterPlusPlusConfig,
         ConfigUnion,
         DynamicAdapterFusionConfig,
         HoulsbyConfig,

diff --git a/src/transformers/adapters/configuration.py b/src/transformers/adapters/configuration.py
@@ -167,6 +167,31 @@ class AdapterConfig(AdapterConfigBase):
         cross_adapter (:obj:`bool`, optional): If True, add adapter modules after the cross attention block of each decoder layer in an encoder-decoder model.
             Defaults to False.
         leave_out (:obj:`List[int]`, optional): The IDs of the layers (starting at 0) where NO adapter modules should be added.
+        phm_layer (:obj:`bool`, optional): If True the down and up projection layers are a PHMLayer.
+            Defaults to False
+        phm_dim (:obj:`int`, optional): The dimension of the phm matrix.
+            Defaults to None.
+        shared_phm_rule (:obj:`bool`, optional): Whether the phm matrix is shared across all layers.
+            Defaults to True
+        factorized_phm_rule (:obj:`bool`, optional): Whether the phm matrix is factorized into a left and right matrix.
+            Defaults to False.
+        learn_phm (:obj:`bool`, optional): Whether the phm matrix should be learned during training.
+            Defaults to True
+        factorized_phm_W (:obj:`bool`, optional): Whether the weights matrix is factorized into a left and right matrix.
+            Defaults to True
+        shared_W_phm (:obj:`bool`, optional): Whether the weights matrix is shared across all layers.
+            Defaults to False.
+        phm_c_init (:obj:`str`, optional): The initialization function for the weights of the phm matrix.
+            The possible values are `["normal", "uniform"]`. Defaults to `normal`.
+        phm_init_range (:obj:`float`, optional): std for initializing phm weights if `phm_c_init="normal"`.
+            Defaults to 0.0001.
+        hypercomplex_nonlinearity (:obj:`str`, optional): This specifies the distribution to draw the weights in the phm layer from,
+            Defaults to `glorot-uniform`.
+        phm_rank (:obj:`int`,  optional): If the weight matrix is factorized this specifies the rank of the matrix. E.g. the left matrix
+            of the down projection has the shape (phm_dim, _in_feats_per_axis, phm_rank) and the right matrix (phm_dim,
+            phm_rank, _out_feats_per_axis). Defaults to 1
+        phm_bias (:obj:`bool`, optional): If True the down and up projection PHMLayer has a bias term. If `phm_layer`is False this is ignored.
+            Defaults to True
     """
 
     # Required options
@@ -190,6 +215,18 @@ class AdapterConfig(AdapterConfigBase):
     inv_adapter_reduction_factor: Optional[int] = None
     cross_adapter: bool = False
     leave_out: List[int] = field(default_factory=list)
+    phm_layer: bool = False
+    phm_dim: int = 4
+    factorized_phm_W: Optional[bool] = True
+    shared_W_phm: Optional[bool] = False
+    shared_phm_rule: Optional[bool] = True
+    factorized_phm_rule: Optional[bool] = False
+    phm_c_init: Optional[str] = "normal"
+    phm_init_range: Optional[float] = 0.0001
+    learn_phm: Optional[bool] = True
+    hypercomplex_nonlinearity: Optional[str] = "glorot-uniform"
+    phm_rank: Optional[int] = 1
+    phm_bias: Optional[bool] = True
 
     # We want to emulate a simple form of immutability while keeping the ability to add custom attributes.
     # Therefore, we don't allow changing attribute values if set once.
@@ -224,6 +261,13 @@ class PfeifferConfig(AdapterConfig):
     reduction_factor: Union[int, Mapping] = 16
 
 
+@dataclass(eq=False)
+class CompacterPlusPlusConfig(PfeifferConfig):
+    phm_layer: bool = True
+    reduction_factor: int = 32
+    non_linearity: str = "gelu"
+
+
 @dataclass(eq=False)
 class PfeifferInvConfig(PfeifferConfig):
     """
@@ -252,6 +296,13 @@ class HoulsbyConfig(AdapterConfig):
     reduction_factor: Union[int, Mapping] = 16
 
 
+@dataclass(eq=False)
+class CompacterConfig(HoulsbyConfig):
+    phm_layer: bool = True
+    reduction_factor: int = 32
+    non_linearity: str = "gelu"
+
+
 @dataclass(eq=False)
 class HoulsbyInvConfig(HoulsbyConfig):
     """
@@ -424,6 +475,8 @@ def adapter(self):
     "houlsby": HoulsbyConfig(),
     "pfeiffer+inv": PfeifferInvConfig(),
     "houlsby+inv": HoulsbyInvConfig(),
+    "compacter++": CompacterPlusPlusConfig(),
+    "compacter": CompacterConfig(),
     "prefix_tuning": PrefixTuningConfig(),
     "prefix_tuning_flat": PrefixTuningConfig(flat=True),
     "parallel": ParallelConfig(),

diff --git a/src/transformers/adapters/layer.py b/src/transformers/adapters/layer.py
@@ -87,6 +87,7 @@ def add_adapter(self, adapter_name: str, layer_idx: int):
             else:
                 adapter_class = Adapter
             adapter = adapter_class(
+                adapter_name=adapter_name,
                 input_size=self.config.hidden_size,
                 down_sample=self.config.hidden_size // reduction_factor,
                 config=adapter_config,

diff --git a/src/transformers/adapters/model_mixin.py b/src/transformers/adapters/model_mixin.py
@@ -128,6 +128,7 @@ def __init__(self, config, *args, **kwargs):
         super().__init__(config, *args, **kwargs)
         self.model_name = None
         self.loaded_embeddings = {}
+        self.shared_parameters = nn.ModuleDict()
         self._active_embedding = "default"
 
         # In some cases, the config is not an instance of a directly supported config class such as BertConfig.
@@ -185,6 +186,11 @@ def train_adapter(self, adapter_setup: Union[list, AdapterCompositionBlock], tra
         self.freeze_model(True)
         adapter_setup = parse_composition(adapter_setup)
         self.apply_to_adapter_layers(lambda i, layer: layer.enable_adapters(adapter_setup, True, False))
+        for adapter_name in adapter_setup:
+            if adapter_name in self.shared_parameters:
+                for param in self.shared_parameters[adapter_name].values():
+                    param.requires_grad = True
+
         if isinstance(self, InvertibleAdaptersMixin):
             self.enable_invertible_adapters(adapter_setup.flatten())
         # use the adapters to be trained by default in every forward pass
@@ -230,6 +236,9 @@ def active_adapters(self) -> AdapterCompositionBlock:
     def active_adapters(self, adapter_setup: Union[list, AdapterCompositionBlock]):
         self.set_active_adapters(adapter_setup)
 
+    def set_shared_parameters(self, param):
+        self.shared_parameters = param
+
     def set_active_adapters(
         self, adapter_setup: Union[list, AdapterCompositionBlock], skip_layers: Optional[List[int]] = None
     ):
@@ -274,6 +283,9 @@ def add_adapter(self, adapter_name: str, config=None, overwrite_ok: bool = False
         self.config.adapters.add(adapter_name, config=config)
         try:
             self.apply_to_adapter_layers(lambda i, layer: layer.add_adapter(adapter_name, i))
+            # PHM Layer
+            if self.config.adapters.match(adapter_name, AdapterConfig, location_key="phm_layer"):
+                self._add_shared_parameters(adapter_name, config)
             # Prefix Tuning
             for module in self.modules():
                 if isinstance(module, PrefixTuningPool):
@@ -286,6 +298,11 @@ def add_adapter(self, adapter_name: str, config=None, overwrite_ok: bool = False
         if set_active:
             self.set_active_adapters(adapter_name)
 
+    def _add_shared_parameters(self, adapter_name, adapter_config: AdapterConfig):
+        self.shared_parameters[adapter_name] = (
+            list(self.get_adapter(adapter_name)[0].values())[0].adapter_down[0].init_shared_parameters()
+        )
+
     def add_fusion(self, adapter_names: Union[Fuse, list], adapter_fusion_config=None, override_kwargs=None):
         warnings.warn(
             "add_fusion() has been deprecated in favor of add_adapter_fusion(). Please use the newer method instead.",
@@ -604,6 +621,10 @@ def forward_context(self, context: ForwardContext, *args, **kwargs):
             return
 
         context.adapters_parallelized = False
+        # Add the shared parameters for the active adapters to the context
+        context.shared_parameters = {
+            name: param for name, param in self.shared_parameters.items() if name in active_adapters.flatten()
+        }
 
         # Prefix tuning
         input_tensor = kwargs.get("input_ids", None)
@@ -791,6 +812,11 @@ def __init__(self, config, *args, **kwargs):
         super().__init__(config, *args, **kwargs)
         self._convert_to_flex_head = False
 
+    def set_shared_parameters(self, param):
+        self.shared_parameters = param
+        if self.base_model is not self:
+            self.base_model.shared_parameters = self.shared_parameters
+
     def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
         """
         Iterates over all layers of the model.