Skip to content

Commit

Permalink
Add Compacter (#297)
Browse files Browse the repository at this point in the history
Implement compacter adapters.
  • Loading branch information
hSterz authored Mar 23, 2022
1 parent d1a8596 commit 3884182
Show file tree
Hide file tree
Showing 18 changed files with 512 additions and 16 deletions.
4 changes: 4 additions & 0 deletions src/transformers/adapters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
"AdapterConfig",
"AdapterConfigBase",
"AdapterFusionConfig",
"CompacterConfig",
"CompacterPlusPlusConfig",
"ConfigUnion",
"DynamicAdapterFusionConfig",
"HoulsbyConfig",
Expand Down Expand Up @@ -151,6 +153,8 @@
AdapterConfig,
AdapterConfigBase,
AdapterFusionConfig,
CompacterConfig,
CompacterPlusPlusConfig,
ConfigUnion,
DynamicAdapterFusionConfig,
HoulsbyConfig,
Expand Down
53 changes: 53 additions & 0 deletions src/transformers/adapters/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,31 @@ class AdapterConfig(AdapterConfigBase):
cross_adapter (:obj:`bool`, optional): If True, add adapter modules after the cross attention block of each decoder layer in an encoder-decoder model.
Defaults to False.
leave_out (:obj:`List[int]`, optional): The IDs of the layers (starting at 0) where NO adapter modules should be added.
phm_layer (:obj:`bool`, optional): If True the down and up projection layers are a PHMLayer.
Defaults to False
phm_dim (:obj:`int`, optional): The dimension of the phm matrix.
Defaults to None.
shared_phm_rule (:obj:`bool`, optional): Whether the phm matrix is shared across all layers.
Defaults to True
factorized_phm_rule (:obj:`bool`, optional): Whether the phm matrix is factorized into a left and right matrix.
Defaults to False.
learn_phm (:obj:`bool`, optional): Whether the phm matrix should be learned during training.
Defaults to True
factorized_phm_W (:obj:`bool`, optional): Whether the weights matrix is factorized into a left and right matrix.
Defaults to True
shared_W_phm (:obj:`bool`, optional): Whether the weights matrix is shared across all layers.
Defaults to False.
phm_c_init (:obj:`str`, optional): The initialization function for the weights of the phm matrix.
The possible values are `["normal", "uniform"]`. Defaults to `normal`.
phm_init_range (:obj:`float`, optional): std for initializing phm weights if `phm_c_init="normal"`.
Defaults to 0.0001.
hypercomplex_nonlinearity (:obj:`str`, optional): This specifies the distribution to draw the weights in the phm layer from,
Defaults to `glorot-uniform`.
phm_rank (:obj:`int`, optional): If the weight matrix is factorized this specifies the rank of the matrix. E.g. the left matrix
of the down projection has the shape (phm_dim, _in_feats_per_axis, phm_rank) and the right matrix (phm_dim,
phm_rank, _out_feats_per_axis). Defaults to 1
phm_bias (:obj:`bool`, optional): If True the down and up projection PHMLayer has a bias term. If `phm_layer`is False this is ignored.
Defaults to True
"""

# Required options
Expand All @@ -190,6 +215,18 @@ class AdapterConfig(AdapterConfigBase):
inv_adapter_reduction_factor: Optional[int] = None
cross_adapter: bool = False
leave_out: List[int] = field(default_factory=list)
phm_layer: bool = False
phm_dim: int = 4
factorized_phm_W: Optional[bool] = True
shared_W_phm: Optional[bool] = False
shared_phm_rule: Optional[bool] = True
factorized_phm_rule: Optional[bool] = False
phm_c_init: Optional[str] = "normal"
phm_init_range: Optional[float] = 0.0001
learn_phm: Optional[bool] = True
hypercomplex_nonlinearity: Optional[str] = "glorot-uniform"
phm_rank: Optional[int] = 1
phm_bias: Optional[bool] = True

# We want to emulate a simple form of immutability while keeping the ability to add custom attributes.
# Therefore, we don't allow changing attribute values if set once.
Expand Down Expand Up @@ -224,6 +261,13 @@ class PfeifferConfig(AdapterConfig):
reduction_factor: Union[int, Mapping] = 16


@dataclass(eq=False)
class CompacterPlusPlusConfig(PfeifferConfig):
phm_layer: bool = True
reduction_factor: int = 32
non_linearity: str = "gelu"


@dataclass(eq=False)
class PfeifferInvConfig(PfeifferConfig):
"""
Expand Down Expand Up @@ -252,6 +296,13 @@ class HoulsbyConfig(AdapterConfig):
reduction_factor: Union[int, Mapping] = 16


@dataclass(eq=False)
class CompacterConfig(HoulsbyConfig):
phm_layer: bool = True
reduction_factor: int = 32
non_linearity: str = "gelu"


@dataclass(eq=False)
class HoulsbyInvConfig(HoulsbyConfig):
"""
Expand Down Expand Up @@ -424,6 +475,8 @@ def adapter(self):
"houlsby": HoulsbyConfig(),
"pfeiffer+inv": PfeifferInvConfig(),
"houlsby+inv": HoulsbyInvConfig(),
"compacter++": CompacterPlusPlusConfig(),
"compacter": CompacterConfig(),
"prefix_tuning": PrefixTuningConfig(),
"prefix_tuning_flat": PrefixTuningConfig(flat=True),
"parallel": ParallelConfig(),
Expand Down
1 change: 1 addition & 0 deletions src/transformers/adapters/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def add_adapter(self, adapter_name: str, layer_idx: int):
else:
adapter_class = Adapter
adapter = adapter_class(
adapter_name=adapter_name,
input_size=self.config.hidden_size,
down_sample=self.config.hidden_size // reduction_factor,
config=adapter_config,
Expand Down
26 changes: 26 additions & 0 deletions src/transformers/adapters/model_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def __init__(self, config, *args, **kwargs):
super().__init__(config, *args, **kwargs)
self.model_name = None
self.loaded_embeddings = {}
self.shared_parameters = nn.ModuleDict()
self._active_embedding = "default"

# In some cases, the config is not an instance of a directly supported config class such as BertConfig.
Expand Down Expand Up @@ -185,6 +186,11 @@ def train_adapter(self, adapter_setup: Union[list, AdapterCompositionBlock], tra
self.freeze_model(True)
adapter_setup = parse_composition(adapter_setup)
self.apply_to_adapter_layers(lambda i, layer: layer.enable_adapters(adapter_setup, True, False))
for adapter_name in adapter_setup:
if adapter_name in self.shared_parameters:
for param in self.shared_parameters[adapter_name].values():
param.requires_grad = True

if isinstance(self, InvertibleAdaptersMixin):
self.enable_invertible_adapters(adapter_setup.flatten())
# use the adapters to be trained by default in every forward pass
Expand Down Expand Up @@ -230,6 +236,9 @@ def active_adapters(self) -> AdapterCompositionBlock:
def active_adapters(self, adapter_setup: Union[list, AdapterCompositionBlock]):
self.set_active_adapters(adapter_setup)

def set_shared_parameters(self, param):
self.shared_parameters = param

def set_active_adapters(
self, adapter_setup: Union[list, AdapterCompositionBlock], skip_layers: Optional[List[int]] = None
):
Expand Down Expand Up @@ -274,6 +283,9 @@ def add_adapter(self, adapter_name: str, config=None, overwrite_ok: bool = False
self.config.adapters.add(adapter_name, config=config)
try:
self.apply_to_adapter_layers(lambda i, layer: layer.add_adapter(adapter_name, i))
# PHM Layer
if self.config.adapters.match(adapter_name, AdapterConfig, location_key="phm_layer"):
self._add_shared_parameters(adapter_name, config)
# Prefix Tuning
for module in self.modules():
if isinstance(module, PrefixTuningPool):
Expand All @@ -286,6 +298,11 @@ def add_adapter(self, adapter_name: str, config=None, overwrite_ok: bool = False
if set_active:
self.set_active_adapters(adapter_name)

def _add_shared_parameters(self, adapter_name, adapter_config: AdapterConfig):
self.shared_parameters[adapter_name] = (
list(self.get_adapter(adapter_name)[0].values())[0].adapter_down[0].init_shared_parameters()
)

def add_fusion(self, adapter_names: Union[Fuse, list], adapter_fusion_config=None, override_kwargs=None):
warnings.warn(
"add_fusion() has been deprecated in favor of add_adapter_fusion(). Please use the newer method instead.",
Expand Down Expand Up @@ -604,6 +621,10 @@ def forward_context(self, context: ForwardContext, *args, **kwargs):
return

context.adapters_parallelized = False
# Add the shared parameters for the active adapters to the context
context.shared_parameters = {
name: param for name, param in self.shared_parameters.items() if name in active_adapters.flatten()
}

# Prefix tuning
input_tensor = kwargs.get("input_ids", None)
Expand Down Expand Up @@ -791,6 +812,11 @@ def __init__(self, config, *args, **kwargs):
super().__init__(config, *args, **kwargs)
self._convert_to_flex_head = False

def set_shared_parameters(self, param):
self.shared_parameters = param
if self.base_model is not self:
self.base_model.shared_parameters = self.shared_parameters

def iter_layers(self) -> Iterable[Tuple[int, nn.Module]]:
"""
Iterates over all layers of the model.
Expand Down
Loading

0 comments on commit 3884182

Please sign in to comment.