From 07370a6d60c4ba10f07ecc297b1dca28b9ec5d04 Mon Sep 17 00:00:00 2001 From: calpt Date: Mon, 8 Apr 2024 14:20:43 +0200 Subject: [PATCH] Add dropout to bottleneck adapters (#667) Closes #414. --- src/adapters/configuration/adapter_config.py | 2 ++ src/adapters/methods/modeling.py | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/adapters/configuration/adapter_config.py b/src/adapters/configuration/adapter_config.py index c3b45ca313..ef401d17a7 100644 --- a/src/adapters/configuration/adapter_config.py +++ b/src/adapters/configuration/adapter_config.py @@ -182,6 +182,7 @@ class BnConfig(AdapterConfig): model. Defaults to False. leave_out (:obj:`List[int]`, optional): The IDs of the layers (starting at 0) where NO adapter modules should be added. + dropout (:obj:`float`, optional): The dropout rate used in the adapter layer. Defaults to 0.0. phm_layer (:obj:`bool`, optional): If True the down and up projection layers are a PHMLayer. Defaults to False phm_dim (:obj:`int`, optional): The dimension of the phm matrix. @@ -234,6 +235,7 @@ class BnConfig(AdapterConfig): inv_adapter_reduction_factor: Optional[float] = None cross_adapter: bool = False leave_out: List[int] = field(default_factory=list) + dropout: float = 0.0 phm_layer: bool = False phm_dim: int = 4 factorized_phm_W: Optional[bool] = True diff --git a/src/adapters/methods/modeling.py b/src/adapters/methods/modeling.py index 6b265e21f2..4068bc10bb 100644 --- a/src/adapters/methods/modeling.py +++ b/src/adapters/methods/modeling.py @@ -108,6 +108,8 @@ def __init__( if self.use_gating: self.gate = nn.Linear(self.input_size, 1) + self.dropout = nn.Dropout(p=config["dropout"]) + # if we want to initialize with the bert strategy then this function is called for all the linear layers if config["init_weights"] == "bert": self.adapter_down.apply(self.init_bert_weights) @@ -173,7 +175,7 @@ def forward(self, x, residual_input, output_gating=False): up = self.adapter_up(down) up = up * self.scaling - output = up + output = self.dropout(up) if self.use_gating: # x.shape = (batch_size, seq_len, hidden_size) @@ -271,7 +273,7 @@ def forward(self, x, residual_input, output_gating=False): up = self.adapter_up(down) up = up * self.scaling - output = up + output = self.dropout(up) if self.use_gating: # x.shape = (batch_size, seq_len, hidden_size)