adapter-hub · TimoImhof · Apr 8, 2024 · Apr 2, 2024
diff --git a/src/adapters/configuration/adapter_config.py b/src/adapters/configuration/adapter_config.py
@@ -182,6 +182,7 @@ class BnConfig(AdapterConfig):
  model. Defaults to False.
  leave_out (:obj:`List[int]`, optional):
  The IDs of the layers (starting at 0) where NO adapter modules should be added.
+ dropout (:obj:`float`, optional): The dropout rate used in the adapter layer. Defaults to 0.0.
  phm_layer (:obj:`bool`, optional): If True the down and up projection layers are a PHMLayer.
  Defaults to False
  phm_dim (:obj:`int`, optional): The dimension of the phm matrix.
@@ -234,6 +235,7 @@ class BnConfig(AdapterConfig):
  inv_adapter_reduction_factor: Optional[float] = None
  cross_adapter: bool = False
  leave_out: List[int] = field(default_factory=list)
+ dropout: float = 0.0
  phm_layer: bool = False
  phm_dim: int = 4
  factorized_phm_W: Optional[bool] = True

diff --git a/src/adapters/methods/modeling.py b/src/adapters/methods/modeling.py
@@ -108,6 +108,8 @@ def __init__(
  if self.use_gating:
  self.gate = nn.Linear(self.input_size, 1)
 
+ self.dropout = nn.Dropout(p=config["dropout"])
+
  # if we want to initialize with the bert strategy then this function is called for all the linear layers
  if config["init_weights"] == "bert":
  self.adapter_down.apply(self.init_bert_weights)
@@ -173,7 +175,7 @@ def forward(self, x, residual_input, output_gating=False):
 
  up = self.adapter_up(down)
  up = up * self.scaling
- output = up
+ output = self.dropout(up)
 
  if self.use_gating:
  # x.shape = (batch_size, seq_len, hidden_size)
@@ -271,7 +273,7 @@ def forward(self, x, residual_input, output_gating=False):
  up = self.adapter_up(down)
  up = up * self.scaling
 
- output = up
+ output = self.dropout(up)
 
  if self.use_gating:
  # x.shape = (batch_size, seq_len, hidden_size)