Make gradient checkpointing work with the decoder

veritable-tech · Apr 24, 2021 · acaeee6 · acaeee6
1 parent 81254e6
commit acaeee6
Showing 1 changed file with 3 additions and 2 deletions.
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
@@ -486,6 +486,8 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value):
                 position_bias = torch.zeros(
                     (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
                 )
+                if self.training:
+                    position_bias.requires_grad = True
             else:
                 position_bias = self.compute_bias(real_seq_length, key_length)
 
@@ -955,7 +957,6 @@ def forward(
             if (
                 getattr(self.config, "gradient_checkpointing", False)
                 and self.training
-                and (not self.config.is_decoder)
             ):
                 if use_cache:
                     logger.warn(
@@ -980,7 +981,7 @@ def custom_forward(*inputs):
                     encoder_decoder_position_bias,
                     layer_head_mask,
                     cross_attn_layer_head_mask,
-                    past_key_value,
+                    None  # past_key_value is always None with gradient checkpointing
                 )
             else:
                 layer_outputs = layer_module(