diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py index 47fca97fb4a9a7..c675ddf30df5e1 100644 --- a/src/transformers/models/bert/modeling_flax_bert.py +++ b/src/transformers/models/bert/modeling_flax_bert.py @@ -252,7 +252,7 @@ def __call__(self, hidden_states, attention_mask, deterministic=True): attention_bias = None dropout_rng = None - if not deterministic and self.dropout_rate > 0.0: + if not deterministic and self.config.attention_probs_dropout_prob > 0.0: dropout_rng = self.make_rng("dropout") attn_output = dot_product_attention( diff --git a/src/transformers/models/roberta/modeling_flax_roberta.py b/src/transformers/models/roberta/modeling_flax_roberta.py index d0fdb60273b8ef..d4311a9286268e 100644 --- a/src/transformers/models/roberta/modeling_flax_roberta.py +++ b/src/transformers/models/roberta/modeling_flax_roberta.py @@ -271,7 +271,7 @@ def __call__(self, hidden_states, attention_mask, deterministic=True): attention_bias = None dropout_rng = None - if not deterministic and self.dropout_rate > 0.0: + if not deterministic and self.config.attention_probs_dropout_prob > 0.0: dropout_rng = self.make_rng("dropout") attn_output = dot_product_attention(