Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Show a warning for missing attention masks when pad_token_id is not None #24510

Merged
merged 6 commits into from
Jun 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions src/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3477,6 +3477,36 @@ def reverse_bettertransformer(self):

return BetterTransformer.reverse(self)

def warn_if_padding_and_no_attention_mask(self, input_ids, attention_mask):
"""
Shows a one-time warning if the input_ids appear to contain padding and no attention mask was given.
"""
if (attention_mask is not None) or (self.config.pad_token_id is None):
return

# Check only the first and last input IDs to reduce overhead.
if self.config.pad_token_id in input_ids[:, [-1, 0]]:
warn_string = (
"We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See "
"https://huggingface.co/docs/transformers/troubleshooting"
"#incorrect-output-when-padding-tokens-arent-masked."
)

# If the pad token is equal to either BOS, EOS, or SEP, we do not know whether the user should use an
# attention_mask or not. In this case, we should still show a warning because this is a rare case.
if (
(self.config.bos_token_id is not None and self.config.bos_token_id == self.config.pad_token_id)
or (self.config.eos_token_id is not None and self.config.eos_token_id == self.config.pad_token_id)
or (self.config.sep_token_id is not None and self.config.sep_token_id == self.config.pad_token_id)
):
warn_string += (
f"\nYou may ignore this warning if your `pad_token_id` ({self.config.pad_token_id}) is identical "
f"to the `bos_token_id` ({self.config.bos_token_id}), `eos_token_id` ({self.config.eos_token_id}), "
f"or the `sep_token_id` ({self.config.sep_token_id}), and your input is not padded."
)

logger.warning_once(warn_string)


PreTrainedModel.push_to_hub = copy_func(PreTrainedModel.push_to_hub)
if PreTrainedModel.push_to_hub.__doc__ is not None:
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/altclip/modeling_altclip.py
Original file line number Diff line number Diff line change
Expand Up @@ -1305,6 +1305,7 @@ def forward(
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/bert/modeling_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -967,6 +967,7 @@ def forward(
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1118,6 +1118,7 @@ def forward(
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/camembert/modeling_camembert.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,7 @@ def forward(
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/clap/modeling_clap.py
Original file line number Diff line number Diff line change
Expand Up @@ -1854,6 +1854,7 @@ def forward(
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/data2vec/modeling_data2vec_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,7 @@ def forward(
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/roberta/modeling_roberta.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,6 +789,7 @@ def forward(
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,7 @@ def forward(
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,7 @@ def forward(
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
Expand Down
26 changes: 25 additions & 1 deletion tests/models/bert/test_modeling_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from transformers import BertConfig, is_torch_available
from transformers.models.auto import get_values
from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
from transformers.testing_utils import CaptureLogger, require_torch, require_torch_gpu, slow, torch_device

from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
Expand All @@ -40,6 +40,7 @@
BertForTokenClassification,
BertLMHeadModel,
BertModel,
logging,
)
from transformers.models.bert.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST

Expand Down Expand Up @@ -567,6 +568,29 @@ def test_for_token_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_token_classification(*config_and_inputs)

def test_for_warning_if_padding_and_no_attention_mask(self):
(
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
) = self.model_tester.prepare_config_and_inputs()

# Set pad tokens in the input_ids
input_ids[0, 0] = config.pad_token_id

# Check for warnings if the attention_mask is missing.
logger = logging.get_logger("transformers.modeling_utils")
with CaptureLogger(logger) as cl:
model = BertModel(config=config)
model.to(torch_device)
model.eval()
model(input_ids, attention_mask=None, token_type_ids=token_type_ids)
self.assertIn("We strongly recommend passing in an `attention_mask`", cl.out)

@slow
def test_model_from_pretrained(self):
for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
Expand Down
76 changes: 76 additions & 0 deletions tests/test_modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,82 @@ def test_unexpected_keys_warnings(self):
self.assertIn("were not used when initializing ModelWithHead: ['added_key']", cl.out)
self.assertEqual(loading_info["unexpected_keys"], ["added_key"])

def test_warn_if_padding_and_no_attention_mask(self):
logger = logging.get_logger("transformers.modeling_utils")

with self.subTest("Ensure no warnings when pad_token_id is None."):
logger.warning_once.cache_clear()
with CaptureLogger(logger) as cl:
config_no_pad_token = PretrainedConfig()
config_no_pad_token.pad_token_id = None
model = ModelWithHead(config_no_pad_token)
input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 0, 0]])
model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
self.assertNotIn("We strongly recommend passing in an `attention_mask`", cl.out)

with self.subTest("Ensure no warnings when there is an attention_mask."):
logger.warning_once.cache_clear()
with CaptureLogger(logger) as cl:
config = PretrainedConfig()
config.pad_token_id = 0
model = ModelWithHead(config)
input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 0, 0]])
attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])
model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
self.assertNotIn("We strongly recommend passing in an `attention_mask`", cl.out)

with self.subTest("Ensure no warnings when there are no pad_token_ids in the input_ids."):
logger.warning_once.cache_clear()
with CaptureLogger(logger) as cl:
config = PretrainedConfig()
config.pad_token_id = 0
model = ModelWithHead(config)
input_ids = torch.tensor([[1, 345, 232, 328, 740, 140, 1695, 69, 6078, 2341, 25]])
model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
self.assertNotIn("We strongly recommend passing in an `attention_mask`", cl.out)

with self.subTest("Ensure a warning is shown when the input_ids start with a pad_token_id."):
logger.warning_once.cache_clear()
with CaptureLogger(logger) as cl:
config = PretrainedConfig()
config.pad_token_id = 0
model = ModelWithHead(config)
input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 432, 5232]])
model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
self.assertIn("We strongly recommend passing in an `attention_mask`", cl.out)

with self.subTest("Ensure a warning is shown when the input_ids end with a pad_token_id."):
logger.warning_once.cache_clear()
with CaptureLogger(logger) as cl:
config = PretrainedConfig()
config.pad_token_id = 0
model = ModelWithHead(config)
input_ids = torch.tensor([[432, 345, 232, 328, 740, 140, 1695, 69, 6078, 0, 0]])
model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
self.assertIn("We strongly recommend passing in an `attention_mask`", cl.out)

with self.subTest("Ensure that the warning is shown at most once."):
logger.warning_once.cache_clear()
with CaptureLogger(logger) as cl:
config = PretrainedConfig()
config.pad_token_id = 0
model = ModelWithHead(config)
input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 0, 0]])
model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
self.assertEqual(cl.out.count("We strongly recommend passing in an `attention_mask`"), 1)

with self.subTest("Ensure a different warning is shown when the pad_token_id is equal to the bos_token_id."):
logger.warning_once.cache_clear()
with CaptureLogger(logger) as cl:
config = PretrainedConfig()
config.pad_token_id = 0
config.bos_token_id = config.pad_token_id
model = ModelWithHead(config)
input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 0, 0]])
model.warn_if_padding_and_no_attention_mask(input_ids, attention_mask=None)
self.assertIn("You may ignore this warning if your `pad_token_id`", cl.out)

@require_torch_gpu
@slow
def test_pretrained_low_mem_new_config(self):
Expand Down