Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tests: upgrade test_eager_matches_sdpa_generate #34386

Merged
merged 3 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions tests/generation/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@


import copy
import gc
import inspect
import tempfile
import unittest
Expand All @@ -33,6 +34,7 @@
require_torch_gpu,
require_torch_multi_accelerator,
require_torch_multi_gpu,
require_torch_sdpa,
slow,
torch_device,
)
Expand Down Expand Up @@ -2046,6 +2048,86 @@ def test_inherits_generation_mixin(self):
for model_class in self.all_generative_model_classes:
self.assertTrue("GenerationMixin" in str(model_class.__bases__))

@require_torch_sdpa
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(this is mostly copy-paste, going to comment the sections that are changed)

@slow
def test_eager_matches_sdpa_generate(self):
max_new_tokens = 30

for model_class in self.all_generative_model_classes:
if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA")

config, original_inputs_dict = self.prepare_config_and_inputs_for_generate()
inputs_dict = {}
for input_name, input_data in original_inputs_dict.items():
if isinstance(input_data, torch.Tensor) and input_data.dtype in [torch.float32, torch.bfloat16]:
inputs_dict[input_name] = input_data.to(torch.float16)
else:
inputs_dict[input_name] = input_data
main_input = inputs_dict[model_class.main_input_name]
Comment on lines +2060 to +2067
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uses self.prepare_config_and_inputs_for_generate() instead, which enables us to pass a dictionary of inputs to generate (better input control than simply using inputs_dict[model_class.main_input_name])


# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + main_input.shape[1] + 1

model = model_class(config)

with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
del model
gc.collect()

generate_kwargs = {
"max_new_tokens": max_new_tokens,
"do_sample": False,
"return_dict_in_generate": True,
"output_scores": True,
}

model_sdpa = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)
res_sdpa = model_sdpa.generate(**inputs_dict, **generate_kwargs)
del model_sdpa
gc.collect()

model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)
res_eager = model_eager.generate(**inputs_dict, **generate_kwargs)
del model_eager
gc.collect()

# Eager and SDPA are very similar, but not exactly the same. Because we are using random models, this
# test would be flaky if we only checked the sequences. Two situations in which this test passes:
# 1. The sequences are the same
# 2. The sequences are different, but the scores up until the first mismatch are nearly identical
output_matches = res_eager.sequences == res_sdpa.sequences
has_matching_outputs = output_matches.all()
has_matching_scores = None
if not has_matching_outputs:
input_length = main_input.shape[1]
for batch_idx in range(res_eager.sequences.shape[0]):
batch_matches = output_matches[batch_idx]
if batch_matches.all():
continue
first_mismatch_idx = batch_matches.int().argmin() # gets the index of the first False
first_mismatch_idx -= input_length # scores doesn't include data regarding input tokens
sdpa_first_mismatch_scores = res_sdpa.scores[first_mismatch_idx][batch_idx]
eager_first_mismatch_scores = res_eager.scores[first_mismatch_idx][batch_idx]
has_matching_scores = torch.allclose(
sdpa_first_mismatch_scores, eager_first_mismatch_scores, rtol=1e-3, atol=1e-3
)
if not has_matching_scores:
break
Comment on lines +2106 to +2127
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

flakiness handling as explained in the PR header


self.assertTrue(has_matching_outputs or has_matching_scores)

def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1):
# we can be sure what is batch size from main input but seq length depends on model type and whether input is text/audio/image
# so we infer actual text seq length from model_tester, same was as it is done in `test_modeling_common.py` tests`
Expand Down
74 changes: 0 additions & 74 deletions tests/models/bert/test_modeling_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
CaptureLogger,
require_torch,
require_torch_accelerator,
require_torch_sdpa,
slow,
torch_device,
)
Expand Down Expand Up @@ -672,79 +671,6 @@ def test_torchscript_device_change(self):
loaded = torch.jit.load(os.path.join(tmp, "bert.pt"), map_location=torch_device)
loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device))

# This test was copied from the common test_eager_matches_sdpa_generate(), but without low_cpu_mem_usage=True.
# TODO: Remove this and use the parent method (in common tests) once BERT supports low_cpu_mem_usage=True.
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
max_new_tokens = 30

if len(self.all_generative_model_classes) == 0:
self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")

for model_class in self.all_generative_model_classes:
if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA")

config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)

# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1

model = model_class(config)

with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)

dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))

model_sdpa = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
# low_cpu_mem_usage=True,
).to(torch_device)

self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")

model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
# low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)

self.assertTrue(model_eager.config._attn_implementation == "eager")

for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")

has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")

# Just test that a large cache works as expected
res_eager = model_eager.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)

res_sdpa = model_sdpa.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)

self.assertTrue(torch.allclose(res_eager, res_sdpa))


@require_torch
class BertModelIntegrationTest(unittest.TestCase):
Expand Down
58 changes: 0 additions & 58 deletions tests/models/cohere/test_modeling_cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,64 +307,6 @@ def test_model_various_embeddings(self):
def test_torch_fx_output_loss(self):
super().test_torch_fx_output_loss()

@require_bitsandbytes
@require_torch_sdpa
@require_torch_multi_gpu
@slow
def test_eager_matches_sdpa_generate(self):
"""
Overwritting the common test as the test is flaky on tiny models
"""
max_new_tokens = 30

model_id = "CohereForAI/c4ai-command-r-v01-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_id)

model_sdpa = CohereForCausalLM.from_pretrained(
model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto"
)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")

model_eager = CohereForCausalLM.from_pretrained(
model_id, torch_dtype=torch.float16, attn_implementation="eager", device_map="auto"
)

self.assertTrue(model_eager.config._attn_implementation == "eager")

for name, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")

has_sdpa = False
for name, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")

texts = [
"hi here's a longer context, getting longer and",
"Hello this is a very long sentence my friend, very long for real",
"Today I am in Paris and",
]

for padding_side in ["left", "right"]:
tokenizer.padding_side = padding_side
tokenizer.pad_token = tokenizer.eos_token

inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)

res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)

with self.subTest(f"{padding_side}"):
torch.testing.assert_close(
res_eager,
res_sdpa,
msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}",
)


@require_torch
@slow
Expand Down
74 changes: 0 additions & 74 deletions tests/models/falcon/test_modeling_falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
# limitations under the License.
"""Testing suite for the PyTorch Falcon model."""

import tempfile
import unittest

from parameterized import parameterized
Expand All @@ -27,7 +26,6 @@
set_seed,
)
from transformers.testing_utils import (
is_flaky,
require_bitsandbytes,
require_torch,
require_torch_sdpa,
Expand Down Expand Up @@ -520,78 +518,6 @@ def test_model_rope_scaling(self):
torch.testing.assert_close(ntk_sin_long, original_sin_long)
self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())

# TODO: @Fxmarty
@is_flaky(max_attempts=3, description="flaky on some models.")
@require_torch_sdpa
@slow
def test_eager_matches_sdpa_generate(self):
max_new_tokens = 30

if len(self.all_generative_model_classes) == 0:
self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")

for model_class in self.all_generative_model_classes:
if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA")

config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)

# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1

model = model_class(config)

with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)

dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))

model_sdpa = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)

self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")

model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)

self.assertTrue(model_eager.config._attn_implementation == "eager")

# NOTE: This check is disabled for Falcon as the non-SDPA/SDPA implementation is in the same class (legacy reason).
# for name, submodule in model_eager.named_modules():
# if "SdpaAttention" in submodule.__class__.__name__:
# raise ValueError("The eager model should not have SDPA attention layers")

# has_sdpa = False
# for name, submodule in model_sdpa.named_modules():
# if "SdpaAttention" in submodule.__class__.__name__:
# has_sdpa = True
# break
# if not has_sdpa:
# raise ValueError("The SDPA model should have SDPA attention layers")

# Just test that a large cache works as expected
res_eager = model_eager.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)

res_sdpa = model_sdpa.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)

self.assertTrue(torch.allclose(res_eager, res_sdpa))


@require_torch
class FalconLanguageGenerationTest(unittest.TestCase):
Expand Down
Loading
Loading