Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 50 additions & 15 deletions tests/causal_lm_tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import pytest
from parameterized import parameterized

from transformers import set_seed
from transformers import PretrainedConfig, set_seed
from transformers.testing_utils import (
is_flaky,
require_flash_attn,
Expand Down Expand Up @@ -230,7 +230,6 @@ class CausalLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
test_pruning = False
model_tester_class = None
all_model_classes = None
rotary_embedding_layer = None # Enables RoPE tests if set
pipeline_model_mapping = None

def setUp(self):
Expand Down Expand Up @@ -319,21 +318,28 @@ def test_token_classification_model(self):

@parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
def test_model_rope_scaling_from_config(self, scaling_type):
if self.rotary_embedding_layer is None:
self.skipTest("Rotary embedding layer not set")
"""
Tests that we can initialize a model with RoPE scaling in the config, that it can run a forward pass, and
that a few basic model output properties are honored.
"""
config, _ = self.model_tester.prepare_config_and_inputs_for_common()

if not _config_supports_rope_scaling(config):
self.skipTest("This model does not support RoPE scaling")

short_input = ids_tensor([1, 10], config.vocab_size)
long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)

set_seed(42) # Fixed seed at init time so the two models get the same random weights
config.rope_scaling = {"rope_type": "default"}
original_model = self.model_tester_class.base_model_class(config)
original_model.to(torch_device)
original_model.eval()
original_short_output = original_model(short_input).last_hidden_state
original_long_output = original_model(long_input).last_hidden_state

set_seed(42) # Fixed seed at init time so the two models get the same random weights
config.rope_scaling = {"type": scaling_type, "factor": 10.0}
config.rope_scaling = {"rope_type": scaling_type, "factor": 10.0}
scaled_model = self.model_tester_class.base_model_class(config)
scaled_model.to(torch_device)
scaled_model.eval()
Expand All @@ -350,10 +356,26 @@ def test_model_rope_scaling_from_config(self, scaling_type):
# The output should be different for long inputs
self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))

def test_model_rope_scaling(self):
if self.rotary_embedding_layer is None:
self.skipTest("Rotary embedding layer not set")
def test_model_rope_scaling_frequencies(self):
"""Tests the frequency properties of the different RoPE scaling types on the model RoPE layer."""
config, _ = self.model_tester.prepare_config_and_inputs_for_common()

if not _config_supports_rope_scaling(config):
self.skipTest("This model does not support RoPE scaling")

# Retrieves the RoPE layer class from the base model class. Uses `.named_modules()` to avoid hardcoding the
# named location of the RoPE layer class.
base_model = self.model_tester.base_model_class(config)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we might need to model.get_decoder as well for models where the lm backbone is hidden inside the base model. Though ig these tests aren't yet used in multimodal models

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point.

Given that the tests are only run on decoder-only models for now, I'd rather leave as is (and upgrade when it's needed) 🤗

possible_rope_attributes = [
"rotary_emb", # most common case
"global_rotary_emb",
"local_rotary_emb",
]
for name, module in base_model.named_modules():
if any(potential_name in name for potential_name in possible_rope_attributes):
rope_class = type(module)
break

scaling_factor = 10
short_input_length = 10
long_input_length = int(config.max_position_embeddings * 1.5)
Expand All @@ -368,16 +390,17 @@ def test_model_rope_scaling(self):
position_ids_long = position_ids_long.unsqueeze(0)

# Sanity check original RoPE
original_rope = self.rotary_embedding_layer(config=config).to(torch_device)
config.rope_scaling = {"rope_type": "default"}
original_rope = rope_class(config=config).to(torch_device)
original_cos_short, original_sin_short = original_rope(x, position_ids_short)
original_cos_long, original_sin_long = original_rope(x, position_ids_long)
torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :])
torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :])

# Sanity check linear RoPE scaling
# New position "x" should match original position with index "x/scaling_factor"
config.rope_scaling = {"type": "linear", "factor": scaling_factor}
linear_scaling_rope = self.rotary_embedding_layer(config=config).to(torch_device)
config.rope_scaling = {"rope_type": "linear", "factor": scaling_factor}
linear_scaling_rope = rope_class(config=config).to(torch_device)
linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short)
linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long)
torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :])
Expand All @@ -390,8 +413,8 @@ def test_model_rope_scaling(self):
# Sanity check Dynamic NTK RoPE scaling
# Scaling should only be observed after a long input is fed. We can observe that the frequencies increase
# with scaling_factor (or that `inv_freq` decreases)
config.rope_scaling = {"type": "dynamic", "factor": scaling_factor}
ntk_scaling_rope = self.rotary_embedding_layer(config=config).to(torch_device)
config.rope_scaling = {"rope_type": "dynamic", "factor": scaling_factor}
ntk_scaling_rope = rope_class(config=config).to(torch_device)
ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short)
ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long)
torch.testing.assert_close(ntk_cos_short, original_cos_short)
Expand All @@ -404,8 +427,8 @@ def test_model_rope_scaling(self):

# Sanity check Yarn RoPE scaling
# Scaling should be over the entire input
config.rope_scaling = {"type": "yarn", "factor": scaling_factor}
yarn_scaling_rope = self.rotary_embedding_layer(config=config).to(torch_device)
config.rope_scaling = {"rope_type": "yarn", "factor": scaling_factor}
yarn_scaling_rope = rope_class(config=config).to(torch_device)
yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short)
yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long)
torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :])
Expand Down Expand Up @@ -450,3 +473,15 @@ def test_flash_attn_2_equivalence(self):
logits = outputs.hidden_states[-1]
logits_fa = outputs_fa.hidden_states[-1]
torch.testing.assert_close(logits_fa, logits, atol=3e-2, rtol=3e-2)


def _config_supports_rope_scaling(config: PretrainedConfig) -> bool:
"""Returns whether a certain model config supports RoPE scaling parameterization."""
# Has rope_scaling -> model was designed with rope scaling in mind
# Has rope_theta (and no rope_scaling) -> probably an older model, but should support rope scaling as well
main_config_has_rope = hasattr(config, "rope_scaling") or hasattr(config, "rope_theta")
sub_config_has_rope = any(
hasattr(config[sub_config], "rope_scaling") or hasattr(config[sub_config], "rope_theta")
for sub_config in config.sub_configs.keys()
)
return main_config_has_rope or sub_config_has_rope
2 changes: 0 additions & 2 deletions tests/models/arcee/test_modeling_arcee.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
ArceeForTokenClassification,
ArceeModel,
)
from transformers.models.arcee.modeling_arcee import ArceeRotaryEmbedding


class ArceeModelTester(CausalLMModelTester):
Expand Down Expand Up @@ -80,7 +79,6 @@ class ArceeModelTest(CausalLMModelTest, unittest.TestCase):
test_pruning = False
fx_compatible = False
model_tester_class = ArceeModelTester
rotary_embedding_layer = ArceeRotaryEmbedding # Enables RoPE tests if set

# Need to use `0.8` instead of `0.9` for `test_cpu_offload`
# This is because we are hitting edge cases with the causal_mask buffer
Expand Down
11 changes: 11 additions & 0 deletions tests/models/dbrx/test_modeling_dbrx.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

import unittest

from parameterized import parameterized

from transformers import DbrxConfig, is_torch_available
from transformers.testing_utils import require_torch, slow

Expand Down Expand Up @@ -122,6 +124,15 @@ def test_disk_offload_safetensors(self):
def test_disk_offload_bin(self):
pass

@unittest.skip("Dbrx doesn't have RoPE scaling implemented")
def test_model_rope_scaling_frequencies(self):
pass

@parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
@unittest.skip("Dbrx doesn't have RoPE scaling implemented")
def test_model_rope_scaling_from_config(self, scaling_type):
pass


@require_torch
class DbrxModelIntegrationTest(unittest.TestCase):
Expand Down
13 changes: 8 additions & 5 deletions tests/models/deepseek_v2/test_modeling_deepseek_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,16 @@ class DeepseekV2ModelTest(CausalLMModelTest, unittest.TestCase):
test_torchscript = False
test_all_params_have_gradient = False
model_tester_class = DeepseekV2ModelTester
rotary_embedding_layer = DeepseekV2RotaryEmbedding
model_split_percents = [0.5, 0.7, 0.8]

# used in `test_torch_compile_for_training`
_torch_compile_train_cls = DeepseekV2ForCausalLM if is_torch_available() else None

def test_model_rope_scaling(self):
def test_model_rope_scaling_frequencies(self):
"""
Overwritten: DeepseekV2 implements RoPE in the complex domain, as opposed to in the real domain with
`sin` and `cos`. Nevertheless, the checks are the same as in the original test.
"""
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
scaling_factor = 10
short_input_length = 10
Expand All @@ -109,7 +112,7 @@ def test_model_rope_scaling(self):

# Sanity check linear RoPE scaling
# New position "x" should match original position with index "x/scaling_factor"
config.rope_scaling = {"type": "linear", "factor": scaling_factor}
config.rope_scaling = {"rope_type": "linear", "factor": scaling_factor}
linear_scaling_rope = DeepseekV2RotaryEmbedding(config=config).to(torch_device)
linear_freqs_cis_short = linear_scaling_rope(x, position_ids_short)
linear_freqs_cis_long = linear_scaling_rope(x, position_ids_long)
Expand All @@ -118,7 +121,7 @@ def test_model_rope_scaling(self):
# Sanity check Dynamic NTK RoPE scaling
# Scaling should only be observed after a long input is fed. We can observe that the frequencies increase
# with scaling_factor (or that `inv_freq` decreases)
config.rope_scaling = {"type": "dynamic", "factor": scaling_factor}
config.rope_scaling = {"rope_type": "dynamic", "factor": scaling_factor}
ntk_scaling_rope = DeepseekV2RotaryEmbedding(config=config).to(torch_device)
ntk_freqs_cis_short = ntk_scaling_rope(x, position_ids_short)
ntk_freqs_cis_long = ntk_scaling_rope(x, position_ids_long)
Expand All @@ -129,7 +132,7 @@ def test_model_rope_scaling(self):

# Sanity check Yarn RoPE scaling
# Scaling should be over the entire input
config.rope_scaling = {"type": "yarn", "factor": scaling_factor}
config.rope_scaling = {"rope_type": "yarn", "factor": scaling_factor}
yarn_scaling_rope = DeepseekV2RotaryEmbedding(config=config).to(torch_device)
yarn_freqs_cis_short = yarn_scaling_rope(x, position_ids_short)
yarn_freqs_cis_long = yarn_scaling_rope(x, position_ids_long)
Expand Down
2 changes: 0 additions & 2 deletions tests/models/ernie4_5/test_modeling_ernie4_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
Ernie4_5ForCausalLM,
Ernie4_5Model,
)
from transformers.models.ernie4_5.modeling_ernie4_5 import Ernie4_5RotaryEmbedding


class Ernie4_5ModelTester(CausalLMModelTester):
Expand Down Expand Up @@ -69,7 +68,6 @@ class Ernie4_5ModelTest(CausalLMModelTest, unittest.TestCase):
test_pruning = False
fx_compatible = False # Broken by attention refactor cc @Cyrilvallez
model_tester_class = Ernie4_5ModelTester
rotary_embedding_layer = Ernie4_5RotaryEmbedding # Enables RoPE tests if set

# Need to use `0.8` instead of `0.9` for `test_cpu_offload`
# This is because we are hitting edge cases with the causal_mask buffer
Expand Down
1 change: 1 addition & 0 deletions tests/models/ernie4_5_moe/test_modeling_ernie4_5_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
Ernie4_5_MoeForCausalLM,
Ernie4_5_MoeModel,
)

from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester


Expand Down
12 changes: 11 additions & 1 deletion tests/models/hunyuan_v1_dense/test_modeling_hunyuan_v1_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

import unittest

from parameterized import parameterized

from transformers import HunYuanDenseV1Config, is_torch_available
from transformers.testing_utils import (
cleanup,
Expand All @@ -30,7 +32,6 @@
HunYuanDenseV1ForSequenceClassification,
HunYuanDenseV1Model,
)

from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester


Expand Down Expand Up @@ -78,6 +79,15 @@ def is_pipeline_test_to_skip(
):
return True

@unittest.skip("HunYuanDenseV1's RoPE has custom parameterization")
def test_model_rope_scaling_frequencies(self):
pass

@parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
@unittest.skip("HunYuanDenseV1's RoPE has custom parameterization")
def test_model_rope_scaling_from_config(self, scaling_type):
pass


@require_torch
class HunYuanDenseV1IntegrationTest(unittest.TestCase):
Expand Down
10 changes: 10 additions & 0 deletions tests/models/hunyuan_v1_moe/test_modeling_hunyuan_v1_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import unittest

import pytest
from parameterized import parameterized

from transformers import HunYuanMoEV1Config, is_torch_available
from transformers.testing_utils import (
Expand Down Expand Up @@ -101,6 +102,15 @@ def test_generate_from_inputs_embeds_with_static_cache(self):
def test_generate_with_static_cache(self):
pass

@unittest.skip("HunYuanMoEV1's RoPE has custom parameterization")
def test_model_rope_scaling_frequencies(self):
pass

@parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
@unittest.skip("HunYuanMoEV1's RoPE has custom parameterization")
def test_model_rope_scaling_from_config(self, scaling_type):
pass


@require_torch
class HunYuanMoEV1IntegrationTest(unittest.TestCase):
Expand Down
2 changes: 0 additions & 2 deletions tests/models/llama/test_modeling_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
LlamaModel,
LlamaTokenizer,
)
from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding


class LlamaModelTester(CausalLMModelTester):
Expand Down Expand Up @@ -87,7 +86,6 @@ class LlamaModelTest(CausalLMModelTest, unittest.TestCase):
test_pruning = False
fx_compatible = False # Broken by attention refactor cc @Cyrilvallez
model_tester_class = LlamaModelTester
rotary_embedding_layer = LlamaRotaryEmbedding # Enables RoPE tests if set

# Need to use `0.8` instead of `0.9` for `test_cpu_offload`
# This is because we are hitting edge cases with the causal_mask buffer
Expand Down
1 change: 0 additions & 1 deletion tests/models/minimax/test_modeling_minimax.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
MiniMaxForTokenClassification,
MiniMaxModel,
)

from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester


Expand Down
1 change: 0 additions & 1 deletion tests/models/mistral/test_modeling_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
MistralForTokenClassification,
MistralModel,
)

from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester


Expand Down
11 changes: 11 additions & 0 deletions tests/models/nemotron/test_modeling_nemotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

import unittest

from parameterized import parameterized

from transformers import NemotronConfig, is_torch_available
from transformers.testing_utils import (
Expectations,
Expand Down Expand Up @@ -96,6 +98,15 @@ def setUp(self):
def test_model_outputs_equivalence(self, **kwargs):
pass

@unittest.skip("Nemotron has a hardcoded `rope_type`, so we can't apply RoPE scaling")
def test_model_rope_scaling_frequencies(self):
pass

@parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
@unittest.skip("Nemotron has a hardcoded `rope_type`, so we can't apply RoPE scaling")
def test_model_rope_scaling_from_config(self, scaling_type):
pass


@require_torch_accelerator
class NemotronIntegrationTest(unittest.TestCase):
Expand Down
2 changes: 0 additions & 2 deletions tests/models/phi/test_modeling_phi.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
PhiForTokenClassification,
PhiModel,
)
from transformers.models.phi.modeling_phi import PhiRotaryEmbedding


class PhiModelTester(CausalLMModelTester):
Expand Down Expand Up @@ -69,7 +68,6 @@ class PhiModelTest(CausalLMModelTest, unittest.TestCase):
test_headmasking = False
test_pruning = False
model_tester_class = PhiModelTester
rotary_embedding_layer = PhiRotaryEmbedding

# TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79292/workflows/fa2ba644-8953-44a6-8f67-ccd69ca6a476/jobs/1012905
def is_pipeline_test_to_skip(
Expand Down
2 changes: 0 additions & 2 deletions tests/models/phi3/test_modeling_phi3.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
Phi3ForTokenClassification,
Phi3Model,
)
from transformers.models.phi3.modeling_phi3 import Phi3RotaryEmbedding

end_of_text_token = 32000

Expand Down Expand Up @@ -116,7 +115,6 @@ class Phi3ModelTest(CausalLMModelTest, unittest.TestCase):
test_headmasking = False
test_pruning = False
model_tester_class = Phi3ModelTester
rotary_embedding_layer = Phi3RotaryEmbedding


@slow
Expand Down
Loading