huggingface · gante · Sep 9, 2025 · Sep 2, 2025 · Sep 3, 2025 · Sep 3, 2025
diff --git a/tests/causal_lm_tester.py b/tests/causal_lm_tester.py
@@ -18,7 +18,7 @@
 import pytest
 from parameterized import parameterized
 
-from transformers import set_seed
+from transformers import PretrainedConfig, set_seed
 from transformers.testing_utils import (
     is_flaky,
     require_flash_attn,
@@ -230,7 +230,6 @@ class CausalLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
     test_pruning = False
     model_tester_class = None
     all_model_classes = None
-    rotary_embedding_layer = None  # Enables RoPE tests if set
     pipeline_model_mapping = None
 
     def setUp(self):
@@ -319,21 +318,28 @@ def test_token_classification_model(self):
 
     @parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
     def test_model_rope_scaling_from_config(self, scaling_type):
-        if self.rotary_embedding_layer is None:
-            self.skipTest("Rotary embedding layer not set")
+        """
+        Tests that we can initialize a model with RoPE scaling in the config, that it can run a forward pass, and
+        that a few basic model output properties are honored.
+        """
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        if not _config_supports_rope_scaling(config):
+            self.skipTest("This model does not support RoPE scaling")
+
         short_input = ids_tensor([1, 10], config.vocab_size)
         long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
 
         set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        config.rope_scaling = {"rope_type": "default"}
         original_model = self.model_tester_class.base_model_class(config)
         original_model.to(torch_device)
         original_model.eval()
         original_short_output = original_model(short_input).last_hidden_state
         original_long_output = original_model(long_input).last_hidden_state
 
         set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
+        config.rope_scaling = {"rope_type": scaling_type, "factor": 10.0}
         scaled_model = self.model_tester_class.base_model_class(config)
         scaled_model.to(torch_device)
         scaled_model.eval()
@@ -350,10 +356,26 @@ def test_model_rope_scaling_from_config(self, scaling_type):
         # The output should be different for long inputs
         self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
 
-    def test_model_rope_scaling(self):
-        if self.rotary_embedding_layer is None:
-            self.skipTest("Rotary embedding layer not set")
+    def test_model_rope_scaling_frequencies(self):
+        """Tests the frequency properties of the different RoPE scaling types on the model RoPE layer."""
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        if not _config_supports_rope_scaling(config):
+            self.skipTest("This model does not support RoPE scaling")
+
+        # Retrieves the RoPE layer class from the base model class. Uses `.named_modules()` to avoid hardcoding the
+        # named location of the RoPE layer class.
+        base_model = self.model_tester.base_model_class(config)
+        possible_rope_attributes = [
+            "rotary_emb",  # most common case
+            "global_rotary_emb",
+            "local_rotary_emb",
+        ]
+        for name, module in base_model.named_modules():
+            if any(potential_name in name for potential_name in possible_rope_attributes):
+                rope_class = type(module)
+                break
+
         scaling_factor = 10
         short_input_length = 10
         long_input_length = int(config.max_position_embeddings * 1.5)
@@ -368,16 +390,17 @@ def test_model_rope_scaling(self):
         position_ids_long = position_ids_long.unsqueeze(0)
 
         # Sanity check original RoPE
-        original_rope = self.rotary_embedding_layer(config=config).to(torch_device)
+        config.rope_scaling = {"rope_type": "default"}
+        original_rope = rope_class(config=config).to(torch_device)
         original_cos_short, original_sin_short = original_rope(x, position_ids_short)
         original_cos_long, original_sin_long = original_rope(x, position_ids_long)
         torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :])
         torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :])
 
         # Sanity check linear RoPE scaling
         # New position "x" should match original position with index "x/scaling_factor"
-        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
-        linear_scaling_rope = self.rotary_embedding_layer(config=config).to(torch_device)
+        config.rope_scaling = {"rope_type": "linear", "factor": scaling_factor}
+        linear_scaling_rope = rope_class(config=config).to(torch_device)
         linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short)
         linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long)
         torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :])
@@ -390,8 +413,8 @@ def test_model_rope_scaling(self):
         # Sanity check Dynamic NTK RoPE scaling
         # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase
         # with scaling_factor (or that `inv_freq` decreases)
-        config.rope_scaling = {"type": "dynamic", "factor": scaling_factor}
-        ntk_scaling_rope = self.rotary_embedding_layer(config=config).to(torch_device)
+        config.rope_scaling = {"rope_type": "dynamic", "factor": scaling_factor}
+        ntk_scaling_rope = rope_class(config=config).to(torch_device)
         ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short)
         ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long)
         torch.testing.assert_close(ntk_cos_short, original_cos_short)
@@ -404,8 +427,8 @@ def test_model_rope_scaling(self):
 
         # Sanity check Yarn RoPE scaling
         # Scaling should be over the entire input
-        config.rope_scaling = {"type": "yarn", "factor": scaling_factor}
-        yarn_scaling_rope = self.rotary_embedding_layer(config=config).to(torch_device)
+        config.rope_scaling = {"rope_type": "yarn", "factor": scaling_factor}
+        yarn_scaling_rope = rope_class(config=config).to(torch_device)
         yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short)
         yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long)
         torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :])
@@ -450,3 +473,15 @@ def test_flash_attn_2_equivalence(self):
                 logits = outputs.hidden_states[-1]
                 logits_fa = outputs_fa.hidden_states[-1]
                 torch.testing.assert_close(logits_fa, logits, atol=3e-2, rtol=3e-2)
+
+
+def _config_supports_rope_scaling(config: PretrainedConfig) -> bool:
+    """Returns whether a certain model config supports RoPE scaling parameterization."""
+    # Has rope_scaling -> model was designed with rope scaling in mind
+    # Has rope_theta (and no rope_scaling) -> probably an older model, but should support rope scaling as well
+    main_config_has_rope = hasattr(config, "rope_scaling") or hasattr(config, "rope_theta")
+    sub_config_has_rope = any(
+        hasattr(config[sub_config], "rope_scaling") or hasattr(config[sub_config], "rope_theta")
+        for sub_config in config.sub_configs.keys()
+    )
+    return main_config_has_rope or sub_config_has_rope
diff --git a/tests/models/arcee/test_modeling_arcee.py b/tests/models/arcee/test_modeling_arcee.py
@@ -39,7 +39,6 @@
         ArceeForTokenClassification,
         ArceeModel,
     )
-    from transformers.models.arcee.modeling_arcee import ArceeRotaryEmbedding
 
 
 class ArceeModelTester(CausalLMModelTester):
@@ -80,7 +79,6 @@ class ArceeModelTest(CausalLMModelTest, unittest.TestCase):
     test_pruning = False
     fx_compatible = False
     model_tester_class = ArceeModelTester
-    rotary_embedding_layer = ArceeRotaryEmbedding  # Enables RoPE tests if set
 
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
@@ -15,6 +15,8 @@
 
 import unittest
 
+from parameterized import parameterized
+
 from transformers import DbrxConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow
 
@@ -122,6 +124,15 @@ def test_disk_offload_safetensors(self):
     def test_disk_offload_bin(self):
         pass
 
+    @unittest.skip("Dbrx doesn't have RoPE scaling implemented")
+    def test_model_rope_scaling_frequencies(self):
+        pass
+
+    @parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
+    @unittest.skip("Dbrx doesn't have RoPE scaling implemented")
+    def test_model_rope_scaling_from_config(self, scaling_type):
+        pass
+
 
 @require_torch
 class DbrxModelIntegrationTest(unittest.TestCase):

diff --git a/tests/models/deepseek_v2/test_modeling_deepseek_v2.py b/tests/models/deepseek_v2/test_modeling_deepseek_v2.py
@@ -82,13 +82,16 @@ class DeepseekV2ModelTest(CausalLMModelTest, unittest.TestCase):
     test_torchscript = False
     test_all_params_have_gradient = False
     model_tester_class = DeepseekV2ModelTester
-    rotary_embedding_layer = DeepseekV2RotaryEmbedding
     model_split_percents = [0.5, 0.7, 0.8]
 
     # used in `test_torch_compile_for_training`
     _torch_compile_train_cls = DeepseekV2ForCausalLM if is_torch_available() else None
 
-    def test_model_rope_scaling(self):
+    def test_model_rope_scaling_frequencies(self):
+        """
+        Overwritten: DeepseekV2 implements RoPE in the complex domain, as opposed to in the real domain with
+        `sin` and `cos`. Nevertheless, the checks are the same as in the original test.
+        """
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         scaling_factor = 10
         short_input_length = 10
@@ -109,7 +112,7 @@ def test_model_rope_scaling(self):
 
         # Sanity check linear RoPE scaling
         # New position "x" should match original position with index "x/scaling_factor"
-        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+        config.rope_scaling = {"rope_type": "linear", "factor": scaling_factor}
         linear_scaling_rope = DeepseekV2RotaryEmbedding(config=config).to(torch_device)
         linear_freqs_cis_short = linear_scaling_rope(x, position_ids_short)
         linear_freqs_cis_long = linear_scaling_rope(x, position_ids_long)
@@ -118,7 +121,7 @@ def test_model_rope_scaling(self):
         # Sanity check Dynamic NTK RoPE scaling
         # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase
         # with scaling_factor (or that `inv_freq` decreases)
-        config.rope_scaling = {"type": "dynamic", "factor": scaling_factor}
+        config.rope_scaling = {"rope_type": "dynamic", "factor": scaling_factor}
         ntk_scaling_rope = DeepseekV2RotaryEmbedding(config=config).to(torch_device)
         ntk_freqs_cis_short = ntk_scaling_rope(x, position_ids_short)
         ntk_freqs_cis_long = ntk_scaling_rope(x, position_ids_long)
@@ -129,7 +132,7 @@ def test_model_rope_scaling(self):
 
         # Sanity check Yarn RoPE scaling
         # Scaling should be over the entire input
-        config.rope_scaling = {"type": "yarn", "factor": scaling_factor}
+        config.rope_scaling = {"rope_type": "yarn", "factor": scaling_factor}
         yarn_scaling_rope = DeepseekV2RotaryEmbedding(config=config).to(torch_device)
         yarn_freqs_cis_short = yarn_scaling_rope(x, position_ids_short)
         yarn_freqs_cis_long = yarn_scaling_rope(x, position_ids_long)

diff --git a/tests/models/ernie4_5/test_modeling_ernie4_5.py b/tests/models/ernie4_5/test_modeling_ernie4_5.py
@@ -37,7 +37,6 @@
         Ernie4_5ForCausalLM,
         Ernie4_5Model,
     )
-    from transformers.models.ernie4_5.modeling_ernie4_5 import Ernie4_5RotaryEmbedding
 
 
 class Ernie4_5ModelTester(CausalLMModelTester):
@@ -69,7 +68,6 @@ class Ernie4_5ModelTest(CausalLMModelTest, unittest.TestCase):
     test_pruning = False
     fx_compatible = False  # Broken by attention refactor cc @Cyrilvallez
     model_tester_class = Ernie4_5ModelTester
-    rotary_embedding_layer = Ernie4_5RotaryEmbedding  # Enables RoPE tests if set
 
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer

diff --git a/tests/models/ernie4_5_moe/test_modeling_ernie4_5_moe.py b/tests/models/ernie4_5_moe/test_modeling_ernie4_5_moe.py
@@ -41,6 +41,7 @@
         Ernie4_5_MoeForCausalLM,
         Ernie4_5_MoeModel,
     )
+
 from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
 
 

diff --git a/tests/models/hunyuan_v1_dense/test_modeling_hunyuan_v1_dense.py b/tests/models/hunyuan_v1_dense/test_modeling_hunyuan_v1_dense.py
@@ -15,6 +15,8 @@
 
 import unittest
 
+from parameterized import parameterized
+
 from transformers import HunYuanDenseV1Config, is_torch_available
 from transformers.testing_utils import (
     cleanup,
@@ -30,7 +32,6 @@
         HunYuanDenseV1ForSequenceClassification,
         HunYuanDenseV1Model,
     )
-
 from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
 
 
@@ -78,6 +79,15 @@ def is_pipeline_test_to_skip(
     ):
         return True
 
+    @unittest.skip("HunYuanDenseV1's RoPE has custom parameterization")
+    def test_model_rope_scaling_frequencies(self):
+        pass
+
+    @parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
+    @unittest.skip("HunYuanDenseV1's RoPE has custom parameterization")
+    def test_model_rope_scaling_from_config(self, scaling_type):
+        pass
+
 
 @require_torch
 class HunYuanDenseV1IntegrationTest(unittest.TestCase):

diff --git a/tests/models/hunyuan_v1_moe/test_modeling_hunyuan_v1_moe.py b/tests/models/hunyuan_v1_moe/test_modeling_hunyuan_v1_moe.py
@@ -16,6 +16,7 @@
 import unittest
 
 import pytest
+from parameterized import parameterized
 
 from transformers import HunYuanMoEV1Config, is_torch_available
 from transformers.testing_utils import (
@@ -101,6 +102,15 @@ def test_generate_from_inputs_embeds_with_static_cache(self):
     def test_generate_with_static_cache(self):
         pass
 
+    @unittest.skip("HunYuanMoEV1's RoPE has custom parameterization")
+    def test_model_rope_scaling_frequencies(self):
+        pass
+
+    @parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
+    @unittest.skip("HunYuanMoEV1's RoPE has custom parameterization")
+    def test_model_rope_scaling_from_config(self, scaling_type):
+        pass
+
 
 @require_torch
 class HunYuanMoEV1IntegrationTest(unittest.TestCase):

diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
@@ -46,7 +46,6 @@
         LlamaModel,
         LlamaTokenizer,
     )
-    from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding
 
 
 class LlamaModelTester(CausalLMModelTester):
@@ -87,7 +86,6 @@ class LlamaModelTest(CausalLMModelTest, unittest.TestCase):
     test_pruning = False
     fx_compatible = False  # Broken by attention refactor cc @Cyrilvallez
     model_tester_class = LlamaModelTester
-    rotary_embedding_layer = LlamaRotaryEmbedding  # Enables RoPE tests if set
 
     # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
     # This is because we are hitting edge cases with the causal_mask buffer

diff --git a/tests/models/minimax/test_modeling_minimax.py b/tests/models/minimax/test_modeling_minimax.py
@@ -38,7 +38,6 @@
         MiniMaxForTokenClassification,
         MiniMaxModel,
     )
-
 from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
 
 

diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
@@ -48,7 +48,6 @@
         MistralForTokenClassification,
         MistralModel,
     )
-
 from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
 
 

diff --git a/tests/models/nemotron/test_modeling_nemotron.py b/tests/models/nemotron/test_modeling_nemotron.py
@@ -16,6 +16,8 @@
 
 import unittest
 
+from parameterized import parameterized
+
 from transformers import NemotronConfig, is_torch_available
 from transformers.testing_utils import (
     Expectations,
@@ -96,6 +98,15 @@ def setUp(self):
     def test_model_outputs_equivalence(self, **kwargs):
         pass
 
+    @unittest.skip("Nemotron has a hardcoded `rope_type`, so we can't apply RoPE scaling")
+    def test_model_rope_scaling_frequencies(self):
+        pass
+
+    @parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
+    @unittest.skip("Nemotron has a hardcoded `rope_type`, so we can't apply RoPE scaling")
+    def test_model_rope_scaling_from_config(self, scaling_type):
+        pass
+
 
 @require_torch_accelerator
 class NemotronIntegrationTest(unittest.TestCase):

diff --git a/tests/models/phi/test_modeling_phi.py b/tests/models/phi/test_modeling_phi.py
@@ -36,7 +36,6 @@
         PhiForTokenClassification,
         PhiModel,
     )
-    from transformers.models.phi.modeling_phi import PhiRotaryEmbedding
 
 
 class PhiModelTester(CausalLMModelTester):
@@ -69,7 +68,6 @@ class PhiModelTest(CausalLMModelTest, unittest.TestCase):
     test_headmasking = False
     test_pruning = False
     model_tester_class = PhiModelTester
-    rotary_embedding_layer = PhiRotaryEmbedding
 
     # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79292/workflows/fa2ba644-8953-44a6-8f67-ccd69ca6a476/jobs/1012905
     def is_pipeline_test_to_skip(

diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py
@@ -40,7 +40,6 @@
         Phi3ForTokenClassification,
         Phi3Model,
     )
-    from transformers.models.phi3.modeling_phi3 import Phi3RotaryEmbedding
 
     end_of_text_token = 32000
 
@@ -116,7 +115,6 @@ class Phi3ModelTest(CausalLMModelTest, unittest.TestCase):
     test_headmasking = False
     test_pruning = False
     model_tester_class = Phi3ModelTester
-    rotary_embedding_layer = Phi3RotaryEmbedding
 
 
 @slow
-Original file line number
+Diff line change
@@ Expand Up / @@ -41,6 +41,7 @@ @@
             Ernie4_5_MoeForCausalLM,
             Ernie4_5_MoeModel,
         )
     from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester
@@ Expand Down @@