Merge branch 'main' into sam-vision-encoder

geetu040 · web-flow · commit d146ba72e4bc · 2025-03-17T22:19:03.000+05:00
diff --git a/README.md b/README.md
diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md
@@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.
 
 # Installation
 
-Transformers works with [PyTorch](https://pytorch.org/get-started/locally/), [TensorFlow 2.0](https://www.tensorflow.org/install/pip), and [Flax](https://flax.readthedocs.io/en/latest/). It has been tested on Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, and Flax.
+Transformers works with [PyTorch](https://pytorch.org/get-started/locally/), [TensorFlow 2.0](https://www.tensorflow.org/install/pip), and [Flax](https://flax.readthedocs.io/en/latest/). It has been tested on Python 3.9+, PyTorch 2.0+, TensorFlow 2.6+, and Flax 0.4.1+.
 
 ## Virtual environment
 
diff --git a/docs/source/en/model_doc/hubert.md b/docs/source/en/model_doc/hubert.md
@@ -71,9 +71,10 @@ pip install -U flash-attn --no-build-isolation
 Below is an expected speedup diagram comparing the pure inference time between the native implementation in transformers of `facebook/hubert-large-ls960-ft`, the flash-attention-2 and the sdpa (scale-dot-product-attention) version. We show the average speedup obtained on the `librispeech_asr` `clean` validation split: 
 
 ```python
->>> from transformers import Wav2Vec2Model
+>>> from transformers import HubertModel
+>>> import torch
 
-model = Wav2Vec2Model.from_pretrained("facebook/hubert-large-ls960-ft", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)
+>>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to("cuda")
 ...
 ```
 
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -109,7 +109,7 @@ class PretrainedConfig(PushToHubMixin):
         is_encoder_decoder (`bool`, *optional*, defaults to `False`):
             Whether the model is used as an encoder/decoder or not.
         is_decoder (`bool`, *optional*, defaults to `False`):
-            Whether the model is used as decoder or not (in which case it's used as an encoder).
+            Whether to only use the decoder in an encoder-decoder architecture, otherwise it has no effect on decoder-only or encoder-only architectures.
         cross_attention_hidden_size** (`bool`, *optional*):
             The hidden size of the cross-attention layer in case the model is used as a decoder in an encoder-decoder
             setting and the cross-attention hidden dimension differs from `self.config.hidden_size`.
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
@@ -1233,7 +1233,7 @@ def generate(
     """,
     BLIP_START_DOCSTRING,
 )
-class BlipForQuestionAnswering(BlipPreTrainedModel):
+class BlipForQuestionAnswering(BlipPreTrainedModel, GenerationMixin):
     config_class = BlipConfig
     _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
 
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -2631,6 +2631,13 @@ def __init__(self, config: SpeechT5Config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    @classmethod
+    def can_generate(cls) -> bool:
+        # Speecht5 has a unique model structure, where the external class (`SpeechT5ForTextToSpeech`) doesn't need to inherit from
+        # `GenerationMixin` (it has a non-standard generation method). This means that the base `can_generate()` will return `False`,
+        # but we need to override it so as to do `GenerationConfig` handling in multiple parts of the codebase.
+        return True
+
     def get_encoder(self):
         return self.speecht5.get_encoder()
 
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
@@ -762,6 +762,9 @@ def has_length(dataset):
     except TypeError:
         # TypeError: len() of unsized object
         return False
+    except AttributeError:
+        # Ray DataSets raises an AttributeError: https://github.com/ray-project/ray/blob/master/python/ray/data/dataset.py#L5616
+        return False
 
 
 def denumpify_detensorize(metrics):
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
@@ -1641,7 +1641,7 @@ def __post_init__(self):
             self.do_eval = True
 
         if self.torch_empty_cache_steps is not None:
-            if not (isinstance(self.torch_empty_cache_steps, int) or self.torch_empty_cache_steps > 0):
+            if not (isinstance(self.torch_empty_cache_steps, int) and self.torch_empty_cache_steps > 0):
                 raise ValueError(
                     f"`torch_empty_cache_steps` must be an integer bigger than 0, got {self.torch_empty_cache_steps}."
                 )
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
@@ -902,6 +902,9 @@ def post_init(self):
             )
 
         if self.backend == AwqBackendPackingMethod.LLMAWQ:
+            # Only cuda device can run this function
+            if not torch.cuda.is_available():
+                raise ValueError("LLM-AWQ backend is only supported on CUDA")
             compute_capability = torch.cuda.get_device_capability()
             major, minor = compute_capability
             if major < 8:
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
@@ -1076,6 +1076,10 @@ def fine_generation_config(self):
         fine_generation_config = BarkFineGenerationConfig(**self.model.generation_config.fine_acoustics_config)
         return fine_generation_config
 
+    def test_model_can_generate(self):
+        # Bark has custom generate without inheriting GenerationMixin. This test could prevent regression.
+        self.assertTrue(self.model.can_generate())
+
     @slow
     def test_generate_semantic(self):
         input_ids = self.inputs
diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -430,13 +430,13 @@ def test_small_model_integration_test(self):
 
         # verify generation
         output = model.generate(**inputs, do_sample=False, max_new_tokens=40)
-        EXPECTED_DECODED_TEXT = 'USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and somewhat comical situation of a young child reading a book while another child is attempting to read the same book. The child who is reading the book seems'  # fmt: skip
-
-        self.assertEqual(
-            self.processor.decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
+        EXPECTED_DECODED_TEXT = (
+            "USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and somewhat comical situation of a young child reading a book while another child is attempting to read the same book. The child who is reading the book seems",  # cuda output
+            "USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and somewhat comical situation of a young child reading a book while wearing a pair of glasses that are too large for them. The glasses are",  # xpu output
         )
 
+        self.assertTrue(self.processor.decode(output[0], skip_special_tokens=True) in EXPECTED_DECODED_TEXT)
+
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test_batch(self):
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
@@ -881,6 +881,7 @@ def create_and_check_model_forward(self, config, inputs_dict):
 @require_torch
 class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (SpeechT5ForTextToSpeech,) if is_torch_available() else ()
+    all_generative_model_classes = ()
     is_encoder_decoder = True
     test_pruning = False
     test_headmasking = False
@@ -892,6 +893,12 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    def test_model_can_generate(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertTrue(model.can_generate())
+
     def test_save_load_strict(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
         for model_class in self.all_model_classes:
diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py
@@ -59,16 +59,21 @@ def test_wrong_backend(self):
         with self.assertRaises(ValueError):
             AwqConfig(bits=4, backend="unexisting-backend")
 
-        compute_capability = torch.cuda.get_device_capability()
-        major, minor = compute_capability
-
-        if major < 8:
+        # Only cuda device can run this function
+        support_llm_awq = False
+        if torch.cuda.is_available():
+            compute_capability = torch.cuda.get_device_capability()
+            major, minor = compute_capability
+            if major >= 8:
+                support_llm_awq = True
+
+        if support_llm_awq:
+            # LLMAWQ should work on an A100
+            AwqConfig(bits=4, backend="llm-awq")
+        else:
             # LLMAWQ does not work on a T4
             with self.assertRaises(ValueError):
                 AwqConfig(bits=4, backend="llm-awq")
-        else:
-            # LLMAWQ should work on an A100
-            AwqConfig(bits=4, backend="llm-awq")
 
     def test_to_dict(self):
         """
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
@@ -138,16 +138,16 @@
     (
         # test name for the test runner
         f"{dtype}_pad_{padding_side}{'' if use_attention_mask else '_no_attn_mask'}"
-        f"{'_output_attn' if output_attentions else ''}{'_sdpa_kernels' if enable_kernels else ''}",
+        f"{'_sdpa_kernels' if enable_kernels else ''}",
         # parameterization
-        *(dtype, padding_side, use_attention_mask, output_attentions, enable_kernels),
+        *(dtype, padding_side, use_attention_mask, False, enable_kernels),
     )
     for dtype in ("fp16", "fp32", "bf16")
     for padding_side in ("left", "right")
     for use_attention_mask in (True, False)
-    for output_attentions in (True, False)
     for enable_kernels in (True, False)
-]
+    # Extra test case: `output_attentions=True` has special attention mask handling and sdpa reverts to eager
+] + [("fp32_pad_left_output_attentions", "fp32", "left", True, True, False)]
 
 
 def _config_zero_init(config):
@@ -3618,7 +3618,7 @@ def test_eager_matches_sdpa_inference(
             ("cuda", False, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float16): 5e-3,
             ("cuda", True, torch.float32): 1e-4,
-            ("cuda", True, torch.bfloat16): 3e-2,
+            ("cuda", True, torch.bfloat16): 3e-2,  # (different from others)
             ("cuda", True, torch.float16): 5e-3,
         }
 
diff --git a/tests/test_training_args.py b/tests/test_training_args.py
@@ -40,3 +40,28 @@ def test_output_dir_creation(self):
             self.assertFalse(os.path.exists(output_dir))  # Still shouldn't exist
 
             # Directory should be created when actually needed (e.g. in Trainer)
+
+    def test_torch_empty_cache_steps_requirements(self):
+        """Test that torch_empty_cache_steps is a positive integer or None."""
+
+        # None is acceptable (feature is disabled):
+        args = TrainingArguments(torch_empty_cache_steps=None)
+        self.assertIsNone(args.torch_empty_cache_steps)
+
+        # non-int is unacceptable:
+        with self.assertRaises(ValueError):
+            TrainingArguments(torch_empty_cache_steps=1.0)
+        with self.assertRaises(ValueError):
+            TrainingArguments(torch_empty_cache_steps="none")
+
+        # negative int is unacceptable:
+        with self.assertRaises(ValueError):
+            TrainingArguments(torch_empty_cache_steps=-1)
+
+        # zero is unacceptable:
+        with self.assertRaises(ValueError):
+            TrainingArguments(torch_empty_cache_steps=0)
+
+        # positive int is acceptable:
+        args = TrainingArguments(torch_empty_cache_steps=1)
+        self.assertEqual(args.torch_empty_cache_steps, 1)
diff --git a/tests/utils/test_import_utils.py b/tests/utils/test_import_utils.py
@@ -1,23 +1,26 @@
 import sys
 
+from transformers.testing_utils import run_test_using_subprocess
 from transformers.utils.import_utils import clear_import_cache
 
 
+@run_test_using_subprocess
 def test_clear_import_cache():
-    # Import some transformers modules
+    """Test the clear_import_cache function."""
 
-    # Get initial module count
+    # Save initial state
     initial_modules = {name: mod for name, mod in sys.modules.items() if name.startswith("transformers.")}
+    assert len(initial_modules) > 0, "No transformers modules loaded before test"
 
-    # Verify we have some modules loaded
-    assert len(initial_modules) > 0
-
-    # Clear cache
+    # Execute clear_import_cache() function
     clear_import_cache()
 
-    # Check modules were removed
+    # Verify modules were removed
     remaining_modules = {name: mod for name, mod in sys.modules.items() if name.startswith("transformers.")}
-    assert len(remaining_modules) < len(initial_modules)
+    assert len(remaining_modules) < len(initial_modules), "No modules were removed"
+
+    # Import and verify module exists
+    from transformers.models.auto import modeling_auto
 
-    # Verify we can reimport
-    assert "transformers" in sys.modules
+    assert "transformers.models.auto.modeling_auto" in sys.modules
+    assert modeling_auto.__name__ == "transformers.models.auto.modeling_auto"

Original file line number	Diff line number	Diff line change
`@@ -1233,7 +1233,7 @@ def generate(`
`1233`	`1233`	`""",`
`1234`	`1234`	`BLIP_START_DOCSTRING,`
`1235`	`1235`	`)`
`1236`		`-class BlipForQuestionAnswering(BlipPreTrainedModel):`
	`1236`	`+class BlipForQuestionAnswering(BlipPreTrainedModel, GenerationMixin):`
`1237`	`1237`	`config_class = BlipConfig`
`1238`	`1238`	`_tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]`
`1239`	`1239`
Original file line number	Diff line number	Diff line change
`@@ -1641,7 +1641,7 @@ def __post_init__(self):`
`1641`	`1641`	`self.do_eval = True`
`1642`	`1642`
`1643`	`1643`	`if self.torch_empty_cache_steps is not None:`
`1644`		`- if not (isinstance(self.torch_empty_cache_steps, int) or self.torch_empty_cache_steps > 0):`
	`1644`	`+ if not (isinstance(self.torch_empty_cache_steps, int) and self.torch_empty_cache_steps > 0):`
`1645`	`1645`	`raise ValueError(`
`1646`	`1646`	f"`torch_empty_cache_steps` must be an integer bigger than 0, got {self.torch_empty_cache_steps}."
`1647`	`1647`	`)`
Original file line number	Diff line number	Diff line change
`@@ -902,6 +902,9 @@ def post_init(self):`
`902`	`902`	`)`
`903`	`903`
`904`	`904`	`if self.backend == AwqBackendPackingMethod.LLMAWQ:`
	`905`	`+ # Only cuda device can run this function`
	`906`	`+ if not torch.cuda.is_available():`
	`907`	`+ raise ValueError("LLM-AWQ backend is only supported on CUDA")`
`905`	`908`	`compute_capability = torch.cuda.get_device_capability()`
`906`	`909`	`major, minor = compute_capability`
`907`	`910`	`if major < 8:`