tmp commit

gante · gante · commit 2b7f8dcda982 · 2025-09-01T12:29:18.000Z
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -1224,7 +1224,8 @@ def get_text_config(self, decoder: Optional[bool] = None, encoder: Optional[bool
         for text_config_name in possible_text_config_names:
             if hasattr(self, text_config_name):
                 text_config = getattr(self, text_config_name, None)
-                if text_config is not None:
+                # Assumption: all text configs have a `vocab_size` attribute
+                if text_config is not None and "vocab_size" in text_config:
                     valid_text_config_names += [text_config_name]
 
         if len(valid_text_config_names) == 1:
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
@@ -1591,27 +1591,36 @@ def test_generate_continue_from_past_key_values(self):
 
     @pytest.mark.generate
     def test_generate_continue_from_inputs_embeds(self):
-        """Tests that we can continue generation from `inputs_embeds` and past key values returned from a previous `generate` call."""
+        """
+        Tests that we can continue generation from `inputs_embeds` and past key values returned from a previous
+        `generate` call.
+        """
         for model_class in self.all_generative_model_classes:
-            if any(model_name in model_class.__name__.lower() for model_name in ["imagegpt"]):
+            # To be more precise: technically we can run this test on all models that have `inputs_embeds` or
+            # `decoder_inputs_embeds` in their signatures, but the main use case of this feature is on LLMs.
+            # Let's prevent overwrites and additional test logic by adding this constraint.
+            if model_class.main_input_name != "input_ids":
+                self.skipTest(reason="This test is only for models that use `input_ids` as their main input")
+            if "inputs_embeds" not in inspect.signature(model_class.prepare_inputs_for_generation).parameters:
+                self.skipTest(reason="This model does not support `inputs_embeds` in generation")
+            # these models have a different cache format/class
+            different_cache = ["gpt_bigcode", "zamba2"]
+            # these models require special input preparation logic for this test
+            non_llm = ["mllama", "idefics", "moshi"]
+            if any(model_name in model_class.__name__.lower() for model_name in different_cache + non_llm):
                 self.skipTest(reason="Won't fix: old model with unique inputs/caches/other")
-            if any(model_name in model_class.__name__.lower() for model_name in ["umt5"]):
-                self.skipTest(reason="TODO: needs modeling or test input preparation fixes for compatibility")
 
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
 
-            if "token_type_ids" in inputs_dict:
-                del inputs_dict["token_type_ids"]
-
+            # TODO (joao, raushan): this shouldn't be a constraint to this test, `decoder_inputs_embeds` exists
             if config.is_encoder_decoder:
                 self.skipTest(reason="This model is encoder-decoder")
             if not hasattr(config.get_text_config(decoder=True), "use_cache"):
                 self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
 
             model = model_class(config).to(torch_device).eval()
-
-            if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters:
-                self.skipTest(reason="This model does not support `inputs_embeds` in generation")
+            if "token_type_ids" in inputs_dict:
+                del inputs_dict["token_type_ids"]
 
             # If "past_key_values" is not returned, skip the test (e.g. RWKV uses a different cache name and format)
             outputs = model(**inputs_dict)
diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
@@ -436,10 +436,6 @@ def test_disk_offload(self):
     def test_past_key_values_format(self):
         pass
 
-    @unittest.skip(reason="BigCodeGPT has a non-standard KV cache format and breaks this test.")
-    def test_generate_continue_from_inputs_embeds(self):
-        pass
-
     def test_gpt_bigcode_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt_bigcode_model(*config_and_inputs)
diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
@@ -491,11 +491,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_generate_without_input_ids(self):
         pass
 
-    @pytest.mark.generate
-    @unittest.skip(reason="""IDEFICS cannot generate with no images provided!""")
-    def test_generate_continue_from_inputs_embeds(self):
-        pass
-
     @pytest.mark.generate
     @unittest.skip(reason="""IDEFICS cannot do contrastive generation yet and it is not worth fixing""")
     def test_contrastive_generate(self):
@@ -776,65 +771,6 @@ def test_generate_without_input_ids(self):
             )
             self.assertIsNotNone(output_ids_generate)
 
-    @pytest.mark.generate
-    def test_generate_continue_from_inputs_embeds(self):
-        """Overwrite for IDEFICS: Ensure image attention mask is processed while continuing from `inputs_embeds`."""
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-            print(inputs)
-
-            model = model_class(config).to(torch_device).eval()
-
-            model.generation_config.pad_token_id = model.generation_config.eos_token_id = -1
-            model.generation_config.forced_eos_token_id = None
-            model.generation_config.use_cache = True
-
-            input_ids = inputs.pop("input_ids")
-            input_embeds = model.get_input_embeddings()(input_ids)
-
-            generation_kwargs = {
-                "return_dict_in_generate": True,
-                "do_sample": False,
-            }
-
-            inputs["inputs_embeds"] = input_embeds
-
-            # Traditional way of generating text, with `return_dict_in_generate` to return the past key values
-            outputs = model.generate(**inputs, max_new_tokens=4, **generation_kwargs)
-            # Let's generate again, but passing the past key values in between (3 + 1 = 4 tokens). Note that the
-            # inputs may need to be tweaked across `generate` calls (like the attention mask).
-            initial_output = model.generate(**inputs, max_new_tokens=3, **generation_kwargs)
-            inputs["past_key_values"] = initial_output.past_key_values
-
-            new_attention_len = input_ids.shape[1] + initial_output.sequences.shape[-1]
-            continued_embeds = torch.cat([input_embeds, model.get_input_embeddings()(initial_output.sequences)], dim=1)
-            inputs["inputs_embeds"] = continued_embeds
-
-            if "attention_mask" in inputs:
-                inputs["attention_mask"] = torch.nn.functional.pad(
-                    inputs["attention_mask"],
-                    (0, new_attention_len - inputs["attention_mask"].shape[1]),
-                    mode="constant",
-                    value=1,
-                )
-            if "image_attention_mask" in inputs:
-                inputs["image_attention_mask"] = inputs["image_attention_mask"][..., -1:, :]
-
-            cached_output = model.generate(**inputs, max_new_tokens=1, **generation_kwargs)
-
-            # Verify that the combined outputs match the full generation.
-            combined_output_sequences = torch.concat([initial_output.sequences, cached_output.sequences], axis=1)
-            self.assertListEqual(outputs.sequences.tolist(), combined_output_sequences.tolist())
-            for layer_idx in range(len(cached_output.past_key_values)):
-                for kv_idx in range(len(cached_output.past_key_values[layer_idx])):
-                    self.assertTrue(
-                        torch.allclose(
-                            outputs.past_key_values[layer_idx][kv_idx],
-                            cached_output.past_key_values[layer_idx][kv_idx],
-                        )
-                    )
-
     def _check_attentions_for_generate(
         self, batch_size, attentions, prompt_length, output_length, config, decoder_past_key_values
     ):
diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py
@@ -362,10 +362,6 @@ def test_disk_offload_bin(self):
     def test_disk_offload_safetensors(self):
         pass
 
-    @unittest.skip(reason="Test becomes too complex with Moshi requiring multiple input modalities.")
-    def test_generate_continue_from_inputs_embeds(self):
-        pass
-
     @is_flaky(max_attempts=5, description="flaky on some models.")
     def test_save_load(self):
         super().test_save_load()
@@ -872,10 +868,6 @@ def test_disk_offload_bin(self):
     def test_disk_offload_safetensors(self):
         pass
 
-    @unittest.skip(reason="Test becomes too complex with Moshi requiring multiple modalities")
-    def test_generate_continue_from_inputs_embeds(self):
-        pass
-
     @is_flaky(max_attempts=5, description="flaky on some models.")
     def test_save_load(self):
         super().test_save_load()
diff --git a/tests/models/zamba2/test_modeling_zamba2.py b/tests/models/zamba2/test_modeling_zamba2.py
@@ -360,10 +360,6 @@ def test_past_key_values_format(self):
                 all_cache_shapes.append([self_attention_cache_shape, self_attention_cache_shape])
         super().test_past_key_values_format(custom_all_cache_shapes=all_cache_shapes)
 
-    @unittest.skip(reason="Zamba2 has hybrid mamba cache.")
-    def test_generate_continue_from_inputs_embeds(self):
-        pass
-
     @unittest.skip(reason="A large mamba2 would be necessary (and costly) for that")
     def test_multi_gpu_data_parallel_forward(self):
         pass