fix generate by extracting seq_len from the method meta

Guang Yang · Guang Yang · commit 608733ce5ac0 · 2025-06-25T17:51:35.000-07:00
diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
@@ -24,7 +24,6 @@
 import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
-from packaging.version import parse
 from transformers import (
     AutoModelForCausalLM,
     AutoModelForImageClassification,
@@ -37,7 +36,6 @@
 )
 from transformers.utils import is_offline_mode
 
-from executorch import version as executorch_version
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule, _load_for_executorch
 from executorch.kernels import quantized  # noqa
 
@@ -676,10 +674,20 @@ def generate(
             )
             max_seq_len = self.max_cache_size
         generated_tokens = []
+        seq_len = self.model.method_meta("forward").input_tensor_meta(1).sizes()[0]
 
-        if parse(executorch_version.__version__).base_version <= "0.6.0":
-            # TODO: Sequential prefill is preserved for backwards compatibility in order to run PTE generated w/o dynamic shapes.
-            #       We can remove this block once the executorch runtime supports `cache_position`.
+        if seq_len > 1:
+            # The model is exported with dynamic shapes. Can support parallel prefill.
+            self.stats.on_sampling_begin()
+            logits = self.forward(
+                input_ids=torch.tensor(prompt_tokens, dtype=torch.long, device=self.device).unsqueeze(0),
+                cache_position=torch.arange(len(prompt_tokens), dtype=torch.long, device=self.device),
+            )
+            self.stats.on_sampling_end()
+            next_token = torch.argmax(logits, dim=-1)[0, -1].item()
+        else:
+            # Sequential prefill is preserved for backwards compatibility in order to run PTE generated w/o dynamic shapes.
+            # TODO: We can remove this block once the executorch runtime supports `cache_position`.
             for i, prompt_token in enumerate(prompt_tokens):
                 self.stats.on_sampling_begin()
                 logits = self.forward(
@@ -688,14 +696,6 @@ def generate(
                 )
                 self.stats.on_sampling_end()
             next_token = torch.argmax(logits, dim=-1).item()
-        else:
-            self.stats.on_sampling_begin()
-            logits = self.forward(
-                input_ids=torch.tensor(prompt_tokens, dtype=torch.long, device=self.device).unsqueeze(0),
-                cache_position=torch.arange(len(prompt_tokens), dtype=torch.long, device=self.device),
-            )
-            self.stats.on_sampling_end()
-            next_token = torch.argmax(logits, dim=-1)[0, -1].item()
         self.stats.on_prompt_eval_end()
         first_token_generated = False
 
diff --git a/tests/models/test_modeling_phi4.py b/tests/models/test_modeling_phi4.py
@@ -16,14 +16,10 @@
 import gc
 import logging
 import os
-import sys
 import unittest
 
 import pytest
-import torchao
-import transformers
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
-from packaging.version import parse
 from transformers import AutoConfig, AutoTokenizer
 from transformers.testing_utils import slow
 
@@ -33,8 +29,8 @@
 
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
 is_ci = os.environ.get("GITHUB_ACTIONS") == "true"
-is_linux_ci = sys.platform.startswith("linux") and os.environ.get("GITHUB_ACTIONS") == "true"
 
 
 class ExecuTorchModelIntegrationTest(unittest.TestCase):
@@ -44,47 +40,36 @@ def __init__(self, *args, **kwargs):
     @slow
     @pytest.mark.run_slow
     @pytest.mark.skipif(
-        is_linux_ci
-        or parse(transformers.__version__) < parse("4.52.0")
-        or parse(torchao.__version__) < parse("0.11.0"),
-        reason="Only available on transformers >= 4.52.0 and torchao >= 0.11.0. OOM on linux runner.",
+        is_ci,
+        reason="Test Phi-4-mini (3.8B) will require runner to be configured with larger RAM",
     )
-    def test_phi4_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self):
+    def test_phi4_text_generation(self):
         model_id = "microsoft/Phi-4-mini-instruct"
         config = AutoConfig.from_pretrained(model_id)
         # NOTE: To make the model exportable we need to set the rope scaling to default to avoid hitting
         # the data-dependent control flow in _longrope_frequency_update. Alternatively, we can rewrite
         # that function to avoid the data-dependent control flow.
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
             config.rope_scaling["type"] = "default"
-        model = ExecuTorchModelForCausalLM.from_pretrained(
-            model_id,
-            recipe="xnnpack",
-            config=config,
-            attn_implementation="custom_sdpa",
-            use_custom_kv_cache=True,
-            **{"qlinear": True, "qembeeding": True},
-        )
+        model = ExecuTorchModelForCausalLM.from_pretrained(model_id, recipe="xnnpack", config=config)
         self.assertIsInstance(model, ExecuTorchModelForCausalLM)
         self.assertIsInstance(model.model, ExecuTorchModule)
 
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         generated_text = model.text_generation(
             tokenizer=tokenizer,
             prompt="My favourite condiment is ",
-            max_seq_len=64,
+            max_seq_len=32,
         )
         logging.info(f"\nGenerated text:\n\t{generated_text}")
+        generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
 
-        if not is_ci:
-            generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
-
-            # Free memory before loading eager for quality check
-            del model
-            del tokenizer
-            gc.collect()
+        # Free memory before loading eager for quality check
+        del model
+        del tokenizer
+        gc.collect()
 
-            self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
+        self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
 
     @slow
     @pytest.mark.run_slow