fix generate by extracting seq_len from the method meta

Guang Yang · Guang Yang · commit e2f4cd23db5b · 2025-06-25T17:46:16.000-07:00
diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
@@ -24,7 +24,6 @@
 import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
-from packaging.version import parse
 from transformers import (
     AutoModelForCausalLM,
     AutoModelForImageClassification,
@@ -37,7 +36,6 @@
 )
 from transformers.utils import is_offline_mode
 
-from executorch import version as executorch_version
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule, _load_for_executorch
 from executorch.kernels import quantized  # noqa
 
@@ -676,10 +674,20 @@ def generate(
             )
             max_seq_len = self.max_cache_size
         generated_tokens = []
+        seq_len = self.model.method_meta("forward").input_tensor_meta(1).sizes()[0]
 
-        if parse(executorch_version.__version__).base_version <= "0.6.0":
-            # TODO: Sequential prefill is preserved for backwards compatibility in order to run PTE generated w/o dynamic shapes.
-            #       We can remove this block once the executorch runtime supports `cache_position`.
+        if seq_len > 1:
+            # The model is exported with dynamic shapes. Can support parallel prefill.
+            self.stats.on_sampling_begin()
+            logits = self.forward(
+                input_ids=torch.tensor(prompt_tokens, dtype=torch.long, device=self.device).unsqueeze(0),
+                cache_position=torch.arange(len(prompt_tokens), dtype=torch.long, device=self.device),
+            )
+            self.stats.on_sampling_end()
+            next_token = torch.argmax(logits, dim=-1)[0, -1].item()
+        else:
+            # Sequential prefill is preserved for backwards compatibility in order to run PTE generated w/o dynamic shapes.
+            # TODO: We can remove this block once the executorch runtime supports `cache_position`.
             for i, prompt_token in enumerate(prompt_tokens):
                 self.stats.on_sampling_begin()
                 logits = self.forward(
@@ -688,14 +696,6 @@ def generate(
                 )
                 self.stats.on_sampling_end()
             next_token = torch.argmax(logits, dim=-1).item()
-        else:
-            self.stats.on_sampling_begin()
-            logits = self.forward(
-                input_ids=torch.tensor(prompt_tokens, dtype=torch.long, device=self.device).unsqueeze(0),
-                cache_position=torch.arange(len(prompt_tokens), dtype=torch.long, device=self.device),
-            )
-            self.stats.on_sampling_end()
-            next_token = torch.argmax(logits, dim=-1)[0, -1].item()
         self.stats.on_prompt_eval_end()
         first_token_generated = False