Enable prefill for running CausalLM using ET runtime

guangy10 · guangy10 · commit d0e3764ff484 · 2025-06-04T16:33:05.000-07:00
diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
@@ -623,6 +623,7 @@ def forward(
             torch.Tensor: Logits output from the model.
         """
         self.stats.on_model_execution_start()
+        print(f"DEBUG: {self.model.method_meta('forward')}")
         logits = self.model.forward((input_ids, cache_position))[0]
         self.stats.on_model_execution_end()
         return logits
@@ -667,14 +668,12 @@ def generate(
             max_seq_len = self.max_cache_size
         generated_tokens = []
 
-        # prefill
-        for i, prompt_token in enumerate(prompt_tokens):
-            self.stats.on_sampling_begin()
-            logits = self.forward(
-                input_ids=torch.tensor([prompt_token], dtype=torch.long, device=self.device).unsqueeze(0),
-                cache_position=torch.tensor([i], dtype=torch.long, device=self.device),
-            )
-            self.stats.on_sampling_end()
+        self.stats.on_sampling_begin()
+        logits = self.forward(
+            input_ids=torch.tensor(prompt_tokens, dtype=torch.long, device=self.device).unsqueeze(0),
+            cache_position=torch.tensor([0], dtype=torch.long, device=self.device),
+        )
+        self.stats.on_sampling_end()
 
         self.stats.on_prompt_eval_end()
         first_token_generated = False
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict
+from typing import Dict, Optional
 
 import torch
 from torch.export import ExportedProgram
@@ -43,7 +43,13 @@ def __init__(self, model):
         self.config = model.config
         self.metadata = save_config_to_constant_methods(model.config, model.generation_config)
 
-    def export(self, input_ids=None, cache_position=None) -> Dict[str, ExportedProgram]:
+    def export(
+        self,
+        input_ids=None,
+        cache_position=None,
+        dynamic_shapes: Optional[dict] = None,
+        strict: Optional[bool] = None,
+    ) -> Dict[str, ExportedProgram]:
         example_input_ids = input_ids if input_ids is not None else torch.tensor([[1]], dtype=torch.long)
         example_cache_position = cache_position if cache_position is not None else torch.tensor([0], dtype=torch.long)
 
@@ -57,13 +63,17 @@ def export(self, input_ids=None, cache_position=None) -> Dict[str, ExportedProgr
             exportable_module = TorchExportableModuleForDecoderOnlyLM(self.model, max_batch_size, max_cache_len)
 
             with torch.no_grad():
-                exported_program = exportable_module.export(example_input_ids, example_cache_position)
+                exported_program = exportable_module.export(
+                    example_input_ids, example_cache_position, dynamic_shapes, strict
+                )
         else:
             from transformers.integrations.executorch import (
                 convert_and_export_with_cache,
             )
 
-            exported_program = convert_and_export_with_cache(self.model, example_input_ids, example_cache_position)
+            exported_program = convert_and_export_with_cache(
+                self.model, example_input_ids, example_cache_position, dynamic_shapes, strict
+            )
 
         return {"model": exported_program}
 
diff --git a/optimum/exporters/executorch/recipes/xnnpack.py b/optimum/exporters/executorch/recipes/xnnpack.py
@@ -15,6 +15,7 @@
 import logging
 from typing import Dict, Union
 
+import torch
 from packaging.version import parse
 from tabulate import tabulate
 from torch.export import ExportedProgram
@@ -95,7 +96,18 @@ def _lower_to_executorch(
             )
         return et_progs
 
-    exported_progs = model.export()
+    # Make the sequence length dim to be dynamic in orfer to leverage parallel prefill in ExecuTorch runtime.
+    seq_length = 7
+    input_ids = torch.zeros((1, seq_length), dtype=torch.long)
+    cache_position = torch.tensor([0], dtype=torch.long)
+    dynamic_shapes = {"input_ids": {1: torch.export.Dim.DYNAMIC}, "cache_position": None}
+    strict = parse(torch.__version__) != parse("2.7.0")  # Due to bug https://github.com/pytorch/pytorch/issues/150994
+    exported_progs = model.export(
+        input_ids=input_ids,
+        cache_position=cache_position,
+        dynamic_shapes=dynamic_shapes,
+        strict=strict,
+    )
 
     if model.config._attn_implementation == "custom_sdpa":
         # Sanity check to make sure the exported program contains the custom sdpa operator.