Cleanup and minor fixes (#61)

guangy10 · Guang Yang · web-flow · commit b44e2d11758e · 2025-04-30T18:56:58.000+02:00
Co-authored-by: Guang Yang &lt;guangyang@fb.com&gt;
diff --git a/README.md b/README.md
@@ -144,7 +144,7 @@ We currently support a wide range of popular transformer models, including encod
 #### Decoder-only models
 - [Gemma](https://huggingface.co/google/gemma-2b): `Gemma-2b` and its variants
 - [Gemma2](https://huggingface.co/google/gemma-2-2b): `Gemma-2-2b` and its variants
-- [Gemma3](https://huggingface.co/google/gemma-3-1b-it): `Gemma-3-1b` and its variants
+- [Gemma3](https://huggingface.co/google/gemma-3-1b-it): `Gemma-3-1b` and its variants *(requires install latest `transformers (4.52.0.dev0)` manually from source)*
 - [Llama](https://huggingface.co/meta-llama/Llama-3.2-1B): `Llama-3.2-1B` and its variants
 - [Qwen2](https://huggingface.co/Qwen/Qwen2.5-0.5B): `Qwen2.5-0.5B` and its variants
 - [Qwen3](https://huggingface.co/Qwen/Qwen3-0.6B): `Qwen3-0.6B` and its variants
diff --git a/optimum/commands/export/executorch.py b/optimum/commands/export/executorch.py
@@ -17,9 +17,6 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
-from executorch import version as executorch_version
-from packaging import version as pkg_version
-
 from ...exporters import TasksManager
 from ..base import BaseOptimumCLICommand, CommandInfo
 
@@ -74,8 +71,6 @@ def run(self):
 
         kwargs = {}
         if self.args.use_custom_sdpa:
-            if pkg_version.parse(executorch_version.__version__) < pkg_version.parse("0.6.0"):
-                raise ValueError("custom_sdpa is not supported for executorch < 0.6.0")
             kwargs["use_custom_sdpa"] = self.args.use_custom_sdpa
 
         main_export(
diff --git a/optimum/executorch/attentions/custom_sdpa.py b/optimum/executorch/attentions/custom_sdpa.py
@@ -15,56 +15,52 @@
 from typing import Optional, Tuple, Union
 
 import torch
-from executorch import version as executorch_version
-from packaging import version as pkg_version
+from executorch.extension.llm.custom_ops.custom_ops import custom_sdpa  # noqa
 
 
-if pkg_version.parse(executorch_version.__version__) >= pkg_version.parse("0.6.0"):
-    from executorch.extension.llm.custom_ops.custom_ops import custom_sdpa  # noqa
+def custom_sdpa_with_start_pos_forward(
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Union[torch.Tensor, "BlockMask"],  # noqa
+    scaling: Optional[float] = None,
+    softcap: Optional[float] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+) -> Tuple[torch.Tensor, None]:
+    # This is before the transpose
+    max_seq_len = key.shape[2]
 
-    def custom_sdpa_with_start_pos_forward(
-        module: torch.nn.Module,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        attention_mask: Union[torch.Tensor, "BlockMask"],  # noqa
-        scaling: Optional[float] = None,
-        softcap: Optional[float] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, None]:
-        # This is before the transpose
-        max_seq_len = key.shape[2]
+    # FA2 uses non-transposed inputs
+    query = query.transpose(1, 2)
+    key = key.transpose(1, 2)
+    value = value.transpose(1, 2)
 
-        # FA2 uses non-transposed inputs
-        query = query.transpose(1, 2)
-        key = key.transpose(1, 2)
-        value = value.transpose(1, 2)
+    # Convert the hell out of the inputs to fp32 and back
+    input_dtype = query.dtype
+    query = query.to(torch.float32)
+    key = key.to(torch.float32)
+    value = value.to(torch.float32)
 
-        # Convert the hell out of the inputs to fp32 and back
-        input_dtype = query.dtype
-        query = query.to(torch.float32)
-        key = key.to(torch.float32)
-        value = value.to(torch.float32)
+    # Ignore the causal flag from kwargs but use the one in module
+    kwargs.pop("is_causal", None)
 
-        # Ignore the causal flag from kwargs but use the one in module
-        kwargs.pop("is_causal", None)
-
-        # Calculate the input pos from attention mask.
-        # Branch out for float vs bool mask
-        # assert attention_mask.dim() == 2, f"attention_mask must be a 2D matrix."
-        attention_mask = attention_mask.reshape(-1, max_seq_len)
-        first_row_mask = attention_mask[0, :]
-        # [0, 0, 0, 0, -inf, -inf, -inf, -inf], start_pos = 3
-        start_pos = torch.argmin(first_row_mask).item() - 1
-        output = torch.ops.llama.custom_sdpa(
-            query,
-            key,
-            value,
-            start_pos=start_pos,
-            attn_mask=None,
-            drpout_p=0.0,
-            is_causal=module.is_causal,
-            scale=scaling,
-        )
-        return output.to(input_dtype), None
+    # Calculate the input pos from attention mask.
+    # Branch out for float vs bool mask
+    # assert attention_mask.dim() == 2, f"attention_mask must be a 2D matrix."
+    attention_mask = attention_mask.reshape(-1, max_seq_len)
+    first_row_mask = attention_mask[0, :]
+    # [0, 0, 0, 0, -inf, -inf, -inf, -inf], start_pos = 3
+    start_pos = torch.argmin(first_row_mask).item() - 1
+    output = torch.ops.llama.custom_sdpa(
+        query,
+        key,
+        value,
+        start_pos=start_pos,
+        attn_mask=None,
+        drpout_p=0.0,
+        is_causal=module.is_causal,
+        scale=scaling,
+    )
+    return output.to(input_dtype), None
diff --git a/optimum/exporters/executorch/convert.py b/optimum/exporters/executorch/convert.py
@@ -19,23 +19,17 @@
 from pathlib import Path
 from typing import Union
 
-from packaging import version as pkg_version
 from transformers.modeling_utils import AttentionInterface
 
-from executorch import version as executorch_version
+from optimum.executorch.attentions.custom_sdpa import custom_sdpa_with_start_pos_forward
 
 from .recipe_registry import discover_recipes, recipe_registry
 
 
-if pkg_version.parse(executorch_version.__version__) >= pkg_version.parse("0.6.0"):
-    from optimum.executorch.attentions.custom_sdpa import custom_sdpa_with_start_pos_forward
-
-    # Register custom sdpa via `AttentionInterface` for executorch>=0.6.0
-    AttentionInterface.register("custom_sdpa", custom_sdpa_with_start_pos_forward)
-
-
 logger = logging.getLogger(__name__)
 
+AttentionInterface.register("custom_sdpa", custom_sdpa_with_start_pos_forward)
+
 
 def export_to_executorch(
     model,
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
@@ -22,10 +22,8 @@
 from tempfile import TemporaryDirectory
 
 import torch
-from executorch import version as executorch_version
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
 from huggingface_hub import HfApi
-from packaging import version as pkg_version
 from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
@@ -113,9 +111,6 @@ def test_find_files_matching_pattern(self):
                 self.assertTrue(len(pte_files) == 0 if revision == "main" else len(pte_files) > 0)
 
     def test_export_with_custom_sdpa(self):
-        if pkg_version.parse(executorch_version.__version__) < pkg_version.parse("0.6.0"):
-            self.skipTest(reason="This test requires executorch >= 0.6 to run.")
-
         model_id = "optimum-internal-testing/tiny-random-llama"
         with tempfile.TemporaryDirectory() as tempdir:
             subprocess.run(
@@ -130,9 +125,6 @@ def test_export_with_custom_sdpa(self):
             self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
 
     def test_eager_text_generation_with_custom_sdpa(self):
-        if pkg_version.parse(executorch_version.__version__) < pkg_version.parse("0.6.0"):
-            self.skipTest(reason="This test requires executorch >= 0.6 to run.")
-
         model_id = "HuggingFaceTB/SmolLM2-135M"
         prompt = "My favourite condiment is "
         max_seq_len = 32
diff --git a/tests/models/test_modeling_gemma.py b/tests/models/test_modeling_gemma.py
@@ -22,9 +22,7 @@
 import unittest
 
 import pytest
-from executorch import version as executorch_version
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
-from packaging import version as pkg_version
 from transformers import AutoTokenizer
 from transformers.testing_utils import slow
 
@@ -65,9 +63,6 @@ def test_gemma_export_to_executorch(self):
     @slow
     @pytest.mark.run_slow
     def test_gemma_text_generation(self):
-        if pkg_version.parse(executorch_version.__version__) < pkg_version.parse("0.6.0"):
-            self.skipTest(reason="Support of float16 requires executorch >= 0.6 to run.")
-
         # TODO: Switch to use google/gemma-2b once https://github.com/huggingface/optimum/issues/2127 is fixed
         # model_id = "google/gemma-2b"
         model_id = "weqweasdas/RM-Gemma-2B"
diff --git a/tests/models/test_modeling_gemma2.py b/tests/models/test_modeling_gemma2.py
@@ -22,9 +22,7 @@
 import unittest
 
 import pytest
-from executorch import version as executorch_version
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
-from packaging import version as pkg_version
 from transformers import AutoTokenizer
 from transformers.testing_utils import slow
 
@@ -65,9 +63,6 @@ def test_gemma2_export_to_executorch(self):
     @slow
     @pytest.mark.run_slow
     def test_gemma2_text_generation(self):
-        if pkg_version.parse(executorch_version.__version__) < pkg_version.parse("0.6.0"):
-            self.skipTest(reason="Support of float16 requires executorch >= 0.6 to run.")
-
         # TODO: Switch to use google/gemma-2-2b once https://github.com/huggingface/optimum/issues/2127 is fixed
         # model_id = "google/gemma-2-2b"
         model_id = "unsloth/gemma-2-2b-it"
diff --git a/tests/models/test_modeling_llama.py b/tests/models/test_modeling_llama.py
@@ -22,9 +22,7 @@
 import unittest
 
 import pytest
-from executorch import version as executorch_version
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
-from packaging import version as pkg_version
 from transformers import AutoTokenizer
 from transformers.testing_utils import slow
 
@@ -91,9 +89,6 @@ def test_llama3_2_1b_text_generation(self):
     @slow
     @pytest.mark.run_slow
     def test_llama_text_generation_with_custom_sdpa(self):
-        if pkg_version.parse(executorch_version.__version__) < pkg_version.parse("0.6.0"):
-            self.skipTest(reason="This test requires executorch >= 0.6 to run.")
-
         # ExecuTorch model + custom sdpa
         model_id = "NousResearch/Llama-3.2-1B"
         model = ExecuTorchModelForCausalLM.from_pretrained(
diff --git a/tests/models/test_modeling_olmo.py b/tests/models/test_modeling_olmo.py
@@ -21,9 +21,7 @@
 import unittest
 
 import pytest
-from executorch import version as executorch_version
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
-from packaging import version as pkg_version
 from transformers import AutoTokenizer
 from transformers.testing_utils import slow
 
@@ -77,9 +75,6 @@ def test_olmo_text_generation_with_xnnpack(self):
     @slow
     @pytest.mark.run_slow
     def test_olmo_text_generation_with_custom_sdpa(self):
-        if pkg_version.parse(executorch_version.__version__) < pkg_version.parse("0.6.0"):
-            self.skipTest(reason="This test requires executorch >= 0.6 to run.")
-
         # ExecuTorch model + custom sdpa
         model_id = "allenai/OLMo-1B-hf"
         model = ExecuTorchModelForCausalLM.from_pretrained(
diff --git a/tests/models/test_modeling_qwen2.py b/tests/models/test_modeling_qwen2.py
@@ -21,9 +21,7 @@
 import unittest
 
 import pytest
-from executorch import version as executorch_version
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
-from packaging import version as pkg_version
 from transformers import AutoTokenizer
 from transformers.testing_utils import slow
 
@@ -80,9 +78,6 @@ def test_qwen2_5_text_generation(self):
     @slow
     @pytest.mark.run_slow
     def test_qwen2_5_text_generation_with_custom_sdpa(self):
-        if pkg_version.parse(executorch_version.__version__) < pkg_version.parse("0.6.0"):
-            self.skipTest(reason="This test requires executorch >= 0.6 to run.")
-
         model_id = "Qwen/Qwen2.5-0.5B"
         prompt = "My favourite condiment is "
         max_seq_len = 32
diff --git a/tests/models/test_modeling_smollm.py b/tests/models/test_modeling_smollm.py
@@ -21,9 +21,7 @@
 import unittest
 
 import pytest
-from executorch import version as executorch_version
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
-from packaging import version as pkg_version
 from transformers import AutoTokenizer
 from transformers.testing_utils import slow
 
@@ -80,9 +78,6 @@ def test_smollm_text_generation(self):
     @slow
     @pytest.mark.run_slow
     def test_smollm_text_generation_with_custom_sdpa(self):
-        if pkg_version.parse(executorch_version.__version__) < pkg_version.parse("0.6.0"):
-            self.skipTest(reason="This test requires executorch >= 0.6 to run.")
-
         model_id = "HuggingFaceTB/SmolLM2-135M"
         prompt = "My favourite condiment is "
         max_seq_len = 32