Introduce 8da4w quant for decoder-only text models

Guang Yang · Guang Yang · commit c20bd3edf9dd · 2025-05-01T11:24:27.000-07:00
diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
@@ -52,11 +52,12 @@ jobs:
       - name: Install dependencies for ExecuTorch
         run: |
           if [ "${{ matrix.executorch-version }}" == "nightly" ]; then
-            export NIGHTLY_VERSION=dev20250413
+            export NIGHTLY_VERSION=dev20250422
             pip install executorch==0.7.0.${NIGHTLY_VERSION} \
               torch==2.8.0.${NIGHTLY_VERSION} \
               torchvision==0.22.0.${NIGHTLY_VERSION} \
               torchaudio==2.6.0.${NIGHTLY_VERSION} \
+              torchao==0.11.0.${NIGHTLY_VERSION} \
               --extra-index-url "https://download.pytorch.org/whl/nightly/cpu"
           else
             pip install executorch==${{ matrix.executorch-version }}
diff --git a/README.md b/README.md
@@ -176,7 +176,7 @@ We currently support a wide range of popular transformer models, including encod
 
 **Supported Backend:**
 
-Currently, **Optimum-ExecuTorch** supports only the [XNNPACK Backend](https://pytorch.org/executorch/main/backends-xnnpack.html) for efficient CPU execution on mobile devices. Quantization support for XNNPACK is planned to be added shortly.
+Currently, **Optimum-ExecuTorch** supports only the [XNNPACK Backend](https://pytorch.org/executorch/main/backends-xnnpack.html) for efficient execution on mobile CPUs. We currently support Post-Training Quantization (PTQ) for linear layers using int8 dynamic per-token activations and int4 grouped per-channel weights (`8da4w`).
 
 For a comprehensive overview of all backends supported by ExecuTorch, please refer to the [ExecuTorch Backend Overview](https://pytorch.org/executorch/main/backends-overview.html).
 
diff --git a/optimum/commands/export/executorch.py b/optimum/commands/export/executorch.py
@@ -57,6 +57,13 @@ def parse_args_executorch(parser):
         action="store_true",
         help="For decoder-only models to use custom sdpa with static kv cache to boost performance. Defaults to False.",
     )
+    required_group.add_argument(
+        "-q",
+        "--quantize",
+        required=False,
+        choices=["8da4w"],
+        help="Quantization recipe to use. Defaults to None.",
+    )
 
 
 class ExecuTorchExportCommand(BaseOptimumCLICommand):
@@ -72,6 +79,8 @@ def run(self):
         kwargs = {}
         if self.args.use_custom_sdpa:
             kwargs["use_custom_sdpa"] = self.args.use_custom_sdpa
+        if self.args.quantize:
+            kwargs["quantize"] = self.args.quantize
 
         main_export(
             model_name_or_path=self.args.model,
diff --git a/optimum/exporters/executorch/recipes/xnnpack.py b/optimum/exporters/executorch/recipes/xnnpack.py
@@ -15,9 +15,11 @@
 import logging
 from typing import Dict, Union
 
+from tabulate import tabulate
 from torch.export import ExportedProgram
 
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.devtools.backend_debug import get_delegation_info
 from executorch.exir import (
     EdgeCompileConfig,
     ExecutorchBackendConfig,
@@ -73,7 +75,15 @@ def _lower_to_executorch(
                     extract_delegate_segments=True,
                 ),
             )
-            logging.debug(f"Exported program for {pte_name}.pte: {et_progs[pte_name].exported_program().graph_module}")
+            logging.debug(f"\nExported program for {pte_name}.pte: {exported_program}")
+            logging.debug(
+                f"\nExecuTorch program for {pte_name}.pte: {et_progs[pte_name].exported_program().graph_module}"
+            )
+            delegation_info = get_delegation_info(et_progs[pte_name].exported_program().graph_module)
+            logging.debug(f"\nDelegation info Summary for {pte_name}.pte: {delegation_info.get_summary()}")
+            logging.debug(
+                f"\nDelegation info for {pte_name}.pte: {tabulate(delegation_info.get_operator_delegation_dataframe(), headers='keys', tablefmt='fancy_grid')}"
+            )
         return et_progs
 
     exported_progs = model.export()
diff --git a/optimum/exporters/executorch/tasks/causal_lm.py b/optimum/exporters/executorch/tasks/causal_lm.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
+import torchao
+from packaging.version import parse
 from transformers import AutoModelForCausalLM, GenerationConfig
 
 from ..integrations import CausalLMExportableModule
@@ -54,12 +57,14 @@ def load_causal_lm_model(model_name_or_path: str, **kwargs) -> CausalLMExportabl
     cache_implementation = kwargs.get("cache_implementation", "static")
     max_length = kwargs.get("max_length", 2048)
     config = kwargs.get("config", None)
+    quantization_recipe = kwargs.get("quantize", None)
 
     eager_model = AutoModelForCausalLM.from_pretrained(
         model_name_or_path,
         device_map=device,
         torch_dtype=dtype,
         config=config,
+        # quantization_config=quantization_config,
         attn_implementation=attn_implementation,
         generation_config=GenerationConfig(
             use_cache=True,
@@ -71,4 +76,25 @@ def load_causal_lm_model(model_name_or_path: str, **kwargs) -> CausalLMExportabl
             },
         ),
     )
+
+    if quantization_recipe == "8da4w":
+        if parse(torchao.__version__) < parse("0.11.0.dev0"):
+            raise RuntimeError("Quantization 8da4w requires torchao >= 0.11.0. Please upgrade torchao.")
+
+        from torchao.quantization.granularity import PerGroup
+        from torchao.quantization.quant_api import (
+            Int8DynamicActivationIntxWeightConfig,
+        )
+
+        # TODO: Should switch to TorchAoConfig once the quant issue on final lm_head layer is fixed.
+        linear_config = Int8DynamicActivationIntxWeightConfig(
+            weight_dtype=torch.int4,
+            weight_granularity=PerGroup(64),
+        )
+
+        torchao.quantize_(
+            eager_model,
+            linear_config,
+        )
+
     return CausalLMExportableModule(eager_model)
diff --git a/tests/models/test_modeling_gemma3.py b/tests/models/test_modeling_gemma3.py
@@ -21,7 +21,9 @@
 import unittest
 
 import pytest
+import torchao
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from packaging.version import parse
 from transformers import AutoTokenizer
 from transformers.testing_utils import slow
 
@@ -153,3 +155,40 @@ def test_gemma3_text_generation_with_custom_sdpa_float16(self):
         gc.collect()
 
         self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skipif(
+        parse(torchao.__version__) < parse("0.11.0.dev0"),
+        reason="Only available on torchao >= 0.11.0.dev0",
+    )
+    def test_gemma3_text_generation_with_custom_sdpa_8da4w(self):
+        model_id = "google/gemma-3-1b-it"
+        prompt = "Write a poem about a machine learning."
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        kwargs = {"quantize": "8da4w"}
+
+        # ExecuTorch model + custom sdpa + float16
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_id,
+            recipe="xnnpack",
+            attn_implementation="custom_sdpa",
+            **kwargs,
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            max_seq_len=64,
+        )
+        logging.info(f"\nGenerated text:\n\t{generated_text}")
+        generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
+
+        # Free memory before loading eager for quality check
+        del model
+        del tokenizer
+        gc.collect()
+
+        self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
diff --git a/tests/models/test_modeling_qwen3.py b/tests/models/test_modeling_qwen3.py
@@ -21,7 +21,9 @@
 import unittest
 
 import pytest
+import torchao
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from packaging.version import parse
 from transformers import AutoTokenizer
 from transformers.testing_utils import slow
 
@@ -136,3 +138,39 @@ def test_qwen3_text_generation_with_custom_sdpa_float16(self):
         gc.collect()
 
         self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skipif(
+        parse(torchao.__version__) < parse("0.11.0.dev0"),
+        reason="Only available on torchao >= 0.11.0.dev0",
+    )
+    def test_qwen3_text_generation_with_custom_sdpa_8da4w(self):
+        model_id = "Qwen/Qwen3-0.6B"
+        prompt = "Give me a short introduction to large language model."
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        # ExecuTorch model + custom sdpa
+        kwargs = {"quantize": "8da4w"}
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_id,
+            recipe="xnnpack",
+            attn_implementation="custom_sdpa",
+            **kwargs,
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            max_seq_len=64,
+        )
+        logging.info(f"\nGenerated text:\n\t{generated_text}")
+        generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
+
+        # Free memory before loading eager for quality check
+        del model
+        del tokenizer
+        gc.collect()
+
+        self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
diff --git a/tests/models/test_modeling_smollm.py b/tests/models/test_modeling_smollm.py
@@ -21,7 +21,9 @@
 import unittest
 
 import pytest
+import torchao
 from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from packaging.version import parse
 from transformers import AutoTokenizer
 from transformers.testing_utils import slow
 
@@ -106,3 +108,41 @@ def test_smollm_text_generation_with_custom_sdpa(self):
         gc.collect()
 
         self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skipif(
+        parse(torchao.__version__) < parse("0.11.0.dev0"),
+        reason="Only available on torchao >= 0.11.0.dev0",
+    )
+    def test_smollm_text_generation_with_custom_sdpa_8da4w(self):
+        model_id = "HuggingFaceTB/SmolLM2-135M"
+        prompt = "My favourite condiment is "
+        max_seq_len = 32
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        # ExecuTorch model + custom sdpa
+        kwargs = {"quantize": "8da4w"}
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_id,
+            recipe="xnnpack",
+            attn_implementation="custom_sdpa",
+            **kwargs,
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            max_seq_len=max_seq_len,
+        )
+        logging.info(f"\nGenerated text:\n\t{generated_text}")
+        generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
+
+        # Free memory before loading eager for quality check
+        del model
+        del tokenizer
+        gc.collect()
+
+        self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens))