Add 16A8W linear ops support and test

Ninja91 · facebook-github-bot · commit ca1cd7f87cc8 · 2025-08-14T20:15:32.000-07:00
Summary:
- Adds linear ops test using the 16A8W config in INT16 profile.
- Adds support in view ops validation for INT16 Dtype.
- Validated with TOSA pipeline test.

Note: Not verified with tosa reference model run.

Differential Revision: D80308822
diff --git a/backends/arm/operators/op_view.py b/backends/arm/operators/op_view.py
@@ -44,7 +44,7 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [inputs[0], output],
-            [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32, ts.DType.BOOL],
+            [ts.DType.INT8, ts.DType.INT16, ts.DType.INT32, ts.DType.FP32, ts.DType.BOOL],
             output.tosa_spec,
         )
 
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
@@ -188,7 +188,7 @@ def get_16a8w_quantization_config(
 
     # 16-bit activation quantization spec
     act_quantization_spec = QuantizationSpec(
-        dtype=torch.int32,
+        dtype=torch.int16,
         quant_min=torch.iinfo(torch.int16).min,  # -32768
         quant_max=torch.iinfo(torch.int16).max,  # 32767
         qscheme=torch.per_tensor_affine,
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
@@ -11,6 +11,9 @@
 import pytest
 
 import torch
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    get_16a8w_quantization_config,
+)
 from executorch.backends.arm.test import common
 
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -258,3 +261,34 @@ def test_linear_vgf_INT(test_data: torch.Tensor):
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
+
+
+@pytest.mark.xfail(
+    reason="TOSA backend has limited INT16 support - view operations only support INT8/INT32/FP32/BOOL"
+)
+@common.parametrize("test_data", test_data_rank1_INT)
+def test_linear_16a8w_tosa_INT(test_data: torch.Tensor):
+    """Test linear operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
+    test_data, out_features, has_bias, per_channel_quantization = test_data()
+    in_features = test_data.shape[-1]
+
+    # Create pipeline with custom 16A8W quantization config
+    pipeline = TosaPipelineINT[input_t1](
+        Linear(
+            in_features=in_features,
+            out_features=out_features,
+            bias=has_bias,
+        ),
+        (test_data,),
+        aten_op,
+        exir_op=[],
+        per_channel_quantization=per_channel_quantization,
+        use_to_edge_transform_and_lower=True,
+        quantization_config=get_16a8w_quantization_config(
+            is_per_channel=per_channel_quantization
+        ),
+        tosa_extensions=["int16"],
+    )
+
+    # Run the pipeline
+    pipeline.run()
diff --git a/backends/arm/test/pytest.ini b/backends/arm/test/pytest.ini
@@ -3,3 +3,4 @@ addopts = --strict-markers
 markers =
     slow: Tests that take long time
     tosa_ref_model: Tests that use TOSA reference model # Temporary!
+    flaky: Tests that are known to be flaky
diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py
@@ -108,7 +108,6 @@ def __init__(
             Union[Sequence[PassType], Dict[str, Sequence[PassType]]]
         ] = None,
     ):
-
         self.tester = ArmTester(
             module,
             example_inputs=test_data,
@@ -341,6 +340,7 @@ def __init__(
         qtol: int = 1,
         dynamic_shapes: Optional[Tuple[Any]] = None,
         tosa_extensions: Optional[List[str]] = None,
+        quantization_config: Optional[Any] = None,
     ):
         if tosa_extensions is None:
             tosa_extensions = []
@@ -356,9 +356,11 @@ def __init__(
         )
 
         quantizer = TOSAQuantizer(tosa_profiles[tosa_version])
-        quantization_config = get_symmetric_quantization_config(
-            is_per_channel=per_channel_quantization
-        )
+        # Use custom quantization config if provided, otherwise use default
+        if quantization_config is None:
+            quantization_config = get_symmetric_quantization_config(
+                is_per_channel=per_channel_quantization
+            )
         if symmetric_io_quantization:
             quantizer.set_io(quantization_config)
         quant_stage = Quantize(quantizer, quantization_config)
@@ -916,7 +918,6 @@ def __init__(
         ] = None,
         tosa_extensions: Optional[List[str]] = None,
     ):
-
         if tosa_extensions is None:
             tosa_extensions = []
         tosa_spec = TosaSpecification.create_from_string(

Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ def define_node(`
`44`	`44`	`validate_valid_dtype(`
`45`	`45`	`self.target,`
`46`	`46`	`[inputs[0], output],`
`47`		`- [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32, ts.DType.BOOL],`
	`47`	`+ [ts.DType.INT8, ts.DType.INT16, ts.DType.INT32, ts.DType.FP32, ts.DType.BOOL],`
`48`	`48`	`output.tosa_spec,`
`49`	`49`	`)`
`50`	`50`