Arm backend: Add support for int32 clamp (#15977)

ArmRyan · web-flow · commit e38734e03543 · 2025-12-02T15:23:28.000+01:00
Tosa does not support int32 dtype for clamp operator so instead we need to use Min/Max to implement clamp. Change-Id: Iea442901b2227610ebb5e5a0f1bca8d236e70d9d cc @freddan80 @per @zingo @oscarandersson8218 @digantdesai --------- Signed-off-by: Ryan O'Shea <ryan.oshea3@arm.com>
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
@@ -52,6 +52,7 @@
 from .decompose_int16_activation_conv2d_pass import (  # noqa
     DecomposeConv2dWithInt16ActivationPass,
 )
+from .decompose_int32_clamp_pass import DecomposeInt32ClampPass  # noqa
 from .decompose_int_pow_pass import DecomposeIntPowPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -55,6 +55,7 @@
     DecomposeGluPass,
     DecomposeGroupedConvPass,
     DecomposeGroupNormPass,
+    DecomposeInt32ClampPass,
     DecomposeIntPowPass,
     DecomposeLayerNormPass,
     DecomposeLeakyReLUPass,
@@ -122,7 +123,6 @@
 
 
 class ArmPassManager(PassManager):
-
     def __init__(self, tosa_spec: TosaSpecification) -> None:
         self.tosa_spec = tosa_spec
         super().__init__()
@@ -174,6 +174,7 @@ def _tosa_pipeline(
                 FuseQuantizedActivationPass(),
                 RemoveGetItemPass(),
                 ConvertToClampPass(),
+                DecomposeInt32ClampPass(),
                 DecomposeGroupNormPass(),
                 DecomposeLayerNormPass(),
                 DecomposeBatchNormNoStatsPass(),
diff --git a/backends/arm/_passes/decompose_int32_clamp_pass.py b/backends/arm/_passes/decompose_int32_clamp_pass.py
@@ -0,0 +1,72 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Set, Type
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+class DecomposeInt32ClampPass(ArmPass):
+    """Rewrite int32 clamp into min/max chain since TOSA lacks int32 clamp support."""
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+    _supported_ops = {
+        exir_ops.edge.aten.clamp.default,
+        torch.ops.aten.clamp.default,
+    }
+
+    def _ensure_tensor(
+        self,
+        value,
+        ref_tensor,
+        dtype,
+        rank,
+        meta,
+    ):
+        if value is None:
+            return None
+        return super().call_operator(
+            exir_ops.edge.aten.full.default,
+            ((1,) * rank, value),
+            {"dtype": dtype},
+            meta,
+            updated=True,
+        )
+
+    def call_operator(self, op, args, kwargs, meta):
+        val = meta["val"]
+        if op not in self._supported_ops or val.dtype != torch.int32:
+            return super().call_operator(op, args, kwargs, meta)
+
+        input_tensor = args[0]
+        min_arg = args[1] if len(args) > 1 else None
+        max_arg = args[2] if len(args) > 2 else None
+        dtype = val.dtype
+        rank = len(val.shape)
+
+        min_arg = self._ensure_tensor(min_arg, input_tensor, dtype, rank, meta)
+        max_arg = self._ensure_tensor(max_arg, input_tensor, dtype, rank, meta)
+
+        current = input_tensor
+        if max_arg is not None:
+            current = super().call_operator(
+                exir_ops.edge.aten.minimum.default,
+                (current, max_arg),
+                {},
+                meta,
+                updated=True,
+            )
+        if min_arg is not None:
+            current = super().call_operator(
+                exir_ops.edge.aten.maximum.default,
+                (current, min_arg),
+                {},
+                meta,
+                updated=True,
+            )
+        return current
diff --git a/backends/arm/operators/op_clamp.py b/backends/arm/operators/op_clamp.py
@@ -40,7 +40,6 @@ def __init__(self, *args):
     def _get_min_max_arguments(
         self, node: Node, dtype: torch.dtype
     ) -> Tuple[int | float, int | float]:
-
         def cast_type(value: Any) -> int | float:
             if isinstance(value, int):
                 return value
@@ -91,7 +90,12 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [inputs[0], output],
-            [ts.DType.INT8, ts.DType.INT16, ts.DType.FP16, ts.DType.FP32],
+            [
+                ts.DType.INT8,
+                ts.DType.INT16,
+                ts.DType.FP16,
+                ts.DType.FP32,
+            ],
             output.tosa_spec,
         )
 
diff --git a/backends/arm/test/ops/test_clamp.py b/backends/arm/test/ops/test_clamp.py
@@ -35,6 +35,25 @@
     "rank_4_no_max": lambda: (torch.rand(1, 10, 10, 1) - 3, -3.3, None),
 }
 
+test_data_suite_int32 = {
+    "int32_rank2": lambda: (torch.randint(-50, 50, (2, 3), dtype=torch.int32), -10, 10),
+    "int32_rank3_no_min": lambda: (
+        torch.randint(-100, 100, (1, 3, 3), dtype=torch.int32),
+        None,
+        25,
+    ),
+    "int32_rank3_no_max": lambda: (
+        torch.randint(-100, 100, (1, 3, 3), dtype=torch.int32),
+        -25,
+        None,
+    ),
+    "int32_rank4_large_range": lambda: (
+        torch.randint(-200, 200, (1, 2, 4, 4), dtype=torch.int32),
+        torch.iinfo(torch.int32).min,
+        torch.iinfo(torch.int32).max,
+    ),
+}
+
 
 class Clamp(torch.nn.Module):
     def __init__(
@@ -53,7 +72,6 @@ def forward(self, x):
 
 @common.parametrize("test_data", test_data_suite)
 def test_clamp_tosa_FP(test_data):
-
     input_tensor, min_val, max_val = test_data()
     model = Clamp(min_val, max_val)
 
@@ -69,7 +87,6 @@ def test_clamp_tosa_FP(test_data):
 
 @common.parametrize("test_data", test_data_suite)
 def test_clamp_tosa_INT(test_data):
-
     input_tensor, min_val, max_val = test_data()
     model = Clamp(min_val, max_val)
 
@@ -84,6 +101,22 @@ def test_clamp_tosa_INT(test_data):
     pipeline.run()
 
 
+@common.parametrize("test_data", test_data_suite_int32)
+def test_clamp_tosa_INT_int32_inputs(test_data):
+    input_tensor, min_val, max_val = test_data()
+    model = Clamp(min_val, max_val)
+
+    pipeline = TosaPipelineINT[input_t](
+        model,
+        (input_tensor,),
+        aten_op,
+        exir_op,
+    )
+    pipeline.change_args("run_method_and_compare_outputs", qtol=1)
+    pipeline.pop_stage("quantize")
+    pipeline.run()
+
+
 @common.parametrize("test_data", test_data_suite)
 def test_clamp_tosa_INT_a16w8(test_data):
     """Test clamp operation with int16 I/O quantization for TOSA INT."""
@@ -103,7 +136,6 @@ def test_clamp_tosa_INT_a16w8(test_data):
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone300
 def test_clamp_u55_INT(test_data):
-
     input_tensor, min_val, max_val = test_data()
     model = Clamp(min_val, max_val)
 
@@ -140,7 +172,6 @@ def test_clamp_16a8w_u55_INT16(test_data):
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
 def test_clamp_u85_INT(test_data):
-
     input_tensor, min_val, max_val = test_data()
     model = Clamp(min_val, max_val)
 
diff --git a/backends/arm/test/passes/test_convert_int32_clamp_to_minmax_pass.py b/backends/arm/test/passes/test_convert_int32_clamp_to_minmax_pass.py
@@ -0,0 +1,44 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes.decompose_int32_clamp_pass import (
+    DecomposeInt32ClampPass,
+)
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+input_t = Tuple[torch.Tensor]
+
+
+class ClampInt32(torch.nn.Module):
+    test_data = {"rand": (torch.randint(-50, 50, (2, 3), dtype=torch.int32),)}
+
+    def forward(self, x: torch.Tensor):
+        return torch.clamp(x, -10, 5)
+
+
+@common.parametrize("test_data", ClampInt32.test_data)
+def test_decompose_int32_clamp_pass(test_data: input_t):
+    module = ClampInt32()
+    pipeline = PassPipeline[input_t](
+        module,
+        test_data,
+        quantize=False,
+        ops_before_pass={
+            "executorch_exir_dialects_edge__ops_aten_clamp_default": 1,
+        },
+        ops_after_pass={
+            "executorch_exir_dialects_edge__ops_aten_minimum_default": 1,
+            "executorch_exir_dialects_edge__ops_aten_maximum_default": 1,
+        },
+        ops_not_after_pass=[
+            "executorch_exir_dialects_edge__ops_aten_clamp_default",
+        ],
+        pass_list=[DecomposeInt32ClampPass],
+    )
+    pipeline.run()

Original file line number	Diff line number	Diff line change
`@@ -52,6 +52,7 @@`
`52`	`52`	`from .decompose_int16_activation_conv2d_pass import ( # noqa`
`53`	`53`	`DecomposeConv2dWithInt16ActivationPass,`
`54`	`54`	`)`
	`55`	`+from .decompose_int32_clamp_pass import DecomposeInt32ClampPass # noqa`
`55`	`56`	`from .decompose_int_pow_pass import DecomposeIntPowPass # noqa`
`56`	`57`	`from .decompose_layernorm_pass import DecomposeLayerNormPass # noqa`
`57`	`58`	`from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass # noqa`