pytorch
diff --git a/‎.mypy.ini‎
Lines changed: 6 additions & 0 deletions b/‎.mypy.ini‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_int16_activation_conv2d_pass.py‎
Lines changed: 3 additions & 3 deletions b/‎backends/arm/_passes/decompose_int16_activation_conv2d_pass.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/arm/_passes/insert_rescales_pass.py‎
Lines changed: 9 additions & 7 deletions b/‎backends/arm/_passes/insert_rescales_pass.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎backends/arm/_passes/insert_table_ops.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/insert_table_ops.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/rewrite_conv2d_pass.py‎
Lines changed: 68 additions & 8 deletions b/‎backends/arm/_passes/rewrite_conv2d_pass.py‎
Lines changed: 68 additions & 8 deletions
diff --git a/‎backends/arm/_passes/rewrite_matmul.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/rewrite_matmul.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/rewrite_upsample.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/rewrite_upsample.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/common/debug.py‎
Lines changed: 7 additions & 15 deletions b/‎backends/arm/common/debug.py‎
Lines changed: 7 additions & 15 deletions
diff --git a/‎backends/arm/debug/schema.py‎
Lines changed: 4 additions & 9 deletions b/‎backends/arm/debug/schema.py‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎backends/arm/ethosu/backend.py‎
Lines changed: 9 additions & 0 deletions b/‎backends/arm/ethosu/backend.py‎
Lines changed: 9 additions & 0 deletions
@@ -83,6 +83,12 @@ ignore_missing_imports = True
 [mypy-tosa_tools.*]
 ignore_missing_imports = True
 
+[mypy-tosa_serializer]
+ignore_missing_imports = True
+
+[mypy-tosa_serializer.*]
+ignore_missing_imports = True
+
 [mypy-setuptools.*]
 ignore_missing_imports = True
 
 
@@ -105,14 +105,14 @@ def call_operator(self, op, args, kwargs, meta):
 
             conv_output = super().call_operator(
                 exir_ops.backend.tosa.RESCALE.default,
-                (convolution, torch.int32, conv_rescale_factor, 0, 0),
+                (convolution, torch.int32, [conv_rescale_factor], 0, 0),
                 {},
                 new_meta,
             )
 
             bias_rescaled = super().call_operator(
                 exir_ops.backend.tosa.RESCALE.default,
-                (channel_bias, torch.int32, bias_rescale_factor, 0, 0),
+                (channel_bias, torch.int32, [bias_rescale_factor], 0, 0),
                 {},
                 new_meta,
             )
@@ -129,7 +129,7 @@ def call_operator(self, op, args, kwargs, meta):
                 (
                     add,
                     output_dtype,
-                    (common_scale / (conv_output_scale * (1 << bits_left_to_shift))),
+                    [(common_scale / (conv_output_scale * (1 << bits_left_to_shift)))],
                     0,
                     0,
                 ),
 
@@ -45,7 +45,7 @@ def fold_dq_q_to_rescale(self, node: Node, user: Node, graph_module: GraphModule
                 (
                     node.all_input_nodes[0],
                     q_args.dtype,
-                    new_scale,
+                    [new_scale],
                     dq_args.zp,
                     q_args.zp,
                 ),
@@ -228,10 +228,10 @@ def _rescale_inputs(self, graph, node, rescale_qargs: Dict[int, QuantArgs]) -> b
                     (
                         arg_node,
                         torch.int32,
-                        qp.get_scale_per_tensor()
-                        / rescale_qargs[
-                            i
-                        ].get_scale_per_tensor(),  # Old scale / new scale
+                        [
+                            qp.get_scale_per_tensor()
+                            / rescale_qargs[i].get_scale_per_tensor()
+                        ],  # [Old scale / new scale]
                         qp.get_zp_per_tensor(),  # Old zero point
                         rescale_qargs[i].get_zp_per_tensor(),  # New zero point
                     ),
@@ -264,8 +264,10 @@ def _rescale_outputs(self, graph, node, rescale_qargs: Optional[QuantArgs]) -> b
                 (
                     node,
                     qarg.dtype,
-                    rescale_qargs.get_scale_per_tensor()
-                    / qarg.get_scale_per_tensor(),  # Old scale / new scale
+                    [
+                        rescale_qargs.get_scale_per_tensor()
+                        / qarg.get_scale_per_tensor()
+                    ],  # [Old scale / new scale]
                     rescale_qargs.get_zp_per_tensor(),  # Old zero point
                     qarg.get_zp_per_tensor(),  # New zero point
                 ),
 
@@ -286,7 +286,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
                     rescale_node = create_node(
                         graph=graph_module.graph,
                         op_target=exir_ops.backend.tosa.RESCALE.default,
-                        args=(table_op_node, output_qparams[0].dtype, scale, 0, 0),
+                        args=(table_op_node, output_qparams[0].dtype, [scale], 0, 0),
                     )
                     output_node = rescale_node
 
 
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import itertools
 from typing import Set, Type
 
 import torch
@@ -16,6 +17,10 @@
     is_buffer,
     is_param,
 )
+from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+    get_input_qparams,
+    get_output_qparams,
+)
 from executorch.backends.arm.constants import HWCM_ORDER, NHWC_INVERSE_ORDER
 from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.backends.transforms.utils import create_constant_placeholder
@@ -156,6 +161,40 @@ def _add_bias(
         node.update_arg(2, bias_node)
         return bias_node
 
+    def insert_output_rescale(self, graph_module, node):
+        input_qparams = get_input_qparams(node)
+        output_qparams = get_output_qparams(node)[0]
+        weight_qparams = input_qparams[1]
+        input_qparams = input_qparams[0]
+        is_per_channel = weight_qparams.per_channel
+        if is_per_channel:
+            weight_scale = weight_qparams.get_scale_per_channel()
+        else:
+            weight_scale = [weight_qparams.get_scale_per_tensor()]
+        input_scale = input_qparams.get_scale_per_tensor()
+        post_conv2d_scale = [
+            (inp * w) / out
+            for inp, w, out in zip(
+                itertools.cycle([input_scale]),
+                weight_scale,
+                itertools.cycle([output_qparams.get_scale_per_tensor()]),
+            )
+        ]
+        with graph_module.graph.inserting_after(node):
+            rescale_node = create_node(
+                graph=graph_module.graph,
+                op_target=exir_ops.backend.tosa.RESCALE.default,
+                args=(
+                    node,
+                    output_qparams.dtype,
+                    post_conv2d_scale,
+                    0,
+                    output_qparams.get_zp_per_tensor(),
+                ),
+                from_node=node,
+            )
+        return rescale_node
+
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         modified = False
         for node in graph_module.graph.nodes:
@@ -180,20 +219,20 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
             ) = node.args
 
             pad = [val for val in pad for _ in (0, 1)]
-            input_shape = get_first_fake_tensor(x).shape
-            weight_shape = get_first_fake_tensor(weight).shape
+            input_fake_tensor = get_first_fake_tensor(x)
+            weight_fake_tensor = get_first_fake_tensor(weight)
             # Adjust the pad value if needed to meet the
             # strict convolution output shape calculation.
             pad[1] = self._adjust_pad_if_needed(
-                input_shape[2],
-                weight_shape[2],
+                input_fake_tensor.shape[2],
+                weight_fake_tensor.shape[2],
                 stride[0],
                 pad[1],
                 dilation[0],
             )
             pad[3] = self._adjust_pad_if_needed(
-                input_shape[3],
-                weight_shape[3],
+                input_fake_tensor.shape[3],
+                weight_fake_tensor.shape[3],
                 stride[1],
                 pad[3],
                 dilation[1],
@@ -204,7 +243,8 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
 
             if self._is_depthwise_conv2d(node):
                 target_op = exir_ops.backend.tosa.DEPTHWISE_CONV2D.default
-                self._reshape_weights(weight, input_shape[1])
+                self._reshape_weights(weight, input_fake_tensor.shape[1])
+                weight_fake_tensor = get_first_fake_tensor(weight)
             else:
                 target_op = exir_ops.backend.tosa.CONV2D.default
 
@@ -227,9 +267,29 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                     args=conv2d_args,
                     from_node=node,
                 )
+            bias_fake_tensor = get_first_fake_tensor(bias) if bias else None
+            tosa_node_fake_tensor = target_op(
+                input_fake_tensor,
+                weight_fake_tensor,
+                bias_fake_tensor,
+                *conv2d_args[3:],
+            )
 
+            if (
+                tosa_node_fake_tensor.dtype == torch.int32
+                and input_fake_tensor.dtype == torch.int8
+            ) or (
+                tosa_node_fake_tensor.dtype == torch.int32
+                and input_fake_tensor.dtype == torch.int16
+            ):
+                output_rescale = self.insert_output_rescale(graph_module, tosa_op)
+                node.replace_all_uses_with(output_rescale)
+                if input_fake_tensor.dtype == torch.int16:
+                    tosa_op.meta[TosaSpecialDtype.meta_key()] = TosaSpecialDtype.INT48
+            else:
                 node.replace_all_uses_with(tosa_op)
-                graph_module.graph.erase_node(node)
+
+            graph_module.graph.erase_node(node)
 
         if modified:
             graph_module.recompile()
 
@@ -44,7 +44,7 @@ def _insert_output_rescale(self, graph_module, node, tosa_matmul_node, dtype):
             rescale_node.args = (
                 tosa_matmul_node,
                 dtype,
-                scale,
+                [scale],
                 0,
                 output_qparams.get_zp_per_tensor(),
             )
 
@@ -74,7 +74,7 @@ def call(self, graph_module):
                     rescale_node.args = (
                         tosa_resize_node,
                         output_dtype,
-                        output_scale,
+                        [output_scale],
                         0,  # zero point
                         0,  # zero point
                     )
 
@@ -7,8 +7,9 @@
 import os
 from typing import Optional
 
-import serializer.tosa_serializer as ts
 import torch
+
+import tosa_serializer as ts
 from executorch.exir.print_program import inspect_node
 
 logger = logging.getLogger(__name__)
@@ -50,29 +51,20 @@ def get_node_debug_info(
     return output
 
 
-# Output TOSA flatbuffer and test harness file
-def debug_tosa_dump(tosa_graph: ts.TosaSerializer, path: str, suffix: str = ""):
+# Output TOSA flatbuffer for debugging
+def debug_tosa_dump(tosa_graph: bytes, path: str, suffix: str = ""):
     filename = f"output{suffix}.tosa"
 
     logger.info(f"Emitting debug output to: {path=}, {suffix=}")
 
     os.makedirs(path, exist_ok=True)
 
-    fb = tosa_graph.serialize()
-    js = tosa_graph.writeJson(filename)
-
     filepath_tosa_fb = os.path.join(path, filename)
     with open(filepath_tosa_fb, "wb") as f:
-        f.write(fb)
+        f.write(tosa_graph)
     if not os.path.exists(filepath_tosa_fb):
         raise IOError("Failed to write TOSA flatbuffer")
 
-    filepath_desc_json = os.path.join(path, f"desc{suffix}.json")
-    with open(filepath_desc_json, "w") as f:
-        f.write(js)
-    if not os.path.exists(filepath_desc_json):
-        raise IOError("Failed to write TOSA JSON")
-
 
 def debug_fail(
     node,
@@ -81,7 +73,7 @@ def debug_fail(
     path: Optional[str] = None,
 ):
     logger.warning("Internal error due to poorly handled node:")
-    if tosa_graph is not None and path is not None:
-        debug_tosa_dump(tosa_graph, path)
+    if tosa_graph is not None and path:
+        debug_tosa_dump(tosa_graph.serialize(), path)
         logger.warning(f"Debug output captured in '{path}'.")
     debug_node(node, graph_module)
@@ -10,8 +10,8 @@
 from dataclasses import asdict, dataclass
 from typing import Any, Optional
 
-import serializer.tosa_serializer as ts
 import torch
+import tosa_serializer as ts
 
 from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
 
@@ -114,23 +114,18 @@ def to_dict(self) -> dict[str, Any]:
 class DebugHook:
     def __init__(self, debug_mode: ArmCompileSpec.DebugMode) -> None:
         self._debug_events: list[DebugSchema] = []
-        self.__op_id_to_name = {}
         self.mode = debug_mode
 
-        # Build up a mapping from TOSA 1.0 operator IDs to their names
-        for name, val in vars(ts.Op).items():
-            self.__op_id_to_name[val] = name
-
-    def add(self, node: torch.fx.Node, tosa_op: Any, tosa_op_id: int) -> DebugSchema:
+    def add(self, node: torch.fx.Node, tosa_op: Any, tosa_op_id: ts.Op) -> DebugSchema:
         tosa_debug_info = None
 
         # If the debug data is being embedded into the TOSA flatbuffer
         # do not collect TOSADebugSchema data, it's redundent
         if self.mode != ArmCompileSpec.DebugMode.TOSA:
             tosa_debug_info = TosaDebugSchema(
                 node_name=str(tosa_op),
-                operator_name=self.__op_id_to_name[tosa_op_id],
-                operator_id=tosa_op_id,
+                operator_name=str(tosa_op_id),
+                operator_id=int(tosa_op_id),
             )
 
         aten_debug_info = ATenDebugSchema.from_node(node)
 
@@ -51,6 +51,15 @@ def _compile_tosa_flatbuffer(
                 "compile_flags are required in the CompileSpec list for EthosUBackend"
             )
 
+        # Vela tooling only supports flatbuffers up to 2 GiB.
+        max_flatbuffer_size = 2 * 1024 * 1024 * 1024
+        flatbuffer_size = len(tosa_flatbuffer)
+        if flatbuffer_size > max_flatbuffer_size:
+            raise RuntimeError(
+                "TOSA flatbuffer is too large for Vela "
+                f"({flatbuffer_size} bytes > {max_flatbuffer_size} bytes limit)."
+            )
+
         # Pass on the TOSA flatbuffer to the vela compiler.
         binary = vela_compile(
             tosa_flatbuffer,
Original file line number	Diff line number	Diff line change
`@@ -286,7 +286,7 @@ def call(self, graph_module: GraphModule) -> PassResult:`
`286`	`286`	`rescale_node = create_node(`
`287`	`287`	`graph=graph_module.graph,`
`288`	`288`	`op_target=exir_ops.backend.tosa.RESCALE.default,`
`289`		`- args=(table_op_node, output_qparams[0].dtype, scale, 0, 0),`
	`289`	`+ args=(table_op_node, output_qparams[0].dtype, [scale], 0, 0),`
`290`	`290`	`)`
`291`	`291`	`output_node = rescale_node`
`292`	`292`
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ def _insert_output_rescale(self, graph_module, node, tosa_matmul_node, dtype):`
`44`	`44`	`rescale_node.args = (`
`45`	`45`	`tosa_matmul_node,`
`46`	`46`	`dtype,`
`47`		`- scale,`
	`47`	`+ [scale],`
`48`	`48`	`0,`
`49`	`49`	`output_qparams.get_zp_per_tensor(),`
`50`	`50`	`)`
Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,7 @@ def call(self, graph_module):`
`74`	`74`	`rescale_node.args = (`
`75`	`75`	`tosa_resize_node,`
`76`	`76`	`output_dtype,`
`77`		`- output_scale,`
	`77`	`+ [output_scale],`
`78`	`78`	`0, # zero point`
`79`	`79`	`0, # zero point`
`80`	`80`	`)`