apache · areusch · Mar 2, 2023 · Nov 25, 2022 · Nov 25, 2022 · Nov 30, 2022
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
@@ -467,6 +467,13 @@ TVM_DLL Pass RemoveUnusedFunctions(Array<runtime::String> entry_functions);
  */
 TVM_DLL Pass SimplifyExpr();
 
+/*!
+ * \brief Stripped down version of SimplifyExpr which is run after AlterOpLayout.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass SimplifyExprPostAlterOp();
+
 /*!
  * \brief Run any custom passes registered under "RelayToTIR" attributes on TargetKinds.
  *

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
@@ -877,23 +877,6 @@ def convert_deformable_conv2d(attrs, inputs, tinfos, desired_layouts):
     return relay.nn.deformable_conv2d(data, offset, weight, **new_attrs)
 
 
-# QNN ops
-@reg.register_alter_op_layout("add")
-def alter_op_layout_add(attrs, inputs, tinfos, out_type):
-    """Alter the layout of a add op.
-
-    Useful for fusing the bias constant with an input zero point constant in a previous quantized
-    op. Only used when previous op is a quantized op, which is why it lives in topi.nn.qnn.
-    """
-    return topi.nn.qnn.qnn_add_alter_layout(attrs, inputs, tinfos, out_type)
-
-
-@reg.register_alter_op_layout("qnn.requantize")
-def alter_op_layout_qnn_requantize(attrs, inputs, tinfos, out_type):
-    """Alter the layout of a requantization op."""
-    return topi.nn.qnn.qnn_requantize_alter_layout(attrs, inputs, tinfos, out_type)
-
-
 # bitpack
 @reg.register_compute("nn.bitpack")
 def compute_bitpack(attrs, inputs, out_dtype):

diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py
@@ -17,12 +17,20 @@
 # pylint: disable=invalid-name, unused-argument, len-as-condition
 """QNN operator feature registration"""
 
+import numpy as np
+
 from tvm import topi
 
 from .. import strategy
 from ...op.op import register_compute
 from ...op.op import register_injective_schedule
-from ...op.op import register_strategy, register_pattern, register_alter_op_layout, OpPattern
+from ...op.op import (
+    OpPattern,
+    register_alter_op_layout,
+    register_legalize,
+    register_pattern,
+    register_strategy,
+)
 
 
 @register_compute("qnn.simulated_quantize")
@@ -85,12 +93,60 @@ def simulated_dequantize_compute(attrs, inputs, output_type):
 register_strategy("qnn.conv2d", strategy.qnn_conv2d_strategy)
 
 
+@register_legalize("clip")
+def legalize_clip(attrs, inputs, tinfos):
+    """Removes clip operators with bounds matching the defaults for their dtype.
+
+    This is already done after alter_op by TVM's simplification passes, but certain QNN operator
+    implementations (like Cortex-M) need it to be done earlier in legalization.
+    """
+
+    if hasattr(inputs[0], "op") and inputs[0].op.name == "qnn.requantize":
+        dtype_info = np.iinfo(tinfos[0].dtype)
+        if dtype_info.min == attrs.a_min and dtype_info.max == attrs.a_max:
+            return inputs[0]
+
+    return None
+
+
+@register_legalize("nn.bias_add")
+def legalize_bias_add(attrs, inputs, tinfos):
+    """Legalize a bias add operator.
+
+    May be used to "fold in" unused channels from quantized convolution operators. This should
+    be done before layout rewrites occur to minimize the amount of "extra" overhead operators
+    like "cast" and "layout_transform".
+    """
+    return topi.nn.bias_add_legalize(attrs, inputs, tinfos)
+
+
 @register_alter_op_layout("qnn.conv2d")
 def alter_op_layout_qnn_conv2d(attrs, inputs, tinfos, out_type):
-    """Alternate the layout of qnn.conv2d"""
+    """Alter the layout of a qnn conv2d op.
+
+    May be used to alter the current QNN Conv2D op, but can also be used to alter previous ops to
+    better match the current op. For example, Arm Cortex-M uses this to set the out_layout of
+    previous ops to the input layout preferred by future layouts.
+    """
     return topi.nn.qnn_conv2d_alter_layout(attrs, inputs, tinfos, out_type)
 
 
+@register_alter_op_layout("add")
+def alter_op_layout_add(attrs, inputs, tinfos, out_type):
+    """Alter the layout of a add op.
+
+    Useful for fusing the bias constant with an input zero point constant in a previous quantized
+    op. Only used when previous op is a quantized op, which is why it lives in topi.nn.qnn.
+    """
+    return topi.nn.add_alter_layout(attrs, inputs, tinfos, out_type)
+
+
+@register_alter_op_layout("qnn.requantize")
+def alter_op_layout_qnn_requantize(attrs, inputs, tinfos, out_type):
+    """Alter the layout of a requantization op."""
+    return topi.nn.qnn_requantize_alter_layout(attrs, inputs, tinfos, out_type)
+
+
 # qnn.dense
 register_strategy("qnn.dense", strategy.qnn_dense_strategy)
 

diff --git a/python/tvm/relay/qnn/strategy/arm_cpu.py b/python/tvm/relay/qnn/strategy/arm_cpu.py
@@ -21,9 +21,55 @@
 regular/depthwise conv2d is supported, but qnn_dense will be added eventually."""
 
 from tvm import topi, TVMError
-from .generic import qnn_conv2d_strategy
+from tvm.topi.utils import get_const_tuple
 from ... import op as _op
 from ...op.strategy.generic import is_depthwise_conv2d
+from .generic import (
+    qnn_conv2d_strategy,
+    qnn_dense_strategy,
+    qnn_dequantize_strategy,
+    qnn_quantize_strategy,
+    wrap_compute_dequantize,
+    wrap_compute_quantize,
+    wrap_topi_qnn_dense,
+    wrap_topi_schedule,
+)
+
+
+@qnn_quantize_strategy.register("arm_cpu")
+def qnn_quantize_strategy_arm_cpu(_attrs, _inputs, _out_type, _target):
+    """qnn.quantize strategy for arm_cpu"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_quantize(topi.hexagon.qnn_quantize),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_quantize),
+        name="qnn_quantize.arm_cpu",
+    )
+    return strategy
+
+
+@qnn_dequantize_strategy.register("arm_cpu")
+def qnn_dequantize_strategy_arm_cpu(_attrs, _inputs, _out_type, _target):
+    """qnn.dequantize strategy for arm_cpu"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_dequantize(topi.hexagon.qnn_dequantize),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_dequantize),
+        name="qnn_dequantize.arm_cpu",
+    )
+    return strategy
+
+
+@qnn_dense_strategy.register("arm_cpu")
+def qnn_dense_strategy_arm_cpu(_attrs, _inputs, _out_type, _target):
+    """qnn.dense strategy for arm_cpu"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_topi_qnn_dense(topi.hexagon.qnn_dense),
+        wrap_topi_schedule(topi.hexagon.schedule_qnn_dense),
+        name="qnn_dense.arm_cpu",
+    )
+    return strategy
 
 
 @qnn_conv2d_strategy.register("arm_cpu")
@@ -59,13 +105,28 @@ def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
                 topi.arm_cpu.schedule_qnn_conv2d,
                 name="qnn_conv2d.arm_cpu",
             )
+        else:
+            raise TVMError("QNN regular Conv2D for Arm Cortex-M DSP got incorrect input layout!")
     elif is_depthwise_conv2d(data.shape, data_layout, kernel.shape, kernel_layout, groups):
         if data_layout == "NCHW" and kernel_layout == "IOHW":
-            strategy.add_implementation(
-                topi.arm_cpu.qnn_depthwise_conv2d,
-                topi.arm_cpu.schedule_qnn_depthwise_conv2d,
-                name="qnn_depthwise_conv2d.arm_cpu",
-            )
+            height, width = data.shape[2:]
+            y_stride, x_stride = get_const_tuple(attrs.strides)
+            if height * width * y_stride % 2 == 0:
+                strategy.add_implementation(
+                    topi.arm_cpu.qnn_depthwise_conv2d,
+                    topi.arm_cpu.schedule_qnn_depthwise_conv2d,
+                    name="qnn_depthwise_conv2d.arm_cpu",
+                )
+            elif y_stride == x_stride == 1:
+                strategy.add_implementation(
+                    topi.arm_cpu.qnn_unrolled_depthwise_conv2d,
+                    topi.arm_cpu.schedule_qnn_unrolled_depthwise_conv2d,
+                    name="qnn_unrolled_depthwise_conv2d.arm_cpu",
+                )
+            else:
+                raise TVMError("No QNN depthwise Conv2D Cortex-M schedule supports these params!")
+        else:
+            raise TVMError("QNN depthwise Conv2D for Arm Cortex-M DSP got incorrect input layout!")
     else:
         raise TVMError("No Arm Cortex-M DSP strategy exists for generic group qnn.conv2d")
 

diff --git a/python/tvm/topi/arm_cpu/__init__.py b/python/tvm/topi/arm_cpu/__init__.py
@@ -23,11 +23,12 @@
 from .conv2d_transpose import *
 from .conv2d_int8 import *
 from . import conv2d_alter_op
-from . import qnn_alter_op
 from .bitserial_conv2d import *
 from .bitserial_dense import *
 from .injective import *
 from .group_conv2d import *
 from .pooling import *
 from .dense import *
 from .qnn import *
+from . import qnn_alter_op
+from . import qnn_legalize
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
@@ -286,7 +286,14 @@ def _write_sums_to_memory(num_outputs, offset, stride) -> Iterator[str]:
         num_packed = (num_outputs - offset) // 2
         for i in range(num_packed):
             index = 2 * i + offset
-            yield f"int32_t packed_res_{i} = requant_{index} + (requant_{index + 1} << 16);"
+            # We must explicitly call asm inline to use the PKHBT instruction. It is not part of
+            # ACLE and has no __builtin. Writing it using masks and bitshifts does not work either:
+            # Arm GCC 12 with -O3 does not compile these efficiently.
+            yield f"int packed_res_{i};"
+            yield (
+                f'__asm__ ("pkhbt %0, %1, %2, lsl #16" : "=r" (packed_res_{i}) : '
+                f'"r" (requant_{index}), "r" (requant_{index + 1}));'
+            )
 
         if offset == 1:
             yield "((int16_t*) output)[1] = (int16_t) requant_0;"