From d5d4caa76d975adc81e5960872ad88507107628e Mon Sep 17 00:00:00 2001
From: Gavin Uberti <gavin.uberti@gmail.com>
Date: Fri, 2 Sep 2022 11:02:10 -0700
Subject: [PATCH] Address comments from Ashutosh

---
 python/tvm/relay/op/strategy/arm_cpu.py                | 10 +++++-----
 .../tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py  |  5 +++--
 .../mprofile/dsp/micro_kernel/quad_channel_convolve.py |  4 ++--
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 6d19982c995f..2d9ef99ba8a6 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -237,11 +237,9 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 )
 
             # Optimized special case depthwiseConv2D operation. Requires a 3x3 kernel, a
-            # NHWC layout, a HWOI kernel layout (which we would ideally rearrange), no dilation,
-            # "SAME" padding, int8 inputs and outputs, the same number of input and output
-            # channels, and for that channel count to be divisible by 4.
-            #
-            # Additional work could remove some of these restrictions.
+            # NHWC layout, a HWOI kernel layout (which we rearrange), no dilation, int8 inputs,
+            # int32 output, the same number of input and output channels, and for that channel
+            # count to be divisible by 4. Additional work could remove these restrictions.
 
             elif (
                 target.features.has_dsp
@@ -249,7 +247,9 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 and dilation_w == dilation_h == 1
                 and kernel.shape[3] == 1  # channel_multiplier == 1
                 and data.dtype == "int8"
+                and out_type.dtype == "int32"
                 and data.shape[3] % 4 == 0
+                and (padding != "SAME" or data.shape[1] % stride_h == data.shape[2] % stride_w == 0)
             ):
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nhwc_dsp),
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py
index ede822da76b3..162bf65a21f9 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py
@@ -149,8 +149,9 @@ def depthwise_conv2d_nhwc_dsp_compute(_cfg, data, kernel, strides, padding, dila
         output_h = height // stride_h
         output_w = width // stride_w
 
-        # Note - this padding behavior is DIFFERENT from Tensorflow, which pads the top left if
-        # stride > 1. Need to investigate and decide which behavior we want.
+        # This padding behavior is consistent with other TVM depthwise_conv2d schedules. However it
+        # differs from the TensorFlow, which only pads the bottom right if stride > 1. This probably
+        # brings down accuracy slightly for models imported from TFLite.
         pad_down = 1 if stride_h == 1 else 0
         pad_right = 1 if stride_w == 1 else 0
 
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py
index 4d8536866d47..960ef8fadc0e 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py
@@ -70,7 +70,7 @@ def intrin_func(ins, outs):
         builder.emit(
             tir.call_extern(
                 "int32",
-                f"kernel_convolve_{tensor_w}_{channels}_{kernel_h}_{kernel_w}_{suffix}",
+                f"kernel_convolve_w{tensor_w}_c{channels}_kh{kernel_h}_kw{kernel_w}_{suffix}",
                 outs[0].access_ptr("w"),
                 ins[0].access_ptr("r"),
                 ins[1].access_ptr("r"),
@@ -131,7 +131,7 @@ def quad_channel_convolve_impl(tensor_w, channels, kernel_h, kernel_w, suffix):
         #ifdef __cplusplus
         extern "C"
         #endif
-        int32_t kernel_convolve_{tensor_w}_{channels}_{kernel_h}_{kernel_w}_{suffix}(
+        int32_t kernel_convolve_w{tensor_w}_c{channels}_kh{kernel_h}_kw{kernel_w}_{suffix}(
             uint32_t *out,
             uint32_t *tensor,
             uint32_t *packed_kernel) {{