From d5d4caa76d975adc81e5960872ad88507107628e Mon Sep 17 00:00:00 2001 From: Gavin Uberti Date: Fri, 2 Sep 2022 11:02:10 -0700 Subject: [PATCH] Address comments from Ashutosh --- python/tvm/relay/op/strategy/arm_cpu.py | 10 +++++----- .../tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py | 5 +++-- .../mprofile/dsp/micro_kernel/quad_channel_convolve.py | 4 ++-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 6d19982c995f..2d9ef99ba8a6 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -237,11 +237,9 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): ) # Optimized special case depthwiseConv2D operation. Requires a 3x3 kernel, a - # NHWC layout, a HWOI kernel layout (which we would ideally rearrange), no dilation, - # "SAME" padding, int8 inputs and outputs, the same number of input and output - # channels, and for that channel count to be divisible by 4. - # - # Additional work could remove some of these restrictions. + # NHWC layout, a HWOI kernel layout (which we rearrange), no dilation, int8 inputs, + # int32 output, the same number of input and output channels, and for that channel + # count to be divisible by 4. Additional work could remove these restrictions. elif ( target.features.has_dsp @@ -249,7 +247,9 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): and dilation_w == dilation_h == 1 and kernel.shape[3] == 1 # channel_multiplier == 1 and data.dtype == "int8" + and out_type.dtype == "int32" and data.shape[3] % 4 == 0 + and (padding != "SAME" or data.shape[1] % stride_h == data.shape[2] % stride_w == 0) ): strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nhwc_dsp), diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py index ede822da76b3..162bf65a21f9 100644 --- a/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py @@ -149,8 +149,9 @@ def depthwise_conv2d_nhwc_dsp_compute(_cfg, data, kernel, strides, padding, dila output_h = height // stride_h output_w = width // stride_w - # Note - this padding behavior is DIFFERENT from Tensorflow, which pads the top left if - # stride > 1. Need to investigate and decide which behavior we want. + # This padding behavior is consistent with other TVM depthwise_conv2d schedules. However it + # differs from the TensorFlow, which only pads the bottom right if stride > 1. This probably + # brings down accuracy slightly for models imported from TFLite. pad_down = 1 if stride_h == 1 else 0 pad_right = 1 if stride_w == 1 else 0 diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py index 4d8536866d47..960ef8fadc0e 100644 --- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py @@ -70,7 +70,7 @@ def intrin_func(ins, outs): builder.emit( tir.call_extern( "int32", - f"kernel_convolve_{tensor_w}_{channels}_{kernel_h}_{kernel_w}_{suffix}", + f"kernel_convolve_w{tensor_w}_c{channels}_kh{kernel_h}_kw{kernel_w}_{suffix}", outs[0].access_ptr("w"), ins[0].access_ptr("r"), ins[1].access_ptr("r"), @@ -131,7 +131,7 @@ def quad_channel_convolve_impl(tensor_w, channels, kernel_h, kernel_w, suffix): #ifdef __cplusplus extern "C" #endif - int32_t kernel_convolve_{tensor_w}_{channels}_{kernel_h}_{kernel_w}_{suffix}( + int32_t kernel_convolve_w{tensor_w}_c{channels}_kh{kernel_h}_kw{kernel_w}_{suffix}( uint32_t *out, uint32_t *tensor, uint32_t *packed_kernel) {{