From 06740db483277b7b041d00e0e598d85b582bcadd Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Wed, 13 May 2020 17:30:13 +0100
Subject: [PATCH 01/15] Improve quantized conv2d performance for armv8

Signed-off-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
Change-Id: I3a3d29f5332dd9b3354e8e0dfb24677a521f9c8f
---
 python/tvm/relay/op/nn/_nn.py               |  18 ++
 python/tvm/relay/op/nn/nn.py                |  87 ++++++
 python/tvm/relay/op/strategy/arm_cpu.py     |  52 +++-
 python/tvm/relay/op/strategy/generic.py     |  13 +
 python/tvm/relay/qnn/op/legalizations.py    |  10 +-
 src/relay/op/nn/convolution.cc              |  80 +++++
 src/relay/op/nn/convolution.h               | 104 +++++++
 topi/python/topi/arm_cpu/conv2d_alter_op.py |  36 ++-
 topi/python/topi/arm_cpu/conv2d_gemm.py     | 143 +++++++++
 topi/python/topi/arm_cpu/conv2d_int8.py     |  31 +-
 topi/python/topi/arm_cpu/tensor_intrin.py   | 327 ++++++++++++++++++++
 topi/python/topi/generic/nn.py              |  19 ++
 topi/python/topi/nn/conv2d.py               |  25 ++
 13 files changed, 934 insertions(+), 11 deletions(-)
 create mode 100644 topi/python/topi/arm_cpu/conv2d_gemm.py

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 1c76f57a6343..b854f4ac5e28 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -446,6 +446,24 @@ def compute_mirror_pad(attrs, inputs, out_dtype):
 reg.register_pattern("nn.contrib_conv2d_winograd_without_weight_transform",
                      OpPattern.OUT_ELEMWISE_FUSABLE)
 
+# conv2d_gemm related operators
+reg.register_strategy("nn.contrib_conv2d_gemm_without_weight_transform",
+                      strategy.conv2d_gemm_without_weight_transform_strategy)
+reg.register_pattern("nn.contrib_conv2d_gemm_without_weight_transform",
+                     OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+@reg.register_compute("nn.contrib_conv2d_gemm_weight_transform")
+def compute_contrib_conv2d_gemm_weight_transform(attrs, inputs, out_dtype):
+    """Compute definition of contrib_conv2d_gemm_weight_transform"""
+    out = topi.nn.conv2d_gemm_weight_transform(
+        inputs[0])
+    return [out]
+
+reg.register_schedule("nn.contrib_conv2d_gemm_weight_transform",
+                       strategy.schedule_conv2d_gemm_weight_transform)
+reg.register_pattern("nn.contrib_conv2d_gemm_weight_transform",
+                     OpPattern.OUT_ELEMWISE_FUSABLE)
 
 @reg.register_compute("nn.contrib_conv2d_winograd_weight_transform")
 def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, out_dtype):
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 34d07dce2863..d6538ea6b428 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -2046,6 +2046,74 @@ def contrib_conv2d_winograd_without_weight_transform(data,
         kernel_layout, out_layout, out_dtype)
 
 
+def contrib_conv2d_gemm_without_weight_transform(data,
+                                                 weight,
+                                                 strides=(1, 1),
+                                                 padding=(0, 0),
+                                                 dilation=(1, 1),
+                                                 groups=1,
+                                                 channels=None,
+                                                 kernel_size=None,
+                                                 data_layout="NCHW",
+                                                 kernel_layout="OIHW",
+                                                 out_layout="",
+                                                 out_dtype=""):
+    r"""2D convolution with gemm algorithm.
+
+    The basic parameters are the same as the ones in vanilla conv2d.
+    It assumes the weight is pre-transformed by nn.contrib_conv2d_gemm_weight_transform
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    weight : tvm.relay.Expr
+        The weight expressions.
+
+    strides : tuple of int, optional
+        The strides of convolution.
+
+    padding : tuple of int, optional
+        The padding of convolution on both sides of inputs before convolution.
+
+    dilation : tuple of int, optional
+        Specifies the dilation rate to be used for dilated convolution.
+
+    groups : int, optional
+        Number of groups for grouped convolution.
+
+    channels : int, optional
+        Number of output channels of this convolution.
+
+    kernel_size : tuple of int, optional
+        The spatial of the convolution kernel.
+
+    data_layout : str, optional
+        Layout of the input.
+
+    kernel_layout : str, optional
+        Layout of the weight.
+
+    out_layout : str, optional
+        Layout of the output, by default, out_layout is the same as data_layout
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision conv2d.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    # convert 2-way padding to 4-way padding
+    padding = get_pad_tuple2d(padding)
+    return _make.contrib_conv2d_gemm_without_weight_transform(
+        data, weight, strides, padding, dilation,
+        groups, channels, kernel_size, data_layout,
+        kernel_layout, out_layout, out_dtype)
+
+
 def contrib_conv2d_nchwc(data,
                          kernel,
                          strides=(1, 1),
@@ -2204,6 +2272,25 @@ def contrib_conv2d_winograd_weight_transform(weight,
     return _make.contrib_conv2d_winograd_weight_transform(weight, tile_size)
 
 
+def contrib_conv2d_gemm_weight_transform(weights):
+    r"""Weight Transformation part for 2D convolution with gemm algorithm.
+
+    We separate this as a single op to enable pre-compute for inference.
+    Use this together with nn.contrib_conv2d_gemm_without_weight_transform
+
+    Parameters
+    ----------
+    weights : tvm.relay.Expr
+        The weight expressions.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.contrib_conv2d_gemm_weight_transform(weights)
+
+
 def contrib_conv3d_winograd_weight_transform(weight,
                                              tile_size):
     r"""Weight Transformation part for 3D convolution with winograd algorithm.
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 6bdec67617e1..8d35ee3f23ce 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -112,10 +112,17 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_direct_simd),
                     name='conv2d_direct_simd.micro_dev')
             elif kernel_layout == "HWIO":
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
-                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
-                    name="conv2d_nhwc_spatial_pack.arm_cpu")
+                is_aarch64 = "aarch64" in str(isa.target)
+                if is_aarch64 and data.dtype in ["int8", "uint8"]:
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized),
+                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
+                        name="compute_conv2d_NHWC_quantized.arm_cpu")
+                else:
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
+                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
+                        name="conv2d_nhwc_spatial_pack.arm_cpu")
             else:
                 raise RuntimeError("Unsupported kernel layout {} for conv2d NHWC".
                                    format(kernel_layout))
@@ -246,6 +253,43 @@ def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out
                            format(layout))
     return strategy
 
+def wrap_compute_conv2d_gemm(topi_compute):
+    """wrap topi compute for conv2d_gemm"""
+
+    def _compute_conv2d_gemm(attrs, inputs, out_type):
+        padding = attrs.get_int_tuple("padding")
+        strides = attrs.get_int_tuple("strides")
+        dilation = attrs.get_int_tuple("dilation")
+        out_dtype = attrs.get_str("out_dtype")
+        channels = attrs['channels']
+        kernel_size = attrs['kernel_size']
+        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
+        return [topi_compute(inputs[0], inputs[1], strides, padding,
+                             dilation, out_dtype, kernel_size, channels)]
+
+    return _compute_conv2d_gemm
+
+@conv2d_gemm_without_weight_transform_strategy.register("arm_cpu")
+def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transfrom arm cpu strategy"""
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs.data_layout
+    strides = attrs.get_int_tuple("strides")
+    data = inputs[0]
+    kernel = inputs[1]
+    strategy = _op.OpStrategy()
+
+    if layout == "NHWC" and data.dtype in ['int8', 'uint8']:
+        strategy.add_implementation(
+            wrap_compute_conv2d_gemm(topi.arm_cpu.compute_conv2d_NHWC_quantized_without_transform),
+            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
+            name="conv2d_NHWC_quantized_without_transform.arm_cpu")
+    else:
+        raise RuntimeError("Unsupported conv2d_gemm_without_weight_transform layout {0} with datatype {1}".
+                           format(layout, data.dtype))
+    return strategy
+
 @conv2d_transpose_strategy.register(["arm_cpu", "micro_dev"])
 def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv2d_transpose arm cpu strategy"""
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index b1fb421c3e2e..a0dd6bfe7b15 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -266,6 +266,12 @@ def conv2d_winograd_without_weight_transfrom_strategy(attrs, inputs, out_type, t
     """conv2d_winograd_without_weight_transfrom generic strategy"""
     raise ValueError("No generic implemenation for conv2d_winograd_without_weight_transform")
 
+# conv2d_gemm_without_weight_transform
+@override_native_generic_func("conv2d_gemm_without_weight_transform_strategy")
+def conv2d_gemm_without_weight_transform_strategy(attrs, inputs, out_type, target):
+    """conv2d_gemm_without_weight_transfrom generic strategy"""
+    raise ValueError("No generic implemenation for conv2d_gemm_without_weight_transform")
+
 # conv2d_winograd_weight_transform
 @generic_func
 def schedule_conv2d_winograd_weight_transform(attrs, outs, target):
@@ -280,6 +286,13 @@ def schedule_conv2d_winograd_nnpack_weight_transform(attrs, outs, target):
     with target:
         return topi.generic.schedule_conv2d_winograd_nnpack_weight_transform(outs)
 
+# conv2d_gemm_weight_transform
+@generic_func
+def schedule_conv2d_gemm_weight_transform(attrs, outs, target):
+    """Schedule conv2d_gemm_weight_transform"""
+    with target:
+        return topi.generic.schedule_conv2d_gemm_weight_transform(outs)
+
 # deformable_conv2d
 def wrap_compute_deformable_conv2d(topi_compute):
     """wrap deformable_conv2d topi compute"""
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index d3b0e44a1a13..4eac33333eef 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -237,6 +237,11 @@ def is_fast_int8_on_arm():
     target = tvm.target.Target.current(allow_none=False)
     return '+v8.2a,+dotprod' in ' '.join(target.options)
 
+def is_aarch64_arm():
+    """ Checks whether the hardware has support for fast Int8 arithmetic operations. """
+    target = tvm.target.Target.current(allow_none=False)
+    return 'aarch64' in ' '.join(target.options)
+
 ########################
 # ARM CPU legalizations.
 ########################
@@ -244,9 +249,10 @@ def is_fast_int8_on_arm():
 @qnn_conv2d_legalize.register('arm_cpu')
 def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types):
     # ARM prefers the dtypes to be same.
-    if is_fast_int8_on_arm():
+    if is_aarch64_arm():
         return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
-    return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.conv2d)
+    return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.qnn.op.conv2d)
+
 
 @qnn_dense_legalize.register('arm_cpu')
 def _qnn_dense_legalize_arm_cpu(attrs, inputs, types):
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 6c6eb1ecb8b2..55ae2d7f1575 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -77,6 +77,27 @@ Expr MakeConvWinograd(Expr data, Expr weight, int tile_size, Array<IndexExpr> st
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
 
+template <typename T>
+Expr MakeConvGemm(Expr data, Expr weight, Array<IndexExpr> strides,
+                      Array<IndexExpr> padding, Array<IndexExpr> dilation, int groups,
+                      IndexExpr channels, Array<IndexExpr> kernel_size, std::string data_layout,
+                      std::string kernel_layout, std::string out_layout, DataType out_dtype,
+                      std::string op_name) {
+  auto attrs = make_object<T>();
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->dilation = std::move(dilation);
+  attrs->groups = groups;
+  attrs->channels = std::move(channels);
+  attrs->kernel_size = std::move(kernel_size);
+  attrs->data_layout = std::move(data_layout);
+  attrs->kernel_layout = std::move(kernel_layout);
+  attrs->out_layout = std::move(out_layout);
+  attrs->out_dtype = std::move(out_dtype);
+  const Op& op = Op::Get(op_name);
+  return Call(op, {data, weight}, Attrs(attrs), {});
+}
+
 Expr MakeConvWinogradWeightTransform(Expr weight, int tile_size, std::string op_name) {
   auto attrs = make_object<ConvWinogradWeightTransformAttrs>();
   attrs->tile_size = tile_size;
@@ -84,6 +105,11 @@ Expr MakeConvWinogradWeightTransform(Expr weight, int tile_size, std::string op_
   return Call(op, {weight}, Attrs(attrs), {});
 }
 
+Expr MakeConvGemmWeightTransform(Expr weight, std::string op_name) {
+  const Op& op = Op::Get(op_name);
+  return Call(op, {weight});
+}
+
 template <typename T>
 Expr MakeConvTranspose(Expr data, Expr weight, Array<IndexExpr> strides, Array<IndexExpr> padding,
                        Array<IndexExpr> dilation, int groups, IndexExpr channels,
@@ -504,6 +530,59 @@ weight transformation in advance.
     .set_support_level(10)
     .add_type_rel("Conv2DWinogradNNPACKWeightTransform", Conv2DWinogradNNPACKWeightTransformRel);
 
+// relay.nn.contrib_conv2d_gemm_without_weight_transform
+TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_gemm_without_weight_transform")
+    .set_body_typed([](Expr data, Expr weight, Array<IndexExpr> strides,
+                       Array<IndexExpr> padding, Array<IndexExpr> dilation, int groups,
+                       IndexExpr channels, Array<IndexExpr> kernel_size, std::string data_layout,
+                       std::string kernel_layout, std::string out_layout, DataType out_dtype) {
+      return MakeConvGemm<Conv2DAttrs>(
+          data, weight, strides, padding, dilation, groups, channels, kernel_size,
+          data_layout, kernel_layout, out_layout, out_dtype,
+          "nn.contrib_conv2d_gemm_without_weight_transform");
+    });
+
+RELAY_REGISTER_OP("nn.contrib_conv2d_gemm_without_weight_transform")
+    .describe(R"code(Compute conv2d with gemm algorithm. Only supports NHWC layout.
+                 This operator assumes the weight tensor is already pre-transformed by
+                 nn.contrib_conv2d_gemm_weight_transform.
+
+- **data**: Input is 4D array of shape  (batch_size, height, width, in_channels)
+- **weight**: Any shape
+            We do not check the shape for this input tensor. Since different backend
+            has different layout strategy.
+
+- **out**:  Output is 4D array of shape (batch_size, channels, out_height, out_width)
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<Conv2DAttrs>()
+    .set_num_inputs(2)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .add_argument("weight", "Tensor", "The weight tensor.")
+    .set_support_level(10)
+    .add_type_rel("Conv2DGemm", Conv2DGemmRel<Conv2DAttrs>)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                                   ConvInferCorrectLayout<Conv2DAttrs>);
+
+// relay.nn.contrib_conv2d_gemm_weight_transform
+
+TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_gemm_weight_transform")
+    .set_body_typed([](Expr weights) {
+      return MakeConvGemmWeightTransform(weights,
+                                             "nn.contrib_conv2d_gemm_weight_transform");
+    });
+
+RELAY_REGISTER_OP("nn.contrib_conv2d_gemm_weight_transform")
+    .describe(R"code(Weight transformation of GEMM convolution algorithm.
+
+Separate this into another operator in order to enable Precompute Pass to compute the
+weight transformation in advance.
+
+)code" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .add_argument("weights", "Tensor", "The weights tensor.")
+    .set_support_level(10)
+    .add_type_rel("Conv2DGemmWeightTransform", Conv2DGemmWeightTransformRel);
+
 // Positional relay function to create conv2d NCHWc operator
 // used by frontend FFI.
 TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_NCHWc")
@@ -600,5 +679,6 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.deformable_conv2d")
           kernel_size, data_layout, kernel_layout, out_layout, out_dtype, "nn.deformable_conv2d");
     });
 
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 0c5b20a153cf..f2dba77fcc00 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -383,6 +383,38 @@ inline bool Conv2DWinogradWeightTransformRel(const Array<Type>& types, int num_i
   return true;
 }
 
+// Gemm convolution shape relations
+inline bool Conv2DGemmWeightTransformRel(const Array<Type>& types, int num_inputs,
+                                         const Attrs& attrs, const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  CHECK_EQ(data->shape.size(), 4) << "Only support HWIO kernel layout";
+
+  const auto K = data->shape[0]*data->shape[1]*data->shape[2];
+  const auto N = data->shape[3];
+
+  auto k_mod_16 = indexmod(K, 16);
+  auto n_mod_4 = indexmod(N, 4);
+
+  auto pad_k = tvm::if_then_else(k_mod_16 != 0, 16 - k_mod_16, tir::make_zero(DataType::Int(32)));
+  auto pad_n = tvm::if_then_else(n_mod_4 != 0, 4 - n_mod_4, tir::make_zero(DataType::Int(32)));
+
+  const auto N_padded = N + pad_n;
+  const auto K_padded = K + pad_k;
+
+  Array<IndexExpr> oshape{
+            indexdiv(N_padded, 4),
+            indexdiv(K_padded, 16),
+            4,
+            16,
+  };
+
+  reporter->Assign(types[1], TensorType(oshape, data->dtype));
+  return true;
+}
+
 inline bool Conv3DWinogradWeightTransformRel(const Array<Type>& types, int num_inputs,
                                              const Attrs& attrs, const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 2);
@@ -519,6 +551,78 @@ bool Conv2DWinogradRel(const Array<Type>& types, int num_inputs, const Attrs& at
   return true;
 }
 
+template <typename AttrType>
+bool Conv2DGemmRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                       const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  static const Layout kNHWC("NHWC");
+  static const Layout kHWIO("HWIO");
+
+  const AttrType* param = attrs.as<AttrType>();
+  CHECK(param != nullptr);
+  const Layout in_layout(param->data_layout);
+  const Layout kernel_layout(param->kernel_layout);
+
+  const auto trans_in_layout = tir::BijectiveLayout(in_layout, kNHWC);
+  CHECK(trans_in_layout.defined())
+      << "Conv only support input layouts that are convertible from NHWC."
+      << " But got " << in_layout;
+
+  const auto trans_kernel_layout = tir::BijectiveLayout(kernel_layout, kHWIO);
+  CHECK(trans_kernel_layout.defined())
+      << "Conv only support kernel layouts that are convertible from HWIO."
+      << " But got " << kernel_layout;
+
+  Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
+  const auto trans_out_layout = tir::BijectiveLayout(out_layout, kNHWC);
+  CHECK(trans_out_layout.defined())
+      << "Conv only support output layouts that are convertible from NHWC."
+      << " But got " << out_layout;
+
+  Array<IndexExpr> dshape_nhwc = trans_in_layout.ForwardShape(data->shape);
+
+  IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
+
+  CHECK(param->kernel_size.defined() && param->channels.defined())
+      << "The kernel size and channels of a Conv must be set or inferred by previous pass";
+
+  CHECK_EQ(param->kernel_size.size(), 2);
+  CHECK_EQ(param->dilation.size(), 2);
+
+  channels = param->channels;
+  dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
+  dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
+
+  // NOTE: Do not check weight shape here!
+
+  // dilation
+  Array<IndexExpr> oshape({dshape_nhwc[0], 0, 0, channels});
+
+  IndexExpr pad_h, pad_w;
+  GetPaddingHeightWidth(param->padding, &pad_h, &pad_w);
+  if (!dshape_nhwc[2].as<tir::AnyNode>()) {
+    oshape.Set(1, (dshape_nhwc[1] + pad_h - dilated_ksize_y) / param->strides[0] + 1);
+  } else {
+    oshape.Set(1, dshape_nhwc[1]);
+  }
+  if (!dshape_nhwc[3].as<tir::AnyNode>()) {
+    oshape.Set(2, (dshape_nhwc[2] + pad_w - dilated_ksize_x) / param->strides[1] + 1);
+  } else {
+    oshape.Set(2, dshape_nhwc[2]);
+  }
+
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
+  oshape = trans_out_layout.BackwardShape(oshape);
+  // assign output type
+  reporter->Assign(types[2], TensorType(oshape, out_dtype));
+  return true;
+}
+
 template <typename AttrType>
 bool Conv3DWinogradRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                        const TypeReporter& reporter) {
diff --git a/topi/python/topi/arm_cpu/conv2d_alter_op.py b/topi/python/topi/arm_cpu/conv2d_alter_op.py
index 3206168d51bd..348e43b1dfc7 100644
--- a/topi/python/topi/arm_cpu/conv2d_alter_op.py
+++ b/topi/python/topi/arm_cpu/conv2d_alter_op.py
@@ -59,10 +59,6 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     data, kernel = tinfos
     out_dtype = out_type.dtype
 
-    # We only perform layout alteration for NCHW data layout.
-    if data_layout == "NHWC":
-        return None
-
     # Extract data types
     data_tensor, kernel_tensor = tinfos
     data_dtype = data_tensor.dtype
@@ -70,6 +66,10 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
 
     idxd = tvm.tir.indexdiv
 
+    # We don't perform layout alteration for NHWC layout with real data types
+    if data_layout == "NHWC" and data_dtype not in ['uint8', 'int8']:
+        return None
+
     if topi_tmpl == "conv2d_nchw_spatial_pack.arm_cpu":
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
         N, CI, H, W = get_const_tuple(data.shape)
@@ -235,5 +235,33 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
              new_attrs['out_layout'], out_dtype], topi_tmpl)
         dispatch_ctx.update(target, new_workload, cfg)
         return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs)
+    if topi_tmpl == "compute_conv2d_NHWC_quantized.arm_cpu":
+        assert (data.dtype == 'int8' and kernel.dtype == 'int8' or
+                data.dtype == 'uint8' and kernel.dtype == 'uint8')
+        CO, IC, KH, KW = get_const_tuple(kernel.shape)
+
+        K = KH * KW * IC
+        N = CO
+
+        pad_k = 0
+        pad_n = 0
+
+        if N % 4 != 0:
+            pad_n = 4 - (N % 4)
+
+        if K % 16 != 0:
+            pad_k = 16 - (K % 16)
+
+        N_padded = N + pad_n
+        K_padded = K + pad_k
+
+        kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(inputs[1])
+        new_kernel = te.placeholder((N_padded // 4, K_padded // 16, 4, 16), kernel.dtype)
+
+        new_workload = autotvm.task.args_to_workload(
+            [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), CO], "conv2d_NHWC_int8_without_tranform.arm_cpu")
+        dispatch_ctx.update(target, new_workload, cfg)
+
+        return relay.nn.contrib_conv2d_gemm_without_weight_transform(inputs[0], kernel_expr, **new_attrs)
 
     return None
diff --git a/topi/python/topi/arm_cpu/conv2d_gemm.py b/topi/python/topi/arm_cpu/conv2d_gemm.py
new file mode 100644
index 000000000000..fddea47af450
--- /dev/null
+++ b/topi/python/topi/arm_cpu/conv2d_gemm.py
@@ -0,0 +1,143 @@
+import tvm
+import topi
+from tvm import te
+from topi import nn
+from ..util import get_const_tuple
+from ..nn.util import get_pad_tuple
+from .tensor_intrin import gemv_quantized, gemv_quantized_impl
+
+
+# Compute function
+def compute_conv2d_gemm_without_weight_transform(cfg, data, B_interleaved_t, strides, padding, dilation, out_dtype, kernel_size, output_channels):
+
+    batches, IH, IW, IC = get_const_tuple(data.shape)
+
+    KH, KW = kernel_size
+    OC = output_channels
+
+    K_AREA = KH * KW
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    dilated_kernel_h = (KH - 1) * dilation_h + 1
+    dilated_kernel_w = (KW - 1) * dilation_w + 1
+
+    pad_top, pad_left, pad_down, pad_right = \
+        get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w))
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+
+    OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
+    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
+    if pad_top or pad_left:
+        data_pad = nn.pad(data, [0, pad_top, pad_left, 0], [0, pad_down, pad_right, 0], name="data_pad")
+    else:
+        data_pad = data
+
+    # --- Im2col
+    M = OH * OW
+    K = IC * K_AREA
+    N = OC
+
+    A_shape = (batches, M, K)
+    if K_AREA == 1:
+        A = te.compute(A_shape, lambda n, x, y: data_pad[n, HSTR * (x // OW), WSTR * (x % OW), y],
+                          name='data_flatten')
+    else:
+        A = te.compute(A_shape, lambda n, x, y: data_pad[n, HSTR * (x // OW) + dilation_h * (y // IC) // KW, WSTR * (x % OW) + dilation_w * (y // IC) % KW, y % IC],
+                          name='data_im2col')
+    N_transformed = B_interleaved_t.shape[0]
+
+    # --- Pad if necessary
+    idxm = tvm.tir.indexmod
+
+    pad_m = 0
+    pad_k = 0
+
+    if M % 4 != 0:
+        pad_m = 4 - (M % 4)
+
+    if K % 16 != 0:
+        pad_k = 16 - (K % 16)
+
+    M_padded = M + pad_m
+    K_padded = K + pad_k
+
+    pad_before = (0, 0, 0)
+    pad_after = (0, pad_m, pad_k)
+
+    if pad_m != 0 or pad_k != 0:
+        A = nn.pad(A, pad_before=pad_before, pad_after=pad_after, name="A_padded")
+
+    # --- GEMM: A*B'
+    k = te.reduce_axis((0, K_padded), "k")
+
+    A_interleaved = te.compute((batches, M_padded // 4, K_padded // 16, 4, 16), lambda b, x, y, z, w: A[b, z + 4 * x, w + 16 * y], name='A_interleaved')
+
+    C_interleaved = te.compute((batches, M_padded // 4, N_transformed, 4, 4),
+            lambda b, x, y, w, z: te.sum(A_interleaved[b, x, k // 16, w, idxm(k, 16)].astype(out_dtype) *
+                                B_interleaved_t[y, k // 16, z, idxm(k, 16)].astype(out_dtype), axis=k),
+            name='C_interleaved')
+
+    # --- Unpack C
+    C = te.compute((batches, M, N), lambda b, x, y: C_interleaved[b, x // 4, y // 4, idxm(x, 4), idxm(y, 4)], name="C", tag='injective')
+
+    # --- Produce the conv output
+    out_shape = (batches, OH, OW, OC)
+    out = te.compute(out_shape, lambda b, x, y, z: C(b, y + OW * x, z), name='conv2d_gemm_output')
+
+    return out
+
+# Schedules
+
+
+def schedule_conv2d_gemm(cfg, s, out):
+
+    C = out.op.input_tensors[0]
+    C_interleaved = C.op.input_tensors[0]
+    A_interleaved = C_interleaved.op.input_tensors[0]
+
+    # Input transform
+    A_interleaved_input = A_interleaved.op.input_tensors[0]
+    if A_interleaved_input.op.name == "A_padded":
+        s[A_interleaved_input].compute_at(s[A_interleaved], A_interleaved.op.axis[3])
+        s[A_interleaved_input].vectorize(A_interleaved_input.op.axis[2])
+        s[A_interleaved_input].compute_inline()
+        data_im2col = A_interleaved_input.op.input_tensors[0]
+    else:
+        data_im2col = A_interleaved_input
+
+    b, m, n = data_im2col.op.axis
+    if data_im2col.op.name == "data_im2col":
+        n_outer, n_inner = s[data_im2col].split(n, 16)
+        s[data_im2col].unroll(n_outer)
+        s[data_im2col].vectorize(n_inner)
+    else:
+        s[data_im2col].compute_inline()
+
+    # Computation(through tensorize)
+    b, xo, yo, xi, yi = C_interleaved.op.axis
+    s[C_interleaved].reorder(xo, yo, yi, xi)
+    s[C_interleaved].parallel(xo)
+    s[A_interleaved].compute_at(s[C_interleaved], xo)
+    s[A_interleaved].vectorize(A_interleaved.op.axis[4])
+
+    in_type = A_interleaved.dtype
+    out_type = C.dtype
+    if out_type == 'int32':
+        K = A_interleaved_input.shape[2]
+        _, M, N = C.shape
+        assert in_type in ['int8', 'uint8'], "Only int8 and uint8 gemm are supported"
+
+        gem_v_dotprod = gemv_quantized(M, N, K, in_type, out_type)
+        s[C_interleaved].pragma(xo, "import_llvm", gemv_quantized_impl(M, N, in_type))
+        s[C_interleaved].tensorize(yi, gem_v_dotprod)
+
+    # Output transform
+    N, OH, OW, OC = out.shape
+    s[C].split(C.op.axis[1], OW)
+    s[C].compute_at(s[out], out.op.axis[3])
+
+    return s
diff --git a/topi/python/topi/arm_cpu/conv2d_int8.py b/topi/python/topi/arm_cpu/conv2d_int8.py
index 06412b656b4b..e5ae40f80cd6 100644
--- a/topi/python/topi/arm_cpu/conv2d_int8.py
+++ b/topi/python/topi/arm_cpu/conv2d_int8.py
@@ -19,11 +19,13 @@
 from tvm import te
 from tvm import autotvm
 from .. import tag
-from ..util import get_const_tuple
+from ..util import traverse_inline, get_const_tuple
 from ..generic import conv2d as conv2d_generic
 from .. import nn
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
+from ..nn.util import get_const_int, get_pad_tuple
 from .tensor_intrin import dot_int8_int8_int32
+from .conv2d_gemm import compute_conv2d_gemm_without_weight_transform, schedule_conv2d_gemm
 
 
 def _get_default_config(cfg, data, kernel, strides, padding, out_dtype):
@@ -109,3 +111,30 @@ def traverse(op):
 
     traverse(outs[0].op)
     return s
+
+
+@autotvm.register_topi_compute("compute_conv2d_NHWC_quantized.arm_cpu")
+def compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    N, IH, IW, IC = get_const_tuple(data.shape)
+    KH, KW, _, OC = get_const_tuple(kernel.shape)
+    kernel = nn.conv2d_gemm_weight_transform(kernel)
+    return  compute_conv2d_gemm_without_weight_transform(cfg, data, kernel, strides, padding, dilation, out_dtype, (KH, KW), OC)
+
+
+@autotvm.register_topi_compute("compute_conv2d_NHWC_quantized_without_transform.arm_cpu")
+def compute_conv2d_NHWC_quantized_without_transform(cfg, data, B, strides, padding, dilation, out_dtype, kernel_size=None, output_channels=None):
+    return  compute_conv2d_gemm_without_weight_transform(cfg, data, B, strides, padding, dilation, out_dtype, kernel_size, output_channels)
+
+
+@autotvm.register_topi_schedule("conv2d_NHWC_quantized.arm_cpu")
+def schedule_conv2d_NHWC_quantized(cfg, outs):
+    """Create schedule for tensors"""
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        """Traverse operators from computation graph"""
+        if op.name == "conv2d_gemm_output":
+            schedule_conv2d_gemm(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/python/topi/arm_cpu/tensor_intrin.py b/topi/python/topi/arm_cpu/tensor_intrin.py
index bab91578e77e..d8285fa6e878 100644
--- a/topi/python/topi/arm_cpu/tensor_intrin.py
+++ b/topi/python/topi/arm_cpu/tensor_intrin.py
@@ -20,6 +20,333 @@
 import tvm
 from tvm import te
 
+
+def gemv_quantized_impl(M, N, data_type = 'uint8'):
+    stepA = min(4, M)
+    stepB = min(4, N)
+    assert data_type in ['uint8', 'int8'], 'Only uint8 and int8 are supported for this implementation'
+
+    cc_code = """
+          extern "C" int gemv_{0}_{0}_int32_{1}_{2}(int *c_buffer, unsigned char *a_buffer, unsigned char *b_buffer, int K, int m, int n) """.format(data_type, stepA, stepB)
+
+    cc_code += """
+    {
+            unsigned char * a_ptr = a_buffer;
+            unsigned char * b_ptr = b_buffer;
+            int * c_ptr = c_buffer;
+
+            int k = K / 16;
+
+            __asm__  __volatile__ (
+                "movi v16.4s, #0\\n"
+                "movi v17.4s, #0\\n"
+                "movi v18.4s, #0\\n"
+                "movi v19.4s, #0\\n"
+                "movi v20.4s, #0\\n"
+                "movi v21.4s, #0\\n"
+                "movi v22.4s, #0\\n"
+                "movi v23.4s, #0\\n"
+                "movi v24.4s, #0\\n"
+                "movi v25.4s, #0\\n"
+                "movi v26.4s, #0\\n"
+                "movi v27.4s, #0\\n"
+                "movi v28.4s, #0\\n"
+                "movi v29.4s, #0\\n"
+                "movi v30.4s, #0\\n"
+                "movi v31.4s, #0\\n"
+            "1:"
+    """
+
+    cc_code += ' "ldr q0, [%[a_ptr]]\\n" '
+
+    if M > 1:
+        cc_code += ' "ldr q1, [%[a_ptr], #16]\\n" '
+    else:
+        cc_code += ' "movi v1.4s, #0\\n" '
+
+    if M > 2:
+        cc_code += ' "ldr q2, [%[a_ptr], #32]\\n" '
+    else:
+        cc_code += ' "movi v2.4s, #0\\n" '
+
+    if M > 3:
+        cc_code += ' "ldr q3, [%[a_ptr], #48]\\n" '
+    else:
+        cc_code += ' "movi v3.4s, #0\\n" '
+
+    cc_code += ' "ldr q4, [%[b_ptr]]\\n" '
+
+    if N > 1:
+        cc_code += ' "ldr q5, [%[b_ptr], #16]\\n" '
+
+    if N > 2:
+        cc_code += ' "ldr q6, [%[b_ptr], #32]\\n" '
+
+    if N > 3:
+        cc_code += ' "ldr q7, [%[b_ptr], #48]\\n" '
+
+    cc_code += """
+                // First half
+                // Higher part of a0 * {b0,b1,b2,b3}
+                "umull v8.8h, v0.8b, v4.8b\\n"
+                "umull v9.8h, v0.8b, v5.8b\\n"
+                "umull v10.8h, v0.8b, v6.8b\\n"
+                "umull v11.8h, v0.8b, v7.8b\\n"
+
+                // Higher part of a1 * {b0,b1,b2,b3}
+                "umull v12.8h, v1.8b, v4.8b\\n"
+                "umull v13.8h, v1.8b, v5.8b\\n"
+                "umull v14.8h, v1.8b, v6.8b\\n"
+                "umull v15.8h, v1.8b, v7.8b\\n"
+
+                // Accumulate
+                "uadalp v16.4s, v8.8h\\n"
+                "uadalp v17.4s, v9.8h\\n"
+                "uadalp v18.4s, v10.8h\\n"
+                "uadalp v19.4s, v11.8h\\n"
+                "uadalp v20.4s, v12.8h\\n"
+                "uadalp v21.4s, v13.8h\\n"
+                "uadalp v22.4s, v14.8h\\n"
+                "uadalp v23.4s, v15.8h\\n"
+
+                // Lower part of a0 * {b0,b1,b2,b3}
+                "umull2 v8.8h, v0.16b, v4.16b\\n"
+                "umull2 v9.8h, v0.16b, v5.16b\\n"
+                "umull2 v10.8h, v0.16b, v6.16b\\n"
+                "umull2 v11.8h, v0.16b, v7.16b\\n"
+
+                // Lower part of a1 * {b0,b1,b2,b3}
+                "umull2 v12.8h, v1.16b, v4.16b\\n"
+                "umull2 v13.8h, v1.16b, v5.16b\\n"
+                "umull2 v14.8h, v1.16b, v6.16b\\n"
+                "umull2 v15.8h, v1.16b, v7.16b\\n"
+
+                 // Accumulate again
+                "uadalp v16.4s, v8.8h\\n"
+                "uadalp v17.4s, v9.8h\\n"
+                "uadalp v18.4s, v10.8h\\n"
+                "uadalp v19.4s, v11.8h\\n"
+                "uadalp v20.4s, v12.8h\\n"
+                "uadalp v21.4s, v13.8h\\n"
+                "uadalp v22.4s, v14.8h\\n"
+                "uadalp v23.4s, v15.8h\\n"
+
+                // Second half
+
+                // Lower part of a2 * {b0,b1,b2,b3}
+                "umull v8.8h, v2.8b, v4.8b\\n"
+                "umull v9.8h, v2.8b, v5.8b\\n"
+                "umull v10.8h, v2.8b, v6.8b\\n"
+                "umull v11.8h, v2.8b, v7.8b\\n"
+
+                // Lower part of a3 * {b0,b1,b2,b3}
+                "umull v12.8h, v3.8b, v4.8b\\n"
+                "umull v13.8h, v3.8b, v5.8b\\n"
+                "umull v14.8h, v3.8b, v6.8b\\n"
+                "umull v15.8h, v3.8b, v7.8b\\n"
+
+                // Accumulate
+                "uadalp v24.4s, v8.8h\\n"
+                "uadalp v25.4s, v9.8h\\n"
+                "uadalp v26.4s, v10.8h\\n"
+                "uadalp v27.4s, v11.8h\\n"
+                "uadalp v28.4s, v12.8h\\n"
+                "uadalp v29.4s, v13.8h\\n"
+                "uadalp v30.4s, v14.8h\\n"
+                "uadalp v31.4s, v15.8h\\n"
+
+                // Higher part of a2 * {b0,b1,b2,b3}
+                "umull2 v8.8h, v2.16b, v4.16b\\n"
+                "umull2 v9.8h, v2.16b, v5.16b\\n"
+                "umull2 v10.8h, v2.16b, v6.16b\\n"
+                "umull2 v11.8h, v2.16b, v7.16b\\n"
+
+                // Higher part of a3 * {b0,b1,b2,b3}
+                "umull2 v12.8h, v3.16b, v4.16b\\n"
+                "umull2 v13.8h, v3.16b, v5.16b\\n"
+                "umull2 v14.8h, v3.16b, v6.16b\\n"
+                "umull2 v15.8h, v3.16b, v7.16b\\n"
+
+                // Accumulate again
+                "uadalp v24.4s, v8.8h\\n"
+                "uadalp v25.4s, v9.8h\\n"
+                "uadalp v26.4s, v10.8h\\n"
+                "uadalp v27.4s, v11.8h\\n"
+                "uadalp v28.4s, v12.8h\\n"
+                "uadalp v29.4s, v13.8h\\n"
+                "uadalp v30.4s, v14.8h\\n"
+                "uadalp v31.4s, v15.8h\\n"
+    """
+    blockA = min(64, M * 16)
+    blockB = min(64, N * 16)
+
+    cc_code += """
+                // Increment pointers and decrement k
+                "add %[a_ptr], %[a_ptr], #{0}\\n"
+                "add %[b_ptr], %[b_ptr], #{1}\\n"
+                "subs %w[k], %w[k], #1\\n"
+    """.format(blockA, blockB)
+
+    stepC = min(4, N)
+
+    cc_code += """
+                "cbnz %w[k], 1b\\n"
+
+                // Final additions
+
+                // v16 contains the four partial sums of a[0, 0:K].*b[0,0:K], let's call them (a,b,c,d)
+                // v17 contains the four partial sums of a[0, 0:K].*b[1,0:K], let's call them (e,f,g,h)
+                // v18 contains the four partial sums of a[0, 0:K].*b[2,0:K], let's call them (i,j,k,l)
+                // v19 contains the four partial sums of a[0, 0:K].*b[3,0:K], let's call them (m,n,o,p)
+                "addp v16.4s, v16.4s, v17.4s\\n" // v16 = (a+b, c+d, e+f, g+h)
+                "addp v17.4s, v18.4s, v19.4s\\n" // v17 = (i+j, k+l, m+n, o+p)
+                "addp v16.4s, v16.4s, v17.4s\\n" // v16 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
+
+                // v20 contains the four partial sums of a[1, 0:K].*b[0,0:K], let's call them (a,b,c,d)
+                // v21 contains the four partial sums of a[1, 0:K].*b[1,0:K], let's call them (e,f,g,h)
+                // v22 contains the four partial sums of a[1, 0:K].*b[2,0:K], let's call them (i,j,k,l)
+                // v23 contains the four partial sums of a[1, 0:K].*b[3,0:K], let's call them (m,n,o,p)
+                "addp v20.4s, v20.4s, v21.4s\\n" // v20 = (a+b, c+d, e+f, g+h)
+                "addp v21.4s, v22.4s, v23.4s\\n" // v21 = (i+j, k+l, m+n, o+p)
+                "addp v20.4s, v20.4s, v21.4s\\n" // v20 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
+
+                // v24 contains the four partial sums of a[2, 0:K].*b[0,0:K], let's call them (a,b,c,d)
+                // v25 contains the four partial sums of a[2, 0:K].*b[1,0:K], let's call them (e,f,g,h)
+                // v26 contains the four partial sums of a[2, 0:K].*b[2,0:K], let's call them (i,j,k,l)
+                // v27 contains the four partial sums of a[2, 0:K].*b[3,0:K], let's call them (m,n,o,p)
+                "addp v24.4s, v24.4s, v25.4s\\n"  // v24 = (a+b, c+d, e+f, g+h)
+                "addp v25.4s, v26.4s, v27.4s\\n"  // v25 = (i+j, k+l, m+n, o+p)
+                "addp v24.4s, v24.4s, v25.4s\\n"  // v24 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
+
+                // v28 contains the four partial sums of a[3, 0:K].*b[0,0:K], let's call them (a,b,c,d)
+                // v29 contains the four partial sums of a[3, 0:K].*b[1,0:K], let's call them (e,f,g,h)
+                // v30 contains the four partial sums of a[3, 0:K].*b[2,0:K], let's call them (i,j,k,l)
+                // v31 contains the four partial sums of a[3, 0:K].*b[3,0:K], let's call them (m,n,o,p)
+                "addp v28.4s, v28.4s, v29.4s\\n" // v28 = (a+b, c+d, e+f, g+h)
+                "addp v29.4s, v30.4s, v31.4s\\n" // v29 = (i+j, k+l, m+n, o+p)
+                "addp v28.4s, v28.4s, v29.4s\\n" // v28 = (a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p)
+
+                "str q16, [%[c_ptr]]\\n"
+            """
+
+    if M > 1:
+        cc_code += ' "str q20, [%[c_ptr], #{0}]\\n" '.format(stepC * 4)
+
+    if M > 2:
+        cc_code += ' "str q24, [%[c_ptr], #{0}]\\n" '.format(stepC * 8)
+
+    if M > 3:
+        cc_code += ' "str q28, [%[c_ptr], #{0}]\\n" '.format(stepC * 12)
+
+    cc_code += """
+             : [c_ptr] "+r" (c_ptr), [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [k] "+r" (k)
+             :
+             : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+                    "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
+                    "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+                    "v27", "v28", "v29", "v30", "v31"
+             );
+        return 0;
+        }
+    """
+
+    if data_type == 'int8':
+        cc_code = cc_code.replace('unsigned char', 'char')
+        cc_code = cc_code.replace('umull', 'smull')
+        cc_code = cc_code.replace('uadalp', 'sadalp')
+
+    from tvm.contrib import util, clang
+    temp = util.tempdir()
+    ll_path = temp.relpath("temp.ll")
+    # Create LLVM ir from c source code
+    cc_code = cc_code
+    ll_code = clang.create_llvm(cc_code, options=["--target=aarch64-linux-gnu -mattr=+neon"], output=ll_path)
+    return ll_code
+
+
+def gemv_quantized(M, N, K, in_type, out_type):
+    """
+    Use integer ARM v8 instructions in order to produce a block c of 4x4 elements
+    given two 4xK blocks a and b' (where b' is a Kx4 block transposed). The final 
+    result is c = a*b (where '*' indicates the matrix product)
+
+    Every row of the matrix c is obtained (for uint8) by a sequence of
+
+          umull -> uadalp -> umull2 -> uadalp 
+
+    The block size is constrained by the number of registers available in arvm8. This
+    function returns a TensorIntrin that can be used to tensorize
+    a schedule.
+
+    Parameters
+    ----------
+    M: int
+        rows of the matrix A
+    N: int
+        columns of the matrix B
+    K: int
+        columns of matrix A
+    in_type: str, {'uint8', 'int8'}
+    out_type: str, {'uint32', 'int32'}
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The ARM uint8/int8 TensorIntrin that can be used in tensorizing schedule
+    """
+    A = te.placeholder((K // 16, te.var("m"), 16), dtype=in_type, name='A')
+    B = te.placeholder((K // 16, te.var("n"), 16), dtype=in_type, name='B')
+
+    idxm = tvm.tir.indexmod
+
+    k = te.reduce_axis((0, K), "k")
+
+    C = te.compute((te.var("m"), te.var("n")),
+                   lambda x, y: te.sum(A[k // 16, x, idxm(k, 16)].astype(out_type) *
+                                    B[k // 16, y, idxm(k, 16)].astype(out_type),
+                                    axis=k), name="C")
+
+    a_buffer = tvm.tir.decl_buffer(A.shape, dtype=in_type, name="a_buffer",
+                                   offset_factor=1, strides=[te.var('sa_1'), te.var('sa_2'), 1])
+
+    b_buffer = tvm.tir.decl_buffer(B.shape, dtype=in_type, name="b_buffer",
+                                   offset_factor=1, strides=[te.var('sb_1'), te.var('sb_2'), 1])
+
+    c_buffer = tvm.tir.decl_buffer(C.shape, dtype=out_type, name="c_buffer",
+                         offset_factor=1, strides=[te.var('sc'), 1])
+
+    def _intrin_func(ins, outs):
+
+        def _instr():
+            ib = tvm.tir.ir_builder.create()
+            aa, bb = ins
+            cc = outs[0]
+            stepA = min(4, M)
+            stepB = min(4, N)
+
+            if in_type == 'int8':
+                ib.emit(tvm.tir.call_extern("int32", "gemv_int8_int8_int32_{0}_{1}".format(stepA, stepB),
+                                        outs[0].access_ptr("w"),
+                                        a_buffer.access_ptr("r"),
+                                        b_buffer.access_ptr("r"),
+                                        K))
+            else:
+                ib.emit(tvm.tir.call_extern("int32", "gemv_uint8_uint8_int32_{0}_{1}".format(stepA, stepB),
+                                        c_buffer.access_ptr("w"),
+                                        a_buffer.access_ptr("r"),
+                                        b_buffer.access_ptr("r"),
+                                        K,
+                                        C.shape[0],  # m, very useful for debug
+                                        C.shape[1]))  # n, very useful for debug
+            return ib.get()
+
+        # body, reset, update
+        return _instr()
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(C.op, _intrin_func, binds={A:a_buffer, B:b_buffer, C:c_buffer}, default_buffer_params=buffer_params)
+
+
 def dot_int8_int8_int32(int32_lanes, dtype='uint'):
     """
     Int8 dot product by every 4 elements using ARM v8.2 udot.
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 767087b0d4f0..7645588f2d35 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -187,6 +187,25 @@ def schedule_conv2d_winograd_weight_transform(outs):
     return s
 
 
+def schedule_conv2d_gemm_weight_transform(outs):
+    """Schedule for weight transformation of gemm
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of this operator
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    # Typically this is computed in PreCompute pass
+    s = te.create_schedule([x.op for x in outs])
+    return s
+
+
 def schedule_conv3d_winograd_weight_transform(outs):
     """Schedule for weight transformation of 3D winograd
 
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 4c7941b49692..d72c0621202a 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -590,6 +590,31 @@ def conv2d_NCHWc_int8(data, kernel, stride, padding, dilation, layout, out_layou
                       name='conv2d_NCHWc_int8', tag="conv2d_NCHWc_int8")
 
 
+def conv2d_gemm_weight_transform(kernel):
+    KH, KW, IC, OC = get_const_tuple(kernel.shape)
+    K = KH * KW * IC
+    N = OC
+
+    kernel_flat = te.compute((K, N), lambda x, y: kernel[(x // IC) // KW, (x // IC) % KW, x % IC, y], 'weight_flatten')
+
+    pad_k = 0
+    pad_n = 0
+
+    if N % 4 != 0:
+        pad_n = 4 - (N % 4)
+
+    if K % 16 != 0:
+        pad_k = 16 - (K % 16)
+
+    N_padded = N + pad_n
+    K_padded = K + pad_k
+
+    if pad_k != 0 or pad_n != 0:
+        kernel_flat = pad(kernel_flat, pad_before=(0, 0), pad_after=(pad_k, pad_n), name='weight_padding')
+
+    return te.compute((N_padded // 4, K_padded // 16, 4, 16), lambda x, y, z, w: kernel_flat[w + 16 * y, z + 4 * x], name='weight_block_reshape')
+
+
 def conv2d_winograd_weight_transform(kernel, tile_size):
     """Weight transformation for winograd
 

From ddc136c2002acb21c5d77b0a5a9b10422977809d Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 9 Jun 2020 15:57:30 +0100
Subject: [PATCH 02/15] Add ASF header to conv2d_gemm.py

Change-Id: I33853279e39c849ae1b555a9c91d7557985a0a35
---
 topi/python/topi/arm_cpu/conv2d_gemm.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/topi/python/topi/arm_cpu/conv2d_gemm.py b/topi/python/topi/arm_cpu/conv2d_gemm.py
index fddea47af450..50a8992bec4f 100644
--- a/topi/python/topi/arm_cpu/conv2d_gemm.py
+++ b/topi/python/topi/arm_cpu/conv2d_gemm.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 import tvm
 import topi
 from tvm import te

From ead3a40e50469381159342aa74a5f1383cf3d8d0 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 9 Jun 2020 16:26:49 +0100
Subject: [PATCH 03/15] Run clang-format-10 on c++ files

Change-Id: Ieee22f032e595dabfc1616ab33466fcbf8d94365
---
 src/relay/op/nn/convolution.cc | 27 +++++++++++----------------
 src/relay/op/nn/convolution.h  | 12 ++++++------
 2 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 55ae2d7f1575..32889a07c0e5 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -78,11 +78,10 @@ Expr MakeConvWinograd(Expr data, Expr weight, int tile_size, Array<IndexExpr> st
 }
 
 template <typename T>
-Expr MakeConvGemm(Expr data, Expr weight, Array<IndexExpr> strides,
-                      Array<IndexExpr> padding, Array<IndexExpr> dilation, int groups,
-                      IndexExpr channels, Array<IndexExpr> kernel_size, std::string data_layout,
-                      std::string kernel_layout, std::string out_layout, DataType out_dtype,
-                      std::string op_name) {
+Expr MakeConvGemm(Expr data, Expr weight, Array<IndexExpr> strides, Array<IndexExpr> padding,
+                  Array<IndexExpr> dilation, int groups, IndexExpr channels,
+                  Array<IndexExpr> kernel_size, std::string data_layout, std::string kernel_layout,
+                  std::string out_layout, DataType out_dtype, std::string op_name) {
   auto attrs = make_object<T>();
   attrs->strides = std::move(strides);
   attrs->padding = std::move(padding);
@@ -532,14 +531,13 @@ weight transformation in advance.
 
 // relay.nn.contrib_conv2d_gemm_without_weight_transform
 TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_gemm_without_weight_transform")
-    .set_body_typed([](Expr data, Expr weight, Array<IndexExpr> strides,
-                       Array<IndexExpr> padding, Array<IndexExpr> dilation, int groups,
-                       IndexExpr channels, Array<IndexExpr> kernel_size, std::string data_layout,
+    .set_body_typed([](Expr data, Expr weight, Array<IndexExpr> strides, Array<IndexExpr> padding,
+                       Array<IndexExpr> dilation, int groups, IndexExpr channels,
+                       Array<IndexExpr> kernel_size, std::string data_layout,
                        std::string kernel_layout, std::string out_layout, DataType out_dtype) {
       return MakeConvGemm<Conv2DAttrs>(
-          data, weight, strides, padding, dilation, groups, channels, kernel_size,
-          data_layout, kernel_layout, out_layout, out_dtype,
-          "nn.contrib_conv2d_gemm_without_weight_transform");
+          data, weight, strides, padding, dilation, groups, channels, kernel_size, data_layout,
+          kernel_layout, out_layout, out_dtype, "nn.contrib_conv2d_gemm_without_weight_transform");
     });
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_gemm_without_weight_transform")
@@ -560,15 +558,13 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_gemm_without_weight_transform")
     .add_argument("weight", "Tensor", "The weight tensor.")
     .set_support_level(10)
     .add_type_rel("Conv2DGemm", Conv2DGemmRel<Conv2DAttrs>)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
-                                   ConvInferCorrectLayout<Conv2DAttrs>);
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv2DAttrs>);
 
 // relay.nn.contrib_conv2d_gemm_weight_transform
 
 TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_gemm_weight_transform")
     .set_body_typed([](Expr weights) {
-      return MakeConvGemmWeightTransform(weights,
-                                             "nn.contrib_conv2d_gemm_weight_transform");
+      return MakeConvGemmWeightTransform(weights, "nn.contrib_conv2d_gemm_weight_transform");
     });
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_gemm_weight_transform")
@@ -679,6 +675,5 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.deformable_conv2d")
           kernel_size, data_layout, kernel_layout, out_layout, out_dtype, "nn.deformable_conv2d");
     });
 
-
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index f2dba77fcc00..f7c330fbd002 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -392,7 +392,7 @@ inline bool Conv2DGemmWeightTransformRel(const Array<Type>& types, int num_input
 
   CHECK_EQ(data->shape.size(), 4) << "Only support HWIO kernel layout";
 
-  const auto K = data->shape[0]*data->shape[1]*data->shape[2];
+  const auto K = data->shape[0] * data->shape[1] * data->shape[2];
   const auto N = data->shape[3];
 
   auto k_mod_16 = indexmod(K, 16);
@@ -405,10 +405,10 @@ inline bool Conv2DGemmWeightTransformRel(const Array<Type>& types, int num_input
   const auto K_padded = K + pad_k;
 
   Array<IndexExpr> oshape{
-            indexdiv(N_padded, 4),
-            indexdiv(K_padded, 16),
-            4,
-            16,
+      indexdiv(N_padded, 4),
+      indexdiv(K_padded, 16),
+      4,
+      16,
   };
 
   reporter->Assign(types[1], TensorType(oshape, data->dtype));
@@ -553,7 +553,7 @@ bool Conv2DWinogradRel(const Array<Type>& types, int num_inputs, const Attrs& at
 
 template <typename AttrType>
 bool Conv2DGemmRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
-                       const TypeReporter& reporter) {
+                   const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;

From 1a7a50cfa1743ac567a83e5b7763dfc20b0621ed Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 9 Jun 2020 16:36:20 +0100
Subject: [PATCH 04/15] Fix pylint errors/warnings

Change-Id: I435d4d7bca7500db99547f4401fdc0d0995a1ff4
---
 python/tvm/relay/op/nn/_nn.py           | 2 +-
 python/tvm/relay/op/strategy/arm_cpu.py | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index b854f4ac5e28..53f317dcccfe 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -461,7 +461,7 @@ def compute_contrib_conv2d_gemm_weight_transform(attrs, inputs, out_dtype):
     return [out]
 
 reg.register_schedule("nn.contrib_conv2d_gemm_weight_transform",
-                       strategy.schedule_conv2d_gemm_weight_transform)
+                      strategy.schedule_conv2d_gemm_weight_transform)
 reg.register_pattern("nn.contrib_conv2d_gemm_weight_transform",
                      OpPattern.OUT_ELEMWISE_FUSABLE)
 
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 8d35ee3f23ce..05a629d526e3 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -272,12 +272,8 @@ def _compute_conv2d_gemm(attrs, inputs, out_type):
 @conv2d_gemm_without_weight_transform_strategy.register("arm_cpu")
 def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv2d_winograd_without_weight_transfrom arm cpu strategy"""
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
     layout = attrs.data_layout
-    strides = attrs.get_int_tuple("strides")
     data = inputs[0]
-    kernel = inputs[1]
     strategy = _op.OpStrategy()
 
     if layout == "NHWC" and data.dtype in ['int8', 'uint8']:
@@ -286,8 +282,9 @@ def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_typ
             wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
             name="conv2d_NHWC_quantized_without_transform.arm_cpu")
     else:
-        raise RuntimeError("Unsupported conv2d_gemm_without_weight_transform layout {0} with datatype {1}".
-                           format(layout, data.dtype))
+        raise RuntimeError(
+            "Unsupported conv2d_gemm_without_weight_transform layout {0} with datatype {1}".
+            format(layout, data.dtype))
     return strategy
 
 @conv2d_transpose_strategy.register(["arm_cpu", "micro_dev"])

From 6576359519c6240e3299caf224ba5743d41709a0 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 9 Jun 2020 18:34:22 +0100
Subject: [PATCH 05/15] Fix pylint errors/warnings in topi

Change-Id: I2fc1ad8453e9020072ab967c849df5390c2967b5
---
 topi/python/topi/arm_cpu/conv2d_alter_op.py | 10 ++--
 topi/python/topi/arm_cpu/conv2d_gemm.py     | 46 ++++++++++------
 topi/python/topi/arm_cpu/conv2d_int8.py     | 13 +++--
 topi/python/topi/arm_cpu/tensor_intrin.py   | 60 ++++++++++++---------
 topi/python/topi/nn/conv2d.py               | 22 ++++++--
 5 files changed, 102 insertions(+), 49 deletions(-)

diff --git a/topi/python/topi/arm_cpu/conv2d_alter_op.py b/topi/python/topi/arm_cpu/conv2d_alter_op.py
index 348e43b1dfc7..b58d6a953c36 100644
--- a/topi/python/topi/arm_cpu/conv2d_alter_op.py
+++ b/topi/python/topi/arm_cpu/conv2d_alter_op.py
@@ -258,10 +258,14 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(inputs[1])
         new_kernel = te.placeholder((N_padded // 4, K_padded // 16, 4, 16), kernel.dtype)
 
-        new_workload = autotvm.task.args_to_workload(
-            [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), CO], "conv2d_NHWC_int8_without_tranform.arm_cpu")
+        new_workload = autotvm.task.args_to_workload([data, new_kernel,
+                                                      strides, padding, dilation,
+                                                      out_dtype, (KH, KW), CO],
+                                                     "conv2d_NHWC_int8_without_tranform.arm_cpu")
         dispatch_ctx.update(target, new_workload, cfg)
 
-        return relay.nn.contrib_conv2d_gemm_without_weight_transform(inputs[0], kernel_expr, **new_attrs)
+        return relay.nn.contrib_conv2d_gemm_without_weight_transform(inputs[0],
+                                                                     kernel_expr,
+                                                                     **new_attrs)
 
     return None
diff --git a/topi/python/topi/arm_cpu/conv2d_gemm.py b/topi/python/topi/arm_cpu/conv2d_gemm.py
index 50a8992bec4f..124fa3bd2cfe 100644
--- a/topi/python/topi/arm_cpu/conv2d_gemm.py
+++ b/topi/python/topi/arm_cpu/conv2d_gemm.py
@@ -14,9 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+# pylint: disable=invalid-name, unused-variable, too-many-locals
+# pylint: disable=unused-argument, redefined-builtin
+"""GEMM Convolution schedule on ARM"""
 import tvm
-import topi
 from tvm import te
 from topi import nn
 from ..util import get_const_tuple
@@ -25,8 +26,11 @@
 
 
 # Compute function
-def compute_conv2d_gemm_without_weight_transform(cfg, data, B_interleaved_t, strides, padding, dilation, out_dtype, kernel_size, output_channels):
-
+def compute_conv2d_gemm_without_weight_transform(cfg,
+                                                 data, B_interleaved_t, strides, padding, dilation,
+                                                 out_dtype, kernel_size, output_channels):
+    """Compute conv2d by transforming the input,
+    executing GEMM and transforming the output back"""
     batches, IH, IW, IC = get_const_tuple(data.shape)
 
     KH, KW = kernel_size
@@ -49,7 +53,8 @@ def compute_conv2d_gemm_without_weight_transform(cfg, data, B_interleaved_t, str
     OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
     OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
     if pad_top or pad_left:
-        data_pad = nn.pad(data, [0, pad_top, pad_left, 0], [0, pad_down, pad_right, 0], name="data_pad")
+        data_pad = nn.pad(data, [0, pad_top, pad_left, 0], [0, pad_down, pad_right, 0],
+                          name="data_pad")
     else:
         data_pad = data
 
@@ -61,10 +66,13 @@ def compute_conv2d_gemm_without_weight_transform(cfg, data, B_interleaved_t, str
     A_shape = (batches, M, K)
     if K_AREA == 1:
         A = te.compute(A_shape, lambda n, x, y: data_pad[n, HSTR * (x // OW), WSTR * (x % OW), y],
-                          name='data_flatten')
+                       name='data_flatten')
     else:
-        A = te.compute(A_shape, lambda n, x, y: data_pad[n, HSTR * (x // OW) + dilation_h * (y // IC) // KW, WSTR * (x % OW) + dilation_w * (y // IC) % KW, y % IC],
-                          name='data_im2col')
+        A = te.compute(A_shape, lambda n, x, y:
+                       data_pad[n,
+                                HSTR * (x // OW) + dilation_h * (y // IC) // KW,
+                                WSTR * (x % OW) + dilation_w * (y // IC) % KW, y % IC],
+                       name='data_im2col')
     N_transformed = B_interleaved_t.shape[0]
 
     # --- Pad if necessary
@@ -91,19 +99,27 @@ def compute_conv2d_gemm_without_weight_transform(cfg, data, B_interleaved_t, str
     # --- GEMM: A*B'
     k = te.reduce_axis((0, K_padded), "k")
 
-    A_interleaved = te.compute((batches, M_padded // 4, K_padded // 16, 4, 16), lambda b, x, y, z, w: A[b, z + 4 * x, w + 16 * y], name='A_interleaved')
+    A_interleaved = te.compute((batches, M_padded // 4, K_padded // 16, 4, 16),
+                               lambda b, x, y, z, w: A[b, z + 4 * x, w + 16 * y],
+                               name='A_interleaved')
 
     C_interleaved = te.compute((batches, M_padded // 4, N_transformed, 4, 4),
-            lambda b, x, y, w, z: te.sum(A_interleaved[b, x, k // 16, w, idxm(k, 16)].astype(out_dtype) *
-                                B_interleaved_t[y, k // 16, z, idxm(k, 16)].astype(out_dtype), axis=k),
-            name='C_interleaved')
+                               lambda b, x, y, w, z:
+                               te.sum(A_interleaved[b, x, k//16, w, idxm(k, 16)].astype(out_dtype)*
+                                      B_interleaved_t[y, k//16, z, idxm(k, 16)].astype(out_dtype),
+                                      axis=k),
+                               name='C_interleaved')
 
     # --- Unpack C
-    C = te.compute((batches, M, N), lambda b, x, y: C_interleaved[b, x // 4, y // 4, idxm(x, 4), idxm(y, 4)], name="C", tag='injective')
+    C = te.compute((batches, M, N),
+                   lambda b, x, y:
+                   C_interleaved[b, x // 4, y // 4, idxm(x, 4), idxm(y, 4)],
+                   name="C", tag='injective')
 
     # --- Produce the conv output
     out_shape = (batches, OH, OW, OC)
-    out = te.compute(out_shape, lambda b, x, y, z: C(b, y + OW * x, z), name='conv2d_gemm_output')
+    out = te.compute(out_shape, lambda b, x, y, z: C(b, y + OW * x, z),
+                     name='conv2d_gemm_output')
 
     return out
 
@@ -111,7 +127,7 @@ def compute_conv2d_gemm_without_weight_transform(cfg, data, B_interleaved_t, str
 
 
 def schedule_conv2d_gemm(cfg, s, out):
-
+    """Create schedule for tensors"""
     C = out.op.input_tensors[0]
     C_interleaved = C.op.input_tensors[0]
     A_interleaved = C_interleaved.op.input_tensors[0]
diff --git a/topi/python/topi/arm_cpu/conv2d_int8.py b/topi/python/topi/arm_cpu/conv2d_int8.py
index e5ae40f80cd6..3a1401658c99 100644
--- a/topi/python/topi/arm_cpu/conv2d_int8.py
+++ b/topi/python/topi/arm_cpu/conv2d_int8.py
@@ -23,7 +23,6 @@
 from ..generic import conv2d as conv2d_generic
 from .. import nn
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
-from ..nn.util import get_const_int, get_pad_tuple
 from .tensor_intrin import dot_int8_int8_int32
 from .conv2d_gemm import compute_conv2d_gemm_without_weight_transform, schedule_conv2d_gemm
 
@@ -118,12 +117,18 @@ def compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, dilation,
     N, IH, IW, IC = get_const_tuple(data.shape)
     KH, KW, _, OC = get_const_tuple(kernel.shape)
     kernel = nn.conv2d_gemm_weight_transform(kernel)
-    return  compute_conv2d_gemm_without_weight_transform(cfg, data, kernel, strides, padding, dilation, out_dtype, (KH, KW), OC)
+    return  compute_conv2d_gemm_without_weight_transform(cfg,
+                                                         data, kernel, strides, padding,
+                                                         dilation, out_dtype, (KH, KW), OC)
 
 
 @autotvm.register_topi_compute("compute_conv2d_NHWC_quantized_without_transform.arm_cpu")
-def compute_conv2d_NHWC_quantized_without_transform(cfg, data, B, strides, padding, dilation, out_dtype, kernel_size=None, output_channels=None):
-    return  compute_conv2d_gemm_without_weight_transform(cfg, data, B, strides, padding, dilation, out_dtype, kernel_size, output_channels)
+def compute_conv2d_NHWC_quantized_without_transform(cfg, data, B, strides, padding,
+                                                    dilation, out_dtype, kernel_size=None,
+                                                    output_channels=None):
+    return  compute_conv2d_gemm_without_weight_transform(cfg, data, B, strides, padding,
+                                                         dilation, out_dtype, kernel_size,
+                                                         output_channels)
 
 
 @autotvm.register_topi_schedule("conv2d_NHWC_quantized.arm_cpu")
diff --git a/topi/python/topi/arm_cpu/tensor_intrin.py b/topi/python/topi/arm_cpu/tensor_intrin.py
index d8285fa6e878..cf56a06c326a 100644
--- a/topi/python/topi/arm_cpu/tensor_intrin.py
+++ b/topi/python/topi/arm_cpu/tensor_intrin.py
@@ -19,15 +19,23 @@
 
 import tvm
 from tvm import te
+from tvm.contrib import util, clang
 
+def gemv_quantized_impl(M, N, data_type='uint8'):
+    """ Assembly implementation of a blocked gemv. Given
+    a block a of shape (4, k) and a block b' of shape (4, k)
+    produces the output block c = a*b of shape (4,4) """
 
-def gemv_quantized_impl(M, N, data_type = 'uint8'):
     stepA = min(4, M)
     stepB = min(4, N)
-    assert data_type in ['uint8', 'int8'], 'Only uint8 and int8 are supported for this implementation'
+    assert data_type in ['uint8', 'int8'], 'Only uint8/int8 supported for this implementation'
 
     cc_code = """
-          extern "C" int gemv_{0}_{0}_int32_{1}_{2}(int *c_buffer, unsigned char *a_buffer, unsigned char *b_buffer, int K, int m, int n) """.format(data_type, stepA, stepB)
+          extern "C" int gemv_{0}_{0}_int32_{1}_{2}(int *c_buffer,
+                                                    unsigned char *a_buffer,
+                                                    unsigned char *b_buffer,
+                                                    int K, int m, int n)
+              """.format(data_type, stepA, stepB)
 
     cc_code += """
     {
@@ -255,24 +263,24 @@ def gemv_quantized_impl(M, N, data_type = 'uint8'):
         cc_code = cc_code.replace('umull', 'smull')
         cc_code = cc_code.replace('uadalp', 'sadalp')
 
-    from tvm.contrib import util, clang
     temp = util.tempdir()
     ll_path = temp.relpath("temp.ll")
     # Create LLVM ir from c source code
-    cc_code = cc_code
-    ll_code = clang.create_llvm(cc_code, options=["--target=aarch64-linux-gnu -mattr=+neon"], output=ll_path)
+    ll_code = clang.create_llvm(cc_code,
+                                options=["--target=aarch64-linux-gnu -mattr=+neon"],
+                                output=ll_path)
     return ll_code
 
 
 def gemv_quantized(M, N, K, in_type, out_type):
     """
     Use integer ARM v8 instructions in order to produce a block c of 4x4 elements
-    given two 4xK blocks a and b' (where b' is a Kx4 block transposed). The final 
+    given two 4xK blocks a and b' (where b' is a Kx4 block transposed). The final
     result is c = a*b (where '*' indicates the matrix product)
 
     Every row of the matrix c is obtained (for uint8) by a sequence of
 
-          umull -> uadalp -> umull2 -> uadalp 
+          umull -> uadalp -> umull2 -> uadalp
 
     The block size is constrained by the number of registers available in arvm8. This
     function returns a TensorIntrin that can be used to tensorize
@@ -303,8 +311,8 @@ def gemv_quantized(M, N, K, in_type, out_type):
 
     C = te.compute((te.var("m"), te.var("n")),
                    lambda x, y: te.sum(A[k // 16, x, idxm(k, 16)].astype(out_type) *
-                                    B[k // 16, y, idxm(k, 16)].astype(out_type),
-                                    axis=k), name="C")
+                                       B[k // 16, y, idxm(k, 16)].astype(out_type),
+                                       axis=k), name="C")
 
     a_buffer = tvm.tir.decl_buffer(A.shape, dtype=in_type, name="a_buffer",
                                    offset_factor=1, strides=[te.var('sa_1'), te.var('sa_2'), 1])
@@ -313,7 +321,7 @@ def gemv_quantized(M, N, K, in_type, out_type):
                                    offset_factor=1, strides=[te.var('sb_1'), te.var('sb_2'), 1])
 
     c_buffer = tvm.tir.decl_buffer(C.shape, dtype=out_type, name="c_buffer",
-                         offset_factor=1, strides=[te.var('sc'), 1])
+                                   offset_factor=1, strides=[te.var('sc'), 1])
 
     def _intrin_func(ins, outs):
 
@@ -325,26 +333,30 @@ def _instr():
             stepB = min(4, N)
 
             if in_type == 'int8':
-                ib.emit(tvm.tir.call_extern("int32", "gemv_int8_int8_int32_{0}_{1}".format(stepA, stepB),
-                                        outs[0].access_ptr("w"),
-                                        a_buffer.access_ptr("r"),
-                                        b_buffer.access_ptr("r"),
-                                        K))
+                ib.emit(tvm.tir.call_extern("int32",
+                                            "gemv_int8_int8_int32_{0}_{1}".format(stepA, stepB),
+                                            outs[0].access_ptr("w"),
+                                            a_buffer.access_ptr("r"),
+                                            b_buffer.access_ptr("r"),
+                                            K))
             else:
-                ib.emit(tvm.tir.call_extern("int32", "gemv_uint8_uint8_int32_{0}_{1}".format(stepA, stepB),
-                                        c_buffer.access_ptr("w"),
-                                        a_buffer.access_ptr("r"),
-                                        b_buffer.access_ptr("r"),
-                                        K,
-                                        C.shape[0],  # m, very useful for debug
-                                        C.shape[1]))  # n, very useful for debug
+                ib.emit(tvm.tir.call_extern("int32",
+                                            "gemv_uint8_uint8_int32_{0}_{1}".format(stepA, stepB),
+                                            c_buffer.access_ptr("w"),
+                                            a_buffer.access_ptr("r"),
+                                            b_buffer.access_ptr("r"),
+                                            K,
+                                            C.shape[0],  # m, very useful for debug
+                                            C.shape[1]))  # n, very useful for debug
             return ib.get()
 
         # body, reset, update
         return _instr()
 
     buffer_params = {"offset_factor": 1}
-    return te.decl_tensor_intrin(C.op, _intrin_func, binds={A:a_buffer, B:b_buffer, C:c_buffer}, default_buffer_params=buffer_params)
+    return te.decl_tensor_intrin(C.op, _intrin_func,
+                                 binds={A:a_buffer, B:b_buffer, C:c_buffer},
+                                 default_buffer_params=buffer_params)
 
 
 def dot_int8_int8_int32(int32_lanes, dtype='uint'):
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index d72c0621202a..c2b4eaa10cce 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -591,11 +591,25 @@ def conv2d_NCHWc_int8(data, kernel, stride, padding, dilation, layout, out_layou
 
 
 def conv2d_gemm_weight_transform(kernel):
+    """Weight transformation for winograd
+
+    Parameters
+    ----------
+    kernel: Tensor
+        The raw kernel tensor with layout "NHWC".
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        2-D with shape [CI*KH*KW,CO]
+    """
     KH, KW, IC, OC = get_const_tuple(kernel.shape)
     K = KH * KW * IC
     N = OC
 
-    kernel_flat = te.compute((K, N), lambda x, y: kernel[(x // IC) // KW, (x // IC) % KW, x % IC, y], 'weight_flatten')
+    kernel_flat = te.compute((K, N), lambda x, y:
+                             kernel[(x // IC) // KW, (x // IC) % KW, x % IC, y],
+                             'weight_flatten')
 
     pad_k = 0
     pad_n = 0
@@ -610,9 +624,11 @@ def conv2d_gemm_weight_transform(kernel):
     K_padded = K + pad_k
 
     if pad_k != 0 or pad_n != 0:
-        kernel_flat = pad(kernel_flat, pad_before=(0, 0), pad_after=(pad_k, pad_n), name='weight_padding')
+        kernel_flat = pad(kernel_flat, pad_before=(0, 0), pad_after=(pad_k, pad_n),
+                          name='weight_padding')
 
-    return te.compute((N_padded // 4, K_padded // 16, 4, 16), lambda x, y, z, w: kernel_flat[w + 16 * y, z + 4 * x], name='weight_block_reshape')
+    return te.compute((N_padded // 4, K_padded // 16, 4, 16), lambda x, y, z, w:
+                      kernel_flat[w + 16 * y, z + 4 * x], name='weight_block_reshape')
 
 
 def conv2d_winograd_weight_transform(kernel, tile_size):

From 637516a77551026be0f4a93c8556a95e673e5785 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Wed, 10 Jun 2020 09:54:36 +0100
Subject: [PATCH 06/15] Fix legalizations tests for aarch64

Change-Id: I0a67a49a7849f52ef7d57b9292ce9125bbb7cb2c
---
 tests/python/relay/test_pass_qnn_legalize.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/relay/test_pass_qnn_legalize.py b/tests/python/relay/test_pass_qnn_legalize.py
index 5f7deff23b06..d39a23b68353 100644
--- a/tests/python/relay/test_pass_qnn_legalize.py
+++ b/tests/python/relay/test_pass_qnn_legalize.py
@@ -148,7 +148,7 @@ def _get_mod(data_dtype, kernel_dtype):
         # Older ARM vesions.
         with tvm.target.create('llvm -device=arm_cpu -target=aarch64-linux-gnu'):
             legalized_mod = relay.qnn.transform.Legalize()(mod)
-            assert 'cast' in legalized_mod.astext() and "qnn" not in legalized_mod.astext()
+            assert 'cast' not in legalized_mod.astext() and "qnn.conv2d" in legalized_mod.astext()
 
     # Check uint8 x int8 transformation
     mod = _get_mod('uint8', 'int8')
@@ -176,7 +176,7 @@ def _get_mod(data_dtype, kernel_dtype):
     # Older ARM vesions.
     with tvm.target.create('llvm -device=arm_cpu -target=aarch64-linux-gnu'):
         legalized_mod = relay.qnn.transform.Legalize()(mod)
-        assert 'cast' in legalized_mod.astext() and "qnn" not in legalized_mod.astext()
+        assert 'cast' in legalized_mod.astext() and "qnn.conv2d" in legalized_mod.astext()
 
     ###########################################
     # Check transformations for CUDA platforms.

From 1d1073f890eebff5fb96ca86aef5935fa0e91f24 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Thu, 11 Jun 2020 15:01:49 +0100
Subject: [PATCH 07/15] Reintroduce conv2d_nhwc_spatial_pack.arm_cpu and int16
 cast

Change-Id: I91b67fabd475e90a9b75f2dd5ecfee851265e0bb
---
 python/tvm/relay/op/strategy/arm_cpu.py      | 11 +++++-----
 python/tvm/relay/qnn/op/legalizations.py     |  4 ++--
 tests/python/relay/test_pass_qnn_legalize.py |  4 ++--
 topi/python/topi/arm_cpu/conv2d_alter_op.py  | 22 ++++++++++++--------
 4 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 05a629d526e3..25066c7610be 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -113,16 +113,17 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     name='conv2d_direct_simd.micro_dev')
             elif kernel_layout == "HWIO":
                 is_aarch64 = "aarch64" in str(isa.target)
+
                 if is_aarch64 and data.dtype in ["int8", "uint8"]:
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
                         name="compute_conv2d_NHWC_quantized.arm_cpu")
-                else:
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
-                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
-                        name="conv2d_nhwc_spatial_pack.arm_cpu")
+
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
+                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
+                    name="conv2d_nhwc_spatial_pack.arm_cpu")
             else:
                 raise RuntimeError("Unsupported kernel layout {} for conv2d NHWC".
                                    format(kernel_layout))
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index 4eac33333eef..a32ac014c6ff 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -249,9 +249,9 @@ def is_aarch64_arm():
 @qnn_conv2d_legalize.register('arm_cpu')
 def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types):
     # ARM prefers the dtypes to be same.
-    if is_aarch64_arm():
+    if is_aarch64_arm() and attrs["data_layout"] == "NHWC" or is_fast_int8_on_arm():
         return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
-    return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.qnn.op.conv2d)
+    return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.conv2d)
 
 
 @qnn_dense_legalize.register('arm_cpu')
diff --git a/tests/python/relay/test_pass_qnn_legalize.py b/tests/python/relay/test_pass_qnn_legalize.py
index d39a23b68353..5f7deff23b06 100644
--- a/tests/python/relay/test_pass_qnn_legalize.py
+++ b/tests/python/relay/test_pass_qnn_legalize.py
@@ -148,7 +148,7 @@ def _get_mod(data_dtype, kernel_dtype):
         # Older ARM vesions.
         with tvm.target.create('llvm -device=arm_cpu -target=aarch64-linux-gnu'):
             legalized_mod = relay.qnn.transform.Legalize()(mod)
-            assert 'cast' not in legalized_mod.astext() and "qnn.conv2d" in legalized_mod.astext()
+            assert 'cast' in legalized_mod.astext() and "qnn" not in legalized_mod.astext()
 
     # Check uint8 x int8 transformation
     mod = _get_mod('uint8', 'int8')
@@ -176,7 +176,7 @@ def _get_mod(data_dtype, kernel_dtype):
     # Older ARM vesions.
     with tvm.target.create('llvm -device=arm_cpu -target=aarch64-linux-gnu'):
         legalized_mod = relay.qnn.transform.Legalize()(mod)
-        assert 'cast' in legalized_mod.astext() and "qnn.conv2d" in legalized_mod.astext()
+        assert 'cast' in legalized_mod.astext() and "qnn" not in legalized_mod.astext()
 
     ###########################################
     # Check transformations for CUDA platforms.
diff --git a/topi/python/topi/arm_cpu/conv2d_alter_op.py b/topi/python/topi/arm_cpu/conv2d_alter_op.py
index b58d6a953c36..5defc09dc995 100644
--- a/topi/python/topi/arm_cpu/conv2d_alter_op.py
+++ b/topi/python/topi/arm_cpu/conv2d_alter_op.py
@@ -88,21 +88,25 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         return relay.nn.conv2d(*inputs, **new_attrs)
 
     if topi_tmpl == "conv2d_nhwc_spatial_pack.arm_cpu":
-        assert data_layout == "NHWC" and kernel_layout == "HWIO"
-        N, H, W, CI = get_const_tuple(data.shape)
-        KH, KW, _, CO = get_const_tuple(kernel.shape)
-        VC = cfg['tile_co'].size[-1]
+        assert (data.dtype == 'int8' and kernel.dtype == 'int8' or
+                data.dtype == 'uint8' and kernel.dtype == 'uint8')
 
-        new_attrs['kernel_layout'] = 'OHWI%do' % VC
+        data_expr, kernel_expr = inputs
+
+        data_int16 = relay.cast(data_expr, dtype='int16')
+        kernel_int16 = relay.cast(kernel_expr, dtype='int16')
+
+        new_attrs = {k : attrs[k] for k in attrs.keys()}
+
+        new_data = te.placeholder(data.shape, 'int16')
+        new_kernel = te.placeholder(kernel.shape, 'int16')
 
-        new_data = data
-        new_kernel = te.placeholder((idxd(CO, VC), KH, KW, CI, VC), dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, out_dtype],
-            "conv2d_nhwc_spatial_pack.arm_cpu")
+            'conv2d_nhwc_spatial_pack.arm_cpu')
         dispatch_ctx.update(target, new_workload, cfg)
 
-        return relay.nn.conv2d(*inputs, **new_attrs)
+        return relay.nn.conv2d(data_int16, kernel_int16, **new_attrs)
 
     if topi_tmpl == "conv2d_nchw_winograd.arm_cpu":
         assert data_layout == "NCHW" and kernel_layout == "OIHW"

From fc0efc5861f213b9c4d940c8982e8e8c22fef3ee Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Thu, 11 Jun 2020 17:12:18 +0100
Subject: [PATCH 08/15] Switch type of legalization depending on the strategy
 used

Change-Id: I9a03040a8c40a6cd2658ed14c3751e05a8e19f2b
---
 python/tvm/relay/qnn/op/legalizations.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index a32ac014c6ff..eee881d86fb4 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -21,6 +21,7 @@
 import tvm
 from tvm import relay
 from .. import op as reg
+from tvm import autotvm
 
 #################################################
 # Register the functions for different operators.
@@ -248,9 +249,25 @@ def is_aarch64_arm():
 
 @qnn_conv2d_legalize.register('arm_cpu')
 def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types):
-    # ARM prefers the dtypes to be same.
-    if is_aarch64_arm() and attrs["data_layout"] == "NHWC" or is_fast_int8_on_arm():
+    # This legalization depends on the strategy used
+
+    # Get the strategy from the tuner
+    target = tvm.target.Target.current(allow_none=False)
+    input_ph = tvm.te.placeholder(types[0].shape, types[0].dtype)
+    kernel_ph = tvm.te.placeholder(types[1].shape, types[1].dtype)
+    _, outs = relay.backend.compile_engine.select_implementation(
+        relay.op.get("nn.conv2d"), attrs, [input_ph, kernel_ph], types[-1], target)
+
+    workload = autotvm.task.get_workload(outs)
+
+    assert workload != None, "Unable to retrieve a workload"
+
+    topi_tmpl = workload[0]
+
+    if topi_tmpl == 'compute_conv2d_NHWC_quantized.arm_cpu' or is_fast_int8_on_arm():
+        # Fast implementations prefer the types to be the same
         return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
+
     return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.conv2d)
 
 

From 6e926dbd9cbd416ff14291db4a7e41a0742d9049 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Thu, 11 Jun 2020 17:31:00 +0100
Subject: [PATCH 09/15] Revert last commit

Change-Id: Ice34101e358e3ce8ebfb12c58f73e910ba5de8e8
---
 python/tvm/relay/qnn/op/legalizations.py | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index eee881d86fb4..a32ac014c6ff 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -21,7 +21,6 @@
 import tvm
 from tvm import relay
 from .. import op as reg
-from tvm import autotvm
 
 #################################################
 # Register the functions for different operators.
@@ -249,25 +248,9 @@ def is_aarch64_arm():
 
 @qnn_conv2d_legalize.register('arm_cpu')
 def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types):
-    # This legalization depends on the strategy used
-
-    # Get the strategy from the tuner
-    target = tvm.target.Target.current(allow_none=False)
-    input_ph = tvm.te.placeholder(types[0].shape, types[0].dtype)
-    kernel_ph = tvm.te.placeholder(types[1].shape, types[1].dtype)
-    _, outs = relay.backend.compile_engine.select_implementation(
-        relay.op.get("nn.conv2d"), attrs, [input_ph, kernel_ph], types[-1], target)
-
-    workload = autotvm.task.get_workload(outs)
-
-    assert workload != None, "Unable to retrieve a workload"
-
-    topi_tmpl = workload[0]
-
-    if topi_tmpl == 'compute_conv2d_NHWC_quantized.arm_cpu' or is_fast_int8_on_arm():
-        # Fast implementations prefer the types to be the same
+    # ARM prefers the dtypes to be same.
+    if is_aarch64_arm() and attrs["data_layout"] == "NHWC" or is_fast_int8_on_arm():
         return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
-
     return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.conv2d)
 
 

From 38d59fa8c67154551646c754bd12f270c38ef7e8 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Thu, 11 Jun 2020 18:49:13 +0100
Subject: [PATCH 10/15] Fix the auto-tuner by registering the correct schedules

Change-Id: Id9273688b2620e1ea849ab01b4c46af8fbf37fd0
---
 python/tvm/relay/op/strategy/arm_cpu.py     | 2 +-
 topi/python/topi/arm_cpu/conv2d_alter_op.py | 2 +-
 topi/python/topi/arm_cpu/conv2d_int8.py     | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 25066c7610be..d682aad63bec 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -118,7 +118,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
-                        name="compute_conv2d_NHWC_quantized.arm_cpu")
+                        name="conv2d_NHWC_quantized.arm_cpu")
 
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
diff --git a/topi/python/topi/arm_cpu/conv2d_alter_op.py b/topi/python/topi/arm_cpu/conv2d_alter_op.py
index 5defc09dc995..c23879a0c664 100644
--- a/topi/python/topi/arm_cpu/conv2d_alter_op.py
+++ b/topi/python/topi/arm_cpu/conv2d_alter_op.py
@@ -239,7 +239,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
              new_attrs['out_layout'], out_dtype], topi_tmpl)
         dispatch_ctx.update(target, new_workload, cfg)
         return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs)
-    if topi_tmpl == "compute_conv2d_NHWC_quantized.arm_cpu":
+    if topi_tmpl == "conv2d_NHWC_quantized.arm_cpu":
         assert (data.dtype == 'int8' and kernel.dtype == 'int8' or
                 data.dtype == 'uint8' and kernel.dtype == 'uint8')
         CO, IC, KH, KW = get_const_tuple(kernel.shape)
diff --git a/topi/python/topi/arm_cpu/conv2d_int8.py b/topi/python/topi/arm_cpu/conv2d_int8.py
index 3a1401658c99..ae74553c72fd 100644
--- a/topi/python/topi/arm_cpu/conv2d_int8.py
+++ b/topi/python/topi/arm_cpu/conv2d_int8.py
@@ -112,7 +112,7 @@ def traverse(op):
     return s
 
 
-@autotvm.register_topi_compute("compute_conv2d_NHWC_quantized.arm_cpu")
+@autotvm.register_topi_compute("conv2d_NHWC_quantized.arm_cpu")
 def compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, dilation, out_dtype):
     N, IH, IW, IC = get_const_tuple(data.shape)
     KH, KW, _, OC = get_const_tuple(kernel.shape)
@@ -122,7 +122,7 @@ def compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, dilation,
                                                          dilation, out_dtype, (KH, KW), OC)
 
 
-@autotvm.register_topi_compute("compute_conv2d_NHWC_quantized_without_transform.arm_cpu")
+@autotvm.register_topi_compute("conv2d_NHWC_quantized_without_transform.arm_cpu")
 def compute_conv2d_NHWC_quantized_without_transform(cfg, data, B, strides, padding,
                                                     dilation, out_dtype, kernel_size=None,
                                                     output_channels=None):

From e5618a7e474afb0957fa634c48a1baefbc5996e3 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Mon, 15 Jun 2020 11:29:59 +0100
Subject: [PATCH 11/15] Address review comments

Change-Id: Ia1755a0af7b6d159072d9f0c93c932c481101e48
---
 python/tvm/relay/op/nn/_nn.py               | 1 -
 topi/python/topi/arm_cpu/conv2d_alter_op.py | 5 +++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 53f317dcccfe..a4381cb9ae60 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -452,7 +452,6 @@ def compute_mirror_pad(attrs, inputs, out_dtype):
 reg.register_pattern("nn.contrib_conv2d_gemm_without_weight_transform",
                      OpPattern.OUT_ELEMWISE_FUSABLE)
 
-
 @reg.register_compute("nn.contrib_conv2d_gemm_weight_transform")
 def compute_contrib_conv2d_gemm_weight_transform(attrs, inputs, out_dtype):
     """Compute definition of contrib_conv2d_gemm_weight_transform"""
diff --git a/topi/python/topi/arm_cpu/conv2d_alter_op.py b/topi/python/topi/arm_cpu/conv2d_alter_op.py
index c23879a0c664..085a84d819ad 100644
--- a/topi/python/topi/arm_cpu/conv2d_alter_op.py
+++ b/topi/python/topi/arm_cpu/conv2d_alter_op.py
@@ -91,6 +91,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         assert (data.dtype == 'int8' and kernel.dtype == 'int8' or
                 data.dtype == 'uint8' and kernel.dtype == 'uint8')
 
+        assert data_layout == "NHWC" and kernel_layout == "HWIO"
+
         data_expr, kernel_expr = inputs
 
         data_int16 = relay.cast(data_expr, dtype='int16')
@@ -242,6 +244,9 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     if topi_tmpl == "conv2d_NHWC_quantized.arm_cpu":
         assert (data.dtype == 'int8' and kernel.dtype == 'int8' or
                 data.dtype == 'uint8' and kernel.dtype == 'uint8')
+
+        assert data_layout == "NHWC" and kernel_layout == "HWIO"
+
         CO, IC, KH, KW = get_const_tuple(kernel.shape)
 
         K = KH * KW * IC

From 9057c8b0d5cd792083b6e0a9818f21ba0cd7bd4f Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Fri, 19 Jun 2020 11:50:15 +0100
Subject: [PATCH 12/15] Improve usability and readability of
 conv2d_gemm_weight_transform

Change-Id: I3333186bbc2fe4054b58ce15d910e3be7b315482
---
 include/tvm/relay/attrs/nn.h                | 14 ++++++
 python/tvm/relay/op/nn/_nn.py               |  2 +-
 python/tvm/relay/op/nn/nn.py                |  8 +++-
 python/tvm/relay/qnn/op/legalizations.py    |  4 +-
 src/relay/op/nn/convolution.cc              | 15 +++++--
 src/relay/op/nn/convolution.h               | 47 ++++++++++++++++-----
 topi/python/topi/arm_cpu/conv2d_alter_op.py | 34 +++++++--------
 topi/python/topi/arm_cpu/conv2d_gemm.py     |  2 -
 topi/python/topi/arm_cpu/conv2d_int8.py     |  4 +-
 topi/python/topi/nn/conv2d.py               | 34 +++++++++------
 10 files changed, 112 insertions(+), 52 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index abe63e583ddc..15cd118a5fac 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -187,6 +187,20 @@ struct ConvWinogradWeightTransformAttrs : public tvm::AttrsNode<ConvWinogradWeig
   }
 };
 
+/*! \brief Attributes used in gemm weight transformation operators */
+struct ConvGemmWeightTransformAttrs : public tvm::AttrsNode<ConvGemmWeightTransformAttrs> {
+  int tile_rows;
+  int tile_cols;
+
+  TVM_DECLARE_ATTRS(ConvGemmWeightTransformAttrs,
+                    "relay.attrs.ConvGemmWeightTransformAttrs") {
+    TVM_ATTR_FIELD(tile_rows).describe(
+        "Tile rows of the weight transformation for ConvGemm.");
+    TVM_ATTR_FIELD(tile_cols).describe(
+        "Tile columns of the weight transformation for ConvGemm.");
+  }
+};
+
 /*! \brief Attributes used in convolution operators with winograd algorithm */
 struct Conv2DWinogradAttrs : public tvm::AttrsNode<Conv2DWinogradAttrs> {
   int tile_size;
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index a4381cb9ae60..564d6f762b3f 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -456,7 +456,7 @@ def compute_mirror_pad(attrs, inputs, out_dtype):
 def compute_contrib_conv2d_gemm_weight_transform(attrs, inputs, out_dtype):
     """Compute definition of contrib_conv2d_gemm_weight_transform"""
     out = topi.nn.conv2d_gemm_weight_transform(
-        inputs[0])
+        inputs[0], attrs.tile_rows, attrs.tile_cols)
     return [out]
 
 reg.register_schedule("nn.contrib_conv2d_gemm_weight_transform",
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index d6538ea6b428..3c47cf7919b5 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -2272,7 +2272,7 @@ def contrib_conv2d_winograd_weight_transform(weight,
     return _make.contrib_conv2d_winograd_weight_transform(weight, tile_size)
 
 
-def contrib_conv2d_gemm_weight_transform(weights):
+def contrib_conv2d_gemm_weight_transform(weights, tile_rows, tile_cols):
     r"""Weight Transformation part for 2D convolution with gemm algorithm.
 
     We separate this as a single op to enable pre-compute for inference.
@@ -2282,13 +2282,17 @@ def contrib_conv2d_gemm_weight_transform(weights):
     ----------
     weights : tvm.relay.Expr
         The weight expressions.
+    tile_rows: int
+        Tile rows of the weight transformation for ConvGemm.
+    tile_cols: int
+       Tile columns of the weight transformation for ConvGemm.
 
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
-    return _make.contrib_conv2d_gemm_weight_transform(weights)
+    return _make.contrib_conv2d_gemm_weight_transform(weights, tile_rows, tile_cols)
 
 
 def contrib_conv3d_winograd_weight_transform(weight,
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index a32ac014c6ff..72462141258c 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -238,7 +238,7 @@ def is_fast_int8_on_arm():
     return '+v8.2a,+dotprod' in ' '.join(target.options)
 
 def is_aarch64_arm():
-    """ Checks whether the hardware has support for fast Int8 arithmetic operations. """
+    """ Checks whether we are compiling for an AArch64 target. """
     target = tvm.target.Target.current(allow_none=False)
     return 'aarch64' in ' '.join(target.options)
 
@@ -249,7 +249,7 @@ def is_aarch64_arm():
 @qnn_conv2d_legalize.register('arm_cpu')
 def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types):
     # ARM prefers the dtypes to be same.
-    if is_aarch64_arm() and attrs["data_layout"] == "NHWC" or is_fast_int8_on_arm():
+    if (is_aarch64_arm() and attrs["data_layout"] == "NHWC") or is_fast_int8_on_arm():
         return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
     return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.conv2d)
 
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 32889a07c0e5..f63c48915f25 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -104,9 +104,12 @@ Expr MakeConvWinogradWeightTransform(Expr weight, int tile_size, std::string op_
   return Call(op, {weight}, Attrs(attrs), {});
 }
 
-Expr MakeConvGemmWeightTransform(Expr weight, std::string op_name) {
+Expr MakeConvGemmWeightTransform(Expr weight, int tile_rows, int tile_cols, std::string op_name) {
+  auto attrs = make_object<ConvGemmWeightTransformAttrs>();
+  attrs->tile_rows = tile_rows;
+  attrs->tile_cols = tile_cols;
   const Op& op = Op::Get(op_name);
-  return Call(op, {weight});
+  return Call(op, {weight}, Attrs(attrs), {});
 }
 
 template <typename T>
@@ -562,9 +565,12 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_gemm_without_weight_transform")
 
 // relay.nn.contrib_conv2d_gemm_weight_transform
 
+TVM_REGISTER_NODE_TYPE(ConvGemmWeightTransformAttrs);
+
 TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_gemm_weight_transform")
-    .set_body_typed([](Expr weights) {
-      return MakeConvGemmWeightTransform(weights, "nn.contrib_conv2d_gemm_weight_transform");
+    .set_body_typed([](Expr weights, int tile_rows, int tile_cols) {
+      return MakeConvGemmWeightTransform(weights, tile_rows, tile_cols,
+                                         "nn.contrib_conv2d_gemm_weight_transform");
     });
 
 RELAY_REGISTER_OP("nn.contrib_conv2d_gemm_weight_transform")
@@ -574,6 +580,7 @@ Separate this into another operator in order to enable Precompute Pass to comput
 weight transformation in advance.
 
 )code" TVM_ADD_FILELINE)
+    .set_attrs_type<ConvGemmWeightTransformAttrs>()
     .set_num_inputs(1)
     .add_argument("weights", "Tensor", "The weights tensor.")
     .set_support_level(10)
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index f7c330fbd002..2dd16dac48c0 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -384,31 +384,58 @@ inline bool Conv2DWinogradWeightTransformRel(const Array<Type>& types, int num_i
 }
 
 // Gemm convolution shape relations
+// In order to run GEMM we need to block-transpose and interleave the K x N weights matrix W.
+// The high level idea is to subdivide W in tiles of tile_cols x tile_rows, and transpose and
+// interleave them. The final output is a [N//tile_rows, K//tile_cols, tile_rows, tile_cols]
+// matrix that we call W_interleaved_t.
+//
+// In the following picture, we show how the first [tile_cols,tile_rows] block of W is transformed
+// for tile_rows = 4 and tile_cols = 16
+//
+//              W[0,0,:,:]                        W_interleaved_t[0,0,:,:]
+//  +-------------------------------+     +----------------------------------- +
+//  |W[0,0]  W[0,1]  W[0,2]  W[0,3] |     |W[0,0]  W[1,0]  W[2,0]  ...  W[15,0]|
+//  |W[1,0]  W[1,1]  W[1,2]  W[1,3] | --\ |W[0,1]  W[1,1]  W[2,1]  ...  W[15,1]|
+//  |W[2,0]  W[2,1]  W[2,2]  W[2,3] | --/ |W[0,2]  W[1,2]  W[2,2]  ...  W[15,2]|
+//  |  ...     ...    ...      ...  |     |W[0,3]  W[1,3]  W[2,3]  ...  W[15,3]|
+//  |  ...     ...    ...      ...  |     +------------------------------------+
+//  |W[15,0] W[15,1] W[15,2] W[15,3]|
+//  +-------------------------------+
+//
+// Tile columns is usually the direction of the reduction. So, if our target can reduce k elements
+// at the time, we should set tile_cols = k.
+// Tile rows is connected with the number of registers available for the given target.
+//
 inline bool Conv2DGemmWeightTransformRel(const Array<Type>& types, int num_inputs,
                                          const Attrs& attrs, const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) return false;
 
+  const ConvGemmWeightTransformAttrs* param = attrs.as<ConvGemmWeightTransformAttrs>();
+  CHECK(param != nullptr);
+  int n = param->tile_rows;
+  int k = param->tile_cols;
+
   CHECK_EQ(data->shape.size(), 4) << "Only support HWIO kernel layout";
 
   const auto K = data->shape[0] * data->shape[1] * data->shape[2];
   const auto N = data->shape[3];
 
-  auto k_mod_16 = indexmod(K, 16);
-  auto n_mod_4 = indexmod(N, 4);
+  auto K_mod_k = indexmod(K, k);
+  auto N_mod_n = indexmod(N, n);
 
-  auto pad_k = tvm::if_then_else(k_mod_16 != 0, 16 - k_mod_16, tir::make_zero(DataType::Int(32)));
-  auto pad_n = tvm::if_then_else(n_mod_4 != 0, 4 - n_mod_4, tir::make_zero(DataType::Int(32)));
+  auto pad_K = tvm::if_then_else(K_mod_k != 0, k - K_mod_k, tir::make_zero(DataType::Int(32)));
+  auto pad_N = tvm::if_then_else(N_mod_n != 0, n - N_mod_n, tir::make_zero(DataType::Int(32)));
 
-  const auto N_padded = N + pad_n;
-  const auto K_padded = K + pad_k;
+  const auto N_padded = N + pad_N;
+  const auto K_padded = K + pad_K;
 
   Array<IndexExpr> oshape{
-      indexdiv(N_padded, 4),
-      indexdiv(K_padded, 16),
-      4,
-      16,
+      indexdiv(N_padded, n),
+      indexdiv(K_padded, k),
+      n,
+      k,
   };
 
   reporter->Assign(types[1], TensorType(oshape, data->dtype));
diff --git a/topi/python/topi/arm_cpu/conv2d_alter_op.py b/topi/python/topi/arm_cpu/conv2d_alter_op.py
index 085a84d819ad..99fdf21d5bc0 100644
--- a/topi/python/topi/arm_cpu/conv2d_alter_op.py
+++ b/topi/python/topi/arm_cpu/conv2d_alter_op.py
@@ -244,28 +244,28 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     if topi_tmpl == "conv2d_NHWC_quantized.arm_cpu":
         assert (data.dtype == 'int8' and kernel.dtype == 'int8' or
                 data.dtype == 'uint8' and kernel.dtype == 'uint8')
-
         assert data_layout == "NHWC" and kernel_layout == "HWIO"
-
         CO, IC, KH, KW = get_const_tuple(kernel.shape)
-
         K = KH * KW * IC
         N = CO
 
-        pad_k = 0
-        pad_n = 0
-
-        if N % 4 != 0:
-            pad_n = 4 - (N % 4)
-
-        if K % 16 != 0:
-            pad_k = 16 - (K % 16)
-
-        N_padded = N + pad_n
-        K_padded = K + pad_k
-
-        kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(inputs[1])
-        new_kernel = te.placeholder((N_padded // 4, K_padded // 16, 4, 16), kernel.dtype)
+        tile_rows = 4
+        tile_cols = 16
+        pad_K = 0
+        pad_N = 0
+
+        if N % tile_rows != 0:
+            pad_N = tile_rows - (N % tile_rows)
+        if K % tile_cols != 0:
+            pad_k = tile_cols - (K % tile_cols)
+
+        N_padded = N + pad_N
+        K_padded = K + pad_K
+        kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(inputs[1], tile_rows, tile_cols)
+        new_kernel = te.placeholder((N_padded // tile_rows,
+                                     K_padded // tile_cols,
+                                     tile_rows,
+                                     tile_cols), kernel.dtype)
 
         new_workload = autotvm.task.args_to_workload([data, new_kernel,
                                                       strides, padding, dilation,
diff --git a/topi/python/topi/arm_cpu/conv2d_gemm.py b/topi/python/topi/arm_cpu/conv2d_gemm.py
index 124fa3bd2cfe..2b6122919d85 100644
--- a/topi/python/topi/arm_cpu/conv2d_gemm.py
+++ b/topi/python/topi/arm_cpu/conv2d_gemm.py
@@ -124,8 +124,6 @@ def compute_conv2d_gemm_without_weight_transform(cfg,
     return out
 
 # Schedules
-
-
 def schedule_conv2d_gemm(cfg, s, out):
     """Create schedule for tensors"""
     C = out.op.input_tensors[0]
diff --git a/topi/python/topi/arm_cpu/conv2d_int8.py b/topi/python/topi/arm_cpu/conv2d_int8.py
index ae74553c72fd..5a895c084c06 100644
--- a/topi/python/topi/arm_cpu/conv2d_int8.py
+++ b/topi/python/topi/arm_cpu/conv2d_int8.py
@@ -116,7 +116,9 @@ def traverse(op):
 def compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, dilation, out_dtype):
     N, IH, IW, IC = get_const_tuple(data.shape)
     KH, KW, _, OC = get_const_tuple(kernel.shape)
-    kernel = nn.conv2d_gemm_weight_transform(kernel)
+    tile_rows = 4
+    tile_cols = 16
+    kernel = nn.conv2d_gemm_weight_transform(kernel, tile_rows, tile_cols)
     return  compute_conv2d_gemm_without_weight_transform(cfg,
                                                          data, kernel, strides, padding,
                                                          dilation, out_dtype, (KH, KW), OC)
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index c2b4eaa10cce..59288892ebaa 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -590,13 +590,17 @@ def conv2d_NCHWc_int8(data, kernel, stride, padding, dilation, layout, out_layou
                       name='conv2d_NCHWc_int8', tag="conv2d_NCHWc_int8")
 
 
-def conv2d_gemm_weight_transform(kernel):
+def conv2d_gemm_weight_transform(kernel, tile_rows, tile_cols):
     """Weight transformation for winograd
 
     Parameters
     ----------
     kernel: Tensor
         The raw kernel tensor with layout "NHWC".
+    tile_rows: int
+        Tile rows of the weight transformation for ConvGemm.
+    tile_cols: int
+        Tile columns of the weight transformation for ConvGemm.
 
     Returns
     -------
@@ -611,24 +615,28 @@ def conv2d_gemm_weight_transform(kernel):
                              kernel[(x // IC) // KW, (x // IC) % KW, x % IC, y],
                              'weight_flatten')
 
-    pad_k = 0
-    pad_n = 0
+    pad_K = 0
+    pad_N = 0
 
-    if N % 4 != 0:
-        pad_n = 4 - (N % 4)
+    if N % tile_rows != 0:
+        pad_N = tile_rows - (N % tile_rows)
 
-    if K % 16 != 0:
-        pad_k = 16 - (K % 16)
+    if K % tile_cols != 0:
+        pad_k = tile_cols - (K % tile_cols)
 
-    N_padded = N + pad_n
-    K_padded = K + pad_k
+    N_padded = N + pad_N
+    K_padded = K + pad_K
 
-    if pad_k != 0 or pad_n != 0:
-        kernel_flat = pad(kernel_flat, pad_before=(0, 0), pad_after=(pad_k, pad_n),
+    if pad_K != 0 or pad_N != 0:
+        kernel_flat = pad(kernel_flat, pad_before=(0, 0), pad_after=(pad_K, pad_N),
                           name='weight_padding')
 
-    return te.compute((N_padded // 4, K_padded // 16, 4, 16), lambda x, y, z, w:
-                      kernel_flat[w + 16 * y, z + 4 * x], name='weight_block_reshape')
+    return te.compute((N_padded // tile_rows,
+                       K_padded // tile_cols,
+                       tile_rows,
+                       tile_cols), lambda x, y, z, w:
+                      kernel_flat[w + tile_cols * y, z + tile_rows * x],
+                      name='weight_block_reshape')
 
 
 def conv2d_winograd_weight_transform(kernel, tile_size):

From f4303027dfe354497b0171de3671e2af58f68085 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Fri, 19 Jun 2020 12:11:58 +0100
Subject: [PATCH 13/15] Change variable name to weight in
 Conv2DGemmWeightTransformRel

Change-Id: Ifb5f1f33af7512fe67c6b049b20a42a0bb2d26c9
---
 src/relay/op/nn/convolution.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 2dd16dac48c0..f53f4e0454a4 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -409,18 +409,18 @@ inline bool Conv2DWinogradWeightTransformRel(const Array<Type>& types, int num_i
 inline bool Conv2DGemmWeightTransformRel(const Array<Type>& types, int num_inputs,
                                          const Attrs& attrs, const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 2);
-  const auto* data = types[0].as<TensorTypeNode>();
-  if (data == nullptr) return false;
+  const auto* weight = types[0].as<TensorTypeNode>();
+  if (weight == nullptr) return false;
 
   const ConvGemmWeightTransformAttrs* param = attrs.as<ConvGemmWeightTransformAttrs>();
   CHECK(param != nullptr);
   int n = param->tile_rows;
   int k = param->tile_cols;
 
-  CHECK_EQ(data->shape.size(), 4) << "Only support HWIO kernel layout";
+  CHECK_EQ(weight->shape.size(), 4) << "Only support HWIO kernel layout";
 
-  const auto K = data->shape[0] * data->shape[1] * data->shape[2];
-  const auto N = data->shape[3];
+  const auto K = weight->shape[0] * weight->shape[1] * weight->shape[2];
+  const auto N = weight->shape[3];
 
   auto K_mod_k = indexmod(K, k);
   auto N_mod_n = indexmod(N, n);
@@ -438,7 +438,7 @@ inline bool Conv2DGemmWeightTransformRel(const Array<Type>& types, int num_input
       k,
   };
 
-  reporter->Assign(types[1], TensorType(oshape, data->dtype));
+  reporter->Assign(types[1], TensorType(oshape, weight->dtype));
   return true;
 }
 

From 5a21a292701b9255cf30d87a369d56fabe3f7dca Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Fri, 19 Jun 2020 12:21:47 +0100
Subject: [PATCH 14/15] Fix clang-10 linting errors

Change-Id: I25ccc844d9cee23766096e1daddb6180abc413a6
---
 include/tvm/relay/attrs/nn.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 15cd118a5fac..5f1ee2f31cc5 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -192,12 +192,9 @@ struct ConvGemmWeightTransformAttrs : public tvm::AttrsNode<ConvGemmWeightTransf
   int tile_rows;
   int tile_cols;
 
-  TVM_DECLARE_ATTRS(ConvGemmWeightTransformAttrs,
-                    "relay.attrs.ConvGemmWeightTransformAttrs") {
-    TVM_ATTR_FIELD(tile_rows).describe(
-        "Tile rows of the weight transformation for ConvGemm.");
-    TVM_ATTR_FIELD(tile_cols).describe(
-        "Tile columns of the weight transformation for ConvGemm.");
+  TVM_DECLARE_ATTRS(ConvGemmWeightTransformAttrs, "relay.attrs.ConvGemmWeightTransformAttrs") {
+    TVM_ATTR_FIELD(tile_rows).describe("Tile rows of the weight transformation for ConvGemm.");
+    TVM_ATTR_FIELD(tile_cols).describe("Tile columns of the weight transformation for ConvGemm.");
   }
 };
 

From ddc61d2d67a2a9116c4b56e3019ee957fb13f252 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Mon, 22 Jun 2020 17:57:16 +0100
Subject: [PATCH 15/15] Trigger tests

Change-Id: Id37706fb7cf77a87a3cc817ecf8046297d9ca95a