From 6d7a450789faa9f5c5fda13311c49e11ee11a6d4 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Thu, 10 Jun 2021 14:29:46 +0800
Subject: [PATCH 01/26] Add Matmul Op

---
 include/tvm/relay/attrs/nn.h                  |  18 ++-
 python/tvm/relay/op/nn/_nn.py                 |  45 ++++++-
 python/tvm/relay/op/nn/nn.py                  |  41 ++++++-
 python/tvm/relay/op/op_attrs.py               |   6 +-
 python/tvm/relay/op/strategy/cuda.py          |  20 +++
 python/tvm/relay/op/strategy/generic.py       |  36 ++++++
 python/tvm/topi/cuda/dense.py                 |  44 ++++++-
 python/tvm/topi/generic/nn.py                 |  17 +++
 python/tvm/topi/nn/dense.py                   | 115 ++++++++++++++++--
 rust/tvm/src/ir/relay/attrs/nn.rs             |   8 +-
 src/relay/op/make_op.h                        |   3 +
 src/relay/op/nn/nn.cc                         |  61 ++++++++--
 src/relay/op/nn/nn.h                          |  25 ++--
 src/relay/qnn/op/dense.cc                     |  16 +--
 src/relay/quantize/realize.cc                 |   6 +-
 .../auto_scheduler_layout_rewrite.cc          |   8 +-
 .../transforms/combine_parallel_dense.cc      |  16 +--
 tests/python/relay/test_op_level1.py          |  63 +++++++++-
 tests/python/topi/python/test_topi_matmul.py  |  26 ++++
 19 files changed, 501 insertions(+), 73 deletions(-)
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index a58bb8750c14..de8fa26d66d9 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -961,19 +961,29 @@ struct AvgPool3DAttrs : public tvm::AttrsNode<AvgPool3DAttrs> {
   }
 };
 
-/*! \brief Attributes for dense operator */
-struct DenseAttrs : public tvm::AttrsNode<DenseAttrs> {
+/*! \brief Attributes for matmul operator and dense operator */
+struct MatmulAttrs : public tvm::AttrsNode<MatmulAttrs> {
   IndexExpr units;
-  tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
   DataType out_dtype;
+  bool input_transposed;
+  bool weight_transposed;
+  tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
 
-  TVM_DECLARE_ATTRS(DenseAttrs, "relay.attrs.DenseAttrs") {
+  TVM_DECLARE_ATTRS(MatmulAttrs, "relay.attrs.MatmulAttrs") {
     TVM_ATTR_FIELD(units).describe("Number of hidden units of the dense transformation.");
 
     // use 0 bits to indicate none.
     TVM_ATTR_FIELD(out_dtype)
         .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
+
+    TVM_ATTR_FIELD(input_transposed)
+        .set_default(false)
+        .describe("Whether the input tensor is in transposed format.");
+
+    TVM_ATTR_FIELD(weight_transposed)
+        .set_default(false)
+        .describe("Whether the weight tensor is in transposed format.");
   }
 };
 
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index c6c4f4bfb959..dd1ee6a43857 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -52,6 +52,32 @@
 reg.register_pattern("nn.log_softmax", OpPattern.OPAQUE)
 
 
+@reg.register_legalize("nn.matmul")
+def leaglize_matmul(attrs, inputs, types):
+    """Legalize matmul op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    return topi.nn.matmul_legalize(attrs, inputs, types)
+
+
+# matmul
+reg.register_strategy("nn.matmul", strategy.matmul_strategy)
+reg.register_pattern("nn.matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
 @reg.register_legalize("nn.dense")
 def legalize_dense(attrs, inputs, types):
     """Legalize dense op.
@@ -1149,21 +1175,34 @@ def batch_flatten_shape_func(attrs, inputs, _):
 
 
 @script
-def _dense_shape_func(data_shape, weight_shape):
+def _matmul_shape_func(data_shape, weight_shape, input_transposed, weight_transposed):
     out = output_tensor((data_shape.shape[0],), "int64")
     for i in const_range(out.shape[0] - 1):
         out[i] = data_shape[i]
-    out[out.shape[0] - 1] = weight_shape[0]
+    if input_transposed:
+        out[out.shape[0] - 2] = out[out.shape[0] - 1]
+    out[out.shape[0] - 1] = weight_shape[0] if weight_transposed else weight_shape[1]
 
     return out
 
 
+@reg.register_shape_func("nn.matmul", False)
+def matmul_shape_func(attrs, inputs, _):
+    """
+    Shape function for matmul op.
+    """
+    ret = [
+        _matmul_shape_func(inputs[0], inputs[1], attrs.input_transposed, attrs.weight_transposed)
+    ]
+    return ret
+
+
 @reg.register_shape_func("nn.dense", False)
 def dense_shape_func(attrs, inputs, _):
     """
     Shape function for dense op.
     """
-    ret = [_dense_shape_func(inputs[0], inputs[1])]
+    ret = [_matmul_shape_func(inputs[0], inputs[1], False, True)]
     return ret
 
 
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index caf1f187fad3..179ff97afe68 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1471,9 +1471,48 @@ def bias_add(data, bias, axis=1):
     return _make.bias_add(data, bias, axis)
 
 
+def matmul(data, weight, units=None, out_dtype="", input_transposed=False, weight_transposed=False):
+    """Dense operator.
+    Applies a linear transformation. The X & W can be transposed.
+
+    .. math::
+
+    `Y = X * W`
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator,
+        of shape `(d_1, d_2, ..., d_n, units_in)`.
+
+    weight : tvm.relay.Expr
+        The weight expressions, 2-D matrix,
+        of shape `(units_in, units)` or `(units, units_in)`.
+
+    units : int, optional
+        Number of hidden units of the dense transformation.
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision dense,
+        of shape `(d_1, d_2, ..., d_n, units)`.
+
+    weight_transposed : bool, optional
+        Whether the weight tensor is in transposed format.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    # With N/T format, the compute will be seen as a dense
+    if not input_transposed and weight_transposed:
+        return dense(data, weight, units, out_dtype)
+    return _make.matmul(data, weight, units, out_dtype, input_transposed, weight_transposed)
+
+
 def dense(data, weight, units=None, out_dtype=""):
     """Dense operator.
-    Applies a linear transformation
+    Applies a linear transformation. This is an alias of matmul with weight transposed.
 
     .. math::
 
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index 6844d133a77e..323190e309b8 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -64,9 +64,9 @@ class BiasAddAttrs(Attrs):
     """Atttribute of nn.bias_add"""
 
 
-@tvm._ffi.register_object("relay.attrs.DenseAttrs")
-class DenseAttrs(Attrs):
-    """Attributes for nn.dense"""
+@tvm._ffi.register_object("relay.attrs.MatmulAttrs")
+class MatmulAttrs(Attrs):
+    """Attributes for nn.matmul and nn.dense"""
 
 
 @tvm._ffi.register_object("relay.attrs.SoftmaxAttrs")
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index b4db412700a7..ef0c355ffe47 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -698,6 +698,26 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target):
     return strategy
 
 
+@matmul_strategy.register(["cuda", "gpu"])
+def matmul_strategy_cuda(attrs, inputs, out_type, target):
+    """dense cuda strategy"""
+    strategy = _op.OpStrategy()
+    if target.kind.name == "cuda" and "cublas" in target.libs:
+        strategy.add_implementation(
+            wrap_compute_matmul(topi.cuda.matmul_cublas),
+            wrap_topi_schedule(topi.cuda.schedule_matmul_cublas),
+            name="matmul_cublas.cuda",
+            plevel=25,
+        )
+    if is_auto_scheduler_enabled():
+        strategy.add_implementation(
+            wrap_compute_matmul(topi.nn.matmul),
+            naive_schedule,
+            name="matmul.cuda",
+        )
+    return strategy
+
+
 @dense_strategy.register(["cuda", "gpu"])
 def dense_strategy_cuda(attrs, inputs, out_type, target):
     """dense cuda strategy"""
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index d56820e409aa..9a552d73bf46 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -712,6 +712,42 @@ def dilation2d_strategy(attrs, inputs, out_type, target):
     return strategy
 
 
+# matmul
+def wrap_compute_matmul(topi_compute, need_auto_scheduler_layout=False):
+    """wrap matmul topi compute"""
+
+    def _compute_matmul(attrs, inputs, out_type):
+        """Compute definition of matmul"""
+        out_dtype = attrs.out_dtype
+        out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
+        args = [
+            inputs[0],
+            inputs[1],
+            None,
+            out_dtype,
+            attrs.input_transposed,
+            attrs.weight_transposed,
+        ]
+        if need_auto_scheduler_layout:
+            args.append(get_auto_scheduler_rewritten_layout(attrs))
+        return [topi_compute(*args)]
+
+    return _compute_matmul
+
+
+@override_native_generic_func("matmul_strategy")
+def matmul_strategy(attrs, inputs, out_type, target):
+    """matmul generic strategy"""
+    logger.warning("matmul is not optimized for this platform.")
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_matmul(topi.nn.matmul),
+        wrap_topi_schedule(topi.generic.schedule_matmul),
+        name="matmul.generic",
+    )
+    return strategy
+
+
 # dense
 def wrap_compute_dense(topi_compute, need_auto_scheduler_layout=False):
     """wrap dense topi compute"""
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index 8adc38b84b1b..71b2464736df 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -30,10 +30,16 @@
 logger = logging.getLogger("topi")
 
 
-@autotvm.register_topi_compute("dense_cublas.cuda")
-def dense_cublas(cfg, data, weight, bias=None, out_dtype=None):
-    """Dense operator on CUDA with CUBLAS"""
-    assert len(data.shape) == 2 and len(weight.shape) == 2, "only support 2-dim dense"
+def _matmul_cublas_common(
+    cfg,
+    data,
+    weight,
+    bias=None,
+    out_dtype=None,
+    input_transposed=False,
+    weight_transposed=False,
+):
+    assert len(data.shape) == 2 and len(weight.shape) == 2, "only support 2-dim matmul"
     if bias is not None:
         assert len(bias.shape) == 1
     if out_dtype is None:
@@ -41,7 +47,7 @@ def dense_cublas(cfg, data, weight, bias=None, out_dtype=None):
     assert out_dtype == data.dtype, "Mixed precision not supported."
     batch, in_dim = get_const_tuple(data.shape)
     out_dim, _ = get_const_tuple(weight.shape)
-    matmul = cublas.matmul(data, weight, False, True)
+    matmul = cublas.matmul(data, weight, input_transposed, weight_transposed)
     if all(isinstance(d, int) for d in [batch, in_dim, out_dim]):
         cfg.add_flop(batch * in_dim * out_dim * 2)
     if bias is not None:
@@ -51,6 +57,34 @@ def dense_cublas(cfg, data, weight, bias=None, out_dtype=None):
     return matmul
 
 
+@autotvm.register_topi_compute("matmul_cublas.cuda")
+def matmul_cublas(
+    cfg,
+    data,
+    weight,
+    bias=None,
+    out_dtype=None,
+    input_transposed=False,
+    weight_transposed=False,
+):
+    """Matmul operator on CUDA with CUBLAS"""
+    return _matmul_cublas_common(
+        cfg, data, weight, bias, out_dtype, input_transposed, weight_transposed
+    )
+
+
+@autotvm.register_topi_schedule("matmul_cublas.cuda")
+def schedule_matmul_cublas(_, outs):
+    """Schedule matmul operator using CUBLAS"""
+    return generic.schedule_extern(outs)
+
+
+@autotvm.register_topi_compute("dense_cublas.cuda")
+def dense_cublas(cfg, data, weight, bias=None, out_dtype=None):
+    """Dense operator on CUDA with CUBLAS"""
+    return _matmul_cublas_common(cfg, data, weight, bias, out_dtype, False, True)
+
+
 @autotvm.register_topi_schedule("dense_cublas.cuda")
 def schedule_dense_cublas(_, outs):
     """Schedule dense operator using CUBLAS"""
diff --git a/python/tvm/topi/generic/nn.py b/python/tvm/topi/generic/nn.py
index 04d649037fef..1b3214154687 100644
--- a/python/tvm/topi/generic/nn.py
+++ b/python/tvm/topi/generic/nn.py
@@ -580,6 +580,23 @@ def schedule_fast_softmax(outs):
     return _default_schedule(outs, False)
 
 
+def schedule_matmul(outs):
+    """Schedule for matmul
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of matmul
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 def schedule_dense(outs):
     """Schedule for dense
 
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index e8ec476b86a5..daaf3c343dcf 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -21,7 +21,15 @@
 from .. import tag
 
 
-def dense(data, weight, bias=None, out_dtype=None, auto_scheduler_rewritten_layout=""):
+def matmul(
+    data,
+    weight,
+    bias=None,
+    out_dtype=None,
+    input_transposed=False,
+    weight_transposed=False,
+    auto_scheduler_rewritten_layout="",
+):
     """The default implementation of dense in topi.
 
     Parameters
@@ -51,37 +59,120 @@ def dense(data, weight, bias=None, out_dtype=None, auto_scheduler_rewritten_layo
         assert len(bias.shape) == 1
     if out_dtype is None:
         out_dtype = data.dtype
-    batch, in_dim = data.shape
+    if input_transposed:
+        in_dim, batch = data.shape
+    else:
+        batch, in_dim = data.shape
 
     if auto_scheduler_rewritten_layout:
         # Infer shape for the rewritten layout
         out_dim, red_dim = auto_scheduler.get_shape_from_rewritten_layout(
-            auto_scheduler_rewritten_layout, ["j", "k"]
+            auto_scheduler_rewritten_layout, ["j", "k"] if weight_transposed else ["k", "j"]
         )
         auto_scheduler.remove_index_check(weight)
-    else:
+    elif weight_transposed:
         out_dim, red_dim = weight.shape
+    else:
+        red_dim, out_dim = weight.shape
     assert in_dim == red_dim
 
     k = te.reduce_axis((0, in_dim), name="k")
-    matmul = te.compute(
+    if input_transposed:
+        if weight_transposed:
+            compute_lambda = lambda i, j: te.sum(
+                data[k, i].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=k
+            )
+            compute_name = "T_matmul_TT"
+        else:
+            compute_lambda = lambda i, j: te.sum(
+                data[k, i].astype(out_dtype) * weight[k, j].astype(out_dtype), axis=k
+            )
+            compute_name = "T_matmul_TN"
+        compute_tag = "matmul"
+    else:
+        if weight_transposed:
+            compute_lambda = lambda i, j: te.sum(
+                data[i, k].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=k
+            )
+            compute_name = "T_dense"
+            compute_tag = "dense"
+        else:
+            compute_lambda = lambda i, j: te.sum(
+                data[i, k].astype(out_dtype) * weight[k, j].astype(out_dtype), axis=k
+            )
+            compute_name = "T_matmul"
+            compute_tag = "matmul"
+
+    mat = te.compute(
         (batch, out_dim),
-        lambda i, j: te.sum(data[i, k].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=k),
-        name="T_dense",
-        tag="dense",
+        compute_lambda,
+        name=compute_name,
+        tag=compute_tag,
         attrs={"layout_free_placeholders": [weight]},
     )
+
     if bias is not None:
-        matmul = te.compute(
+        mat = te.compute(
             (batch, out_dim),
-            lambda i, j: matmul[i, j] + bias[j].astype(out_dtype),
+            lambda i, j: mat[i, j] + bias[j].astype(out_dtype),
             tag=tag.BROADCAST,
         )
 
     if auto_scheduler_rewritten_layout:
-        matmul = auto_scheduler.rewrite_compute_body(matmul, auto_scheduler_rewritten_layout)
+        mat = auto_scheduler.rewrite_compute_body(mat, auto_scheduler_rewritten_layout)
+
+    return mat
+
+
+@tvm.target.generic_func
+def matmul_legalize(attrs, inputs, types):
+    """Legalizes matmul op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current dense
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
 
-    return matmul
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    # not to change by default
+    # pylint: disable=unused-argument
+    return None
+
+
+def dense(data, weight, bias=None, out_dtype=None, auto_scheduler_rewritten_layout=""):
+    """The default implementation of dense in topi.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        2-D with shape [batch, in_dim]
+
+    weight : tvm.te.Tensor
+        2-D with shape [out_dim, in_dim]
+
+    bias : Optional[tvm.te.Tensor]
+        1-D with shape [out_dim]
+
+    out_dtype : Optional[str]
+        The output type. This is used for mixed precision.
+
+    auto_scheduler_rewritten_layout: str = ""
+        The layout after auto-scheduler's layout rewrite pass.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        2-D with shape [batch, out_dim]
+    """
+    return matmul(data, weight, bias, out_dtype, False, True, auto_scheduler_rewritten_layout)
 
 
 @tvm.target.generic_func
diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs
index f0137fa3cbcc..84533e9ecaea 100644
--- a/rust/tvm/src/ir/relay/attrs/nn.rs
+++ b/rust/tvm/src/ir/relay/attrs/nn.rs
@@ -56,12 +56,14 @@ pub struct BiasAddAttrsNode {
 
 #[repr(C)]
 #[derive(Object, Debug)]
-#[ref_name = "DenseAttrs"]
-#[type_key = "relay.attrs.DenseAttrs"]
-pub struct DenseAttrsNode {
+#[ref_name = "MatmulAttrs"]
+#[type_key = "relay.attrs.MatmulAttrs"]
+pub struct MatmulAttrsNode {
     pub base: BaseAttrsNode,
     pub units: IndexExpr,
     pub out_dtype: DataType,
+    pub input_transposed: bool,
+    pub weight_transposed: bool,
 }
 
 #[repr(C)]
diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h
index 81de4bc90ad7..b957df987f4c 100644
--- a/src/relay/op/make_op.h
+++ b/src/relay/op/make_op.h
@@ -44,6 +44,9 @@ Expr MakeClip(Expr a, double a_min, double a_max);
 
 Expr MakeConcatenate(Expr data, int axis);
 
+Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool input_transposed,
+                bool weight_transposed);
+
 Expr MakeDense(Expr data, Expr weight, IndexExpr units, DataType out_dtype);
 
 Expr MakeBatchMatmul(Expr lhs, Expr rhs, DataType out_dtype);
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 32c0a21d46c7..70e7cd0f5be8 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -162,16 +162,47 @@ Useful for
     .set_support_level(3)
     .add_type_rel("FIFOBuffer", FIFOBufferRel);
 
-// relay.nn.dense
-TVM_REGISTER_NODE_TYPE(DenseAttrs);
+// ------------------- relay.nn.matmul
+TVM_REGISTER_NODE_TYPE(MatmulAttrs);
 
-// Positional relay function to create dense operator used by frontend FFI.
-Expr MakeDense(Expr data, Expr weight, IndexExpr units, DataType out_dtype) {
-  auto attrs = make_object<DenseAttrs>();
+Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool input_transposed,
+                bool weight_transposed) {
+  auto attrs = make_object<MatmulAttrs>();
   attrs->units = units;
   attrs->out_dtype = out_dtype;
-  static const Op& op = Op::Get("nn.dense");
-  return Call(op, {data, weight}, Attrs(attrs), {});
+  attrs->input_transposed = input_transposed;
+  attrs->weight_transposed = weight_transposed;
+  if (!input_transposed && weight_transposed) {
+    static const Op& dense_op = Op::Get("nn.dense");
+    return Call(dense_op, {data, weight}, Attrs(attrs), {});
+  } else {
+    static const Op& matmul_op = Op::Get("nn.matmul");
+    return Call(matmul_op, {data, weight}, Attrs(attrs), {});
+  }
+}
+
+TVM_REGISTER_GLOBAL("relay.op.nn._make.matmul").set_body_typed(MakeMatmul);
+
+RELAY_REGISTER_OP("nn.matmul")
+    .describe(R"code(Applies a linear transformation: :math:`Y = XW`. X & W can be transposed.
+
+- **data**: `(x1, x2, ..., xn, input_dim)`
+- **weight**: `(input_dim, units)` or `(units, input_dim)`
+- **out**: `(x1, x2, ..., xn, units)`.
+
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<MatmulAttrs>()
+    .set_num_inputs(2)
+    .add_argument("data", "nD Tensor", "Input data.")
+    .add_argument("weight", "2D Tensor", "Weight matrix.")
+    .set_support_level(1)
+    .add_type_rel("Matmul", MatmulRel<MatmulAttrs>);
+// ------------------- relay.nn.matmul
+
+// ------------------- relay.nn.dense
+// Positional relay function to create dense operator used by frontend FFI.
+Expr MakeDense(Expr data, Expr weight, IndexExpr units, DataType out_dtype) {
+  return MakeMatmul(data, weight, units, out_dtype, false, true);
 }
 
 TVM_REGISTER_GLOBAL("relay.op.nn._make.dense").set_body_typed(MakeDense);
@@ -184,19 +215,22 @@ RELAY_REGISTER_OP("nn.dense")
 - **out**: `(x1, x2, ..., xn, units)`.
 
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type<DenseAttrs>()
+    .set_attrs_type<MatmulAttrs>()
     .set_num_inputs(2)
     .add_argument("data", "nD Tensor", "Input data.")
     .add_argument("weight", "2D Tensor", "Weight matrix.")
     .set_support_level(1)
-    .add_type_rel("Dense", DenseRel<DenseAttrs>);
+    .add_type_rel("Dense", MatmulRel<MatmulAttrs>);
+// ------------------- relay.nn.dense
 
-// relay.nn.contrib_dense_pack
+// ------------------- relay.nn.contrib_dense_pack
 // Positional relay function to create dense_pack operator used by frontend FFI.
 Expr MakeDensePack(Expr data, Expr weight, IndexExpr units, DataType out_dtype) {
-  auto attrs = make_object<DenseAttrs>();
+  auto attrs = make_object<MatmulAttrs>();
   attrs->units = units;
   attrs->out_dtype = out_dtype;
+  attrs->input_transposed = false;
+  attrs->weight_transposed = true;
   static const Op& op = Op::Get("nn.contrib_dense_pack");
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
@@ -211,12 +245,13 @@ RELAY_REGISTER_OP("nn.contrib_dense_pack")
 - **out**: `(x1, x2, ..., xn, units)`.
 
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type<DenseAttrs>()
+    .set_attrs_type<MatmulAttrs>()
     .set_num_inputs(2)
     .add_argument("data", "nD Tensor", "Input data.")
     .add_argument("weight", "3D Tensor", "Packed weight matrix.")
     .set_support_level(10)
-    .add_type_rel("DensePack", DensePackRel<DenseAttrs>);
+    .add_type_rel("DensePack", DensePackRel<MatmulAttrs>);
+// ------------------- relay.nn.contrib_dense_pack
 
 // relay.leaky_relu
 TVM_REGISTER_NODE_TYPE(LeakyReluAttrs);
diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
index 1ac800f357b0..dd394396a309 100644
--- a/src/relay/op/nn/nn.h
+++ b/src/relay/op/nn/nn.h
@@ -27,6 +27,7 @@
 #include <tvm/ir/attrs.h>
 #include <tvm/ir/expr.h>
 #include <tvm/relay/type.h>
+#include <tvm/runtime/container.h>
 
 #include <utility>
 
@@ -36,8 +37,8 @@ namespace tvm {
 namespace relay {
 
 template <typename AttrType>
-bool DenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
-              const TypeReporter& reporter) {
+bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+               const TypeReporter& reporter) {
   ICHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
@@ -48,12 +49,19 @@ bool DenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   ICHECK(static_cast<int>(data->shape.size()) != 0);
 
-  Array<tvm::PrimExpr> dshape = data->shape;
+  const Array<tvm::PrimExpr>& dshape = data->shape;
   Array<tvm::PrimExpr> oshape = dshape;
+  tvm::PrimExpr reduce = dshape[dshape.size() - 1];
+  if (param->input_transposed) {
+    reduce = dshape[dshape.size() - 2];
+    oshape.Set((oshape.size() - 2), dshape[oshape.size() - 1]);
+  }
   if (param->units.defined()) {
     // validate the weight shape is proper if defined
     // Assign weight type
-    Array<IndexExpr> wshape({param->units, dshape[dshape.size() - 1]});
+    const Array<IndexExpr>& wshape = param->weight_transposed
+                                         ? Array<IndexExpr>({param->units, reduce})
+                                         : Array<IndexExpr>({reduce, param->units});
     // It is possible for weight to be nullptr in which case we will use
     // data dtype as the weight dtype. However if weight dtype is explicitly
     // present we will use that.
@@ -70,7 +78,7 @@ bool DenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     oshape.Set((oshape.size() - 1), param->units);
   } else {
     if (weight == nullptr) return false;
-    Array<tvm::PrimExpr> wshape = weight->shape;
+    const Array<tvm::PrimExpr>& wshape = weight->shape;
     // When weight's layout has been rewritten, figure it out based on the
     // total number of elements and input dimensions.
     if (param->auto_scheduler_rewritten_layout.size() != 0) {
@@ -83,11 +91,12 @@ bool DenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     } else {
       ICHECK(static_cast<int>(weight->shape.size()) == 2);
       if (!data->shape.back().as<tir::AnyNode>()) {
-        ICHECK(reporter->AssertEQ(data->shape[data->shape.size() - 1], weight->shape[1]))
-            << "DenseRel: input dimension doesn't match,"
+        ICHECK((param->weight_transposed && reporter->AssertEQ(reduce, weight->shape[1])) ||
+               (!param->weight_transposed && reporter->AssertEQ(reduce, weight->shape[0])))
+            << "MatmulRel: input dimension doesn't match,"
             << " data shape=" << data->shape << ", weight shape=" << weight->shape;
       }
-      oshape.Set((oshape.size() - 1), wshape[0]);
+      oshape.Set((oshape.size() - 1), param->weight_transposed ? wshape[0] : wshape[1]);
     }
   }
 
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index 6284524bff27..fbaa8a96f562 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -45,8 +45,8 @@ bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr || weight == nullptr) return false;
-  const auto* param = attrs.as<DenseAttrs>();
-  ICHECK(param != nullptr) << "DenseAttrs cannot be nullptr.";
+  const auto* param = attrs.as<MatmulAttrs>();
+  ICHECK(param != nullptr) << "MatmulAttrs cannot be nullptr.";
   ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
       << "Expected quantized dense type(int8, uint8) for input but was " << data->dtype;
   ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
@@ -70,22 +70,24 @@ bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
   // Dense infer type function.
   Array<Type> tensor_types = {types[0], types[1], types[6]};
-  return DenseRel<DenseAttrs>(tensor_types, 3, attrs, reporter);
+  return MatmulRel<MatmulAttrs>(tensor_types, 3, attrs, reporter);
 }
 
 // Positional relay function to create quantized dense operator used by frontend FFI.
 Expr MakeQuantizedDense(Expr data, Expr weight, Expr input_zero_point, Expr kernel_zero_point,
                         Expr input_scale, Expr kernel_scale, IndexExpr units, DataType out_dtype) {
-  auto attrs = make_object<DenseAttrs>();
+  auto attrs = make_object<MatmulAttrs>();
   attrs->units = std::move(units);
   attrs->out_dtype = out_dtype;
+  attrs->input_transposed = false;
+  attrs->weight_transposed = true;
   static const Op& op = Op::Get("qnn.dense");
   return Call(op, {data, weight, input_zero_point, kernel_zero_point, input_scale, kernel_scale},
               Attrs(attrs), {});
 }
 
 Expr DenseFirstTerm(const Expr& quantized_data, const Expr& quantized_kernel,
-                    const DenseAttrs* attrs) {
+                    const MatmulAttrs* attrs) {
   return Dense(quantized_data, quantized_kernel, attrs->units, attrs->out_dtype);
 }
 
@@ -161,7 +163,7 @@ Expr QnnDenseCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   const auto in_shape = get_shape(arg_types[0]);
   const int reduction_dim_size = get_const_int(in_shape[1]);
 
-  const auto* qnn_dense_attrs = attrs.as<DenseAttrs>();
+  const auto* qnn_dense_attrs = attrs.as<MatmulAttrs>();
 
   auto term1 = DenseFirstTerm(quantized_data, quantized_kernel, qnn_dense_attrs);
   auto term2 = DenseSecondTerm(quantized_data, kernel_zero_point);
@@ -204,7 +206,7 @@ RELAY_REGISTER_OP("qnn.dense")
 - **weight**: quantized(int8, unit8) `(units, input_dim)`
 - **out**: quantized(int32) `(x1, x2, ..., xn, units)`.
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type<DenseAttrs>()
+    .set_attrs_type<MatmulAttrs>()
     .set_num_inputs(6)
     .add_argument("data", "quantized nD Tensor", "Input data.")
     .add_argument("weight", "quantized 2D Tensor", "Weight matrix.")
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 968628fbfe39..0f7a5eafb0f0 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -280,11 +280,13 @@ Expr DenseRealize(const Call& ref_call, const Array<Expr>& new_args, const Objec
   }
   Expr rdata = Cast(rhs->data, cfg->dtype_weight);
 
-  const auto ref_attrs = ref_call->attrs.as<DenseAttrs>();
-  auto attrs = make_object<DenseAttrs>();
+  const auto ref_attrs = ref_call->attrs.as<MatmulAttrs>();
+  auto attrs = make_object<MatmulAttrs>();
   *attrs = *ref_attrs;
   DataType out_dtype = cfg->dtype_activation;
   attrs->out_dtype = out_dtype;
+  attrs->input_transposed = false;
+  attrs->weight_transposed = true;
 
   Expr ret = Call(ref_call->op, {ldata, rdata}, Attrs(attrs), ref_call->type_args);
   Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale);
diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
index edc4119ce859..ac110544a2b5 100644
--- a/src/relay/transforms/auto_scheduler_layout_rewrite.cc
+++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
@@ -87,7 +87,7 @@ class FuncMutator : public ExprMutator {
         updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
       } else if (auto pattr = call->attrs.as<Conv3DAttrs>()) {
         updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
-      } else if (auto pattr = call->attrs.as<DenseAttrs>()) {
+      } else if (auto pattr = call->attrs.as<MatmulAttrs>()) {
         updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
       } else if (auto pattr = call->attrs.as<BatchMatmulAttrs>()) {
         updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
@@ -105,7 +105,7 @@ class FuncMutator : public ExprMutator {
 
   std::vector<std::string> target_ops_{"nn.conv2d", "nn.conv3d",
                                        "nn.contrib_conv2d_winograd_without_weight_transform",
-                                       "nn.dense", "nn.batch_matmul"};
+                                       "nn.matmul", "nn.dense", "nn.batch_matmul"};
 };
 
 Expr AutoSchedulerLayoutRewriter::VisitExpr_(const CallNode* n) {
@@ -166,8 +166,8 @@ TVM_REGISTER_GLOBAL("relay.attrs.get_auto_scheduler_rewritten_layout")
         return attrs.as<Conv2DWinogradAttrs>()->auto_scheduler_rewritten_layout;
       } else if (attrs->IsInstance<Conv3DAttrs>()) {
         return attrs.as<Conv3DAttrs>()->auto_scheduler_rewritten_layout;
-      } else if (attrs->IsInstance<DenseAttrs>()) {
-        return attrs.as<DenseAttrs>()->auto_scheduler_rewritten_layout;
+      } else if (attrs->IsInstance<MatmulAttrs>()) {
+        return attrs.as<MatmulAttrs>()->auto_scheduler_rewritten_layout;
       } else if (attrs->IsInstance<BatchMatmulAttrs>()) {
         return attrs.as<BatchMatmulAttrs>()->auto_scheduler_rewritten_layout;
       } else {
diff --git a/src/relay/transforms/combine_parallel_dense.cc b/src/relay/transforms/combine_parallel_dense.cc
index 3cd9cca4fec4..41f3a70c62bc 100644
--- a/src/relay/transforms/combine_parallel_dense.cc
+++ b/src/relay/transforms/combine_parallel_dense.cc
@@ -70,15 +70,15 @@ class ParallelDenseToBatchCombiner : public ParallelOpBatchCombiner {
     }
 
     CHECK_EQ(num_args, 2);
-    const auto* origin_attrs = branches[0][0]->attrs.as<DenseAttrs>();
+    const auto* origin_attrs = branches[0][0]->attrs.as<MatmulAttrs>();
     ICHECK(origin_attrs);
     return Downcast<Call>(MakeBatchMatmul(new_args[0], new_args[1], origin_attrs->out_dtype));
   }
 
   virtual bool CanOpsBeCombined(const CallNode* a, const CallNode* b) {
     StructuralEqual eq;
-    const auto* attrs_a = a->attrs.as<DenseAttrs>();
-    const auto* attrs_b = b->attrs.as<DenseAttrs>();
+    const auto* attrs_a = a->attrs.as<MatmulAttrs>();
+    const auto* attrs_b = b->attrs.as<MatmulAttrs>();
     ICHECK(attrs_a);
     ICHECK(attrs_b);
     const auto* weight_a = a->args[1]->type_as<TensorTypeNode>();
@@ -103,8 +103,8 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
 
   bool CanOpsBeCombined(const CallNode* a, const CallNode* b) {
     StructuralEqual eq;
-    const auto* attrs_a = a->attrs.as<DenseAttrs>();
-    const auto* attrs_b = b->attrs.as<DenseAttrs>();
+    const auto* attrs_a = a->attrs.as<MatmulAttrs>();
+    const auto* attrs_b = b->attrs.as<MatmulAttrs>();
     const auto* weight_a = a->args[1]->type_as<TensorTypeNode>();
     const auto* weight_b = b->args[1]->type_as<TensorTypeNode>();
     ICHECK(attrs_a != nullptr && attrs_b != nullptr && weight_a != nullptr && weight_b != nullptr);
@@ -119,11 +119,13 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
     IndexExpr new_output_dims;
     // concat all weights into one
     std::tie(new_weight, new_output_dims) = TransformWeight(branches);
-    const auto* origin_attrs = branches[0][0]->attrs.as<DenseAttrs>();
+    const auto* origin_attrs = branches[0][0]->attrs.as<MatmulAttrs>();
     ICHECK(origin_attrs);
-    const auto dense_attrs = make_object<DenseAttrs>();
+    const auto dense_attrs = make_object<MatmulAttrs>();
     dense_attrs->units = new_output_dims;
     dense_attrs->out_dtype = origin_attrs->out_dtype;
+    dense_attrs->input_transposed = false;
+    dense_attrs->weight_transposed = true;
     return Call(dense_op, {input, new_weight}, Attrs{dense_attrs}, {});
   }
 
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 89475ac7df86..11f79b2477bf 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -409,6 +409,66 @@ def test_batch_norm():
         )
 
 
+@pytest.mark.xfail
+def test_matmul_type_check():
+    dtype = "float16"
+    n, c, h, w = 2, 2, 2, 2
+    x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
+    # it should fail since it does not match with m(2)
+    mismatch_w = 3
+    w = relay.var("w", relay.TensorType((mismatch_w, 2), dtype))
+    y = relay.nn.matmul(x, w)
+    yy = run_infer_type(y)
+
+
+@tvm.testing.uses_gpu
+def test_matmul():
+    for dtype in ["float16", "float32"]:
+        # Matmul accuracy for float16 is poor
+        if dtype == "float16":
+            continue
+        n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
+        x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
+        w = relay.var("w", relay.TensorType((2, w), dtype))
+        y = relay.nn.matmul(x, w, units=2, weight_transposed=True)
+        assert "units=2" in y.astext()
+        yy = run_infer_type(y)
+        assert yy.checked_type == relay.TensorType((n, c, h, 2), dtype)
+
+        n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), 2
+        x = relay.var("x", relay.TensorType((n, c, w, h), dtype))
+        wh, ww = te.size_var("wh"), te.size_var("ww")
+        w = relay.var("w", relay.TensorType((wh, ww), dtype))
+        y = relay.nn.matmul(x, w, input_transposed=True)
+        yy = run_infer_type(y)
+        assert yy.checked_type == relay.TensorType((n, c, h, ww), dtype)
+
+        n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), 2
+        x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
+        w = relay.var("w", relay.IncompleteType())
+        y = relay.nn.matmul(x, w, units=2)
+        yy = run_infer_type(y)
+        assert yy.checked_type == relay.TensorType((n, c, h, 2), dtype)
+
+        x = relay.var("x", shape=(5, 10), dtype=dtype)
+        w = relay.var("w", shape=(5, 2), dtype=dtype)
+        z = relay.nn.matmul(x, w, input_transposed=True)
+
+        # Check result.
+        func = relay.Function([x, w], z)
+        x_data = np.random.rand(5, 10).astype(dtype)
+        w_data = np.random.rand(5, 2).astype(dtype)
+        ref_res = np.dot(x_data.transpose(), w_data)
+
+        for target, dev in tvm.testing.enabled_targets():
+            intrp1 = relay.create_executor("graph", device=dev, target=target)
+            intrp2 = relay.create_executor("debug", device=dev, target=target)
+            op_res1 = intrp1.evaluate(func)(x_data, w_data)
+            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
+            op_res2 = intrp2.evaluate(func)(x_data, w_data)
+            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+
+
 @pytest.mark.xfail
 def test_dense_type_check():
     dtype = "float16"
@@ -426,7 +486,7 @@ def test_dense():
     for dtype in ["float16", "float32"]:
         # Dense accuracy for float16 is poor
         if dtype == "float16":
-            return
+            continue
         n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
         x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
         w = relay.var("w", relay.TensorType((2, w), dtype))
@@ -506,6 +566,7 @@ def test_bitserial_dense():
     test_log_softmax()
     test_dropout()
     test_batch_norm()
+    test_matmul()
     test_dense()
     test_bitserial_dense()
     test_dense_dtype()
diff --git a/tests/python/topi/python/test_topi_matmul.py b/tests/python/topi/python/test_topi_matmul.py
index e5a21a3ad3b7..c777b8f2bc6e 100644
--- a/tests/python/topi/python/test_topi_matmul.py
+++ b/tests/python/topi/python/test_topi_matmul.py
@@ -41,6 +41,31 @@ def with_tvm(lam, *args):
     return out_nd.numpy()
 
 
+def verify_nn_matmul(sa, sb, transp_a, transp_b):
+    a = np.random.uniform(low=-1.0, high=1.0, size=sa).astype(np.float32)
+    b = np.random.uniform(low=-1.0, high=1.0, size=sb).astype(np.float32)
+    c1 = np.matmul(np.transpose(a) if transp_a else a, np.transpose(b) if transp_b else b)
+    c2 = with_tvm(
+        lambda A, B: topi.nn.matmul(A, B, input_transposed=transp_a, weight_transposed=transp_b),
+        a,
+        b,
+    )
+    tvm.testing.assert_allclose(c1, c2, rtol=1e-5, atol=1e-5)
+
+
+def test_nn_matmul():
+    verify_nn_matmul((1, 1), (1, 1), False, False)
+    verify_nn_matmul((1, 1), (1, 1), True, True)
+    verify_nn_matmul((2, 2), (2, 2), False, False)
+    verify_nn_matmul((2, 2), (2, 2), True, True)
+    verify_nn_matmul((2, 3), (3, 5), False, False)
+    verify_nn_matmul((5, 3), (3, 2), False, False)
+    verify_nn_matmul((3, 5), (3, 2), True, False)
+    verify_nn_matmul((3, 5), (2, 3), True, True)
+    verify_nn_matmul((3, 5), (3, 2), True, False)
+    verify_nn_matmul((5, 3), (2, 3), False, True)
+
+
 def verify_matmul(sa, sb, transp_a, transp_b):
     a = np.random.uniform(low=-1.0, high=1.0, size=sa).astype(np.float32)
     b = np.random.uniform(low=-1.0, high=1.0, size=sb).astype(np.float32)
@@ -79,5 +104,6 @@ def test_tensordot():
 
 
 if __name__ == "__main__":
+    test_nn_matmul()
     test_matmul()
     test_tensordot()

From 3c24a86b578b85b4abe4b77e510778833344ad7f Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Thu, 10 Jun 2021 15:00:44 +0800
Subject: [PATCH 02/26] Bug fix

---
 python/tvm/relay/frontend/tensorflow.py       | 14 ++++++++++-
 python/tvm/relay/frontend/tensorflow_ops.py   | 23 +++++++++++++++----
 src/relay/op/nn/nn.h                          |  1 -
 .../frontend/tensorflow/test_forward.py       | 12 ++++++++--
 4 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 0bdec953a540..7ae94aa6c67c 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -44,6 +44,10 @@
 
 __all__ = ["from_tensorflow"]
 
+# By default, TVM convert `tf.matmul` to `nn.dense` op with data tensor non-transposed and weight
+# tensor transposed
+_USE_DENSE_INSTEAD_OF_MATMUL = True
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -1204,7 +1208,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
         return func, self._params
 
 
-def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None):
+def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None, use_dense_op=True):
     """Load tensorflow graph which is a python tensorflow graph object into relay.
     The companion parameters will be handled automatically.
 
@@ -1222,6 +1226,11 @@ def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None):
     outputs : List of output tensor names (Optional)
         if not specified then the last node is assumed as graph output.
 
+    use_dense_op : bool (Optional)
+        Ture to convert `tf.matmul` to `nn.dense`, else to `nn.matmul`.
+        The `nn.dense` op requires the data tensor to be non-transposed and weight tensor to be
+        transposed, may insert extra `transpose` to the original graph.
+
     Returns
     -------
     mod : tvm.IRModule
@@ -1230,6 +1239,9 @@ def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None):
     params : dict of str to tvm.nd.NDArray
         Dict of converted parameters stored in tvm.nd.NDArray format
     """
+    global _USE_DENSE_INSTEAD_OF_MATMUL
+    if use_dense_op != _USE_DENSE_INSTEAD_OF_MATMUL:
+        _USE_DENSE_INSTEAD_OF_MATMUL = use_dense_op
 
     g = GraphProto()
     mod, params = g.from_tensorflow(graph, layout, shape, outputs)
diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py
index c7385565857d..e89e624e123e 100644
--- a/python/tvm/relay/frontend/tensorflow_ops.py
+++ b/python/tvm/relay/frontend/tensorflow_ops.py
@@ -1114,12 +1114,25 @@ def _impl(inputs, attr, params, mod):
 def _matmul():
     def _impl(inputs, attr, params, mod):
         channels = _infer_channels(inputs[1], not attr["transpose_b"])
-        if attr["transpose_a"]:
-            inputs[0] = _op.transpose(inputs[0], axes=(1, 0))
-        if not attr["transpose_b"]:
-            inputs[1] = _op.transpose(inputs[1], axes=(1, 0))
+        from .tensorflow import _USE_DENSE_INSTEAD_OF_MATMUL
+        if _USE_DENSE_INSTEAD_OF_MATMUL:
+            if attr["transpose_a"]:
+                inputs[0] = _op.transpose(inputs[0], axes=(1, 0))
+            if not attr["transpose_b"]:
+                inputs[1] = _op.transpose(inputs[1], axes=(1, 0))
+            return AttrCvt(
+                op_name="dense",
+                extras={"units": channels},
+                ignores=["transpose_a", "transpose_b", "T"],
+            )(inputs, attr)
         return AttrCvt(
-            op_name="dense", extras={"units": channels}, ignores=["transpose_a", "transpose_b", "T"]
+            op_name="matmul",
+            extras={
+                "units": channels,
+                "input_transposed": attr["transpose_a"] or False,
+                "weight_transposed": attr["transpose_b"] or False,
+            },
+            ignores=["transpose_a", "transpose_b", "T"],
         )(inputs, attr)
 
     return _impl
diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
index dd394396a309..fe59ba047e18 100644
--- a/src/relay/op/nn/nn.h
+++ b/src/relay/op/nn/nn.h
@@ -27,7 +27,6 @@
 #include <tvm/ir/attrs.h>
 #include <tvm/ir/expr.h>
 #include <tvm/relay/type.h>
-#include <tvm/runtime/container.h>
 
 #include <utility>
 
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 331553388b48..afd38d1c952d 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -117,6 +117,7 @@ def run_tvm_graph(
     disabled_pass=None,
     ignore_in_shape=False,
     serialize=False,
+    use_dense_op=True,
 ):
     """Generic function to compile on relay and execute on tvm"""
     input_data = convert_to_list(input_data)
@@ -131,7 +132,11 @@ def run_tvm_graph(
             e: i.shape if hasattr(i, "shape") else () for e, i in zip(input_node, input_data)
         }
     mod, params = relay.frontend.from_tensorflow(
-        graph_def, layout=layout, shape=shape_dict, outputs=out_names
+        graph_def,
+        layout=layout,
+        shape=shape_dict,
+        outputs=out_names,
+        use_dense_op=use_dense_op,
     )
     dev = tvm.device(target, 0)
     if mode == "debug":
@@ -213,6 +218,7 @@ def compare_tf_with_tvm(
     add_shapes_to_graph_def=True,
     targets=None,
     ignore_in_shape=False,
+    use_dense_op=True,
 ):
     """Generic function to generate and compare tensorflow and TVM output"""
 
@@ -260,6 +266,7 @@ def name_without_num(name):
                 mode=mode,
                 cuda_layout=cuda_layout,
                 ignore_in_shape=ignore_in_shape,
+                use_dense_op=use_dense_op,
             )
             # since the names from tensorflow and relay runs are not exactly same,
             # first len(tf_output) will be compared
@@ -1795,7 +1802,8 @@ def _test_matmul(i, j, k, dtype, outer=None):
 
                 A_np = np.random.uniform(high=5.0, size=A_shape).astype(dtype)
                 B_np = np.random.uniform(high=5.0, size=B_shape).astype(dtype)
-                compare_tf_with_tvm([A_np, B_np], [A.name, B.name], result.name)
+                compare_tf_with_tvm([A_np, B_np], [A.name, B.name], result.name, use_dense_op=True)
+                compare_tf_with_tvm([A_np, B_np], [A.name, B.name], result.name, use_dense_op=False)
 
 
 def test_forward_matmul():

From 4a808d727e7f2ecae56945495a39a2e7fbb8a5de Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Thu, 10 Jun 2021 15:18:13 +0800
Subject: [PATCH 03/26] Lint fix

---
 src/relay/transforms/auto_scheduler_layout_rewrite.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
index ac110544a2b5..5005e7bf4c41 100644
--- a/src/relay/transforms/auto_scheduler_layout_rewrite.cc
+++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
@@ -103,9 +103,9 @@ class FuncMutator : public ExprMutator {
   std::deque<std::string> ori_layouts_queue_;
   std::deque<std::string> new_layouts_queue_;
 
-  std::vector<std::string> target_ops_{"nn.conv2d", "nn.conv3d",
-                                       "nn.contrib_conv2d_winograd_without_weight_transform",
-                                       "nn.matmul", "nn.dense", "nn.batch_matmul"};
+  std::vector<std::string> target_ops_{
+      "nn.conv2d", "nn.conv3d", "nn.contrib_conv2d_winograd_without_weight_transform",
+      "nn.matmul", "nn.dense", "nn.batch_matmul"};
 };
 
 Expr AutoSchedulerLayoutRewriter::VisitExpr_(const CallNode* n) {

From 2f3d4fae5e3976513334af209e6b5585926a1c0a Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Thu, 10 Jun 2021 15:23:53 +0800
Subject: [PATCH 04/26] Update

---
 include/tvm/relay/attrs/nn.h                   |  4 ++--
 python/tvm/relay/frontend/tensorflow_ops.py    |  2 +-
 python/tvm/relay/op/nn/_nn.py                  |  6 +++---
 python/tvm/relay/op/nn/nn.py                   | 11 +++++++----
 python/tvm/relay/op/strategy/generic.py        |  2 +-
 python/tvm/topi/cuda/dense.py                  |  8 ++++----
 python/tvm/topi/nn/dense.py                    |  6 +++---
 rust/tvm/src/ir/relay/attrs/nn.rs              |  2 +-
 src/relay/op/make_op.h                         |  2 +-
 src/relay/op/nn/nn.cc                          |  8 ++++----
 src/relay/op/nn/nn.h                           |  2 +-
 src/relay/qnn/op/dense.cc                      |  2 +-
 src/relay/quantize/realize.cc                  |  2 +-
 src/relay/transforms/combine_parallel_dense.cc |  2 +-
 tests/python/relay/test_op_level1.py           |  4 ++--
 tests/python/topi/python/test_topi_matmul.py   |  2 +-
 16 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index de8fa26d66d9..6518f122b72e 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -965,7 +965,7 @@ struct AvgPool3DAttrs : public tvm::AttrsNode<AvgPool3DAttrs> {
 struct MatmulAttrs : public tvm::AttrsNode<MatmulAttrs> {
   IndexExpr units;
   DataType out_dtype;
-  bool input_transposed;
+  bool data_transposed;
   bool weight_transposed;
   tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
 
@@ -977,7 +977,7 @@ struct MatmulAttrs : public tvm::AttrsNode<MatmulAttrs> {
         .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
 
-    TVM_ATTR_FIELD(input_transposed)
+    TVM_ATTR_FIELD(data_transposed)
         .set_default(false)
         .describe("Whether the input tensor is in transposed format.");
 
diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py
index e89e624e123e..32c1792aa700 100644
--- a/python/tvm/relay/frontend/tensorflow_ops.py
+++ b/python/tvm/relay/frontend/tensorflow_ops.py
@@ -1129,7 +1129,7 @@ def _impl(inputs, attr, params, mod):
             op_name="matmul",
             extras={
                 "units": channels,
-                "input_transposed": attr["transpose_a"] or False,
+                "data_transposed": attr["transpose_a"] or False,
                 "weight_transposed": attr["transpose_b"] or False,
             },
             ignores=["transpose_a", "transpose_b", "T"],
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index dd1ee6a43857..7bf593c0d9be 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -1175,11 +1175,11 @@ def batch_flatten_shape_func(attrs, inputs, _):
 
 
 @script
-def _matmul_shape_func(data_shape, weight_shape, input_transposed, weight_transposed):
+def _matmul_shape_func(data_shape, weight_shape, data_transposed, weight_transposed):
     out = output_tensor((data_shape.shape[0],), "int64")
     for i in const_range(out.shape[0] - 1):
         out[i] = data_shape[i]
-    if input_transposed:
+    if data_transposed:
         out[out.shape[0] - 2] = out[out.shape[0] - 1]
     out[out.shape[0] - 1] = weight_shape[0] if weight_transposed else weight_shape[1]
 
@@ -1192,7 +1192,7 @@ def matmul_shape_func(attrs, inputs, _):
     Shape function for matmul op.
     """
     ret = [
-        _matmul_shape_func(inputs[0], inputs[1], attrs.input_transposed, attrs.weight_transposed)
+        _matmul_shape_func(inputs[0], inputs[1], attrs.data_transposed, attrs.weight_transposed)
     ]
     return ret
 
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 179ff97afe68..df049478f577 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1471,7 +1471,7 @@ def bias_add(data, bias, axis=1):
     return _make.bias_add(data, bias, axis)
 
 
-def matmul(data, weight, units=None, out_dtype="", input_transposed=False, weight_transposed=False):
+def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight_transposed=False):
     """Dense operator.
     Applies a linear transformation. The X & W can be transposed.
 
@@ -1483,7 +1483,7 @@ def matmul(data, weight, units=None, out_dtype="", input_transposed=False, weigh
     ----------
     data : tvm.relay.Expr
         The input data to the operator,
-        of shape `(d_1, d_2, ..., d_n, units_in)`.
+        of shape `(d_1, d_2, ..., d_n, units_in)` or `(d_1, d_2, ..., units_in, d_n)`.
 
     weight : tvm.relay.Expr
         The weight expressions, 2-D matrix,
@@ -1496,6 +1496,9 @@ def matmul(data, weight, units=None, out_dtype="", input_transposed=False, weigh
         Specifies the output data type for mixed precision dense,
         of shape `(d_1, d_2, ..., d_n, units)`.
 
+    data_transposed : bool, optional
+        Whether the input tensor is in transposed format.
+
     weight_transposed : bool, optional
         Whether the weight tensor is in transposed format.
 
@@ -1505,9 +1508,9 @@ def matmul(data, weight, units=None, out_dtype="", input_transposed=False, weigh
         The computed result.
     """
     # With N/T format, the compute will be seen as a dense
-    if not input_transposed and weight_transposed:
+    if not data_transposed and weight_transposed:
         return dense(data, weight, units, out_dtype)
-    return _make.matmul(data, weight, units, out_dtype, input_transposed, weight_transposed)
+    return _make.matmul(data, weight, units, out_dtype, data_transposed, weight_transposed)
 
 
 def dense(data, weight, units=None, out_dtype=""):
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 9a552d73bf46..ed3bc4af8d3d 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -725,7 +725,7 @@ def _compute_matmul(attrs, inputs, out_type):
             inputs[1],
             None,
             out_dtype,
-            attrs.input_transposed,
+            attrs.data_transposed,
             attrs.weight_transposed,
         ]
         if need_auto_scheduler_layout:
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index 71b2464736df..76fbd7f82117 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -36,7 +36,7 @@ def _matmul_cublas_common(
     weight,
     bias=None,
     out_dtype=None,
-    input_transposed=False,
+    data_transposed=False,
     weight_transposed=False,
 ):
     assert len(data.shape) == 2 and len(weight.shape) == 2, "only support 2-dim matmul"
@@ -47,7 +47,7 @@ def _matmul_cublas_common(
     assert out_dtype == data.dtype, "Mixed precision not supported."
     batch, in_dim = get_const_tuple(data.shape)
     out_dim, _ = get_const_tuple(weight.shape)
-    matmul = cublas.matmul(data, weight, input_transposed, weight_transposed)
+    matmul = cublas.matmul(data, weight, data_transposed, weight_transposed)
     if all(isinstance(d, int) for d in [batch, in_dim, out_dim]):
         cfg.add_flop(batch * in_dim * out_dim * 2)
     if bias is not None:
@@ -64,12 +64,12 @@ def matmul_cublas(
     weight,
     bias=None,
     out_dtype=None,
-    input_transposed=False,
+    data_transposed=False,
     weight_transposed=False,
 ):
     """Matmul operator on CUDA with CUBLAS"""
     return _matmul_cublas_common(
-        cfg, data, weight, bias, out_dtype, input_transposed, weight_transposed
+        cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed
     )
 
 
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index daaf3c343dcf..9cc0b1c19f84 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -26,7 +26,7 @@ def matmul(
     weight,
     bias=None,
     out_dtype=None,
-    input_transposed=False,
+    data_transposed=False,
     weight_transposed=False,
     auto_scheduler_rewritten_layout="",
 ):
@@ -59,7 +59,7 @@ def matmul(
         assert len(bias.shape) == 1
     if out_dtype is None:
         out_dtype = data.dtype
-    if input_transposed:
+    if data_transposed:
         in_dim, batch = data.shape
     else:
         batch, in_dim = data.shape
@@ -77,7 +77,7 @@ def matmul(
     assert in_dim == red_dim
 
     k = te.reduce_axis((0, in_dim), name="k")
-    if input_transposed:
+    if data_transposed:
         if weight_transposed:
             compute_lambda = lambda i, j: te.sum(
                 data[k, i].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=k
diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs
index 84533e9ecaea..e77972e45f86 100644
--- a/rust/tvm/src/ir/relay/attrs/nn.rs
+++ b/rust/tvm/src/ir/relay/attrs/nn.rs
@@ -62,7 +62,7 @@ pub struct MatmulAttrsNode {
     pub base: BaseAttrsNode,
     pub units: IndexExpr,
     pub out_dtype: DataType,
-    pub input_transposed: bool,
+    pub data_transposed: bool,
     pub weight_transposed: bool,
 }
 
diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h
index b957df987f4c..f7bb98f9d1b6 100644
--- a/src/relay/op/make_op.h
+++ b/src/relay/op/make_op.h
@@ -44,7 +44,7 @@ Expr MakeClip(Expr a, double a_min, double a_max);
 
 Expr MakeConcatenate(Expr data, int axis);
 
-Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool input_transposed,
+Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool data_transposed,
                 bool weight_transposed);
 
 Expr MakeDense(Expr data, Expr weight, IndexExpr units, DataType out_dtype);
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 70e7cd0f5be8..7283242e1914 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -165,14 +165,14 @@ Useful for
 // ------------------- relay.nn.matmul
 TVM_REGISTER_NODE_TYPE(MatmulAttrs);
 
-Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool input_transposed,
+Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool data_transposed,
                 bool weight_transposed) {
   auto attrs = make_object<MatmulAttrs>();
   attrs->units = units;
   attrs->out_dtype = out_dtype;
-  attrs->input_transposed = input_transposed;
+  attrs->data_transposed = data_transposed;
   attrs->weight_transposed = weight_transposed;
-  if (!input_transposed && weight_transposed) {
+  if (!data_transposed && weight_transposed) {
     static const Op& dense_op = Op::Get("nn.dense");
     return Call(dense_op, {data, weight}, Attrs(attrs), {});
   } else {
@@ -229,7 +229,7 @@ Expr MakeDensePack(Expr data, Expr weight, IndexExpr units, DataType out_dtype)
   auto attrs = make_object<MatmulAttrs>();
   attrs->units = units;
   attrs->out_dtype = out_dtype;
-  attrs->input_transposed = false;
+  attrs->data_transposed = false;
   attrs->weight_transposed = true;
   static const Op& op = Op::Get("nn.contrib_dense_pack");
   return Call(op, {data, weight}, Attrs(attrs), {});
diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
index fe59ba047e18..60bff5d3c69a 100644
--- a/src/relay/op/nn/nn.h
+++ b/src/relay/op/nn/nn.h
@@ -51,7 +51,7 @@ bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const Array<tvm::PrimExpr>& dshape = data->shape;
   Array<tvm::PrimExpr> oshape = dshape;
   tvm::PrimExpr reduce = dshape[dshape.size() - 1];
-  if (param->input_transposed) {
+  if (param->data_transposed) {
     reduce = dshape[dshape.size() - 2];
     oshape.Set((oshape.size() - 2), dshape[oshape.size() - 1]);
   }
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index fbaa8a96f562..f47fe71ebab7 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -79,7 +79,7 @@ Expr MakeQuantizedDense(Expr data, Expr weight, Expr input_zero_point, Expr kern
   auto attrs = make_object<MatmulAttrs>();
   attrs->units = std::move(units);
   attrs->out_dtype = out_dtype;
-  attrs->input_transposed = false;
+  attrs->data_transposed = false;
   attrs->weight_transposed = true;
   static const Op& op = Op::Get("qnn.dense");
   return Call(op, {data, weight, input_zero_point, kernel_zero_point, input_scale, kernel_scale},
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 0f7a5eafb0f0..09572b04ed4b 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -285,7 +285,7 @@ Expr DenseRealize(const Call& ref_call, const Array<Expr>& new_args, const Objec
   *attrs = *ref_attrs;
   DataType out_dtype = cfg->dtype_activation;
   attrs->out_dtype = out_dtype;
-  attrs->input_transposed = false;
+  attrs->data_transposed = false;
   attrs->weight_transposed = true;
 
   Expr ret = Call(ref_call->op, {ldata, rdata}, Attrs(attrs), ref_call->type_args);
diff --git a/src/relay/transforms/combine_parallel_dense.cc b/src/relay/transforms/combine_parallel_dense.cc
index 41f3a70c62bc..966210c3c882 100644
--- a/src/relay/transforms/combine_parallel_dense.cc
+++ b/src/relay/transforms/combine_parallel_dense.cc
@@ -124,7 +124,7 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
     const auto dense_attrs = make_object<MatmulAttrs>();
     dense_attrs->units = new_output_dims;
     dense_attrs->out_dtype = origin_attrs->out_dtype;
-    dense_attrs->input_transposed = false;
+    dense_attrs->data_transposed = false;
     dense_attrs->weight_transposed = true;
     return Call(dense_op, {input, new_weight}, Attrs{dense_attrs}, {});
   }
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 11f79b2477bf..d5a0e8bc8403 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -439,7 +439,7 @@ def test_matmul():
         x = relay.var("x", relay.TensorType((n, c, w, h), dtype))
         wh, ww = te.size_var("wh"), te.size_var("ww")
         w = relay.var("w", relay.TensorType((wh, ww), dtype))
-        y = relay.nn.matmul(x, w, input_transposed=True)
+        y = relay.nn.matmul(x, w, data_transposed=True)
         yy = run_infer_type(y)
         assert yy.checked_type == relay.TensorType((n, c, h, ww), dtype)
 
@@ -452,7 +452,7 @@ def test_matmul():
 
         x = relay.var("x", shape=(5, 10), dtype=dtype)
         w = relay.var("w", shape=(5, 2), dtype=dtype)
-        z = relay.nn.matmul(x, w, input_transposed=True)
+        z = relay.nn.matmul(x, w, data_transposed=True)
 
         # Check result.
         func = relay.Function([x, w], z)
diff --git a/tests/python/topi/python/test_topi_matmul.py b/tests/python/topi/python/test_topi_matmul.py
index c777b8f2bc6e..cf92ab0807cc 100644
--- a/tests/python/topi/python/test_topi_matmul.py
+++ b/tests/python/topi/python/test_topi_matmul.py
@@ -46,7 +46,7 @@ def verify_nn_matmul(sa, sb, transp_a, transp_b):
     b = np.random.uniform(low=-1.0, high=1.0, size=sb).astype(np.float32)
     c1 = np.matmul(np.transpose(a) if transp_a else a, np.transpose(b) if transp_b else b)
     c2 = with_tvm(
-        lambda A, B: topi.nn.matmul(A, B, input_transposed=transp_a, weight_transposed=transp_b),
+        lambda A, B: topi.nn.matmul(A, B, data_transposed=transp_a, weight_transposed=transp_b),
         a,
         b,
     )

From 0fc436a9109032fec2f5a69f1b3b47c77b46a367 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Thu, 10 Jun 2021 15:35:56 +0800
Subject: [PATCH 05/26] Update

---
 python/tvm/relay/op/nn/nn.py | 2 +-
 python/tvm/topi/nn/dense.py  | 8 +++++++-
 src/relay/op/nn/nn.cc        | 2 +-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index df049478f577..ee82d47548ec 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1497,7 +1497,7 @@ def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight
         of shape `(d_1, d_2, ..., d_n, units)`.
 
     data_transposed : bool, optional
-        Whether the input tensor is in transposed format.
+        Whether the data tensor is in transposed format.
 
     weight_transposed : bool, optional
         Whether the weight tensor is in transposed format.
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index 9cc0b1c19f84..b7e3ece5f579 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -30,7 +30,7 @@ def matmul(
     weight_transposed=False,
     auto_scheduler_rewritten_layout="",
 ):
-    """The default implementation of dense in topi.
+    """The default implementation of matmul in topi.
 
     Parameters
     ----------
@@ -46,6 +46,12 @@ def matmul(
     out_dtype : Optional[str]
         The output type. This is used for mixed precision.
 
+    data_transposed : Optional[bool]
+        Whether the data tensor is in transposed format.
+
+    weight_transposed : Optional[bool]
+        Whether the weight tensor is in transposed format.
+
     auto_scheduler_rewritten_layout: str = ""
         The layout after auto-scheduler's layout rewrite pass.
 
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 7283242e1914..20179a4597ed 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -186,7 +186,7 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.matmul").set_body_typed(MakeMatmul);
 RELAY_REGISTER_OP("nn.matmul")
     .describe(R"code(Applies a linear transformation: :math:`Y = XW`. X & W can be transposed.
 
-- **data**: `(x1, x2, ..., xn, input_dim)`
+- **data**: `(x1, x2, ..., xn, input_dim)` or `(x1, x2, ..., input_dim, xn)`
 - **weight**: `(input_dim, units)` or `(units, input_dim)`
 - **out**: `(x1, x2, ..., xn, units)`.
 

From e262e5ebcf94b6c53c15bb7af3abaa14c89bfacf Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Thu, 10 Jun 2021 15:42:39 +0800
Subject: [PATCH 06/26] Lint fix

---
 src/relay/transforms/auto_scheduler_layout_rewrite.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
index 5005e7bf4c41..8aa5117dee1e 100644
--- a/src/relay/transforms/auto_scheduler_layout_rewrite.cc
+++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
@@ -105,7 +105,7 @@ class FuncMutator : public ExprMutator {
 
   std::vector<std::string> target_ops_{
       "nn.conv2d", "nn.conv3d", "nn.contrib_conv2d_winograd_without_weight_transform",
-      "nn.matmul", "nn.dense", "nn.batch_matmul"};
+      "nn.matmul", "nn.dense",  "nn.batch_matmul"};
 };
 
 Expr AutoSchedulerLayoutRewriter::VisitExpr_(const CallNode* n) {

From 3bdff51fb9dda3fe81dd1f6ad5f3b4026c4d6923 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Thu, 10 Jun 2021 15:46:16 +0800
Subject: [PATCH 07/26] Update

---
 python/tvm/relay/op/nn/nn.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index ee82d47548ec..d8d36d3a544d 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1507,15 +1507,12 @@ def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight
     result : tvm.relay.Expr
         The computed result.
     """
-    # With N/T format, the compute will be seen as a dense
-    if not data_transposed and weight_transposed:
-        return dense(data, weight, units, out_dtype)
     return _make.matmul(data, weight, units, out_dtype, data_transposed, weight_transposed)
 
 
 def dense(data, weight, units=None, out_dtype=""):
     """Dense operator.
-    Applies a linear transformation. This is an alias of matmul with weight transposed.
+    Applies a linear transformation
 
     .. math::
 

From 23facfb7884e5586a3d76e0f9591aef16022e6b0 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Thu, 10 Jun 2021 15:51:19 +0800
Subject: [PATCH 08/26] Lint fix

---
 python/tvm/relay/frontend/tensorflow_ops.py | 3 ++-
 python/tvm/relay/op/nn/_nn.py               | 4 +---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py
index 32c1792aa700..c9b37881e208 100644
--- a/python/tvm/relay/frontend/tensorflow_ops.py
+++ b/python/tvm/relay/frontend/tensorflow_ops.py
@@ -1113,8 +1113,9 @@ def _impl(inputs, attr, params, mod):
 
 def _matmul():
     def _impl(inputs, attr, params, mod):
-        channels = _infer_channels(inputs[1], not attr["transpose_b"])
         from .tensorflow import _USE_DENSE_INSTEAD_OF_MATMUL
+
+        channels = _infer_channels(inputs[1], not attr["transpose_b"])
         if _USE_DENSE_INSTEAD_OF_MATMUL:
             if attr["transpose_a"]:
                 inputs[0] = _op.transpose(inputs[0], axes=(1, 0))
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 7bf593c0d9be..929331c3daf2 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -1191,9 +1191,7 @@ def matmul_shape_func(attrs, inputs, _):
     """
     Shape function for matmul op.
     """
-    ret = [
-        _matmul_shape_func(inputs[0], inputs[1], attrs.data_transposed, attrs.weight_transposed)
-    ]
+    ret = [_matmul_shape_func(inputs[0], inputs[1], attrs.data_transposed, attrs.weight_transposed)]
     return ret
 
 

From e73a5a6b4c6dcc2d69d65b8a7776be778f20a5aa Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Thu, 10 Jun 2021 17:42:06 +0800
Subject: [PATCH 09/26] Bug fix

---
 python/tvm/relay/op/nn/nn.py                         | 12 +++++++++++-
 .../contrib/test_arm_compute_lib/test_dense.py       |  2 ++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index d8d36d3a544d..5a90d3ba6aab 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1510,7 +1510,7 @@ def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight
     return _make.matmul(data, weight, units, out_dtype, data_transposed, weight_transposed)
 
 
-def dense(data, weight, units=None, out_dtype=""):
+def dense(data, weight, units=None, out_dtype="", data_transposed=False, weight_transposed=True):
     """Dense operator.
     Applies a linear transformation
 
@@ -1535,11 +1535,21 @@ def dense(data, weight, units=None, out_dtype=""):
         Specifies the output data type for mixed precision dense,
         of shape `(d_1, d_2, ..., d_n, units)`.
 
+    data_transposed : bool, optional
+        Whether the data tensor is in transposed format. Expected to be False.
+
+    weight_transposed : bool, optional
+        Whether the weight tensor is in transposed format. Expected to be True.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
+    # Add data_transposed & weight_transposed parameters for some API requires to apply
+    # attrs to this function
+    assert data_transposed
+    assert not weight_transposed
     return _make.dense(data, weight, units, out_dtype)
 
 
diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py
index e6620a4bc1cb..0fac4de35388 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_dense.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_dense.py
@@ -123,6 +123,8 @@ def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False):
             "shape": [[list(output_shape)]],
             "dtype": [[dtype]],
             "units": [[str(units)]],
+            "data_transposed": [[0]],
+            "weight_transposed": [[1]],
         },
     }
 

From f5aedd84d770a6bddc67c190bc4c976e4b3d8185 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Thu, 10 Jun 2021 18:48:17 +0800
Subject: [PATCH 10/26] Bug fix

---
 python/tvm/relay/op/nn/nn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 5a90d3ba6aab..a6cf147cc564 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1548,8 +1548,8 @@ def dense(data, weight, units=None, out_dtype="", data_transposed=False, weight_
     """
     # Add data_transposed & weight_transposed parameters for some API requires to apply
     # attrs to this function
-    assert data_transposed
-    assert not weight_transposed
+    assert not data_transposed
+    assert weight_transposed
     return _make.dense(data, weight, units, out_dtype)
 
 

From 9dc686ff5737debd63a9d825fe14f015266c7313 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Thu, 10 Jun 2021 19:59:59 +0800
Subject: [PATCH 11/26] Bug fix

---
 tests/python/contrib/test_arm_compute_lib/test_dense.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py
index 0fac4de35388..007a6b9bf8d0 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_dense.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_dense.py
@@ -123,8 +123,8 @@ def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False):
             "shape": [[list(output_shape)]],
             "dtype": [[dtype]],
             "units": [[str(units)]],
-            "data_transposed": [[0]],
-            "weight_transposed": [[1]],
+            "data_transposed": [["0"]],
+            "weight_transposed": [["1"]],
         },
     }
 

From d5e2625fd47e65cad78d4307c061a7ad0358c7ad Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Thu, 17 Jun 2021 11:15:55 +0800
Subject: [PATCH 12/26] Recover DenseAttrs

---
 include/tvm/relay/attrs/nn.h                  | 16 ++++++++++++
 python/tvm/relay/op/nn/nn.py                  | 12 +--------
 python/tvm/relay/op/op_attrs.py               |  7 ++++-
 rust/tvm/src/ir/relay/attrs/nn.rs             | 10 +++++++
 src/relay/op/nn/nn.cc                         | 26 ++++++++++---------
 src/relay/op/nn/nn.h                          | 19 +++++++++-----
 src/relay/qnn/op/dense.cc                     | 16 +++++-------
 src/relay/quantize/realize.cc                 |  6 ++---
 .../auto_scheduler_layout_rewrite.cc          |  4 +++
 .../transforms/combine_parallel_dense.cc      | 16 +++++-------
 .../test_arm_compute_lib/test_dense.py        |  2 --
 11 files changed, 79 insertions(+), 55 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 6518f122b72e..0a3246ecf5a1 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -987,6 +987,22 @@ struct MatmulAttrs : public tvm::AttrsNode<MatmulAttrs> {
   }
 };
 
+/*! \brief Attributes for dense operator */
+struct DenseAttrs : public tvm::AttrsNode<DenseAttrs> {
+  IndexExpr units;
+  tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
+  DataType out_dtype;
+
+  TVM_DECLARE_ATTRS(DenseAttrs, "relay.attrs.DenseAttrs") {
+    TVM_ATTR_FIELD(units).describe("Number of hidden units of the dense transformation.");
+
+    // use 0 bits to indicate none.
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
+  }
+};
+
 /*! \brief Attributes for batch matmul operator */
 struct BatchMatmulAttrs : public tvm::AttrsNode<BatchMatmulAttrs> {
   tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index a6cf147cc564..d8d36d3a544d 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1510,7 +1510,7 @@ def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight
     return _make.matmul(data, weight, units, out_dtype, data_transposed, weight_transposed)
 
 
-def dense(data, weight, units=None, out_dtype="", data_transposed=False, weight_transposed=True):
+def dense(data, weight, units=None, out_dtype=""):
     """Dense operator.
     Applies a linear transformation
 
@@ -1535,21 +1535,11 @@ def dense(data, weight, units=None, out_dtype="", data_transposed=False, weight_
         Specifies the output data type for mixed precision dense,
         of shape `(d_1, d_2, ..., d_n, units)`.
 
-    data_transposed : bool, optional
-        Whether the data tensor is in transposed format. Expected to be False.
-
-    weight_transposed : bool, optional
-        Whether the weight tensor is in transposed format. Expected to be True.
-
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
-    # Add data_transposed & weight_transposed parameters for some API requires to apply
-    # attrs to this function
-    assert not data_transposed
-    assert weight_transposed
     return _make.dense(data, weight, units, out_dtype)
 
 
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index 323190e309b8..68c745beb027 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -66,7 +66,12 @@ class BiasAddAttrs(Attrs):
 
 @tvm._ffi.register_object("relay.attrs.MatmulAttrs")
 class MatmulAttrs(Attrs):
-    """Attributes for nn.matmul and nn.dense"""
+    """Attributes for nn.matmul"""
+
+
+@tvm._ffi.register_object("relay.attrs.DenseAttrs")
+class DenseAttrs(Attrs):
+    """Attributes for nn.dense"""
 
 
 @tvm._ffi.register_object("relay.attrs.SoftmaxAttrs")
diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs
index e77972e45f86..e0a1d5bf02cd 100644
--- a/rust/tvm/src/ir/relay/attrs/nn.rs
+++ b/rust/tvm/src/ir/relay/attrs/nn.rs
@@ -66,6 +66,16 @@ pub struct MatmulAttrsNode {
     pub weight_transposed: bool,
 }
 
+#[repr(C)]
+#[derive(Object, Debug)]
+#[ref_name = "DenseAttrs"]
+#[type_key = "relay.attrs.DenseAttrs"]
+pub struct DenseAttrsNode {
+    pub base: BaseAttrsNode,
+    pub units: IndexExpr,
+    pub out_dtype: DataType,
+}
+
 #[repr(C)]
 #[derive(Object, Debug)]
 #[ref_name = "GlobalPool2DAttrs"]
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 20179a4597ed..1cd825618473 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -164,18 +164,22 @@ Useful for
 
 // ------------------- relay.nn.matmul
 TVM_REGISTER_NODE_TYPE(MatmulAttrs);
+TVM_REGISTER_NODE_TYPE(DenseAttrs);
 
 Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool data_transposed,
                 bool weight_transposed) {
-  auto attrs = make_object<MatmulAttrs>();
-  attrs->units = units;
-  attrs->out_dtype = out_dtype;
-  attrs->data_transposed = data_transposed;
-  attrs->weight_transposed = weight_transposed;
   if (!data_transposed && weight_transposed) {
+    auto attrs = make_object<DenseAttrs>();
+    attrs->units = units;
+    attrs->out_dtype = out_dtype;
     static const Op& dense_op = Op::Get("nn.dense");
     return Call(dense_op, {data, weight}, Attrs(attrs), {});
   } else {
+    auto attrs = make_object<MatmulAttrs>();
+    attrs->units = units;
+    attrs->out_dtype = out_dtype;
+    attrs->data_transposed = data_transposed;
+    attrs->weight_transposed = weight_transposed;
     static const Op& matmul_op = Op::Get("nn.matmul");
     return Call(matmul_op, {data, weight}, Attrs(attrs), {});
   }
@@ -215,22 +219,20 @@ RELAY_REGISTER_OP("nn.dense")
 - **out**: `(x1, x2, ..., xn, units)`.
 
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type<MatmulAttrs>()
+    .set_attrs_type<DenseAttrs>()
     .set_num_inputs(2)
     .add_argument("data", "nD Tensor", "Input data.")
     .add_argument("weight", "2D Tensor", "Weight matrix.")
     .set_support_level(1)
-    .add_type_rel("Dense", MatmulRel<MatmulAttrs>);
+    .add_type_rel("Dense", MatmulRel<DenseAttrs>);
 // ------------------- relay.nn.dense
 
 // ------------------- relay.nn.contrib_dense_pack
 // Positional relay function to create dense_pack operator used by frontend FFI.
 Expr MakeDensePack(Expr data, Expr weight, IndexExpr units, DataType out_dtype) {
-  auto attrs = make_object<MatmulAttrs>();
+  auto attrs = make_object<DenseAttrs>();
   attrs->units = units;
   attrs->out_dtype = out_dtype;
-  attrs->data_transposed = false;
-  attrs->weight_transposed = true;
   static const Op& op = Op::Get("nn.contrib_dense_pack");
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
@@ -245,12 +247,12 @@ RELAY_REGISTER_OP("nn.contrib_dense_pack")
 - **out**: `(x1, x2, ..., xn, units)`.
 
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type<MatmulAttrs>()
+    .set_attrs_type<DenseAttrs>()
     .set_num_inputs(2)
     .add_argument("data", "nD Tensor", "Input data.")
     .add_argument("weight", "3D Tensor", "Packed weight matrix.")
     .set_support_level(10)
-    .add_type_rel("DensePack", DensePackRel<MatmulAttrs>);
+    .add_type_rel("DensePack", DensePackRel<DenseAttrs>);
 // ------------------- relay.nn.contrib_dense_pack
 
 // relay.leaky_relu
diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
index 60bff5d3c69a..97591981c2a3 100644
--- a/src/relay/op/nn/nn.h
+++ b/src/relay/op/nn/nn.h
@@ -42,23 +42,28 @@ bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr) return false;
+  ICHECK(static_cast<int>(data->shape.size()) != 0);
 
   const AttrType* param = attrs.as<AttrType>();
   ICHECK(param != nullptr);
-
-  ICHECK(static_cast<int>(data->shape.size()) != 0);
+  bool data_transposed = false;
+  bool weight_transposed = true;
+  if (attrs->IsInstance<MatmulAttrs>()) {
+    data_transposed = param->data_transposed;
+    weight_transposed = param->weight_transposed;
+  }
 
   const Array<tvm::PrimExpr>& dshape = data->shape;
   Array<tvm::PrimExpr> oshape = dshape;
   tvm::PrimExpr reduce = dshape[dshape.size() - 1];
-  if (param->data_transposed) {
+  if (data_transposed) {
     reduce = dshape[dshape.size() - 2];
     oshape.Set((oshape.size() - 2), dshape[oshape.size() - 1]);
   }
   if (param->units.defined()) {
     // validate the weight shape is proper if defined
     // Assign weight type
-    const Array<IndexExpr>& wshape = param->weight_transposed
+    const Array<IndexExpr>& wshape = weight_transposed
                                          ? Array<IndexExpr>({param->units, reduce})
                                          : Array<IndexExpr>({reduce, param->units});
     // It is possible for weight to be nullptr in which case we will use
@@ -90,12 +95,12 @@ bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     } else {
       ICHECK(static_cast<int>(weight->shape.size()) == 2);
       if (!data->shape.back().as<tir::AnyNode>()) {
-        ICHECK((param->weight_transposed && reporter->AssertEQ(reduce, weight->shape[1])) ||
-               (!param->weight_transposed && reporter->AssertEQ(reduce, weight->shape[0])))
+        ICHECK((weight_transposed && reporter->AssertEQ(reduce, weight->shape[1])) ||
+               (!weight_transposed && reporter->AssertEQ(reduce, weight->shape[0])))
             << "MatmulRel: input dimension doesn't match,"
             << " data shape=" << data->shape << ", weight shape=" << weight->shape;
       }
-      oshape.Set((oshape.size() - 1), param->weight_transposed ? wshape[0] : wshape[1]);
+      oshape.Set((oshape.size() - 1), weight_transposed ? wshape[0] : wshape[1]);
     }
   }
 
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index f47fe71ebab7..6284524bff27 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -45,8 +45,8 @@ bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* weight = types[1].as<TensorTypeNode>();
   if (data == nullptr || weight == nullptr) return false;
-  const auto* param = attrs.as<MatmulAttrs>();
-  ICHECK(param != nullptr) << "MatmulAttrs cannot be nullptr.";
+  const auto* param = attrs.as<DenseAttrs>();
+  ICHECK(param != nullptr) << "DenseAttrs cannot be nullptr.";
   ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
       << "Expected quantized dense type(int8, uint8) for input but was " << data->dtype;
   ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
@@ -70,24 +70,22 @@ bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
   // Dense infer type function.
   Array<Type> tensor_types = {types[0], types[1], types[6]};
-  return MatmulRel<MatmulAttrs>(tensor_types, 3, attrs, reporter);
+  return DenseRel<DenseAttrs>(tensor_types, 3, attrs, reporter);
 }
 
 // Positional relay function to create quantized dense operator used by frontend FFI.
 Expr MakeQuantizedDense(Expr data, Expr weight, Expr input_zero_point, Expr kernel_zero_point,
                         Expr input_scale, Expr kernel_scale, IndexExpr units, DataType out_dtype) {
-  auto attrs = make_object<MatmulAttrs>();
+  auto attrs = make_object<DenseAttrs>();
   attrs->units = std::move(units);
   attrs->out_dtype = out_dtype;
-  attrs->data_transposed = false;
-  attrs->weight_transposed = true;
   static const Op& op = Op::Get("qnn.dense");
   return Call(op, {data, weight, input_zero_point, kernel_zero_point, input_scale, kernel_scale},
               Attrs(attrs), {});
 }
 
 Expr DenseFirstTerm(const Expr& quantized_data, const Expr& quantized_kernel,
-                    const MatmulAttrs* attrs) {
+                    const DenseAttrs* attrs) {
   return Dense(quantized_data, quantized_kernel, attrs->units, attrs->out_dtype);
 }
 
@@ -163,7 +161,7 @@ Expr QnnDenseCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   const auto in_shape = get_shape(arg_types[0]);
   const int reduction_dim_size = get_const_int(in_shape[1]);
 
-  const auto* qnn_dense_attrs = attrs.as<MatmulAttrs>();
+  const auto* qnn_dense_attrs = attrs.as<DenseAttrs>();
 
   auto term1 = DenseFirstTerm(quantized_data, quantized_kernel, qnn_dense_attrs);
   auto term2 = DenseSecondTerm(quantized_data, kernel_zero_point);
@@ -206,7 +204,7 @@ RELAY_REGISTER_OP("qnn.dense")
 - **weight**: quantized(int8, unit8) `(units, input_dim)`
 - **out**: quantized(int32) `(x1, x2, ..., xn, units)`.
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type<MatmulAttrs>()
+    .set_attrs_type<DenseAttrs>()
     .set_num_inputs(6)
     .add_argument("data", "quantized nD Tensor", "Input data.")
     .add_argument("weight", "quantized 2D Tensor", "Weight matrix.")
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 09572b04ed4b..968628fbfe39 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -280,13 +280,11 @@ Expr DenseRealize(const Call& ref_call, const Array<Expr>& new_args, const Objec
   }
   Expr rdata = Cast(rhs->data, cfg->dtype_weight);
 
-  const auto ref_attrs = ref_call->attrs.as<MatmulAttrs>();
-  auto attrs = make_object<MatmulAttrs>();
+  const auto ref_attrs = ref_call->attrs.as<DenseAttrs>();
+  auto attrs = make_object<DenseAttrs>();
   *attrs = *ref_attrs;
   DataType out_dtype = cfg->dtype_activation;
   attrs->out_dtype = out_dtype;
-  attrs->data_transposed = false;
-  attrs->weight_transposed = true;
 
   Expr ret = Call(ref_call->op, {ldata, rdata}, Attrs(attrs), ref_call->type_args);
   Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale);
diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
index 8aa5117dee1e..da0bd35a332a 100644
--- a/src/relay/transforms/auto_scheduler_layout_rewrite.cc
+++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
@@ -89,6 +89,8 @@ class FuncMutator : public ExprMutator {
         updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
       } else if (auto pattr = call->attrs.as<MatmulAttrs>()) {
         updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
+      } else if (auto pattr = call->attrs.as<DenseAttrs>()) {
+        updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
       } else if (auto pattr = call->attrs.as<BatchMatmulAttrs>()) {
         updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
       } else {
@@ -168,6 +170,8 @@ TVM_REGISTER_GLOBAL("relay.attrs.get_auto_scheduler_rewritten_layout")
         return attrs.as<Conv3DAttrs>()->auto_scheduler_rewritten_layout;
       } else if (attrs->IsInstance<MatmulAttrs>()) {
         return attrs.as<MatmulAttrs>()->auto_scheduler_rewritten_layout;
+      } else if (attrs->IsInstance<DenseAttrs>()) {
+        return attrs.as<DenseAttrs>()->auto_scheduler_rewritten_layout;
       } else if (attrs->IsInstance<BatchMatmulAttrs>()) {
         return attrs.as<BatchMatmulAttrs>()->auto_scheduler_rewritten_layout;
       } else {
diff --git a/src/relay/transforms/combine_parallel_dense.cc b/src/relay/transforms/combine_parallel_dense.cc
index 966210c3c882..3cd9cca4fec4 100644
--- a/src/relay/transforms/combine_parallel_dense.cc
+++ b/src/relay/transforms/combine_parallel_dense.cc
@@ -70,15 +70,15 @@ class ParallelDenseToBatchCombiner : public ParallelOpBatchCombiner {
     }
 
     CHECK_EQ(num_args, 2);
-    const auto* origin_attrs = branches[0][0]->attrs.as<MatmulAttrs>();
+    const auto* origin_attrs = branches[0][0]->attrs.as<DenseAttrs>();
     ICHECK(origin_attrs);
     return Downcast<Call>(MakeBatchMatmul(new_args[0], new_args[1], origin_attrs->out_dtype));
   }
 
   virtual bool CanOpsBeCombined(const CallNode* a, const CallNode* b) {
     StructuralEqual eq;
-    const auto* attrs_a = a->attrs.as<MatmulAttrs>();
-    const auto* attrs_b = b->attrs.as<MatmulAttrs>();
+    const auto* attrs_a = a->attrs.as<DenseAttrs>();
+    const auto* attrs_b = b->attrs.as<DenseAttrs>();
     ICHECK(attrs_a);
     ICHECK(attrs_b);
     const auto* weight_a = a->args[1]->type_as<TensorTypeNode>();
@@ -103,8 +103,8 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
 
   bool CanOpsBeCombined(const CallNode* a, const CallNode* b) {
     StructuralEqual eq;
-    const auto* attrs_a = a->attrs.as<MatmulAttrs>();
-    const auto* attrs_b = b->attrs.as<MatmulAttrs>();
+    const auto* attrs_a = a->attrs.as<DenseAttrs>();
+    const auto* attrs_b = b->attrs.as<DenseAttrs>();
     const auto* weight_a = a->args[1]->type_as<TensorTypeNode>();
     const auto* weight_b = b->args[1]->type_as<TensorTypeNode>();
     ICHECK(attrs_a != nullptr && attrs_b != nullptr && weight_a != nullptr && weight_b != nullptr);
@@ -119,13 +119,11 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
     IndexExpr new_output_dims;
     // concat all weights into one
     std::tie(new_weight, new_output_dims) = TransformWeight(branches);
-    const auto* origin_attrs = branches[0][0]->attrs.as<MatmulAttrs>();
+    const auto* origin_attrs = branches[0][0]->attrs.as<DenseAttrs>();
     ICHECK(origin_attrs);
-    const auto dense_attrs = make_object<MatmulAttrs>();
+    const auto dense_attrs = make_object<DenseAttrs>();
     dense_attrs->units = new_output_dims;
     dense_attrs->out_dtype = origin_attrs->out_dtype;
-    dense_attrs->data_transposed = false;
-    dense_attrs->weight_transposed = true;
     return Call(dense_op, {input, new_weight}, Attrs{dense_attrs}, {});
   }
 
diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py
index 007a6b9bf8d0..e6620a4bc1cb 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_dense.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_dense.py
@@ -123,8 +123,6 @@ def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False):
             "shape": [[list(output_shape)]],
             "dtype": [[dtype]],
             "units": [[str(units)]],
-            "data_transposed": [["0"]],
-            "weight_transposed": [["1"]],
         },
     }
 

From 91ead08fed4711b6add620067deafc0ca8b7471a Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Thu, 24 Jun 2021 11:39:52 +0800
Subject: [PATCH 13/26] Add grad for matmul & some update

---
 include/tvm/relay/attrs/nn.h              |  2 +-
 python/tvm/relay/op/_tensor_grad.py       | 29 +++++++++++++++++++++++
 python/tvm/relay/op/nn/_nn.py             |  2 +-
 python/tvm/relay/op/nn/nn.py              |  2 ++
 python/tvm/relay/op/strategy/cuda.py      | 19 ++++++++++-----
 python/tvm/relay/op/strategy/x86.py       | 22 +++++++++++++++++
 python/tvm/topi/cuda/dense.py             |  2 +-
 src/relay/op/nn/nn.cc                     | 17 ++++++-------
 src/relay/op/nn/nn.h                      | 13 +++++-----
 src/relay/qnn/op/dense.cc                 |  2 +-
 tests/python/relay/test_op_grad_level2.py | 17 +++++++++++++
 11 files changed, 101 insertions(+), 26 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 0a3246ecf5a1..40d8406b2be2 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -961,7 +961,7 @@ struct AvgPool3DAttrs : public tvm::AttrsNode<AvgPool3DAttrs> {
   }
 };
 
-/*! \brief Attributes for matmul operator and dense operator */
+/*! \brief Attributes for matmul operator */
 struct MatmulAttrs : public tvm::AttrsNode<MatmulAttrs> {
   IndexExpr units;
   DataType out_dtype;
diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
index d5b891088933..1f68cf5bb641 100644
--- a/python/tvm/relay/op/_tensor_grad.py
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -557,6 +557,35 @@ def dense_grad(orig, grad):
     ]
 
 
+@register_gradient("nn.matmul")
+def matmul_grad(orig, grad):
+    """Returns [grad' @ weight, data @ grad']"""
+    data, weight = orig.args
+    if (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (True, True):
+        return [
+            collapse_sum_like(
+                _nn.matmul(weight, grad, data_transposed=True, weight_transposed=True), data
+            ),
+            collapse_sum_like(
+                _nn.matmul(grad, data, data_transposed=True, weight_transposed=True), weight
+            ),
+        ]
+    if (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (True, False):
+        return [
+            collapse_sum_like(_nn.matmul(weight, grad, weight_transposed=True), data),
+            collapse_sum_like(_nn.matmul(data, grad), weight),
+        ]
+    if (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (False, True):
+        # Keep using Dense op here for not involving extra ops
+        # TODO(jcf94): Merge all to nn.matmul when it is finally ready
+        return dense_grad(orig, grad)
+    # (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (False, False)
+    return [
+        collapse_sum_like(_nn.matmul(grad, weight, weight_transposed=True), data),
+        collapse_sum_like(_nn.matmul(data, grad, data_transposed=True), weight),
+    ]
+
+
 @register_gradient("nn.batch_matmul")
 def batch_matmul_grad(orig, grad):
     """gradient for nn.batch_matmul: in einsum LHS_bik,RHS_bjk->RES_bij
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 929331c3daf2..91fff7df18e4 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -53,7 +53,7 @@
 
 
 @reg.register_legalize("nn.matmul")
-def leaglize_matmul(attrs, inputs, types):
+def legalize_matmul(attrs, inputs, types):
     """Legalize matmul op.
 
     Parameters
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index d8d36d3a544d..ff7fe9080714 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1507,6 +1507,8 @@ def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight
     result : tvm.relay.Expr
         The computed result.
     """
+    if not data_transposed and weight_transposed:
+        return dense(data, weight, units, out_dtype)
     return _make.matmul(data, weight, units, out_dtype, data_transposed, weight_transposed)
 
 
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 2f49aa4a89c7..d0c724832b57 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -702,6 +702,19 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target):
 def matmul_strategy_cuda(attrs, inputs, out_type, target):
     """dense cuda strategy"""
     strategy = _op.OpStrategy()
+    if is_auto_scheduler_enabled():
+        strategy.add_implementation(
+            wrap_compute_matmul(topi.nn.matmul),
+            naive_schedule,
+            name="matmul.cuda",
+        )
+    else:
+        logger.warning("Matmul other than NT format is not optimized for cuda.")
+        strategy.add_implementation(
+            wrap_compute_matmul(topi.nn.matmul),
+            wrap_topi_schedule(topi.cuda.schedule_dense_small_batch),
+            name="dense_small_batch.cuda",
+        )
     if target.kind.name == "cuda" and "cublas" in target.libs:
         strategy.add_implementation(
             wrap_compute_matmul(topi.cuda.matmul_cublas),
@@ -709,12 +722,6 @@ def matmul_strategy_cuda(attrs, inputs, out_type, target):
             name="matmul_cublas.cuda",
             plevel=25,
         )
-    if is_auto_scheduler_enabled():
-        strategy.add_implementation(
-            wrap_compute_matmul(topi.nn.matmul),
-            naive_schedule,
-            name="matmul.cuda",
-        )
     return strategy
 
 
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index c21ec4d13906..7173e5b8db63 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -370,6 +370,28 @@ def conv1d_strategy_cpu(attrs, inputs, out_type, target):
     return strategy
 
 
+@matmul_strategy.register("cpu")
+def matmul_strategy_cpu(attrs, inputs, out_type, target):
+    """matmul x86 strategy"""
+    strategy = _op.OpStrategy()
+    if is_auto_scheduler_enabled():
+        strategy.add_implementation(
+            wrap_compute_matmul(topi.nn.matmul, need_auto_scheduler_layout=True),
+            naive_schedule,
+            name="matmul.generic",
+            plevel=11,
+        )
+    else:
+        logger.warning("Matmul other than NT format is not optimized for x86.")
+        strategy.add_implementation(
+            wrap_compute_matmul(topi.nn.matmul),
+            naive_schedule,
+            name="matmul.generic",
+        )
+
+    return strategy
+
+
 @dense_strategy.register("cpu")
 def dense_strategy_cpu(attrs, inputs, out_type, target):
     """dense x86 strategy"""
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index 76fbd7f82117..884a1c61a5e5 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -104,7 +104,7 @@ def schedule_dense_small_batch(cfg, outs):
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if op.tag == "dense":
+        if op.tag == "dense" or op.tag == "matmul":
             _schedule_dense_small_batch(cfg, s, op.output(0))
 
     traverse_inline(s, outs[0].op, _callback)
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 1cd825618473..0fee3b15545d 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -164,17 +164,9 @@ Useful for
 
 // ------------------- relay.nn.matmul
 TVM_REGISTER_NODE_TYPE(MatmulAttrs);
-TVM_REGISTER_NODE_TYPE(DenseAttrs);
 
 Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool data_transposed,
                 bool weight_transposed) {
-  if (!data_transposed && weight_transposed) {
-    auto attrs = make_object<DenseAttrs>();
-    attrs->units = units;
-    attrs->out_dtype = out_dtype;
-    static const Op& dense_op = Op::Get("nn.dense");
-    return Call(dense_op, {data, weight}, Attrs(attrs), {});
-  } else {
     auto attrs = make_object<MatmulAttrs>();
     attrs->units = units;
     attrs->out_dtype = out_dtype;
@@ -182,7 +174,6 @@ Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, boo
     attrs->weight_transposed = weight_transposed;
     static const Op& matmul_op = Op::Get("nn.matmul");
     return Call(matmul_op, {data, weight}, Attrs(attrs), {});
-  }
 }
 
 TVM_REGISTER_GLOBAL("relay.op.nn._make.matmul").set_body_typed(MakeMatmul);
@@ -204,9 +195,15 @@ RELAY_REGISTER_OP("nn.matmul")
 // ------------------- relay.nn.matmul
 
 // ------------------- relay.nn.dense
+TVM_REGISTER_NODE_TYPE(DenseAttrs);
+
 // Positional relay function to create dense operator used by frontend FFI.
 Expr MakeDense(Expr data, Expr weight, IndexExpr units, DataType out_dtype) {
-  return MakeMatmul(data, weight, units, out_dtype, false, true);
+  auto attrs = make_object<DenseAttrs>();
+  attrs->units = units;
+  attrs->out_dtype = out_dtype;
+  static const Op& op = Op::Get("nn.dense");
+  return Call(op, {data, weight}, Attrs(attrs), {});
 }
 
 TVM_REGISTER_GLOBAL("relay.op.nn._make.dense").set_body_typed(MakeDense);
diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
index 97591981c2a3..a1fbc0a4de90 100644
--- a/src/relay/op/nn/nn.h
+++ b/src/relay/op/nn/nn.h
@@ -46,11 +46,13 @@ bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   const AttrType* param = attrs.as<AttrType>();
   ICHECK(param != nullptr);
+  // Default set to dense layout
   bool data_transposed = false;
   bool weight_transposed = true;
-  if (attrs->IsInstance<MatmulAttrs>()) {
-    data_transposed = param->data_transposed;
-    weight_transposed = param->weight_transposed;
+  const auto& mattrs = attrs.as<MatmulAttrs>();
+  if (mattrs != nullptr) {
+    data_transposed = mattrs->data_transposed;
+    weight_transposed = mattrs->weight_transposed;
   }
 
   const Array<tvm::PrimExpr>& dshape = data->shape;
@@ -63,9 +65,8 @@ bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (param->units.defined()) {
     // validate the weight shape is proper if defined
     // Assign weight type
-    const Array<IndexExpr>& wshape = weight_transposed
-                                         ? Array<IndexExpr>({param->units, reduce})
-                                         : Array<IndexExpr>({reduce, param->units});
+    const Array<IndexExpr>& wshape = weight_transposed ? Array<IndexExpr>({param->units, reduce})
+                                                       : Array<IndexExpr>({reduce, param->units});
     // It is possible for weight to be nullptr in which case we will use
     // data dtype as the weight dtype. However if weight dtype is explicitly
     // present we will use that.
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index 6284524bff27..592fa77aed77 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -70,7 +70,7 @@ bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
   // Dense infer type function.
   Array<Type> tensor_types = {types[0], types[1], types[6]};
-  return DenseRel<DenseAttrs>(tensor_types, 3, attrs, reporter);
+  return MatmulRel<DenseAttrs>(tensor_types, 3, attrs, reporter);
 }
 
 // Positional relay function to create quantized dense operator used by frontend FFI.
diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py
index 686fd9834640..b9dff0464f07 100644
--- a/tests/python/relay/test_op_grad_level2.py
+++ b/tests/python/relay/test_op_grad_level2.py
@@ -199,6 +199,22 @@ def test_dense_grad():
     verify_dense_grad((5, 4), (3, 4))
 
 
+def verify_matmul_grad(d_shape, w_shape, d_transposed, w_transposed):
+    data = relay.var("data", relay.TensorType(d_shape, "float32"))
+    weight = relay.var("weight", relay.TensorType(w_shape, "float32"))
+    fwd_func = relay.Function(
+        [data, weight],
+        relay.nn.matmul(data, weight, data_transposed=d_transposed, weight_transposed=w_transposed),
+    )
+    check_grad(fwd_func)
+
+
+def test_matmul_grad():
+    verify_matmul_grad((1, 8), (8, 16), False, False)
+    verify_matmul_grad((4, 1), (4, 3), True, False)
+    verify_matmul_grad((4, 5), (3, 4), True, True)
+
+
 def verify_batch_flatten_grad(d_shape):
     data = relay.var("data", relay.TensorType(d_shape, "float32"))
     fwd_func = relay.Function([data], relay.nn.batch_flatten(data))
@@ -216,4 +232,5 @@ def test_batch_flatten_grad():
     test_global_avg_pool2d_grad()
     test_conv2d_grad()
     test_dense_grad()
+    test_matmul_grad()
     test_batch_flatten_grad()

From eec6f25786432467343a42e2376cc3f987269244 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Thu, 24 Jun 2021 14:07:53 +0800
Subject: [PATCH 14/26] Bug fix

---
 python/tvm/relay/op/nn/_nn.py        | 20 +++++++++++++++++---
 python/tvm/relay/op/nn/nn.py         |  4 ++--
 python/tvm/relay/op/strategy/cuda.py | 21 +++++++++------------
 src/relay/op/nn/nn.cc                | 14 +++++++-------
 4 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 91fff7df18e4..7812c42a62c2 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -59,7 +59,7 @@ def legalize_matmul(attrs, inputs, types):
     Parameters
     ----------
     attrs : tvm.ir.Attrs
-        Attributes of current convolution
+        Attributes of current matmul
     inputs : list of tvm.relay.Expr
         The args of the Relay expr to be legalized
     types : list of types
@@ -1191,7 +1191,14 @@ def matmul_shape_func(attrs, inputs, _):
     """
     Shape function for matmul op.
     """
-    ret = [_matmul_shape_func(inputs[0], inputs[1], attrs.data_transposed, attrs.weight_transposed)]
+    ret = [
+        _matmul_shape_func(
+            inputs[0],
+            inputs[1],
+            expr.IntImm("bool", attrs.data_transposed),
+            expr.IntImm("bool", attrs.weight_transposed),
+        )
+    ]
     return ret
 
 
@@ -1200,7 +1207,14 @@ def dense_shape_func(attrs, inputs, _):
     """
     Shape function for dense op.
     """
-    ret = [_matmul_shape_func(inputs[0], inputs[1], False, True)]
+    ret = [
+        _matmul_shape_func(
+            inputs[0],
+            inputs[1],
+            expr.IntImm("bool", False),
+            expr.IntImm("bool", True),
+        )
+    ]
     return ret
 
 
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index ff7fe9080714..c1aa1cfcc788 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1472,12 +1472,12 @@ def bias_add(data, bias, axis=1):
 
 
 def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight_transposed=False):
-    """Dense operator.
+    """Matmul operator.
     Applies a linear transformation. The X & W can be transposed.
 
     .. math::
 
-    `Y = X * W`
+        `Y = X * W`
 
     Parameters
     ----------
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index d0c724832b57..890b02e61867 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -702,19 +702,16 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target):
 def matmul_strategy_cuda(attrs, inputs, out_type, target):
     """dense cuda strategy"""
     strategy = _op.OpStrategy()
-    if is_auto_scheduler_enabled():
-        strategy.add_implementation(
-            wrap_compute_matmul(topi.nn.matmul),
-            naive_schedule,
-            name="matmul.cuda",
-        )
-    else:
+    # Temporary use this as a basic schedule
+    strategy.add_implementation(
+        wrap_compute_matmul(topi.nn.matmul),
+        wrap_topi_schedule(topi.cuda.schedule_dense_small_batch),
+        name="dense_small_batch.cuda",
+    )
+
+    if not is_auto_scheduler_enabled():
         logger.warning("Matmul other than NT format is not optimized for cuda.")
-        strategy.add_implementation(
-            wrap_compute_matmul(topi.nn.matmul),
-            wrap_topi_schedule(topi.cuda.schedule_dense_small_batch),
-            name="dense_small_batch.cuda",
-        )
+
     if target.kind.name == "cuda" and "cublas" in target.libs:
         strategy.add_implementation(
             wrap_compute_matmul(topi.cuda.matmul_cublas),
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 0fee3b15545d..24ba6ac1843e 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -167,13 +167,13 @@ TVM_REGISTER_NODE_TYPE(MatmulAttrs);
 
 Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool data_transposed,
                 bool weight_transposed) {
-    auto attrs = make_object<MatmulAttrs>();
-    attrs->units = units;
-    attrs->out_dtype = out_dtype;
-    attrs->data_transposed = data_transposed;
-    attrs->weight_transposed = weight_transposed;
-    static const Op& matmul_op = Op::Get("nn.matmul");
-    return Call(matmul_op, {data, weight}, Attrs(attrs), {});
+  auto attrs = make_object<MatmulAttrs>();
+  attrs->units = units;
+  attrs->out_dtype = out_dtype;
+  attrs->data_transposed = data_transposed;
+  attrs->weight_transposed = weight_transposed;
+  static const Op& matmul_op = Op::Get("nn.matmul");
+  return Call(matmul_op, {data, weight}, Attrs(attrs), {});
 }
 
 TVM_REGISTER_GLOBAL("relay.op.nn._make.matmul").set_body_typed(MakeMatmul);

From 85fb6ca94d26ed56d652e1cb661ffe25ba7ac9af Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Fri, 25 Jun 2021 10:41:28 +0800
Subject: [PATCH 15/26] Update matmul cuda default schedule

---
 python/tvm/relay/op/strategy/cuda.py | 20 ++++++++++++-------
 python/tvm/topi/cuda/dense.py        | 29 +++++++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 890b02e61867..3ec8546407c7 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -702,15 +702,21 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target):
 def matmul_strategy_cuda(attrs, inputs, out_type, target):
     """dense cuda strategy"""
     strategy = _op.OpStrategy()
-    # Temporary use this as a basic schedule
-    strategy.add_implementation(
-        wrap_compute_matmul(topi.nn.matmul),
-        wrap_topi_schedule(topi.cuda.schedule_dense_small_batch),
-        name="dense_small_batch.cuda",
-    )
 
-    if not is_auto_scheduler_enabled():
+    if is_auto_scheduler_enabled():
+        strategy.add_implementation(
+            wrap_compute_matmul(topi.nn.matmul),
+            naive_schedule,
+            name="matmul.cuda",
+        )
+    else:
         logger.warning("Matmul other than NT format is not optimized for cuda.")
+        # Temporary use this as a basic schedule
+        strategy.add_implementation(
+            wrap_compute_matmul(topi.cuda.matmul_default_cuda),
+            wrap_topi_schedule(topi.cuda.schedule_matmul_default_cuda),
+            name="matmul_default.cuda",
+        )
 
     if target.kind.name == "cuda" and "cublas" in target.libs:
         strategy.add_implementation(
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index 884a1c61a5e5..2db6aa7a1fb4 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -104,7 +104,34 @@ def schedule_dense_small_batch(cfg, outs):
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if op.tag == "dense" or op.tag == "matmul":
+        if op.tag == "dense":
+            _schedule_dense_small_batch(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+@autotvm.register_topi_compute("matmul_default.cuda")
+def matmul_default_cuda(
+    cfg,
+    data,
+    weight,
+    bias=None,
+    out_dtype=None,
+    data_transposed=False,
+    weight_transposed=False,
+):
+    return nn.matmul(data, weight, bias, out_dtype, data_transposed, weight_transposed)
+
+
+@autotvm.register_topi_schedule("matmul_default.cuda")
+def schedule_matmul_default_cuda(cfg, outs):
+    # Temporary use this as a basic schedule for matmul
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == "matmul":
             _schedule_dense_small_batch(cfg, s, op.output(0))
 
     traverse_inline(s, outs[0].op, _callback)

From d2d1a321beb01bc3cc904bd70c23361a46db0726 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Fri, 25 Jun 2021 10:47:52 +0800
Subject: [PATCH 16/26] Lint fix

---
 python/tvm/topi/cuda/dense.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index 2db6aa7a1fb4..296e0975c041 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -121,17 +121,20 @@ def matmul_default_cuda(
     data_transposed=False,
     weight_transposed=False,
 ):
+    """Matmul operator on cuda"""
     return nn.matmul(data, weight, bias, out_dtype, data_transposed, weight_transposed)
 
 
 @autotvm.register_topi_schedule("matmul_default.cuda")
 def schedule_matmul_default_cuda(cfg, outs):
-    # Temporary use this as a basic schedule for matmul
+    """Schedule matmul on cuda"""
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == "matmul":
+            # Temporary use this as a basic schedule for matmul
+            # TODO(jcf94): Add a more general schedule for matmul
             _schedule_dense_small_batch(cfg, s, op.output(0))
 
     traverse_inline(s, outs[0].op, _callback)

From 5300840b909d905c4f748aa6b58e883b8fded9fc Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Fri, 25 Jun 2021 10:52:40 +0800
Subject: [PATCH 17/26] Update

---
 python/tvm/relay/op/nn/nn.py         | 2 +-
 python/tvm/relay/op/strategy/cuda.py | 2 +-
 python/tvm/topi/nn/dense.py          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index c1aa1cfcc788..e83b95665264 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1490,7 +1490,7 @@ def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight
         of shape `(units_in, units)` or `(units, units_in)`.
 
     units : int, optional
-        Number of hidden units of the dense transformation.
+        Number of hidden units of the matmul transformation.
 
     out_dtype : str, optional
         Specifies the output data type for mixed precision dense,
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 3ec8546407c7..d1791f676ca1 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -700,7 +700,7 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target):
 
 @matmul_strategy.register(["cuda", "gpu"])
 def matmul_strategy_cuda(attrs, inputs, out_type, target):
-    """dense cuda strategy"""
+    """Matmul cuda strategy"""
     strategy = _op.OpStrategy()
 
     if is_auto_scheduler_enabled():
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index b7e3ece5f579..3be4975b52ae 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -137,7 +137,7 @@ def matmul_legalize(attrs, inputs, types):
     Parameters
     ----------
     attrs : tvm.ir.Attrs
-        Attributes of current dense
+        Attributes of current matmul
     inputs : list of tvm.relay.Expr
         The args of the Relay expr to be legalized
     types : list of types

From 9a57b20604a96ef17ebb9d4cf7bbe3b9e52e9dff Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Mon, 28 Jun 2021 19:04:25 +0800
Subject: [PATCH 18/26] Add blas support for matmul

---
 python/tvm/relay/op/strategy/x86.py | 27 ++++++++
 python/tvm/topi/x86/dense.py        | 99 +++++++++++++++++++++--------
 2 files changed, 101 insertions(+), 25 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 7173e5b8db63..4fd4bbb444eb 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -389,6 +389,33 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
             name="matmul.generic",
         )
 
+    same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype
+    dtype = inputs[0].dtype
+    u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32"
+    if "cblas" in target.libs:
+        with SpecializedCondition(same_type and dtype in ["float32", "float64"]):
+            strategy.add_implementation(
+                wrap_compute_matmul(topi.x86.matmul_cblas),
+                wrap_topi_schedule(topi.x86.schedule_matmul_cblas),
+                name="matmul_cblas.x86",
+                plevel=13,
+            )
+    if "mkl" in target.libs:
+        with SpecializedCondition(same_type and dtype in ["float32", "float64"] or u8s8s32):
+            strategy.add_implementation(
+                wrap_compute_matmul(topi.x86.matmul_mkl),
+                wrap_topi_schedule(topi.x86.schedule_matmul_mkl),
+                name="matmul_mkl.x86",
+                plevel=14,
+            )
+    if "mkldnn" in target.libs:
+        with SpecializedCondition(same_type and dtype == "float32"):
+            strategy.add_implementation(
+                wrap_compute_matmul(topi.x86.matmul_mkldnn),
+                wrap_topi_schedule(topi.x86.schedule_matmul_mkldnn),
+                name="matmul_mkldnn.x86",
+                plevel=15,
+            )
     return strategy
 
 
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index 4fed4c16464e..4be1408d0ba4 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -28,7 +28,7 @@
 
 from .utils import get_fp32_len
 from .injective import schedule_injective_from_existing
-from .. import generic, tag
+from .. import tag
 from ..utils import traverse_inline, get_const_tuple
 
 
@@ -281,8 +281,8 @@ def _callback(op):
     return s
 
 
-def dense_blas_common(cfg, data, weight, bias, out_dtype, lib):
-    """Compute dense using a BLAS library"""
+def matmul_blas_common(cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, lib):
+    """Compute matmul/dense using a BLAS library"""
     M, K = get_const_tuple(data.shape)
     N, _ = get_const_tuple(weight.shape)
     if isinstance(M, int) and isinstance(K, int) and isinstance(N, int):
@@ -290,63 +290,112 @@ def dense_blas_common(cfg, data, weight, bias, out_dtype, lib):
     if data.dtype == "uint8" and weight.dtype == "int8" and out_dtype == "int32":
         if not hasattr(lib, "matmul_u8s8s32"):
             raise NotImplementedError(
-                f"Dense with {lib.__name__} for {data.dtype} is not supported "
+                f"Matmul/Dense with {lib.__name__} for {data.dtype} is not supported "
                 "(matmulu8s8s32 not imlemented)"
             )
-        C = lib.matmul_u8s8s32(data, weight, False, True, dtype=out_dtype)
+        C = lib.matmul_u8s8s32(data, weight, data_transposed, weight_transposed, dtype=out_dtype)
     elif data.dtype == "float32" or data.dtype == "float64":
-        C = lib.matmul(data, weight, False, True)
+        C = lib.matmul(data, weight, data_transposed, weight_transposed)
     else:
-        raise NotImplementedError(f"Dense with {lib.__name__} for {data.dtype} is not supported")
+        raise NotImplementedError(
+            f"Matmul/Dense with {lib.__name__} for {data.dtype} is not supported"
+        )
 
     if bias is not None:
         C = te.compute(C.shape, lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST)
     return C
 
 
+def schedule_matmul_blas_common(outs):
+    """Default matmul schedule for BLAS library"""
+    s = te.create_schedule([x.op for x in outs])
+    te.schedule.AutoInlineInjective(s)
+
+    for out in outs:
+        if "dense" not in out.op.tag and "matmul" not in out.op.tag:
+            schedule_injective_from_existing(s, out)
+    return s
+
+
 @autotvm.register_topi_compute("dense_cblas.x86")
 def dense_cblas(cfg, data, weight, bias=None, out_dtype=None):
     """Compute dense using a cblas"""
-    return dense_blas_common(cfg, data, weight, bias, out_dtype, cblas)
+    return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, cblas)
 
 
 @autotvm.register_topi_schedule("dense_cblas.x86")
 def schedule_dense_cblas(_, outs):
     """Create schedule for dense_cblas"""
-    return generic.schedule_extern(outs)
+    return schedule_matmul_blas_common(outs)
 
 
 @autotvm.register_topi_compute("dense_mkl.x86")
 def dense_mkl(cfg, data, weight, bias=None, out_dtype=None):
     """Compute dense using mkl"""
-    return dense_blas_common(cfg, data, weight, bias, out_dtype, mkl)
+    return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, mkl)
 
 
 @autotvm.register_topi_schedule("dense_mkl.x86")
 def schedule_dense_mkl(_, outs):
     """Create schedule for dense_mkl"""
-    # return generic.schedule_extern(outs)
-    s = te.create_schedule([x.op for x in outs])
-    te.schedule.AutoInlineInjective(s)
-
-    def _callback(op):
-        if "broadcast" in op.tag or "injective" in op.tag or "elemwise" in op.tag:
-            schedule_injective_from_existing(s, op.output(0))
-
-    # traverse_inline(s, outs[0].op, _callback)
-    for out in outs:
-        if "dense" not in out.op.name:
-            schedule_injective_from_existing(s, out)
-    return s
+    return schedule_matmul_blas_common(outs)
 
 
 @autotvm.register_topi_compute("dense_mkldnn.x86")
 def dense_mkldnn(cfg, data, weight, bias=None, out_dtype=None):
     """Compute dense using mkldnn"""
-    return dense_blas_common(cfg, data, weight, bias, out_dtype, mkldnn)
+    return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, mkldnn)
 
 
 @autotvm.register_topi_schedule("dense_mkldnn.x86")
 def schedule_dense_mkldnn(_, outs):
     """Create schedule for dense_mkldnn"""
-    return generic.schedule_extern(outs)
+    return schedule_matmul_blas_common(outs)
+
+
+@autotvm.register_topi_compute("matmul_cblas.x86")
+def matmul_cblas(
+    cfg, data, weight, bias=None, out_dtype=None, data_transposed=False, weight_transposed=False
+):
+    """Compute matmul using a cblas"""
+    return matmul_blas_common(
+        cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, cblas
+    )
+
+
+@autotvm.register_topi_schedule("matmul_cblas.x86")
+def schedule_matmul_cblas(_, outs):
+    """Create schedule for matmul_cblas"""
+    return schedule_matmul_blas_common(outs)
+
+
+@autotvm.register_topi_compute("matmul_mkl.x86")
+def matmul_mkl(
+    cfg, data, weight, bias=None, out_dtype=None, data_transposed=False, weight_transposed=False
+):
+    """Compute matmul using mkl"""
+    return matmul_blas_common(
+        cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, mkl
+    )
+
+
+@autotvm.register_topi_schedule("matmul_mkl.x86")
+def schedule_matmul_mkl(_, outs):
+    """Create schedule for matmul_mkl"""
+    return schedule_matmul_blas_common(outs)
+
+
+@autotvm.register_topi_compute("matmul_mkldnn.x86")
+def matmul_mkldnn(
+    cfg, data, weight, bias=None, out_dtype=None, data_transposed=False, weight_transposed=False
+):
+    """Compute matmul using mkldnn"""
+    return matmul_blas_common(
+        cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, mkldnn
+    )
+
+
+@autotvm.register_topi_schedule("matmul_mkldnn.x86")
+def schedule_matmul_mkldnn(_, outs):
+    """Create schedule for matmul_mkldnn"""
+    return schedule_matmul_blas_common(outs)

From 26daf14c6848be06afbf0c19d1574b8d257b8da3 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Tue, 29 Jun 2021 11:14:26 +0800
Subject: [PATCH 19/26] Update

---
 include/tvm/relay/attrs/nn.h                 |  8 +-
 python/tvm/relay/frontend/tensorflow.py      | 18 ++--
 python/tvm/relay/frontend/tensorflow_ops.py  | 12 +--
 python/tvm/relay/op/_tensor_grad.py          | 24 +++---
 python/tvm/relay/op/nn/_nn.py                | 14 +--
 python/tvm/relay/op/nn/nn.py                 | 29 ++++---
 python/tvm/relay/op/strategy/generic.py      |  4 +-
 python/tvm/topi/cuda/dense.py                | 40 ++++-----
 python/tvm/topi/nn/dense.py                  | 91 ++++++++++----------
 python/tvm/topi/x86/dense.py                 | 30 +++----
 rust/tvm/src/ir/relay/attrs/nn.rs            |  4 +-
 src/relay/op/make_op.h                       |  4 +-
 src/relay/op/nn/nn.cc                        | 20 ++---
 src/relay/op/nn/nn.h                         | 64 +++++++-------
 tests/python/relay/test_op_grad_level2.py    | 10 +--
 tests/python/relay/test_op_level1.py         |  6 +-
 tests/python/topi/python/test_topi_matmul.py |  2 +-
 17 files changed, 191 insertions(+), 189 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 2154cc0efa27..8b245ab529d2 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -965,8 +965,8 @@ struct AvgPool3DAttrs : public tvm::AttrsNode<AvgPool3DAttrs> {
 struct MatmulAttrs : public tvm::AttrsNode<MatmulAttrs> {
   IndexExpr units;
   DataType out_dtype;
-  bool data_transposed;
-  bool weight_transposed;
+  bool transpose_a;
+  bool transpose_b;
   tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
 
   TVM_DECLARE_ATTRS(MatmulAttrs, "relay.attrs.MatmulAttrs") {
@@ -977,11 +977,11 @@ struct MatmulAttrs : public tvm::AttrsNode<MatmulAttrs> {
         .set_default(NullValue<DataType>())
         .describe("Output data type, set to explicit type under mixed precision setting");
 
-    TVM_ATTR_FIELD(data_transposed)
+    TVM_ATTR_FIELD(transpose_a)
         .set_default(false)
         .describe("Whether the input tensor is in transposed format.");
 
-    TVM_ATTR_FIELD(weight_transposed)
+    TVM_ATTR_FIELD(transpose_b)
         .set_default(false)
         .describe("Whether the weight tensor is in transposed format.");
   }
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 7ae94aa6c67c..860ee2198998 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -44,9 +44,14 @@
 
 __all__ = ["from_tensorflow"]
 
-# By default, TVM convert `tf.matmul` to `nn.dense` op with data tensor non-transposed and weight
-# tensor transposed
-_USE_DENSE_INSTEAD_OF_MATMUL = True
+# The default configurations of Relay TensorFlow frontend.
+TF_DEFAULT_CONFIGS = {
+  # By default, TVM converts `tf.matmul` to `transpose(weight) + nn.dense`, which introduces
+  # unnecessary overhead in weight transpose. Change this flag to False to directly convert to
+  # `nn.matmul` to get rid of the overhead.
+  # However, please note that `nn.matmul` is in experimental so it may have some performance issues.
+  "use_dense": True,
+}
 
 # compatible operators that do NOT require any conversion.
 _identity_list = []
@@ -1226,7 +1231,7 @@ def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None, use_dense_op
     outputs : List of output tensor names (Optional)
         if not specified then the last node is assumed as graph output.
 
-    use_dense_op : bool (Optional)
+    use_dense_op : bool (Optional) = True
         Ture to convert `tf.matmul` to `nn.dense`, else to `nn.matmul`.
         The `nn.dense` op requires the data tensor to be non-transposed and weight tensor to be
         transposed, may insert extra `transpose` to the original graph.
@@ -1239,9 +1244,8 @@ def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None, use_dense_op
     params : dict of str to tvm.nd.NDArray
         Dict of converted parameters stored in tvm.nd.NDArray format
     """
-    global _USE_DENSE_INSTEAD_OF_MATMUL
-    if use_dense_op != _USE_DENSE_INSTEAD_OF_MATMUL:
-        _USE_DENSE_INSTEAD_OF_MATMUL = use_dense_op
+    global TF_DEFAULT_CONFIGS
+    TF_DEFAULT_CONFIGS["use_dense"] = use_dense_op
 
     g = GraphProto()
     mod, params = g.from_tensorflow(graph, layout, shape, outputs)
diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py
index 612ea908ced2..004174f076fd 100644
--- a/python/tvm/relay/frontend/tensorflow_ops.py
+++ b/python/tvm/relay/frontend/tensorflow_ops.py
@@ -1113,10 +1113,10 @@ def _impl(inputs, attr, params, mod):
 
 def _matmul():
     def _impl(inputs, attr, params, mod):
-        from .tensorflow import _USE_DENSE_INSTEAD_OF_MATMUL
+        from .tensorflow import TF_DEFAULT_CONFIGS
 
         channels = _infer_channels(inputs[1], not attr["transpose_b"])
-        if _USE_DENSE_INSTEAD_OF_MATMUL:
+        if TF_DEFAULT_CONFIGS["use_dense"]:
             if attr["transpose_a"]:
                 inputs[0] = _op.transpose(inputs[0], axes=(1, 0))
             if not attr["transpose_b"]:
@@ -1128,12 +1128,8 @@ def _impl(inputs, attr, params, mod):
             )(inputs, attr)
         return AttrCvt(
             op_name="matmul",
-            extras={
-                "units": channels,
-                "data_transposed": attr["transpose_a"] or False,
-                "weight_transposed": attr["transpose_b"] or False,
-            },
-            ignores=["transpose_a", "transpose_b", "T"],
+            extras={"units": channels},
+            ignores=["T"],
         )(inputs, attr)
 
     return _impl
diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
index ca1c7292c9ce..fa2772c1299f 100644
--- a/python/tvm/relay/op/_tensor_grad.py
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -556,30 +556,30 @@ def dense_grad(orig, grad):
 
 @register_gradient("nn.matmul")
 def matmul_grad(orig, grad):
-    """Returns [grad' @ weight, data @ grad']"""
-    data, weight = orig.args
-    if (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (True, True):
+    """Returns [grad' @ tensor_b, tensor_a @ grad']"""
+    tensor_a, tensor_b = orig.args
+    if (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (True, True):
         return [
             collapse_sum_like(
-                _nn.matmul(weight, grad, data_transposed=True, weight_transposed=True), data
+                _nn.matmul(tensor_b, grad, transpose_a=True, transpose_b=True), tensor_a
             ),
             collapse_sum_like(
-                _nn.matmul(grad, data, data_transposed=True, weight_transposed=True), weight
+                _nn.matmul(grad, tensor_a, transpose_a=True, transpose_b=True), tensor_b
             ),
         ]
-    if (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (True, False):
+    if (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (True, False):
         return [
-            collapse_sum_like(_nn.matmul(weight, grad, weight_transposed=True), data),
-            collapse_sum_like(_nn.matmul(data, grad), weight),
+            collapse_sum_like(_nn.matmul(tensor_b, grad, transpose_b=True), tensor_a),
+            collapse_sum_like(_nn.matmul(tensor_a, grad), tensor_b),
         ]
-    if (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (False, True):
+    if (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (False, True):
         # Keep using Dense op here for not involving extra ops
         # TODO(jcf94): Merge all to nn.matmul when it is finally ready
         return dense_grad(orig, grad)
-    # (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (False, False)
+    # (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (False, False)
     return [
-        collapse_sum_like(_nn.matmul(grad, weight, weight_transposed=True), data),
-        collapse_sum_like(_nn.matmul(data, grad, data_transposed=True), weight),
+        collapse_sum_like(_nn.matmul(grad, tensor_b, transpose_b=True), tensor_a),
+        collapse_sum_like(_nn.matmul(tensor_a, grad, transpose_a=True), tensor_b),
     ]
 
 
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 6da983e76933..492d0c910172 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -1186,13 +1186,13 @@ def batch_flatten_shape_func(attrs, inputs, _):
 
 
 @script
-def _matmul_shape_func(data_shape, weight_shape, data_transposed, weight_transposed):
-    out = output_tensor((data_shape.shape[0],), "int64")
+def _matmul_shape_func(tensor_a_shape, tensor_b_shape, transpose_a, transpose_b):
+    out = output_tensor((tensor_a_shape.shape[0],), "int64")
     for i in const_range(out.shape[0] - 1):
-        out[i] = data_shape[i]
-    if data_transposed:
+        out[i] = tensor_a_shape[i]
+    if transpose_a:
         out[out.shape[0] - 2] = out[out.shape[0] - 1]
-    out[out.shape[0] - 1] = weight_shape[0] if weight_transposed else weight_shape[1]
+    out[out.shape[0] - 1] = tensor_b_shape[0] if transpose_b else tensor_b_shape[1]
 
     return out
 
@@ -1206,8 +1206,8 @@ def matmul_shape_func(attrs, inputs, _):
         _matmul_shape_func(
             inputs[0],
             inputs[1],
-            expr.IntImm("bool", attrs.data_transposed),
-            expr.IntImm("bool", attrs.weight_transposed),
+            expr.IntImm("bool", attrs.transpose_a),
+            expr.IntImm("bool", attrs.transpose_b),
         )
     ]
     return ret
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 85ba75411028..4c94102275bb 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1471,35 +1471,35 @@ def bias_add(data, bias, axis=1):
     return _make.bias_add(data, bias, axis)
 
 
-def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight_transposed=False):
+def matmul(tensor_a, tensor_b, units=None, out_dtype="", transpose_a=False, transpose_b=False):
     """Matmul operator.
-    Applies a linear transformation. The X & W can be transposed.
+    Applies a linear transformation. The A & B can be transposed.
 
     .. math::
 
-        `Y = X * W`
+        `C = A * B`
 
     Parameters
     ----------
     data : tvm.relay.Expr
-        The input data to the operator,
+        The first input of the operator,
         of shape `(d_1, d_2, ..., d_n, units_in)` or `(d_1, d_2, ..., units_in, d_n)`.
 
     weight : tvm.relay.Expr
-        The weight expressions, 2-D matrix,
+        The second input expressions, 2-D matrix,
         of shape `(units_in, units)` or `(units, units_in)`.
 
-    units : int, optional
+    units : Optional[int]
         Number of hidden units of the matmul transformation.
 
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision dense,
+    out_dtype : Optional[str]
+        Specifies the output data type for mixed precision matmul,
         of shape `(d_1, d_2, ..., d_n, units)`.
 
-    data_transposed : bool, optional
+    transpose_a : Optional[bool] = False
         Whether the data tensor is in transposed format.
 
-    weight_transposed : bool, optional
+    transpose_b : Optional[bool] = False
         Whether the weight tensor is in transposed format.
 
     Returns
@@ -1507,9 +1507,12 @@ def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight
     result : tvm.relay.Expr
         The computed result.
     """
-    if not data_transposed and weight_transposed:
-        return dense(data, weight, units, out_dtype)
-    return _make.matmul(data, weight, units, out_dtype, data_transposed, weight_transposed)
+    # Since currently `nn.dense` has better topi schedule support, will prefer to use `dense`
+    # rather than `matmul` for better compatibility
+    if not transpose_a and transpose_b:
+        # TODO(jcf94): Remove this when `nn.matmul` is finnaly ready
+        return dense(tensor_a, tensor_b, units, out_dtype)
+    return _make.matmul(tensor_a, tensor_b, units, out_dtype, transpose_a, transpose_b)
 
 
 def dense(data, weight, units=None, out_dtype=""):
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index edb4556a554b..5cb3f65f3ebe 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -725,8 +725,8 @@ def _compute_matmul(attrs, inputs, out_type):
             inputs[1],
             None,
             out_dtype,
-            attrs.data_transposed,
-            attrs.weight_transposed,
+            attrs.transpose_a,
+            attrs.transpose_b,
         ]
         if need_auto_scheduler_layout:
             args.append(get_auto_scheduler_rewritten_layout(attrs))
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index 296e0975c041..3e0b76e69175 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -32,22 +32,22 @@
 
 def _matmul_cublas_common(
     cfg,
-    data,
-    weight,
+    tensor_a,
+    tensor_b,
     bias=None,
     out_dtype=None,
-    data_transposed=False,
-    weight_transposed=False,
+    transpose_a=False,
+    transpose_b=False,
 ):
-    assert len(data.shape) == 2 and len(weight.shape) == 2, "only support 2-dim matmul"
+    assert len(tensor_a.shape) == 2 and len(tensor_b.shape) == 2, "only support 2-dim matmul"
     if bias is not None:
         assert len(bias.shape) == 1
     if out_dtype is None:
-        out_dtype = data.dtype
-    assert out_dtype == data.dtype, "Mixed precision not supported."
-    batch, in_dim = get_const_tuple(data.shape)
-    out_dim, _ = get_const_tuple(weight.shape)
-    matmul = cublas.matmul(data, weight, data_transposed, weight_transposed)
+        out_dtype = tensor_a.dtype
+    assert out_dtype == tensor_a.dtype, "Mixed precision not supported."
+    batch, in_dim = get_const_tuple(tensor_a.shape)
+    out_dim, _ = get_const_tuple(tensor_b.shape)
+    matmul = cublas.matmul(tensor_a, tensor_b, transpose_a, transpose_b)
     if all(isinstance(d, int) for d in [batch, in_dim, out_dim]):
         cfg.add_flop(batch * in_dim * out_dim * 2)
     if bias is not None:
@@ -60,16 +60,16 @@ def _matmul_cublas_common(
 @autotvm.register_topi_compute("matmul_cublas.cuda")
 def matmul_cublas(
     cfg,
-    data,
-    weight,
+    tensor_a,
+    tensor_b,
     bias=None,
     out_dtype=None,
-    data_transposed=False,
-    weight_transposed=False,
+    transpose_a=False,
+    transpose_b=False,
 ):
     """Matmul operator on CUDA with CUBLAS"""
     return _matmul_cublas_common(
-        cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed
+        cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b
     )
 
 
@@ -114,15 +114,15 @@ def _callback(op):
 @autotvm.register_topi_compute("matmul_default.cuda")
 def matmul_default_cuda(
     cfg,
-    data,
-    weight,
+    tensor_a,
+    tensor_b,
     bias=None,
     out_dtype=None,
-    data_transposed=False,
-    weight_transposed=False,
+    transpose_a=False,
+    transpose_b=False,
 ):
     """Matmul operator on cuda"""
-    return nn.matmul(data, weight, bias, out_dtype, data_transposed, weight_transposed)
+    return nn.matmul(tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b)
 
 
 @autotvm.register_topi_schedule("matmul_default.cuda")
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index 3be4975b52ae..02c6786472f8 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -22,22 +22,22 @@
 
 
 def matmul(
-    data,
-    weight,
+    tensor_a,
+    tensor_b,
     bias=None,
     out_dtype=None,
-    data_transposed=False,
-    weight_transposed=False,
+    transpose_a=False,
+    transpose_b=False,
     auto_scheduler_rewritten_layout="",
 ):
     """The default implementation of matmul in topi.
 
     Parameters
     ----------
-    data : tvm.te.Tensor
+    tensor_a : tvm.te.Tensor
         2-D with shape [batch, in_dim]
 
-    weight : tvm.te.Tensor
+    tensor_b : tvm.te.Tensor
         2-D with shape [out_dim, in_dim]
 
     bias : Optional[tvm.te.Tensor]
@@ -46,13 +46,13 @@ def matmul(
     out_dtype : Optional[str]
         The output type. This is used for mixed precision.
 
-    data_transposed : Optional[bool]
-        Whether the data tensor is in transposed format.
+    transpose_a : Optional[bool] = False
+        Whether the tensor_a is in transposed format.
 
-    weight_transposed : Optional[bool]
-        Whether the weight tensor is in transposed format.
+    transpose_b : Optional[bool] = False
+        Whether the tensor_b is in transposed format.
 
-    auto_scheduler_rewritten_layout: str = ""
+    auto_scheduler_rewritten_layout: Optional[str] = ""
         The layout after auto-scheduler's layout rewrite pass.
 
     Returns
@@ -60,61 +60,60 @@ def matmul(
     output : tvm.te.Tensor
         2-D with shape [batch, out_dim]
     """
-    assert len(data.shape) == 2, "only support 2-dim dense"
+    assert len(tensor_a.shape) == 2, "only support 2-dim dense"
     if bias is not None:
         assert len(bias.shape) == 1
     if out_dtype is None:
-        out_dtype = data.dtype
-    if data_transposed:
-        in_dim, batch = data.shape
+        out_dtype = tensor_a.dtype
+    if transpose_a:
+        in_dim, batch = tensor_a.shape
     else:
-        batch, in_dim = data.shape
+        batch, in_dim = tensor_a.shape
 
     if auto_scheduler_rewritten_layout:
         # Infer shape for the rewritten layout
         out_dim, red_dim = auto_scheduler.get_shape_from_rewritten_layout(
-            auto_scheduler_rewritten_layout, ["j", "k"] if weight_transposed else ["k", "j"]
+            auto_scheduler_rewritten_layout, ["j", "k"] if transpose_b else ["k", "j"]
         )
-        auto_scheduler.remove_index_check(weight)
-    elif weight_transposed:
-        out_dim, red_dim = weight.shape
+        auto_scheduler.remove_index_check(tensor_b)
+    elif transpose_b:
+        out_dim, red_dim = tensor_b.shape
     else:
-        red_dim, out_dim = weight.shape
+        red_dim, out_dim = tensor_b.shape
     assert in_dim == red_dim
 
     k = te.reduce_axis((0, in_dim), name="k")
-    if data_transposed:
-        if weight_transposed:
-            compute_lambda = lambda i, j: te.sum(
-                data[k, i].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=k
-            )
-            compute_name = "T_matmul_TT"
-        else:
-            compute_lambda = lambda i, j: te.sum(
-                data[k, i].astype(out_dtype) * weight[k, j].astype(out_dtype), axis=k
-            )
-            compute_name = "T_matmul_TN"
+    if (transpose_a, transpose_b) == (True, True):
+        compute_lambda = lambda i, j: te.sum(
+            tensor_a[k, i].astype(out_dtype) * tensor_b[j, k].astype(out_dtype), axis=k
+        )
+        compute_name = "T_matmul"
+        compute_tag = "matmul"
+    elif (transpose_a, transpose_b) == (True, False):
+        compute_lambda = lambda i, j: te.sum(
+            tensor_a[k, i].astype(out_dtype) * tensor_b[k, j].astype(out_dtype), axis=k
+        )
+        compute_name = "T_matmul"
+        compute_tag = "matmul"
+    elif (transpose_a, transpose_b) == (False, True):
+        compute_lambda = lambda i, j: te.sum(
+            tensor_a[i, k].astype(out_dtype) * tensor_b[j, k].astype(out_dtype), axis=k
+        )
+        compute_name = "T_dense"
+        compute_tag = "dense"
+    else:  # (transpose_a, transpose_b) == (False, False):
+        compute_lambda = lambda i, j: te.sum(
+            tensor_a[i, k].astype(out_dtype) * tensor_b[k, j].astype(out_dtype), axis=k
+        )
+        compute_name = "T_matmul"
         compute_tag = "matmul"
-    else:
-        if weight_transposed:
-            compute_lambda = lambda i, j: te.sum(
-                data[i, k].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=k
-            )
-            compute_name = "T_dense"
-            compute_tag = "dense"
-        else:
-            compute_lambda = lambda i, j: te.sum(
-                data[i, k].astype(out_dtype) * weight[k, j].astype(out_dtype), axis=k
-            )
-            compute_name = "T_matmul"
-            compute_tag = "matmul"
 
     mat = te.compute(
         (batch, out_dim),
         compute_lambda,
         name=compute_name,
         tag=compute_tag,
-        attrs={"layout_free_placeholders": [weight]},
+        attrs={"layout_free_placeholders": [tensor_b]},
     )
 
     if bias is not None:
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index 4be1408d0ba4..171e54e6c9e6 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -281,24 +281,24 @@ def _callback(op):
     return s
 
 
-def matmul_blas_common(cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, lib):
+def matmul_blas_common(cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, lib):
     """Compute matmul/dense using a BLAS library"""
-    M, K = get_const_tuple(data.shape)
-    N, _ = get_const_tuple(weight.shape)
+    M, K = get_const_tuple(tensor_a.shape)
+    N, _ = get_const_tuple(tensor_b.shape)
     if isinstance(M, int) and isinstance(K, int) and isinstance(N, int):
         cfg.add_flop(M * K * N * 2)
-    if data.dtype == "uint8" and weight.dtype == "int8" and out_dtype == "int32":
+    if tensor_a.dtype == "uint8" and tensor_b.dtype == "int8" and out_dtype == "int32":
         if not hasattr(lib, "matmul_u8s8s32"):
             raise NotImplementedError(
-                f"Matmul/Dense with {lib.__name__} for {data.dtype} is not supported "
+                f"Matmul/Dense with {lib.__name__} for {tensor_a.dtype} is not supported "
                 "(matmulu8s8s32 not imlemented)"
             )
-        C = lib.matmul_u8s8s32(data, weight, data_transposed, weight_transposed, dtype=out_dtype)
-    elif data.dtype == "float32" or data.dtype == "float64":
-        C = lib.matmul(data, weight, data_transposed, weight_transposed)
+        C = lib.matmul_u8s8s32(tensor_a, tensor_b, transpose_a, transpose_b, dtype=out_dtype)
+    elif tensor_a.dtype == "float32" or tensor_a.dtype == "float64":
+        C = lib.matmul(tensor_a, tensor_b, transpose_a, transpose_b)
     else:
         raise NotImplementedError(
-            f"Matmul/Dense with {lib.__name__} for {data.dtype} is not supported"
+            f"Matmul/Dense with {lib.__name__} for {tensor_a.dtype} is not supported"
         )
 
     if bias is not None:
@@ -355,11 +355,11 @@ def schedule_dense_mkldnn(_, outs):
 
 @autotvm.register_topi_compute("matmul_cblas.x86")
 def matmul_cblas(
-    cfg, data, weight, bias=None, out_dtype=None, data_transposed=False, weight_transposed=False
+    cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False
 ):
     """Compute matmul using a cblas"""
     return matmul_blas_common(
-        cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, cblas
+        cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, cblas
     )
 
 
@@ -371,11 +371,11 @@ def schedule_matmul_cblas(_, outs):
 
 @autotvm.register_topi_compute("matmul_mkl.x86")
 def matmul_mkl(
-    cfg, data, weight, bias=None, out_dtype=None, data_transposed=False, weight_transposed=False
+    cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False
 ):
     """Compute matmul using mkl"""
     return matmul_blas_common(
-        cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, mkl
+        cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, mkl
     )
 
 
@@ -387,11 +387,11 @@ def schedule_matmul_mkl(_, outs):
 
 @autotvm.register_topi_compute("matmul_mkldnn.x86")
 def matmul_mkldnn(
-    cfg, data, weight, bias=None, out_dtype=None, data_transposed=False, weight_transposed=False
+    cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False
 ):
     """Compute matmul using mkldnn"""
     return matmul_blas_common(
-        cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, mkldnn
+        cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, mkldnn
     )
 
 
diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs
index e0a1d5bf02cd..04320d1f6f85 100644
--- a/rust/tvm/src/ir/relay/attrs/nn.rs
+++ b/rust/tvm/src/ir/relay/attrs/nn.rs
@@ -62,8 +62,8 @@ pub struct MatmulAttrsNode {
     pub base: BaseAttrsNode,
     pub units: IndexExpr,
     pub out_dtype: DataType,
-    pub data_transposed: bool,
-    pub weight_transposed: bool,
+    pub transpose_a: bool,
+    pub transpose_b: bool,
 }
 
 #[repr(C)]
diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h
index f7bb98f9d1b6..6f4db5ab268a 100644
--- a/src/relay/op/make_op.h
+++ b/src/relay/op/make_op.h
@@ -44,8 +44,8 @@ Expr MakeClip(Expr a, double a_min, double a_max);
 
 Expr MakeConcatenate(Expr data, int axis);
 
-Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool data_transposed,
-                bool weight_transposed);
+Expr MakeMatmul(Expr tensor_a, Expr tensor_b, IndexExpr units, DataType out_dtype, bool transpose_a,
+                bool transpose_b);
 
 Expr MakeDense(Expr data, Expr weight, IndexExpr units, DataType out_dtype);
 
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 3b64a99bce09..4eaa12b17d7b 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -165,31 +165,31 @@ Useful for
 // ------------------- relay.nn.matmul
 TVM_REGISTER_NODE_TYPE(MatmulAttrs);
 
-Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool data_transposed,
-                bool weight_transposed) {
+Expr MakeMatmul(Expr tensor_a, Expr tensor_b, IndexExpr units, DataType out_dtype, bool transpose_a,
+                bool transpose_b) {
   auto attrs = make_object<MatmulAttrs>();
   attrs->units = units;
   attrs->out_dtype = out_dtype;
-  attrs->data_transposed = data_transposed;
-  attrs->weight_transposed = weight_transposed;
+  attrs->transpose_a = transpose_a;
+  attrs->transpose_b = transpose_b;
   static const Op& matmul_op = Op::Get("nn.matmul");
-  return Call(matmul_op, {data, weight}, Attrs(attrs), {});
+  return Call(matmul_op, {tensor_a, tensor_b}, Attrs(attrs), {});
 }
 
 TVM_REGISTER_GLOBAL("relay.op.nn._make.matmul").set_body_typed(MakeMatmul);
 
 RELAY_REGISTER_OP("nn.matmul")
-    .describe(R"code(Applies a linear transformation: :math:`Y = XW`. X & W can be transposed.
+    .describe(R"code(Applies a linear transformation: :math:`C = A * B`. A & B can be transposed.
 
-- **data**: `(x1, x2, ..., xn, input_dim)` or `(x1, x2, ..., input_dim, xn)`
-- **weight**: `(input_dim, units)` or `(units, input_dim)`
+- **tensor_a**: `(x1, x2, ..., xn, input_dim)` or `(x1, x2, ..., input_dim, xn)`
+- **tensor_b**: `(input_dim, units)` or `(units, input_dim)`
 - **out**: `(x1, x2, ..., xn, units)`.
 
 )code" TVM_ADD_FILELINE)
     .set_attrs_type<MatmulAttrs>()
     .set_num_inputs(2)
-    .add_argument("data", "nD Tensor", "Input data.")
-    .add_argument("weight", "2D Tensor", "Weight matrix.")
+    .add_argument("tensor_a", "nD Tensor", "The first input Tensor.")
+    .add_argument("tensor_b", "2D Tensor", "The second input Tensor.")
     .set_support_level(1)
     .add_type_rel("Matmul", MatmulRel<MatmulAttrs>);
 // ------------------- relay.nn.matmul
diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
index a1fbc0a4de90..29f200c67c59 100644
--- a/src/relay/op/nn/nn.h
+++ b/src/relay/op/nn/nn.h
@@ -39,41 +39,41 @@ template <typename AttrType>
 bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   ICHECK_EQ(types.size(), 3);
-  const auto* data = types[0].as<TensorTypeNode>();
-  const auto* weight = types[1].as<TensorTypeNode>();
-  if (data == nullptr) return false;
-  ICHECK(static_cast<int>(data->shape.size()) != 0);
+  const auto* tensor_a = types[0].as<TensorTypeNode>();
+  const auto* tensor_b = types[1].as<TensorTypeNode>();
+  if (tensor_a == nullptr) return false;
+  ICHECK(static_cast<int>(tensor_a->shape.size()) != 0);
 
   const AttrType* param = attrs.as<AttrType>();
   ICHECK(param != nullptr);
   // Default set to dense layout
-  bool data_transposed = false;
-  bool weight_transposed = true;
+  bool transpose_a = false;
+  bool transpose_b = true;
   const auto& mattrs = attrs.as<MatmulAttrs>();
   if (mattrs != nullptr) {
-    data_transposed = mattrs->data_transposed;
-    weight_transposed = mattrs->weight_transposed;
+    transpose_a = mattrs->transpose_a;
+    transpose_b = mattrs->transpose_b;
   }
 
-  const Array<tvm::PrimExpr>& dshape = data->shape;
+  const Array<tvm::PrimExpr>& dshape = tensor_a->shape;
   Array<tvm::PrimExpr> oshape = dshape;
   tvm::PrimExpr reduce = dshape[dshape.size() - 1];
-  if (data_transposed) {
+  if (transpose_a) {
     reduce = dshape[dshape.size() - 2];
     oshape.Set((oshape.size() - 2), dshape[oshape.size() - 1]);
   }
   if (param->units.defined()) {
-    // validate the weight shape is proper if defined
-    // Assign weight type
-    const Array<IndexExpr>& wshape = weight_transposed ? Array<IndexExpr>({param->units, reduce})
-                                                       : Array<IndexExpr>({reduce, param->units});
-    // It is possible for weight to be nullptr in which case we will use
-    // data dtype as the weight dtype. However if weight dtype is explicitly
+    // validate the tensor_b shape is proper if defined
+    // Assign tensor_b type
+    const Array<IndexExpr>& wshape = transpose_b ? Array<IndexExpr>({param->units, reduce})
+                                                 : Array<IndexExpr>({reduce, param->units});
+    // It is possible for tensor_b to be nullptr in which case we will use
+    // data dtype as the tensor_b dtype. However if tensor_b dtype is explicitly
     // present we will use that.
-    auto weight_dtype = (weight == nullptr ? data->dtype : weight->dtype);
+    auto tensor_b_dtype = (tensor_b == nullptr ? tensor_a->dtype : tensor_b->dtype);
     if (param->auto_scheduler_rewritten_layout.size() == 0) {
       // Normal case: assign result to reporter
-      reporter->Assign(types[1], TensorType(wshape, weight_dtype));
+      reporter->Assign(types[1], TensorType(wshape, tensor_b_dtype));
     } else {
       // If the layout is rewritten by auto-scheduler,
       // we just forcly apply the layout provided by auto-scheduler and
@@ -82,32 +82,32 @@ bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     }
     oshape.Set((oshape.size() - 1), param->units);
   } else {
-    if (weight == nullptr) return false;
-    const Array<tvm::PrimExpr>& wshape = weight->shape;
-    // When weight's layout has been rewritten, figure it out based on the
+    if (tensor_b == nullptr) return false;
+    const Array<tvm::PrimExpr>& wshape = tensor_b->shape;
+    // When tensor_b's layout has been rewritten, figure it out based on the
     // total number of elements and input dimensions.
     if (param->auto_scheduler_rewritten_layout.size() != 0) {
-      PrimExpr weight_elements = 1;
+      PrimExpr tensor_b_elements = 1;
       for (size_t i = 0; i < wshape.size(); i++) {
-        weight_elements = weight_elements * wshape[i];
+        tensor_b_elements = tensor_b_elements * wshape[i];
       }
-      oshape.Set(oshape.size() - 1, weight_elements / dshape[dshape.size() - 1]);
-      // Otherwise just pull it out of the weight shape directly.
+      oshape.Set(oshape.size() - 1, tensor_b_elements / dshape[dshape.size() - 1]);
+      // Otherwise just pull it out of the tensor_b shape directly.
     } else {
-      ICHECK(static_cast<int>(weight->shape.size()) == 2);
-      if (!data->shape.back().as<tir::AnyNode>()) {
-        ICHECK((weight_transposed && reporter->AssertEQ(reduce, weight->shape[1])) ||
-               (!weight_transposed && reporter->AssertEQ(reduce, weight->shape[0])))
+      ICHECK(static_cast<int>(tensor_b->shape.size()) == 2);
+      if (!tensor_a->shape.back().as<tir::AnyNode>()) {
+        ICHECK((transpose_b && reporter->AssertEQ(reduce, tensor_b->shape[1])) ||
+               (!transpose_b && reporter->AssertEQ(reduce, tensor_b->shape[0])))
             << "MatmulRel: input dimension doesn't match,"
-            << " data shape=" << data->shape << ", weight shape=" << weight->shape;
+            << " tensor_a shape=" << tensor_a->shape << ", tensor_b shape=" << tensor_b->shape;
       }
-      oshape.Set((oshape.size() - 1), weight_transposed ? wshape[0] : wshape[1]);
+      oshape.Set((oshape.size() - 1), transpose_b ? wshape[0] : wshape[1]);
     }
   }
 
   DataType out_dtype = param->out_dtype;
   if (out_dtype.bits() == 0) {
-    out_dtype = data->dtype;
+    out_dtype = tensor_a->dtype;
   }
   // assign output type
   reporter->Assign(types[2], TensorType(oshape, out_dtype));
diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py
index b9dff0464f07..c8a94683eec4 100644
--- a/tests/python/relay/test_op_grad_level2.py
+++ b/tests/python/relay/test_op_grad_level2.py
@@ -199,12 +199,12 @@ def test_dense_grad():
     verify_dense_grad((5, 4), (3, 4))
 
 
-def verify_matmul_grad(d_shape, w_shape, d_transposed, w_transposed):
-    data = relay.var("data", relay.TensorType(d_shape, "float32"))
-    weight = relay.var("weight", relay.TensorType(w_shape, "float32"))
+def verify_matmul_grad(a_shape, b_shape, transpose_a, transpose_b):
+    tensor_a = relay.var("tensor_a", relay.TensorType(a_shape, "float32"))
+    tensor_b = relay.var("tensor_b", relay.TensorType(b_shape, "float32"))
     fwd_func = relay.Function(
-        [data, weight],
-        relay.nn.matmul(data, weight, data_transposed=d_transposed, weight_transposed=w_transposed),
+        [tensor_a, tensor_b],
+        relay.nn.matmul(tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b),
     )
     check_grad(fwd_func)
 
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index d5a0e8bc8403..cbc3e7fbd1e5 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -430,7 +430,7 @@ def test_matmul():
         n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
         x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
         w = relay.var("w", relay.TensorType((2, w), dtype))
-        y = relay.nn.matmul(x, w, units=2, weight_transposed=True)
+        y = relay.nn.matmul(x, w, units=2, transpose_b=True)
         assert "units=2" in y.astext()
         yy = run_infer_type(y)
         assert yy.checked_type == relay.TensorType((n, c, h, 2), dtype)
@@ -439,7 +439,7 @@ def test_matmul():
         x = relay.var("x", relay.TensorType((n, c, w, h), dtype))
         wh, ww = te.size_var("wh"), te.size_var("ww")
         w = relay.var("w", relay.TensorType((wh, ww), dtype))
-        y = relay.nn.matmul(x, w, data_transposed=True)
+        y = relay.nn.matmul(x, w, transpose_a=True)
         yy = run_infer_type(y)
         assert yy.checked_type == relay.TensorType((n, c, h, ww), dtype)
 
@@ -452,7 +452,7 @@ def test_matmul():
 
         x = relay.var("x", shape=(5, 10), dtype=dtype)
         w = relay.var("w", shape=(5, 2), dtype=dtype)
-        z = relay.nn.matmul(x, w, data_transposed=True)
+        z = relay.nn.matmul(x, w, transpose_a=True)
 
         # Check result.
         func = relay.Function([x, w], z)
diff --git a/tests/python/topi/python/test_topi_matmul.py b/tests/python/topi/python/test_topi_matmul.py
index cf92ab0807cc..de2d4d3c4c8e 100644
--- a/tests/python/topi/python/test_topi_matmul.py
+++ b/tests/python/topi/python/test_topi_matmul.py
@@ -46,7 +46,7 @@ def verify_nn_matmul(sa, sb, transp_a, transp_b):
     b = np.random.uniform(low=-1.0, high=1.0, size=sb).astype(np.float32)
     c1 = np.matmul(np.transpose(a) if transp_a else a, np.transpose(b) if transp_b else b)
     c2 = with_tvm(
-        lambda A, B: topi.nn.matmul(A, B, data_transposed=transp_a, weight_transposed=transp_b),
+        lambda A, B: topi.nn.matmul(A, B, transpose_a=transp_a, transpose_b=transp_b),
         a,
         b,
     )

From 32a2f42ea6875a928f5b859d3f3c4842a5ae7b1f Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Tue, 29 Jun 2021 11:36:13 +0800
Subject: [PATCH 20/26] Lint fix add update doc strings

---
 python/tvm/relay/frontend/tensorflow.py      | 11 +++---
 python/tvm/relay/op/strategy/cuda.py         |  6 ++--
 python/tvm/relay/op/strategy/x86.py          |  5 ++-
 python/tvm/topi/cuda/dense.py                |  6 ++--
 python/tvm/topi/nn/dense.py                  |  2 ++
 python/tvm/topi/x86/dense.py                 | 12 +++----
 tests/python/contrib/test_cblas.py           | 38 ++++++++++----------
 tutorials/auto_scheduler/tune_network_x86.py |  4 +--
 8 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 860ee2198998..e297398ffe5b 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -46,11 +46,12 @@
 
 # The default configurations of Relay TensorFlow frontend.
 TF_DEFAULT_CONFIGS = {
-  # By default, TVM converts `tf.matmul` to `transpose(weight) + nn.dense`, which introduces
-  # unnecessary overhead in weight transpose. Change this flag to False to directly convert to
-  # `nn.matmul` to get rid of the overhead.
-  # However, please note that `nn.matmul` is in experimental so it may have some performance issues.
-  "use_dense": True,
+    # By default, TVM converts `tf.matmul` to `transpose(weight) + nn.dense`, which introduces
+    # unnecessary overhead in weight transpose. Change this flag to False to directly convert to
+    # `nn.matmul` to get rid of the overhead.
+    # However, please note that `nn.matmul` is in experimental so it may have some performance
+    # issues.
+    "use_dense": True,
 }
 
 # compatible operators that do NOT require any conversion.
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index d1791f676ca1..04c9f64b538d 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -700,7 +700,7 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target):
 
 @matmul_strategy.register(["cuda", "gpu"])
 def matmul_strategy_cuda(attrs, inputs, out_type, target):
-    """Matmul cuda strategy"""
+    """Matmul cuda strategy."""
     strategy = _op.OpStrategy()
 
     if is_auto_scheduler_enabled():
@@ -710,7 +710,9 @@ def matmul_strategy_cuda(attrs, inputs, out_type, target):
             name="matmul.cuda",
         )
     else:
-        logger.warning("Matmul other than NT format is not optimized for cuda.")
+        logger.warning(
+            "Matmul is not optimized for cuda. Recommend to use cublas for better performance."
+        )
         # Temporary use this as a basic schedule
         strategy.add_implementation(
             wrap_compute_matmul(topi.cuda.matmul_default_cuda),
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 4fd4bbb444eb..581aa5d6a781 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -382,7 +382,10 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
             plevel=11,
         )
     else:
-        logger.warning("Matmul other than NT format is not optimized for x86.")
+        logger.warning(
+            "Matmul is not optimized for x86. "
+            "Recommend to use cblas/mkl/mkldnn for better performance."
+        )
         strategy.add_implementation(
             wrap_compute_matmul(topi.nn.matmul),
             naive_schedule,
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index 3e0b76e69175..24ee30e5313a 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -68,9 +68,7 @@ def matmul_cublas(
     transpose_b=False,
 ):
     """Matmul operator on CUDA with CUBLAS"""
-    return _matmul_cublas_common(
-        cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b
-    )
+    return _matmul_cublas_common(cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b)
 
 
 @autotvm.register_topi_schedule("matmul_cublas.cuda")
@@ -81,7 +79,7 @@ def schedule_matmul_cublas(_, outs):
 
 @autotvm.register_topi_compute("dense_cublas.cuda")
 def dense_cublas(cfg, data, weight, bias=None, out_dtype=None):
-    """Dense operator on CUDA with CUBLAS"""
+    """Dense operator on CUDA with CUBLAS. This is an alias of matmul_nt operator."""
     return _matmul_cublas_common(cfg, data, weight, bias, out_dtype, False, True)
 
 
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index 02c6786472f8..9f1a3963115e 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -154,6 +154,8 @@ def matmul_legalize(attrs, inputs, types):
 
 def dense(data, weight, bias=None, out_dtype=None, auto_scheduler_rewritten_layout=""):
     """The default implementation of dense in topi.
+    This is an alias of matmul operator for data tensor in non-transposed format and weight tensor
+    in transposed format.
 
     Parameters
     ----------
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index 171e54e6c9e6..f726cdc14022 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -319,7 +319,7 @@ def schedule_matmul_blas_common(outs):
 
 @autotvm.register_topi_compute("dense_cblas.x86")
 def dense_cblas(cfg, data, weight, bias=None, out_dtype=None):
-    """Compute dense using a cblas"""
+    """Compute dense using cblas. This is an alias of matmul_nt operator."""
     return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, cblas)
 
 
@@ -331,7 +331,7 @@ def schedule_dense_cblas(_, outs):
 
 @autotvm.register_topi_compute("dense_mkl.x86")
 def dense_mkl(cfg, data, weight, bias=None, out_dtype=None):
-    """Compute dense using mkl"""
+    """Compute dense using mkl. This is an alias of matmul_nt operator."""
     return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, mkl)
 
 
@@ -343,7 +343,7 @@ def schedule_dense_mkl(_, outs):
 
 @autotvm.register_topi_compute("dense_mkldnn.x86")
 def dense_mkldnn(cfg, data, weight, bias=None, out_dtype=None):
-    """Compute dense using mkldnn"""
+    """Compute dense using mkldnn. This is an alias of matmul_nt operator."""
     return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, mkldnn)
 
 
@@ -357,7 +357,7 @@ def schedule_dense_mkldnn(_, outs):
 def matmul_cblas(
     cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False
 ):
-    """Compute matmul using a cblas"""
+    """Compute matmul using cblas."""
     return matmul_blas_common(
         cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, cblas
     )
@@ -373,7 +373,7 @@ def schedule_matmul_cblas(_, outs):
 def matmul_mkl(
     cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False
 ):
-    """Compute matmul using mkl"""
+    """Compute matmul using mkl."""
     return matmul_blas_common(
         cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, mkl
     )
@@ -389,7 +389,7 @@ def schedule_matmul_mkl(_, outs):
 def matmul_mkldnn(
     cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False
 ):
-    """Compute matmul using mkldnn"""
+    """Compute matmul using mkldnn."""
     return matmul_blas_common(
         cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, mkldnn
     )
diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py
index 2b99879d8227..86556c334f5b 100644
--- a/tests/python/contrib/test_cblas.py
+++ b/tests/python/contrib/test_cblas.py
@@ -71,30 +71,30 @@ def verify(target="llvm"):
         )
 
     verify("llvm")
-    verify("c")
+    # verify("c")
 
 
 def test_matmul_add():
-    verify_matmul_add(235, 128, 1024, cblas)
-    verify_matmul_add(235, 128, 1024, cblas, True, False)
-    verify_matmul_add(235, 128, 1024, cblas, False, True)
-    verify_matmul_add(235, 128, 1024, cblas, True, True)
-    verify_matmul_add(235, 128, 1024, mkl)
-    verify_matmul_add(235, 128, 1024, mkl, True, False)
-    verify_matmul_add(235, 128, 1024, mkl, False, True)
-    verify_matmul_add(235, 128, 1024, mkl, True, True)
+    # verify_matmul_add(235, 128, 1024, cblas)
+    # verify_matmul_add(235, 128, 1024, cblas, True, False)
+    # verify_matmul_add(235, 128, 1024, cblas, False, True)
+    # verify_matmul_add(235, 128, 1024, cblas, True, True)
+    # verify_matmul_add(235, 128, 1024, mkl)
+    # verify_matmul_add(235, 128, 1024, mkl, True, False)
+    # verify_matmul_add(235, 128, 1024, mkl, False, True)
+    # verify_matmul_add(235, 128, 1024, mkl, True, True)
     verify_matmul_add(235, 128, 1024, mkldnn)
     verify_matmul_add(235, 128, 1024, mkldnn, True, False)
     verify_matmul_add(235, 128, 1024, mkldnn, False, True)
     verify_matmul_add(235, 128, 1024, mkldnn, True, True)
-    verify_matmul_add(1, 16, 4, cblas)
-    verify_matmul_add(1, 16, 3, cblas, True, False)
-    verify_matmul_add(1, 16, 3, cblas, False, False)
-    verify_matmul_add(1, 16, 3, cblas, True, True)
-    verify_matmul_add(1, 16, 4, mkl)
-    verify_matmul_add(1, 16, 3, mkl, True, False)
-    verify_matmul_add(1, 16, 3, mkl, False, False)
-    verify_matmul_add(1, 16, 3, mkl, True, True)
+    # verify_matmul_add(1, 16, 4, cblas)
+    # verify_matmul_add(1, 16, 3, cblas, True, False)
+    # verify_matmul_add(1, 16, 3, cblas, False, False)
+    # verify_matmul_add(1, 16, 3, cblas, True, True)
+    # verify_matmul_add(1, 16, 4, mkl)
+    # verify_matmul_add(1, 16, 3, mkl, True, False)
+    # verify_matmul_add(1, 16, 3, mkl, False, False)
+    # verify_matmul_add(1, 16, 3, mkl, True, True)
     verify_matmul_add(1, 16, 4, mkldnn)
     verify_matmul_add(1, 16, 3, mkldnn, True, False)
     verify_matmul_add(1, 16, 3, mkldnn, False, False)
@@ -238,5 +238,5 @@ def test_batch_matmul():
 
 if __name__ == "__main__":
     test_matmul_add()
-    test_quantized_matmul_add()
-    test_batch_matmul()
+    # test_quantized_matmul_add()
+    # test_batch_matmul()
diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/tutorials/auto_scheduler/tune_network_x86.py
index 76068fa79605..89b1e11d26eb 100644
--- a/tutorials/auto_scheduler/tune_network_x86.py
+++ b/tutorials/auto_scheduler/tune_network_x86.py
@@ -146,11 +146,11 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32", use_sparse=Fal
 # Define the neural network and compilation target.
 # If the target machine supports avx512 instructions, replace the
 # "llvm -mcpu=core-avx2" with "llvm -mcpu=skylake-avx512"
-network = "resnet-50"
+network = "mlp"
 use_sparse = False
 batch_size = 1
 layout = "NHWC"
-target = tvm.target.Target("llvm -mcpu=core-avx2")
+target = tvm.target.Target("llvm -mcpu=core-avx2 -libs=mkldnn")
 dtype = "float32"
 log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
 

From e92e7839b3d07487962dfce473df5df6262c61d9 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Tue, 29 Jun 2021 11:43:59 +0800
Subject: [PATCH 21/26] Update

---
 python/tvm/topi/nn/dense.py                  | 12 +++----
 python/tvm/topi/x86/dense.py                 | 12 +++----
 tests/python/contrib/test_cblas.py           | 38 ++++++++++----------
 tutorials/auto_scheduler/tune_network_x86.py |  4 +--
 4 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index 9f1a3963115e..ac1fff79deea 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -87,25 +87,25 @@ def matmul(
         compute_lambda = lambda i, j: te.sum(
             tensor_a[k, i].astype(out_dtype) * tensor_b[j, k].astype(out_dtype), axis=k
         )
-        compute_name = "T_matmul"
+        compute_name = "T_matmul_TT"
         compute_tag = "matmul"
     elif (transpose_a, transpose_b) == (True, False):
         compute_lambda = lambda i, j: te.sum(
             tensor_a[k, i].astype(out_dtype) * tensor_b[k, j].astype(out_dtype), axis=k
         )
-        compute_name = "T_matmul"
+        compute_name = "T_matmul_TN"
         compute_tag = "matmul"
     elif (transpose_a, transpose_b) == (False, True):
         compute_lambda = lambda i, j: te.sum(
             tensor_a[i, k].astype(out_dtype) * tensor_b[j, k].astype(out_dtype), axis=k
         )
-        compute_name = "T_dense"
+        compute_name = "T_matmul_NT"
         compute_tag = "dense"
     else:  # (transpose_a, transpose_b) == (False, False):
         compute_lambda = lambda i, j: te.sum(
             tensor_a[i, k].astype(out_dtype) * tensor_b[k, j].astype(out_dtype), axis=k
         )
-        compute_name = "T_matmul"
+        compute_name = "T_matmul_NN"
         compute_tag = "matmul"
 
     mat = te.compute(
@@ -154,8 +154,8 @@ def matmul_legalize(attrs, inputs, types):
 
 def dense(data, weight, bias=None, out_dtype=None, auto_scheduler_rewritten_layout=""):
     """The default implementation of dense in topi.
-    This is an alias of matmul operator for data tensor in non-transposed format and weight tensor
-    in transposed format.
+    This is an alias of matmul_nt operator for data tensor in non-transposed format and weight
+    tensor in transposed format.
 
     Parameters
     ----------
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index f726cdc14022..189ac5bd34bd 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -325,7 +325,7 @@ def dense_cblas(cfg, data, weight, bias=None, out_dtype=None):
 
 @autotvm.register_topi_schedule("dense_cblas.x86")
 def schedule_dense_cblas(_, outs):
-    """Create schedule for dense_cblas"""
+    """Create schedule for dense_cblas. This is an alias of matmul_nt operator."""
     return schedule_matmul_blas_common(outs)
 
 
@@ -337,7 +337,7 @@ def dense_mkl(cfg, data, weight, bias=None, out_dtype=None):
 
 @autotvm.register_topi_schedule("dense_mkl.x86")
 def schedule_dense_mkl(_, outs):
-    """Create schedule for dense_mkl"""
+    """Create schedule for dense_mkl. This is an alias of matmul_nt operator."""
     return schedule_matmul_blas_common(outs)
 
 
@@ -349,7 +349,7 @@ def dense_mkldnn(cfg, data, weight, bias=None, out_dtype=None):
 
 @autotvm.register_topi_schedule("dense_mkldnn.x86")
 def schedule_dense_mkldnn(_, outs):
-    """Create schedule for dense_mkldnn"""
+    """Create schedule for dense_mkldnn. This is an alias of matmul_nt operator."""
     return schedule_matmul_blas_common(outs)
 
 
@@ -365,7 +365,7 @@ def matmul_cblas(
 
 @autotvm.register_topi_schedule("matmul_cblas.x86")
 def schedule_matmul_cblas(_, outs):
-    """Create schedule for matmul_cblas"""
+    """Create schedule for matmul_cblas."""
     return schedule_matmul_blas_common(outs)
 
 
@@ -381,7 +381,7 @@ def matmul_mkl(
 
 @autotvm.register_topi_schedule("matmul_mkl.x86")
 def schedule_matmul_mkl(_, outs):
-    """Create schedule for matmul_mkl"""
+    """Create schedule for matmul_mkl."""
     return schedule_matmul_blas_common(outs)
 
 
@@ -397,5 +397,5 @@ def matmul_mkldnn(
 
 @autotvm.register_topi_schedule("matmul_mkldnn.x86")
 def schedule_matmul_mkldnn(_, outs):
-    """Create schedule for matmul_mkldnn"""
+    """Create schedule for matmul_mkldnn."""
     return schedule_matmul_blas_common(outs)
diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py
index 86556c334f5b..2b99879d8227 100644
--- a/tests/python/contrib/test_cblas.py
+++ b/tests/python/contrib/test_cblas.py
@@ -71,30 +71,30 @@ def verify(target="llvm"):
         )
 
     verify("llvm")
-    # verify("c")
+    verify("c")
 
 
 def test_matmul_add():
-    # verify_matmul_add(235, 128, 1024, cblas)
-    # verify_matmul_add(235, 128, 1024, cblas, True, False)
-    # verify_matmul_add(235, 128, 1024, cblas, False, True)
-    # verify_matmul_add(235, 128, 1024, cblas, True, True)
-    # verify_matmul_add(235, 128, 1024, mkl)
-    # verify_matmul_add(235, 128, 1024, mkl, True, False)
-    # verify_matmul_add(235, 128, 1024, mkl, False, True)
-    # verify_matmul_add(235, 128, 1024, mkl, True, True)
+    verify_matmul_add(235, 128, 1024, cblas)
+    verify_matmul_add(235, 128, 1024, cblas, True, False)
+    verify_matmul_add(235, 128, 1024, cblas, False, True)
+    verify_matmul_add(235, 128, 1024, cblas, True, True)
+    verify_matmul_add(235, 128, 1024, mkl)
+    verify_matmul_add(235, 128, 1024, mkl, True, False)
+    verify_matmul_add(235, 128, 1024, mkl, False, True)
+    verify_matmul_add(235, 128, 1024, mkl, True, True)
     verify_matmul_add(235, 128, 1024, mkldnn)
     verify_matmul_add(235, 128, 1024, mkldnn, True, False)
     verify_matmul_add(235, 128, 1024, mkldnn, False, True)
     verify_matmul_add(235, 128, 1024, mkldnn, True, True)
-    # verify_matmul_add(1, 16, 4, cblas)
-    # verify_matmul_add(1, 16, 3, cblas, True, False)
-    # verify_matmul_add(1, 16, 3, cblas, False, False)
-    # verify_matmul_add(1, 16, 3, cblas, True, True)
-    # verify_matmul_add(1, 16, 4, mkl)
-    # verify_matmul_add(1, 16, 3, mkl, True, False)
-    # verify_matmul_add(1, 16, 3, mkl, False, False)
-    # verify_matmul_add(1, 16, 3, mkl, True, True)
+    verify_matmul_add(1, 16, 4, cblas)
+    verify_matmul_add(1, 16, 3, cblas, True, False)
+    verify_matmul_add(1, 16, 3, cblas, False, False)
+    verify_matmul_add(1, 16, 3, cblas, True, True)
+    verify_matmul_add(1, 16, 4, mkl)
+    verify_matmul_add(1, 16, 3, mkl, True, False)
+    verify_matmul_add(1, 16, 3, mkl, False, False)
+    verify_matmul_add(1, 16, 3, mkl, True, True)
     verify_matmul_add(1, 16, 4, mkldnn)
     verify_matmul_add(1, 16, 3, mkldnn, True, False)
     verify_matmul_add(1, 16, 3, mkldnn, False, False)
@@ -238,5 +238,5 @@ def test_batch_matmul():
 
 if __name__ == "__main__":
     test_matmul_add()
-    # test_quantized_matmul_add()
-    # test_batch_matmul()
+    test_quantized_matmul_add()
+    test_batch_matmul()
diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/tutorials/auto_scheduler/tune_network_x86.py
index 89b1e11d26eb..76068fa79605 100644
--- a/tutorials/auto_scheduler/tune_network_x86.py
+++ b/tutorials/auto_scheduler/tune_network_x86.py
@@ -146,11 +146,11 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32", use_sparse=Fal
 # Define the neural network and compilation target.
 # If the target machine supports avx512 instructions, replace the
 # "llvm -mcpu=core-avx2" with "llvm -mcpu=skylake-avx512"
-network = "mlp"
+network = "resnet-50"
 use_sparse = False
 batch_size = 1
 layout = "NHWC"
-target = tvm.target.Target("llvm -mcpu=core-avx2 -libs=mkldnn")
+target = tvm.target.Target("llvm -mcpu=core-avx2")
 dtype = "float32"
 log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
 

From 968c6bf208de2da295be6e97b7372eadf7450c26 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Tue, 29 Jun 2021 11:47:51 +0800
Subject: [PATCH 22/26] Update

---
 include/tvm/relay/attrs/nn.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 8b245ab529d2..3c7574562676 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -979,11 +979,11 @@ struct MatmulAttrs : public tvm::AttrsNode<MatmulAttrs> {
 
     TVM_ATTR_FIELD(transpose_a)
         .set_default(false)
-        .describe("Whether the input tensor is in transposed format.");
+        .describe("Whether the first input tensor is in transposed format.");
 
     TVM_ATTR_FIELD(transpose_b)
         .set_default(false)
-        .describe("Whether the weight tensor is in transposed format.");
+        .describe("Whether the second input tensor is in transposed format.");
   }
 };
 

From dc4681e3fa82d185651a88c41dad335032e136ee Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Tue, 29 Jun 2021 13:35:35 +0800
Subject: [PATCH 23/26] Update

---
 python/tvm/relay/op/nn/_nn.py | 8 +++-----
 python/tvm/topi/nn/dense.py   | 3 ++-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 492d0c910172..056cb5694a48 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -1199,9 +1199,7 @@ def _matmul_shape_func(tensor_a_shape, tensor_b_shape, transpose_a, transpose_b)
 
 @reg.register_shape_func("nn.matmul", False)
 def matmul_shape_func(attrs, inputs, _):
-    """
-    Shape function for matmul op.
-    """
+    """Shape function for matmul op."""
     ret = [
         _matmul_shape_func(
             inputs[0],
@@ -1215,8 +1213,8 @@ def matmul_shape_func(attrs, inputs, _):
 
 @reg.register_shape_func("nn.dense", False)
 def dense_shape_func(attrs, inputs, _):
-    """
-    Shape function for dense op.
+    """Shape function for dense op. This is an alias of matmul_nt operator for data tensor in
+    non-transposed format and weight tensor in transposed format.
     """
     ret = [
         _matmul_shape_func(
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index ac1fff79deea..d8c69efb70e0 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -60,7 +60,7 @@ def matmul(
     output : tvm.te.Tensor
         2-D with shape [batch, out_dim]
     """
-    assert len(tensor_a.shape) == 2, "only support 2-dim dense"
+    assert len(tensor_a.shape) == 2, "only support 2-dim matmul"
     if bias is not None:
         assert len(bias.shape) == 1
     if out_dtype is None:
@@ -100,6 +100,7 @@ def matmul(
             tensor_a[i, k].astype(out_dtype) * tensor_b[j, k].astype(out_dtype), axis=k
         )
         compute_name = "T_matmul_NT"
+        # TODO(jcf94): Remove `dense` when `matmul` is finally ready
         compute_tag = "dense"
     else:  # (transpose_a, transpose_b) == (False, False):
         compute_lambda = lambda i, j: te.sum(

From e7a6e7639bf2d7c42e6ea6c8e297164dd37a51c2 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Wed, 30 Jun 2021 10:32:36 +0800
Subject: [PATCH 24/26] Update

---
 python/tvm/relay/op/strategy/x86.py | 51 +++++++++++++++++++----------
 python/tvm/topi/nn/dense.py         |  3 +-
 2 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 581aa5d6a781..1271826cefa7 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -374,28 +374,12 @@ def conv1d_strategy_cpu(attrs, inputs, out_type, target):
 def matmul_strategy_cpu(attrs, inputs, out_type, target):
     """matmul x86 strategy"""
     strategy = _op.OpStrategy()
-    if is_auto_scheduler_enabled():
-        strategy.add_implementation(
-            wrap_compute_matmul(topi.nn.matmul, need_auto_scheduler_layout=True),
-            naive_schedule,
-            name="matmul.generic",
-            plevel=11,
-        )
-    else:
-        logger.warning(
-            "Matmul is not optimized for x86. "
-            "Recommend to use cblas/mkl/mkldnn for better performance."
-        )
-        strategy.add_implementation(
-            wrap_compute_matmul(topi.nn.matmul),
-            naive_schedule,
-            name="matmul.generic",
-        )
 
     same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype
     dtype = inputs[0].dtype
     u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32"
     if "cblas" in target.libs:
+        length_before = len(strategy.specializations)
         with SpecializedCondition(same_type and dtype in ["float32", "float64"]):
             strategy.add_implementation(
                 wrap_compute_matmul(topi.x86.matmul_cblas),
@@ -403,7 +387,12 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
                 name="matmul_cblas.x86",
                 plevel=13,
             )
+        if length_before == len(strategy.specializations):
+            logger.warning(
+                "Currently cblas only support the data type to be float32 or float64. Skip."
+            )
     if "mkl" in target.libs:
+        length_before = len(strategy.specializations)
         with SpecializedCondition(same_type and dtype in ["float32", "float64"] or u8s8s32):
             strategy.add_implementation(
                 wrap_compute_matmul(topi.x86.matmul_mkl),
@@ -411,7 +400,13 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
                 name="matmul_mkl.x86",
                 plevel=14,
             )
+        if length_before == len(strategy.specializations):
+            logger.warning(
+                "Currently mkl only support the data type to be float32, float64 or input with "
+                "uint8 and int8 while output wiht int32. Skip."
+            )
     if "mkldnn" in target.libs:
+        length_before = len(strategy.specializations)
         with SpecializedCondition(same_type and dtype == "float32"):
             strategy.add_implementation(
                 wrap_compute_matmul(topi.x86.matmul_mkldnn),
@@ -419,6 +414,28 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
                 name="matmul_mkldnn.x86",
                 plevel=15,
             )
+        if length_before == len(strategy.specializations):
+            logger.warning("Currently mkldnn only support the data type to be float32. Skip.")
+
+    if is_auto_scheduler_enabled():
+        strategy.add_implementation(
+            wrap_compute_matmul(topi.nn.matmul, need_auto_scheduler_layout=True),
+            naive_schedule,
+            name="matmul.generic",
+            plevel=11,
+        )
+    else:
+        # If no cblas/mkl/mkldnn strategy choosed
+        if not len(strategy.specializations):
+            logger.warning(
+                "Matmul is not optimized for x86. "
+                "Recommend to use cblas/mkl/mkldnn for better performance."
+            )
+        strategy.add_implementation(
+            wrap_compute_matmul(topi.nn.matmul),
+            naive_schedule,
+            name="matmul.generic",
+        )
     return strategy
 
 
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index d8c69efb70e0..58c458a7d676 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -60,6 +60,7 @@ def matmul(
     output : tvm.te.Tensor
         2-D with shape [batch, out_dim]
     """
+    # TODO(jcf94): Add multi-dim support for tensor_a
     assert len(tensor_a.shape) == 2, "only support 2-dim matmul"
     if bias is not None:
         assert len(bias.shape) == 1
@@ -73,7 +74,7 @@ def matmul(
     if auto_scheduler_rewritten_layout:
         # Infer shape for the rewritten layout
         out_dim, red_dim = auto_scheduler.get_shape_from_rewritten_layout(
-            auto_scheduler_rewritten_layout, ["j", "k"] if transpose_b else ["k", "j"]
+            auto_scheduler_rewritten_layout, ["j", "k"]
         )
         auto_scheduler.remove_index_check(tensor_b)
     elif transpose_b:

From 63426d8376c24d77039ebe4c77783e6fddec7237 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Wed, 30 Jun 2021 10:39:22 +0800
Subject: [PATCH 25/26] Lint fix

---
 python/tvm/relay/op/strategy/x86.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 1271826cefa7..d09d90a50d41 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -379,7 +379,7 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
     dtype = inputs[0].dtype
     u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32"
     if "cblas" in target.libs:
-        length_before = len(strategy.specializations)
+        length_before = len(strategy.specializations) if strategy.specializations else 0
         with SpecializedCondition(same_type and dtype in ["float32", "float64"]):
             strategy.add_implementation(
                 wrap_compute_matmul(topi.x86.matmul_cblas),
@@ -387,12 +387,13 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
                 name="matmul_cblas.x86",
                 plevel=13,
             )
-        if length_before == len(strategy.specializations):
+        length_after = len(strategy.specializations) if strategy.specializations else 0
+        if length_before == length_after:
             logger.warning(
                 "Currently cblas only support the data type to be float32 or float64. Skip."
             )
     if "mkl" in target.libs:
-        length_before = len(strategy.specializations)
+        length_before = len(strategy.specializations) if strategy.specializations else 0
         with SpecializedCondition(same_type and dtype in ["float32", "float64"] or u8s8s32):
             strategy.add_implementation(
                 wrap_compute_matmul(topi.x86.matmul_mkl),
@@ -400,13 +401,14 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
                 name="matmul_mkl.x86",
                 plevel=14,
             )
-        if length_before == len(strategy.specializations):
+        length_after = len(strategy.specializations) if strategy.specializations else 0
+        if length_before == length_after:
             logger.warning(
                 "Currently mkl only support the data type to be float32, float64 or input with "
                 "uint8 and int8 while output wiht int32. Skip."
             )
     if "mkldnn" in target.libs:
-        length_before = len(strategy.specializations)
+        length_before = len(strategy.specializations) if strategy.specializations else 0
         with SpecializedCondition(same_type and dtype == "float32"):
             strategy.add_implementation(
                 wrap_compute_matmul(topi.x86.matmul_mkldnn),
@@ -414,7 +416,8 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
                 name="matmul_mkldnn.x86",
                 plevel=15,
             )
-        if length_before == len(strategy.specializations):
+        length_after = len(strategy.specializations) if strategy.specializations else 0
+        if length_before == length_after:
             logger.warning("Currently mkldnn only support the data type to be float32. Skip.")
 
     if is_auto_scheduler_enabled():
@@ -426,7 +429,7 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
         )
     else:
         # If no cblas/mkl/mkldnn strategy choosed
-        if not len(strategy.specializations):
+        if not strategy.specializations:
             logger.warning(
                 "Matmul is not optimized for x86. "
                 "Recommend to use cblas/mkl/mkldnn for better performance."

From 9b4f77ced69267aa8a28f3715d041a7dc2087f38 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Wed, 30 Jun 2021 14:13:14 +0800
Subject: [PATCH 26/26] Bug fix for merge main

---
 python/tvm/relay/op/strategy/cuda.py |  6 +++---
 python/tvm/topi/gpu/dense.py         | 30 ++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index c40a1f025f2c..dd265e4b4d5b 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -715,9 +715,9 @@ def matmul_strategy_cuda(attrs, inputs, out_type, target):
         )
         # Temporary use this as a basic schedule
         strategy.add_implementation(
-            wrap_compute_matmul(topi.cuda.matmul_default_cuda),
-            wrap_topi_schedule(topi.cuda.schedule_matmul_default_cuda),
-            name="matmul_default.cuda",
+            wrap_compute_matmul(topi.gpu.matmul_default),
+            wrap_topi_schedule(topi.gpu.schedule_matmul_default),
+            name="matmul_default.gpu",
         )
 
     if target.kind.name == "cuda" and "cublas" in target.libs:
diff --git a/python/tvm/topi/gpu/dense.py b/python/tvm/topi/gpu/dense.py
index 806aa9f5ca44..b9009d3f3393 100644
--- a/python/tvm/topi/gpu/dense.py
+++ b/python/tvm/topi/gpu/dense.py
@@ -49,6 +49,36 @@ def _callback(op):
     return s
 
 
+@autotvm.register_topi_compute("matmul_default.gpu")
+def matmul_default(
+    cfg,
+    tensor_a,
+    tensor_b,
+    bias=None,
+    out_dtype=None,
+    transpose_a=False,
+    transpose_b=False,
+):
+    """Matmul operator on GPU"""
+    return nn.matmul(tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b)
+
+
+@autotvm.register_topi_schedule("matmul_default.gpu")
+def schedule_matmul_default(cfg, outs):
+    """Schedule matmul on GPU"""
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == "matmul":
+            # Temporary use this as a basic schedule for matmul
+            # TODO(jcf94): Add a more general schedule for matmul
+            _schedule_dense_small_batch(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
 def _schedule_dense_small_batch(cfg, s, C):
     A, weights = C.op.input_tensors
     _, in_dim_weights = get_const_tuple(weights.shape)