From 6d7a450789faa9f5c5fda13311c49e11ee11a6d4 Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Thu, 10 Jun 2021 14:29:46 +0800 Subject: [PATCH 01/26] Add Matmul Op --- include/tvm/relay/attrs/nn.h | 18 ++- python/tvm/relay/op/nn/_nn.py | 45 ++++++- python/tvm/relay/op/nn/nn.py | 41 ++++++- python/tvm/relay/op/op_attrs.py | 6 +- python/tvm/relay/op/strategy/cuda.py | 20 +++ python/tvm/relay/op/strategy/generic.py | 36 ++++++ python/tvm/topi/cuda/dense.py | 44 ++++++- python/tvm/topi/generic/nn.py | 17 +++ python/tvm/topi/nn/dense.py | 115 ++++++++++++++++-- rust/tvm/src/ir/relay/attrs/nn.rs | 8 +- src/relay/op/make_op.h | 3 + src/relay/op/nn/nn.cc | 61 ++++++++-- src/relay/op/nn/nn.h | 25 ++-- src/relay/qnn/op/dense.cc | 16 +-- src/relay/quantize/realize.cc | 6 +- .../auto_scheduler_layout_rewrite.cc | 8 +- .../transforms/combine_parallel_dense.cc | 16 +-- tests/python/relay/test_op_level1.py | 63 +++++++++- tests/python/topi/python/test_topi_matmul.py | 26 ++++ 19 files changed, 501 insertions(+), 73 deletions(-) diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h index a58bb8750c14..de8fa26d66d9 100644 --- a/include/tvm/relay/attrs/nn.h +++ b/include/tvm/relay/attrs/nn.h @@ -961,19 +961,29 @@ struct AvgPool3DAttrs : public tvm::AttrsNode { } }; -/*! \brief Attributes for dense operator */ -struct DenseAttrs : public tvm::AttrsNode { +/*! \brief Attributes for matmul operator and dense operator */ +struct MatmulAttrs : public tvm::AttrsNode { IndexExpr units; - tvm::String auto_scheduler_rewritten_layout; // The layout after auto-scheduler's layout rewrite DataType out_dtype; + bool input_transposed; + bool weight_transposed; + tvm::String auto_scheduler_rewritten_layout; // The layout after auto-scheduler's layout rewrite - TVM_DECLARE_ATTRS(DenseAttrs, "relay.attrs.DenseAttrs") { + TVM_DECLARE_ATTRS(MatmulAttrs, "relay.attrs.MatmulAttrs") { TVM_ATTR_FIELD(units).describe("Number of hidden units of the dense transformation."); // use 0 bits to indicate none. TVM_ATTR_FIELD(out_dtype) .set_default(NullValue()) .describe("Output data type, set to explicit type under mixed precision setting"); + + TVM_ATTR_FIELD(input_transposed) + .set_default(false) + .describe("Whether the input tensor is in transposed format."); + + TVM_ATTR_FIELD(weight_transposed) + .set_default(false) + .describe("Whether the weight tensor is in transposed format."); } }; diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index c6c4f4bfb959..dd1ee6a43857 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -52,6 +52,32 @@ reg.register_pattern("nn.log_softmax", OpPattern.OPAQUE) +@reg.register_legalize("nn.matmul") +def leaglize_matmul(attrs, inputs, types): + """Legalize matmul op. + + Parameters + ---------- + attrs : tvm.ir.Attrs + Attributes of current convolution + inputs : list of tvm.relay.Expr + The args of the Relay expr to be legalized + types : list of types + List of input and output types + + Returns + ------- + result : tvm.relay.Expr + The legalized expr + """ + return topi.nn.matmul_legalize(attrs, inputs, types) + + +# matmul +reg.register_strategy("nn.matmul", strategy.matmul_strategy) +reg.register_pattern("nn.matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE) + + @reg.register_legalize("nn.dense") def legalize_dense(attrs, inputs, types): """Legalize dense op. @@ -1149,21 +1175,34 @@ def batch_flatten_shape_func(attrs, inputs, _): @script -def _dense_shape_func(data_shape, weight_shape): +def _matmul_shape_func(data_shape, weight_shape, input_transposed, weight_transposed): out = output_tensor((data_shape.shape[0],), "int64") for i in const_range(out.shape[0] - 1): out[i] = data_shape[i] - out[out.shape[0] - 1] = weight_shape[0] + if input_transposed: + out[out.shape[0] - 2] = out[out.shape[0] - 1] + out[out.shape[0] - 1] = weight_shape[0] if weight_transposed else weight_shape[1] return out +@reg.register_shape_func("nn.matmul", False) +def matmul_shape_func(attrs, inputs, _): + """ + Shape function for matmul op. + """ + ret = [ + _matmul_shape_func(inputs[0], inputs[1], attrs.input_transposed, attrs.weight_transposed) + ] + return ret + + @reg.register_shape_func("nn.dense", False) def dense_shape_func(attrs, inputs, _): """ Shape function for dense op. """ - ret = [_dense_shape_func(inputs[0], inputs[1])] + ret = [_matmul_shape_func(inputs[0], inputs[1], False, True)] return ret diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index caf1f187fad3..179ff97afe68 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -1471,9 +1471,48 @@ def bias_add(data, bias, axis=1): return _make.bias_add(data, bias, axis) +def matmul(data, weight, units=None, out_dtype="", input_transposed=False, weight_transposed=False): + """Dense operator. + Applies a linear transformation. The X & W can be transposed. + + .. math:: + + `Y = X * W` + + Parameters + ---------- + data : tvm.relay.Expr + The input data to the operator, + of shape `(d_1, d_2, ..., d_n, units_in)`. + + weight : tvm.relay.Expr + The weight expressions, 2-D matrix, + of shape `(units_in, units)` or `(units, units_in)`. + + units : int, optional + Number of hidden units of the dense transformation. + + out_dtype : str, optional + Specifies the output data type for mixed precision dense, + of shape `(d_1, d_2, ..., d_n, units)`. + + weight_transposed : bool, optional + Whether the weight tensor is in transposed format. + + Returns + ------- + result : tvm.relay.Expr + The computed result. + """ + # With N/T format, the compute will be seen as a dense + if not input_transposed and weight_transposed: + return dense(data, weight, units, out_dtype) + return _make.matmul(data, weight, units, out_dtype, input_transposed, weight_transposed) + + def dense(data, weight, units=None, out_dtype=""): """Dense operator. - Applies a linear transformation + Applies a linear transformation. This is an alias of matmul with weight transposed. .. math:: diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py index 6844d133a77e..323190e309b8 100644 --- a/python/tvm/relay/op/op_attrs.py +++ b/python/tvm/relay/op/op_attrs.py @@ -64,9 +64,9 @@ class BiasAddAttrs(Attrs): """Atttribute of nn.bias_add""" -@tvm._ffi.register_object("relay.attrs.DenseAttrs") -class DenseAttrs(Attrs): - """Attributes for nn.dense""" +@tvm._ffi.register_object("relay.attrs.MatmulAttrs") +class MatmulAttrs(Attrs): + """Attributes for nn.matmul and nn.dense""" @tvm._ffi.register_object("relay.attrs.SoftmaxAttrs") diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index b4db412700a7..ef0c355ffe47 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -698,6 +698,26 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target): return strategy +@matmul_strategy.register(["cuda", "gpu"]) +def matmul_strategy_cuda(attrs, inputs, out_type, target): + """dense cuda strategy""" + strategy = _op.OpStrategy() + if target.kind.name == "cuda" and "cublas" in target.libs: + strategy.add_implementation( + wrap_compute_matmul(topi.cuda.matmul_cublas), + wrap_topi_schedule(topi.cuda.schedule_matmul_cublas), + name="matmul_cublas.cuda", + plevel=25, + ) + if is_auto_scheduler_enabled(): + strategy.add_implementation( + wrap_compute_matmul(topi.nn.matmul), + naive_schedule, + name="matmul.cuda", + ) + return strategy + + @dense_strategy.register(["cuda", "gpu"]) def dense_strategy_cuda(attrs, inputs, out_type, target): """dense cuda strategy""" diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index d56820e409aa..9a552d73bf46 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -712,6 +712,42 @@ def dilation2d_strategy(attrs, inputs, out_type, target): return strategy +# matmul +def wrap_compute_matmul(topi_compute, need_auto_scheduler_layout=False): + """wrap matmul topi compute""" + + def _compute_matmul(attrs, inputs, out_type): + """Compute definition of matmul""" + out_dtype = attrs.out_dtype + out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype + args = [ + inputs[0], + inputs[1], + None, + out_dtype, + attrs.input_transposed, + attrs.weight_transposed, + ] + if need_auto_scheduler_layout: + args.append(get_auto_scheduler_rewritten_layout(attrs)) + return [topi_compute(*args)] + + return _compute_matmul + + +@override_native_generic_func("matmul_strategy") +def matmul_strategy(attrs, inputs, out_type, target): + """matmul generic strategy""" + logger.warning("matmul is not optimized for this platform.") + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_matmul(topi.nn.matmul), + wrap_topi_schedule(topi.generic.schedule_matmul), + name="matmul.generic", + ) + return strategy + + # dense def wrap_compute_dense(topi_compute, need_auto_scheduler_layout=False): """wrap dense topi compute""" diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py index 8adc38b84b1b..71b2464736df 100644 --- a/python/tvm/topi/cuda/dense.py +++ b/python/tvm/topi/cuda/dense.py @@ -30,10 +30,16 @@ logger = logging.getLogger("topi") -@autotvm.register_topi_compute("dense_cublas.cuda") -def dense_cublas(cfg, data, weight, bias=None, out_dtype=None): - """Dense operator on CUDA with CUBLAS""" - assert len(data.shape) == 2 and len(weight.shape) == 2, "only support 2-dim dense" +def _matmul_cublas_common( + cfg, + data, + weight, + bias=None, + out_dtype=None, + input_transposed=False, + weight_transposed=False, +): + assert len(data.shape) == 2 and len(weight.shape) == 2, "only support 2-dim matmul" if bias is not None: assert len(bias.shape) == 1 if out_dtype is None: @@ -41,7 +47,7 @@ def dense_cublas(cfg, data, weight, bias=None, out_dtype=None): assert out_dtype == data.dtype, "Mixed precision not supported." batch, in_dim = get_const_tuple(data.shape) out_dim, _ = get_const_tuple(weight.shape) - matmul = cublas.matmul(data, weight, False, True) + matmul = cublas.matmul(data, weight, input_transposed, weight_transposed) if all(isinstance(d, int) for d in [batch, in_dim, out_dim]): cfg.add_flop(batch * in_dim * out_dim * 2) if bias is not None: @@ -51,6 +57,34 @@ def dense_cublas(cfg, data, weight, bias=None, out_dtype=None): return matmul +@autotvm.register_topi_compute("matmul_cublas.cuda") +def matmul_cublas( + cfg, + data, + weight, + bias=None, + out_dtype=None, + input_transposed=False, + weight_transposed=False, +): + """Matmul operator on CUDA with CUBLAS""" + return _matmul_cublas_common( + cfg, data, weight, bias, out_dtype, input_transposed, weight_transposed + ) + + +@autotvm.register_topi_schedule("matmul_cublas.cuda") +def schedule_matmul_cublas(_, outs): + """Schedule matmul operator using CUBLAS""" + return generic.schedule_extern(outs) + + +@autotvm.register_topi_compute("dense_cublas.cuda") +def dense_cublas(cfg, data, weight, bias=None, out_dtype=None): + """Dense operator on CUDA with CUBLAS""" + return _matmul_cublas_common(cfg, data, weight, bias, out_dtype, False, True) + + @autotvm.register_topi_schedule("dense_cublas.cuda") def schedule_dense_cublas(_, outs): """Schedule dense operator using CUBLAS""" diff --git a/python/tvm/topi/generic/nn.py b/python/tvm/topi/generic/nn.py index 04d649037fef..1b3214154687 100644 --- a/python/tvm/topi/generic/nn.py +++ b/python/tvm/topi/generic/nn.py @@ -580,6 +580,23 @@ def schedule_fast_softmax(outs): return _default_schedule(outs, False) +def schedule_matmul(outs): + """Schedule for matmul + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of matmul + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) + + def schedule_dense(outs): """Schedule for dense diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py index e8ec476b86a5..daaf3c343dcf 100644 --- a/python/tvm/topi/nn/dense.py +++ b/python/tvm/topi/nn/dense.py @@ -21,7 +21,15 @@ from .. import tag -def dense(data, weight, bias=None, out_dtype=None, auto_scheduler_rewritten_layout=""): +def matmul( + data, + weight, + bias=None, + out_dtype=None, + input_transposed=False, + weight_transposed=False, + auto_scheduler_rewritten_layout="", +): """The default implementation of dense in topi. Parameters @@ -51,37 +59,120 @@ def dense(data, weight, bias=None, out_dtype=None, auto_scheduler_rewritten_layo assert len(bias.shape) == 1 if out_dtype is None: out_dtype = data.dtype - batch, in_dim = data.shape + if input_transposed: + in_dim, batch = data.shape + else: + batch, in_dim = data.shape if auto_scheduler_rewritten_layout: # Infer shape for the rewritten layout out_dim, red_dim = auto_scheduler.get_shape_from_rewritten_layout( - auto_scheduler_rewritten_layout, ["j", "k"] + auto_scheduler_rewritten_layout, ["j", "k"] if weight_transposed else ["k", "j"] ) auto_scheduler.remove_index_check(weight) - else: + elif weight_transposed: out_dim, red_dim = weight.shape + else: + red_dim, out_dim = weight.shape assert in_dim == red_dim k = te.reduce_axis((0, in_dim), name="k") - matmul = te.compute( + if input_transposed: + if weight_transposed: + compute_lambda = lambda i, j: te.sum( + data[k, i].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=k + ) + compute_name = "T_matmul_TT" + else: + compute_lambda = lambda i, j: te.sum( + data[k, i].astype(out_dtype) * weight[k, j].astype(out_dtype), axis=k + ) + compute_name = "T_matmul_TN" + compute_tag = "matmul" + else: + if weight_transposed: + compute_lambda = lambda i, j: te.sum( + data[i, k].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=k + ) + compute_name = "T_dense" + compute_tag = "dense" + else: + compute_lambda = lambda i, j: te.sum( + data[i, k].astype(out_dtype) * weight[k, j].astype(out_dtype), axis=k + ) + compute_name = "T_matmul" + compute_tag = "matmul" + + mat = te.compute( (batch, out_dim), - lambda i, j: te.sum(data[i, k].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=k), - name="T_dense", - tag="dense", + compute_lambda, + name=compute_name, + tag=compute_tag, attrs={"layout_free_placeholders": [weight]}, ) + if bias is not None: - matmul = te.compute( + mat = te.compute( (batch, out_dim), - lambda i, j: matmul[i, j] + bias[j].astype(out_dtype), + lambda i, j: mat[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST, ) if auto_scheduler_rewritten_layout: - matmul = auto_scheduler.rewrite_compute_body(matmul, auto_scheduler_rewritten_layout) + mat = auto_scheduler.rewrite_compute_body(mat, auto_scheduler_rewritten_layout) + + return mat + + +@tvm.target.generic_func +def matmul_legalize(attrs, inputs, types): + """Legalizes matmul op. + + Parameters + ---------- + attrs : tvm.ir.Attrs + Attributes of current dense + inputs : list of tvm.relay.Expr + The args of the Relay expr to be legalized + types : list of types + List of input and output types - return matmul + Returns + ------- + result : tvm.relay.Expr + The legalized expr + """ + # not to change by default + # pylint: disable=unused-argument + return None + + +def dense(data, weight, bias=None, out_dtype=None, auto_scheduler_rewritten_layout=""): + """The default implementation of dense in topi. + + Parameters + ---------- + data : tvm.te.Tensor + 2-D with shape [batch, in_dim] + + weight : tvm.te.Tensor + 2-D with shape [out_dim, in_dim] + + bias : Optional[tvm.te.Tensor] + 1-D with shape [out_dim] + + out_dtype : Optional[str] + The output type. This is used for mixed precision. + + auto_scheduler_rewritten_layout: str = "" + The layout after auto-scheduler's layout rewrite pass. + + Returns + ------- + output : tvm.te.Tensor + 2-D with shape [batch, out_dim] + """ + return matmul(data, weight, bias, out_dtype, False, True, auto_scheduler_rewritten_layout) @tvm.target.generic_func diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs index f0137fa3cbcc..84533e9ecaea 100644 --- a/rust/tvm/src/ir/relay/attrs/nn.rs +++ b/rust/tvm/src/ir/relay/attrs/nn.rs @@ -56,12 +56,14 @@ pub struct BiasAddAttrsNode { #[repr(C)] #[derive(Object, Debug)] -#[ref_name = "DenseAttrs"] -#[type_key = "relay.attrs.DenseAttrs"] -pub struct DenseAttrsNode { +#[ref_name = "MatmulAttrs"] +#[type_key = "relay.attrs.MatmulAttrs"] +pub struct MatmulAttrsNode { pub base: BaseAttrsNode, pub units: IndexExpr, pub out_dtype: DataType, + pub input_transposed: bool, + pub weight_transposed: bool, } #[repr(C)] diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h index 81de4bc90ad7..b957df987f4c 100644 --- a/src/relay/op/make_op.h +++ b/src/relay/op/make_op.h @@ -44,6 +44,9 @@ Expr MakeClip(Expr a, double a_min, double a_max); Expr MakeConcatenate(Expr data, int axis); +Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool input_transposed, + bool weight_transposed); + Expr MakeDense(Expr data, Expr weight, IndexExpr units, DataType out_dtype); Expr MakeBatchMatmul(Expr lhs, Expr rhs, DataType out_dtype); diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index 32c0a21d46c7..70e7cd0f5be8 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -162,16 +162,47 @@ Useful for .set_support_level(3) .add_type_rel("FIFOBuffer", FIFOBufferRel); -// relay.nn.dense -TVM_REGISTER_NODE_TYPE(DenseAttrs); +// ------------------- relay.nn.matmul +TVM_REGISTER_NODE_TYPE(MatmulAttrs); -// Positional relay function to create dense operator used by frontend FFI. -Expr MakeDense(Expr data, Expr weight, IndexExpr units, DataType out_dtype) { - auto attrs = make_object(); +Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool input_transposed, + bool weight_transposed) { + auto attrs = make_object(); attrs->units = units; attrs->out_dtype = out_dtype; - static const Op& op = Op::Get("nn.dense"); - return Call(op, {data, weight}, Attrs(attrs), {}); + attrs->input_transposed = input_transposed; + attrs->weight_transposed = weight_transposed; + if (!input_transposed && weight_transposed) { + static const Op& dense_op = Op::Get("nn.dense"); + return Call(dense_op, {data, weight}, Attrs(attrs), {}); + } else { + static const Op& matmul_op = Op::Get("nn.matmul"); + return Call(matmul_op, {data, weight}, Attrs(attrs), {}); + } +} + +TVM_REGISTER_GLOBAL("relay.op.nn._make.matmul").set_body_typed(MakeMatmul); + +RELAY_REGISTER_OP("nn.matmul") + .describe(R"code(Applies a linear transformation: :math:`Y = XW`. X & W can be transposed. + +- **data**: `(x1, x2, ..., xn, input_dim)` +- **weight**: `(input_dim, units)` or `(units, input_dim)` +- **out**: `(x1, x2, ..., xn, units)`. + +)code" TVM_ADD_FILELINE) + .set_attrs_type() + .set_num_inputs(2) + .add_argument("data", "nD Tensor", "Input data.") + .add_argument("weight", "2D Tensor", "Weight matrix.") + .set_support_level(1) + .add_type_rel("Matmul", MatmulRel); +// ------------------- relay.nn.matmul + +// ------------------- relay.nn.dense +// Positional relay function to create dense operator used by frontend FFI. +Expr MakeDense(Expr data, Expr weight, IndexExpr units, DataType out_dtype) { + return MakeMatmul(data, weight, units, out_dtype, false, true); } TVM_REGISTER_GLOBAL("relay.op.nn._make.dense").set_body_typed(MakeDense); @@ -184,19 +215,22 @@ RELAY_REGISTER_OP("nn.dense") - **out**: `(x1, x2, ..., xn, units)`. )code" TVM_ADD_FILELINE) - .set_attrs_type() + .set_attrs_type() .set_num_inputs(2) .add_argument("data", "nD Tensor", "Input data.") .add_argument("weight", "2D Tensor", "Weight matrix.") .set_support_level(1) - .add_type_rel("Dense", DenseRel); + .add_type_rel("Dense", MatmulRel); +// ------------------- relay.nn.dense -// relay.nn.contrib_dense_pack +// ------------------- relay.nn.contrib_dense_pack // Positional relay function to create dense_pack operator used by frontend FFI. Expr MakeDensePack(Expr data, Expr weight, IndexExpr units, DataType out_dtype) { - auto attrs = make_object(); + auto attrs = make_object(); attrs->units = units; attrs->out_dtype = out_dtype; + attrs->input_transposed = false; + attrs->weight_transposed = true; static const Op& op = Op::Get("nn.contrib_dense_pack"); return Call(op, {data, weight}, Attrs(attrs), {}); } @@ -211,12 +245,13 @@ RELAY_REGISTER_OP("nn.contrib_dense_pack") - **out**: `(x1, x2, ..., xn, units)`. )code" TVM_ADD_FILELINE) - .set_attrs_type() + .set_attrs_type() .set_num_inputs(2) .add_argument("data", "nD Tensor", "Input data.") .add_argument("weight", "3D Tensor", "Packed weight matrix.") .set_support_level(10) - .add_type_rel("DensePack", DensePackRel); + .add_type_rel("DensePack", DensePackRel); +// ------------------- relay.nn.contrib_dense_pack // relay.leaky_relu TVM_REGISTER_NODE_TYPE(LeakyReluAttrs); diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h index 1ac800f357b0..dd394396a309 100644 --- a/src/relay/op/nn/nn.h +++ b/src/relay/op/nn/nn.h @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -36,8 +37,8 @@ namespace tvm { namespace relay { template -bool DenseRel(const Array& types, int num_inputs, const Attrs& attrs, - const TypeReporter& reporter) { +bool MatmulRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { ICHECK_EQ(types.size(), 3); const auto* data = types[0].as(); const auto* weight = types[1].as(); @@ -48,12 +49,19 @@ bool DenseRel(const Array& types, int num_inputs, const Attrs& attrs, ICHECK(static_cast(data->shape.size()) != 0); - Array dshape = data->shape; + const Array& dshape = data->shape; Array oshape = dshape; + tvm::PrimExpr reduce = dshape[dshape.size() - 1]; + if (param->input_transposed) { + reduce = dshape[dshape.size() - 2]; + oshape.Set((oshape.size() - 2), dshape[oshape.size() - 1]); + } if (param->units.defined()) { // validate the weight shape is proper if defined // Assign weight type - Array wshape({param->units, dshape[dshape.size() - 1]}); + const Array& wshape = param->weight_transposed + ? Array({param->units, reduce}) + : Array({reduce, param->units}); // It is possible for weight to be nullptr in which case we will use // data dtype as the weight dtype. However if weight dtype is explicitly // present we will use that. @@ -70,7 +78,7 @@ bool DenseRel(const Array& types, int num_inputs, const Attrs& attrs, oshape.Set((oshape.size() - 1), param->units); } else { if (weight == nullptr) return false; - Array wshape = weight->shape; + const Array& wshape = weight->shape; // When weight's layout has been rewritten, figure it out based on the // total number of elements and input dimensions. if (param->auto_scheduler_rewritten_layout.size() != 0) { @@ -83,11 +91,12 @@ bool DenseRel(const Array& types, int num_inputs, const Attrs& attrs, } else { ICHECK(static_cast(weight->shape.size()) == 2); if (!data->shape.back().as()) { - ICHECK(reporter->AssertEQ(data->shape[data->shape.size() - 1], weight->shape[1])) - << "DenseRel: input dimension doesn't match," + ICHECK((param->weight_transposed && reporter->AssertEQ(reduce, weight->shape[1])) || + (!param->weight_transposed && reporter->AssertEQ(reduce, weight->shape[0]))) + << "MatmulRel: input dimension doesn't match," << " data shape=" << data->shape << ", weight shape=" << weight->shape; } - oshape.Set((oshape.size() - 1), wshape[0]); + oshape.Set((oshape.size() - 1), param->weight_transposed ? wshape[0] : wshape[1]); } } diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc index 6284524bff27..fbaa8a96f562 100644 --- a/src/relay/qnn/op/dense.cc +++ b/src/relay/qnn/op/dense.cc @@ -45,8 +45,8 @@ bool QnnDenseRel(const Array& types, int num_inputs, const Attrs& attrs, const auto* data = types[0].as(); const auto* weight = types[1].as(); if (data == nullptr || weight == nullptr) return false; - const auto* param = attrs.as(); - ICHECK(param != nullptr) << "DenseAttrs cannot be nullptr."; + const auto* param = attrs.as(); + ICHECK(param != nullptr) << "MatmulAttrs cannot be nullptr."; ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8)) << "Expected quantized dense type(int8, uint8) for input but was " << data->dtype; ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8)) @@ -70,22 +70,24 @@ bool QnnDenseRel(const Array& types, int num_inputs, const Attrs& attrs, // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay // Dense infer type function. Array tensor_types = {types[0], types[1], types[6]}; - return DenseRel(tensor_types, 3, attrs, reporter); + return MatmulRel(tensor_types, 3, attrs, reporter); } // Positional relay function to create quantized dense operator used by frontend FFI. Expr MakeQuantizedDense(Expr data, Expr weight, Expr input_zero_point, Expr kernel_zero_point, Expr input_scale, Expr kernel_scale, IndexExpr units, DataType out_dtype) { - auto attrs = make_object(); + auto attrs = make_object(); attrs->units = std::move(units); attrs->out_dtype = out_dtype; + attrs->input_transposed = false; + attrs->weight_transposed = true; static const Op& op = Op::Get("qnn.dense"); return Call(op, {data, weight, input_zero_point, kernel_zero_point, input_scale, kernel_scale}, Attrs(attrs), {}); } Expr DenseFirstTerm(const Expr& quantized_data, const Expr& quantized_kernel, - const DenseAttrs* attrs) { + const MatmulAttrs* attrs) { return Dense(quantized_data, quantized_kernel, attrs->units, attrs->out_dtype); } @@ -161,7 +163,7 @@ Expr QnnDenseCanonicalize(const Attrs& attrs, const Array& new_args, const auto in_shape = get_shape(arg_types[0]); const int reduction_dim_size = get_const_int(in_shape[1]); - const auto* qnn_dense_attrs = attrs.as(); + const auto* qnn_dense_attrs = attrs.as(); auto term1 = DenseFirstTerm(quantized_data, quantized_kernel, qnn_dense_attrs); auto term2 = DenseSecondTerm(quantized_data, kernel_zero_point); @@ -204,7 +206,7 @@ RELAY_REGISTER_OP("qnn.dense") - **weight**: quantized(int8, unit8) `(units, input_dim)` - **out**: quantized(int32) `(x1, x2, ..., xn, units)`. )code" TVM_ADD_FILELINE) - .set_attrs_type() + .set_attrs_type() .set_num_inputs(6) .add_argument("data", "quantized nD Tensor", "Input data.") .add_argument("weight", "quantized 2D Tensor", "Weight matrix.") diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc index 968628fbfe39..0f7a5eafb0f0 100644 --- a/src/relay/quantize/realize.cc +++ b/src/relay/quantize/realize.cc @@ -280,11 +280,13 @@ Expr DenseRealize(const Call& ref_call, const Array& new_args, const Objec } Expr rdata = Cast(rhs->data, cfg->dtype_weight); - const auto ref_attrs = ref_call->attrs.as(); - auto attrs = make_object(); + const auto ref_attrs = ref_call->attrs.as(); + auto attrs = make_object(); *attrs = *ref_attrs; DataType out_dtype = cfg->dtype_activation; attrs->out_dtype = out_dtype; + attrs->input_transposed = false; + attrs->weight_transposed = true; Expr ret = Call(ref_call->op, {ldata, rdata}, Attrs(attrs), ref_call->type_args); Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale); diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc b/src/relay/transforms/auto_scheduler_layout_rewrite.cc index edc4119ce859..ac110544a2b5 100644 --- a/src/relay/transforms/auto_scheduler_layout_rewrite.cc +++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc @@ -87,7 +87,7 @@ class FuncMutator : public ExprMutator { updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout); } else if (auto pattr = call->attrs.as()) { updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout); - } else if (auto pattr = call->attrs.as()) { + } else if (auto pattr = call->attrs.as()) { updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout); } else if (auto pattr = call->attrs.as()) { updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout); @@ -105,7 +105,7 @@ class FuncMutator : public ExprMutator { std::vector target_ops_{"nn.conv2d", "nn.conv3d", "nn.contrib_conv2d_winograd_without_weight_transform", - "nn.dense", "nn.batch_matmul"}; + "nn.matmul", "nn.dense", "nn.batch_matmul"}; }; Expr AutoSchedulerLayoutRewriter::VisitExpr_(const CallNode* n) { @@ -166,8 +166,8 @@ TVM_REGISTER_GLOBAL("relay.attrs.get_auto_scheduler_rewritten_layout") return attrs.as()->auto_scheduler_rewritten_layout; } else if (attrs->IsInstance()) { return attrs.as()->auto_scheduler_rewritten_layout; - } else if (attrs->IsInstance()) { - return attrs.as()->auto_scheduler_rewritten_layout; + } else if (attrs->IsInstance()) { + return attrs.as()->auto_scheduler_rewritten_layout; } else if (attrs->IsInstance()) { return attrs.as()->auto_scheduler_rewritten_layout; } else { diff --git a/src/relay/transforms/combine_parallel_dense.cc b/src/relay/transforms/combine_parallel_dense.cc index 3cd9cca4fec4..41f3a70c62bc 100644 --- a/src/relay/transforms/combine_parallel_dense.cc +++ b/src/relay/transforms/combine_parallel_dense.cc @@ -70,15 +70,15 @@ class ParallelDenseToBatchCombiner : public ParallelOpBatchCombiner { } CHECK_EQ(num_args, 2); - const auto* origin_attrs = branches[0][0]->attrs.as(); + const auto* origin_attrs = branches[0][0]->attrs.as(); ICHECK(origin_attrs); return Downcast(MakeBatchMatmul(new_args[0], new_args[1], origin_attrs->out_dtype)); } virtual bool CanOpsBeCombined(const CallNode* a, const CallNode* b) { StructuralEqual eq; - const auto* attrs_a = a->attrs.as(); - const auto* attrs_b = b->attrs.as(); + const auto* attrs_a = a->attrs.as(); + const auto* attrs_b = b->attrs.as(); ICHECK(attrs_a); ICHECK(attrs_b); const auto* weight_a = a->args[1]->type_as(); @@ -103,8 +103,8 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner { bool CanOpsBeCombined(const CallNode* a, const CallNode* b) { StructuralEqual eq; - const auto* attrs_a = a->attrs.as(); - const auto* attrs_b = b->attrs.as(); + const auto* attrs_a = a->attrs.as(); + const auto* attrs_b = b->attrs.as(); const auto* weight_a = a->args[1]->type_as(); const auto* weight_b = b->args[1]->type_as(); ICHECK(attrs_a != nullptr && attrs_b != nullptr && weight_a != nullptr && weight_b != nullptr); @@ -119,11 +119,13 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner { IndexExpr new_output_dims; // concat all weights into one std::tie(new_weight, new_output_dims) = TransformWeight(branches); - const auto* origin_attrs = branches[0][0]->attrs.as(); + const auto* origin_attrs = branches[0][0]->attrs.as(); ICHECK(origin_attrs); - const auto dense_attrs = make_object(); + const auto dense_attrs = make_object(); dense_attrs->units = new_output_dims; dense_attrs->out_dtype = origin_attrs->out_dtype; + dense_attrs->input_transposed = false; + dense_attrs->weight_transposed = true; return Call(dense_op, {input, new_weight}, Attrs{dense_attrs}, {}); } diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py index 89475ac7df86..11f79b2477bf 100644 --- a/tests/python/relay/test_op_level1.py +++ b/tests/python/relay/test_op_level1.py @@ -409,6 +409,66 @@ def test_batch_norm(): ) +@pytest.mark.xfail +def test_matmul_type_check(): + dtype = "float16" + n, c, h, w = 2, 2, 2, 2 + x = relay.var("x", relay.TensorType((n, c, h, w), dtype)) + # it should fail since it does not match with m(2) + mismatch_w = 3 + w = relay.var("w", relay.TensorType((mismatch_w, 2), dtype)) + y = relay.nn.matmul(x, w) + yy = run_infer_type(y) + + +@tvm.testing.uses_gpu +def test_matmul(): + for dtype in ["float16", "float32"]: + # Matmul accuracy for float16 is poor + if dtype == "float16": + continue + n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w") + x = relay.var("x", relay.TensorType((n, c, h, w), dtype)) + w = relay.var("w", relay.TensorType((2, w), dtype)) + y = relay.nn.matmul(x, w, units=2, weight_transposed=True) + assert "units=2" in y.astext() + yy = run_infer_type(y) + assert yy.checked_type == relay.TensorType((n, c, h, 2), dtype) + + n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), 2 + x = relay.var("x", relay.TensorType((n, c, w, h), dtype)) + wh, ww = te.size_var("wh"), te.size_var("ww") + w = relay.var("w", relay.TensorType((wh, ww), dtype)) + y = relay.nn.matmul(x, w, input_transposed=True) + yy = run_infer_type(y) + assert yy.checked_type == relay.TensorType((n, c, h, ww), dtype) + + n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), 2 + x = relay.var("x", relay.TensorType((n, c, h, w), dtype)) + w = relay.var("w", relay.IncompleteType()) + y = relay.nn.matmul(x, w, units=2) + yy = run_infer_type(y) + assert yy.checked_type == relay.TensorType((n, c, h, 2), dtype) + + x = relay.var("x", shape=(5, 10), dtype=dtype) + w = relay.var("w", shape=(5, 2), dtype=dtype) + z = relay.nn.matmul(x, w, input_transposed=True) + + # Check result. + func = relay.Function([x, w], z) + x_data = np.random.rand(5, 10).astype(dtype) + w_data = np.random.rand(5, 2).astype(dtype) + ref_res = np.dot(x_data.transpose(), w_data) + + for target, dev in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", device=dev, target=target) + intrp2 = relay.create_executor("debug", device=dev, target=target) + op_res1 = intrp1.evaluate(func)(x_data, w_data) + tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5) + op_res2 = intrp2.evaluate(func)(x_data, w_data) + tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5) + + @pytest.mark.xfail def test_dense_type_check(): dtype = "float16" @@ -426,7 +486,7 @@ def test_dense(): for dtype in ["float16", "float32"]: # Dense accuracy for float16 is poor if dtype == "float16": - return + continue n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w") x = relay.var("x", relay.TensorType((n, c, h, w), dtype)) w = relay.var("w", relay.TensorType((2, w), dtype)) @@ -506,6 +566,7 @@ def test_bitserial_dense(): test_log_softmax() test_dropout() test_batch_norm() + test_matmul() test_dense() test_bitserial_dense() test_dense_dtype() diff --git a/tests/python/topi/python/test_topi_matmul.py b/tests/python/topi/python/test_topi_matmul.py index e5a21a3ad3b7..c777b8f2bc6e 100644 --- a/tests/python/topi/python/test_topi_matmul.py +++ b/tests/python/topi/python/test_topi_matmul.py @@ -41,6 +41,31 @@ def with_tvm(lam, *args): return out_nd.numpy() +def verify_nn_matmul(sa, sb, transp_a, transp_b): + a = np.random.uniform(low=-1.0, high=1.0, size=sa).astype(np.float32) + b = np.random.uniform(low=-1.0, high=1.0, size=sb).astype(np.float32) + c1 = np.matmul(np.transpose(a) if transp_a else a, np.transpose(b) if transp_b else b) + c2 = with_tvm( + lambda A, B: topi.nn.matmul(A, B, input_transposed=transp_a, weight_transposed=transp_b), + a, + b, + ) + tvm.testing.assert_allclose(c1, c2, rtol=1e-5, atol=1e-5) + + +def test_nn_matmul(): + verify_nn_matmul((1, 1), (1, 1), False, False) + verify_nn_matmul((1, 1), (1, 1), True, True) + verify_nn_matmul((2, 2), (2, 2), False, False) + verify_nn_matmul((2, 2), (2, 2), True, True) + verify_nn_matmul((2, 3), (3, 5), False, False) + verify_nn_matmul((5, 3), (3, 2), False, False) + verify_nn_matmul((3, 5), (3, 2), True, False) + verify_nn_matmul((3, 5), (2, 3), True, True) + verify_nn_matmul((3, 5), (3, 2), True, False) + verify_nn_matmul((5, 3), (2, 3), False, True) + + def verify_matmul(sa, sb, transp_a, transp_b): a = np.random.uniform(low=-1.0, high=1.0, size=sa).astype(np.float32) b = np.random.uniform(low=-1.0, high=1.0, size=sb).astype(np.float32) @@ -79,5 +104,6 @@ def test_tensordot(): if __name__ == "__main__": + test_nn_matmul() test_matmul() test_tensordot() From 3c24a86b578b85b4abe4b77e510778833344ad7f Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Thu, 10 Jun 2021 15:00:44 +0800 Subject: [PATCH 02/26] Bug fix --- python/tvm/relay/frontend/tensorflow.py | 14 ++++++++++- python/tvm/relay/frontend/tensorflow_ops.py | 23 +++++++++++++++---- src/relay/op/nn/nn.h | 1 - .../frontend/tensorflow/test_forward.py | 12 ++++++++-- 4 files changed, 41 insertions(+), 9 deletions(-) diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py index 0bdec953a540..7ae94aa6c67c 100644 --- a/python/tvm/relay/frontend/tensorflow.py +++ b/python/tvm/relay/frontend/tensorflow.py @@ -44,6 +44,10 @@ __all__ = ["from_tensorflow"] +# By default, TVM convert `tf.matmul` to `nn.dense` op with data tensor non-transposed and weight +# tensor transposed +_USE_DENSE_INSTEAD_OF_MATMUL = True + # compatible operators that do NOT require any conversion. _identity_list = [] @@ -1204,7 +1208,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None): return func, self._params -def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None): +def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None, use_dense_op=True): """Load tensorflow graph which is a python tensorflow graph object into relay. The companion parameters will be handled automatically. @@ -1222,6 +1226,11 @@ def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None): outputs : List of output tensor names (Optional) if not specified then the last node is assumed as graph output. + use_dense_op : bool (Optional) + Ture to convert `tf.matmul` to `nn.dense`, else to `nn.matmul`. + The `nn.dense` op requires the data tensor to be non-transposed and weight tensor to be + transposed, may insert extra `transpose` to the original graph. + Returns ------- mod : tvm.IRModule @@ -1230,6 +1239,9 @@ def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None): params : dict of str to tvm.nd.NDArray Dict of converted parameters stored in tvm.nd.NDArray format """ + global _USE_DENSE_INSTEAD_OF_MATMUL + if use_dense_op != _USE_DENSE_INSTEAD_OF_MATMUL: + _USE_DENSE_INSTEAD_OF_MATMUL = use_dense_op g = GraphProto() mod, params = g.from_tensorflow(graph, layout, shape, outputs) diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py index c7385565857d..e89e624e123e 100644 --- a/python/tvm/relay/frontend/tensorflow_ops.py +++ b/python/tvm/relay/frontend/tensorflow_ops.py @@ -1114,12 +1114,25 @@ def _impl(inputs, attr, params, mod): def _matmul(): def _impl(inputs, attr, params, mod): channels = _infer_channels(inputs[1], not attr["transpose_b"]) - if attr["transpose_a"]: - inputs[0] = _op.transpose(inputs[0], axes=(1, 0)) - if not attr["transpose_b"]: - inputs[1] = _op.transpose(inputs[1], axes=(1, 0)) + from .tensorflow import _USE_DENSE_INSTEAD_OF_MATMUL + if _USE_DENSE_INSTEAD_OF_MATMUL: + if attr["transpose_a"]: + inputs[0] = _op.transpose(inputs[0], axes=(1, 0)) + if not attr["transpose_b"]: + inputs[1] = _op.transpose(inputs[1], axes=(1, 0)) + return AttrCvt( + op_name="dense", + extras={"units": channels}, + ignores=["transpose_a", "transpose_b", "T"], + )(inputs, attr) return AttrCvt( - op_name="dense", extras={"units": channels}, ignores=["transpose_a", "transpose_b", "T"] + op_name="matmul", + extras={ + "units": channels, + "input_transposed": attr["transpose_a"] or False, + "weight_transposed": attr["transpose_b"] or False, + }, + ignores=["transpose_a", "transpose_b", "T"], )(inputs, attr) return _impl diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h index dd394396a309..fe59ba047e18 100644 --- a/src/relay/op/nn/nn.h +++ b/src/relay/op/nn/nn.h @@ -27,7 +27,6 @@ #include #include #include -#include #include diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py index 331553388b48..afd38d1c952d 100644 --- a/tests/python/frontend/tensorflow/test_forward.py +++ b/tests/python/frontend/tensorflow/test_forward.py @@ -117,6 +117,7 @@ def run_tvm_graph( disabled_pass=None, ignore_in_shape=False, serialize=False, + use_dense_op=True, ): """Generic function to compile on relay and execute on tvm""" input_data = convert_to_list(input_data) @@ -131,7 +132,11 @@ def run_tvm_graph( e: i.shape if hasattr(i, "shape") else () for e, i in zip(input_node, input_data) } mod, params = relay.frontend.from_tensorflow( - graph_def, layout=layout, shape=shape_dict, outputs=out_names + graph_def, + layout=layout, + shape=shape_dict, + outputs=out_names, + use_dense_op=use_dense_op, ) dev = tvm.device(target, 0) if mode == "debug": @@ -213,6 +218,7 @@ def compare_tf_with_tvm( add_shapes_to_graph_def=True, targets=None, ignore_in_shape=False, + use_dense_op=True, ): """Generic function to generate and compare tensorflow and TVM output""" @@ -260,6 +266,7 @@ def name_without_num(name): mode=mode, cuda_layout=cuda_layout, ignore_in_shape=ignore_in_shape, + use_dense_op=use_dense_op, ) # since the names from tensorflow and relay runs are not exactly same, # first len(tf_output) will be compared @@ -1795,7 +1802,8 @@ def _test_matmul(i, j, k, dtype, outer=None): A_np = np.random.uniform(high=5.0, size=A_shape).astype(dtype) B_np = np.random.uniform(high=5.0, size=B_shape).astype(dtype) - compare_tf_with_tvm([A_np, B_np], [A.name, B.name], result.name) + compare_tf_with_tvm([A_np, B_np], [A.name, B.name], result.name, use_dense_op=True) + compare_tf_with_tvm([A_np, B_np], [A.name, B.name], result.name, use_dense_op=False) def test_forward_matmul(): From 4a808d727e7f2ecae56945495a39a2e7fbb8a5de Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Thu, 10 Jun 2021 15:18:13 +0800 Subject: [PATCH 03/26] Lint fix --- src/relay/transforms/auto_scheduler_layout_rewrite.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc b/src/relay/transforms/auto_scheduler_layout_rewrite.cc index ac110544a2b5..5005e7bf4c41 100644 --- a/src/relay/transforms/auto_scheduler_layout_rewrite.cc +++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc @@ -103,9 +103,9 @@ class FuncMutator : public ExprMutator { std::deque ori_layouts_queue_; std::deque new_layouts_queue_; - std::vector target_ops_{"nn.conv2d", "nn.conv3d", - "nn.contrib_conv2d_winograd_without_weight_transform", - "nn.matmul", "nn.dense", "nn.batch_matmul"}; + std::vector target_ops_{ + "nn.conv2d", "nn.conv3d", "nn.contrib_conv2d_winograd_without_weight_transform", + "nn.matmul", "nn.dense", "nn.batch_matmul"}; }; Expr AutoSchedulerLayoutRewriter::VisitExpr_(const CallNode* n) { From 2f3d4fae5e3976513334af209e6b5585926a1c0a Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Thu, 10 Jun 2021 15:23:53 +0800 Subject: [PATCH 04/26] Update --- include/tvm/relay/attrs/nn.h | 4 ++-- python/tvm/relay/frontend/tensorflow_ops.py | 2 +- python/tvm/relay/op/nn/_nn.py | 6 +++--- python/tvm/relay/op/nn/nn.py | 11 +++++++---- python/tvm/relay/op/strategy/generic.py | 2 +- python/tvm/topi/cuda/dense.py | 8 ++++---- python/tvm/topi/nn/dense.py | 6 +++--- rust/tvm/src/ir/relay/attrs/nn.rs | 2 +- src/relay/op/make_op.h | 2 +- src/relay/op/nn/nn.cc | 8 ++++---- src/relay/op/nn/nn.h | 2 +- src/relay/qnn/op/dense.cc | 2 +- src/relay/quantize/realize.cc | 2 +- src/relay/transforms/combine_parallel_dense.cc | 2 +- tests/python/relay/test_op_level1.py | 4 ++-- tests/python/topi/python/test_topi_matmul.py | 2 +- 16 files changed, 34 insertions(+), 31 deletions(-) diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h index de8fa26d66d9..6518f122b72e 100644 --- a/include/tvm/relay/attrs/nn.h +++ b/include/tvm/relay/attrs/nn.h @@ -965,7 +965,7 @@ struct AvgPool3DAttrs : public tvm::AttrsNode { struct MatmulAttrs : public tvm::AttrsNode { IndexExpr units; DataType out_dtype; - bool input_transposed; + bool data_transposed; bool weight_transposed; tvm::String auto_scheduler_rewritten_layout; // The layout after auto-scheduler's layout rewrite @@ -977,7 +977,7 @@ struct MatmulAttrs : public tvm::AttrsNode { .set_default(NullValue()) .describe("Output data type, set to explicit type under mixed precision setting"); - TVM_ATTR_FIELD(input_transposed) + TVM_ATTR_FIELD(data_transposed) .set_default(false) .describe("Whether the input tensor is in transposed format."); diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py index e89e624e123e..32c1792aa700 100644 --- a/python/tvm/relay/frontend/tensorflow_ops.py +++ b/python/tvm/relay/frontend/tensorflow_ops.py @@ -1129,7 +1129,7 @@ def _impl(inputs, attr, params, mod): op_name="matmul", extras={ "units": channels, - "input_transposed": attr["transpose_a"] or False, + "data_transposed": attr["transpose_a"] or False, "weight_transposed": attr["transpose_b"] or False, }, ignores=["transpose_a", "transpose_b", "T"], diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index dd1ee6a43857..7bf593c0d9be 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -1175,11 +1175,11 @@ def batch_flatten_shape_func(attrs, inputs, _): @script -def _matmul_shape_func(data_shape, weight_shape, input_transposed, weight_transposed): +def _matmul_shape_func(data_shape, weight_shape, data_transposed, weight_transposed): out = output_tensor((data_shape.shape[0],), "int64") for i in const_range(out.shape[0] - 1): out[i] = data_shape[i] - if input_transposed: + if data_transposed: out[out.shape[0] - 2] = out[out.shape[0] - 1] out[out.shape[0] - 1] = weight_shape[0] if weight_transposed else weight_shape[1] @@ -1192,7 +1192,7 @@ def matmul_shape_func(attrs, inputs, _): Shape function for matmul op. """ ret = [ - _matmul_shape_func(inputs[0], inputs[1], attrs.input_transposed, attrs.weight_transposed) + _matmul_shape_func(inputs[0], inputs[1], attrs.data_transposed, attrs.weight_transposed) ] return ret diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index 179ff97afe68..df049478f577 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -1471,7 +1471,7 @@ def bias_add(data, bias, axis=1): return _make.bias_add(data, bias, axis) -def matmul(data, weight, units=None, out_dtype="", input_transposed=False, weight_transposed=False): +def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight_transposed=False): """Dense operator. Applies a linear transformation. The X & W can be transposed. @@ -1483,7 +1483,7 @@ def matmul(data, weight, units=None, out_dtype="", input_transposed=False, weigh ---------- data : tvm.relay.Expr The input data to the operator, - of shape `(d_1, d_2, ..., d_n, units_in)`. + of shape `(d_1, d_2, ..., d_n, units_in)` or `(d_1, d_2, ..., units_in, d_n)`. weight : tvm.relay.Expr The weight expressions, 2-D matrix, @@ -1496,6 +1496,9 @@ def matmul(data, weight, units=None, out_dtype="", input_transposed=False, weigh Specifies the output data type for mixed precision dense, of shape `(d_1, d_2, ..., d_n, units)`. + data_transposed : bool, optional + Whether the input tensor is in transposed format. + weight_transposed : bool, optional Whether the weight tensor is in transposed format. @@ -1505,9 +1508,9 @@ def matmul(data, weight, units=None, out_dtype="", input_transposed=False, weigh The computed result. """ # With N/T format, the compute will be seen as a dense - if not input_transposed and weight_transposed: + if not data_transposed and weight_transposed: return dense(data, weight, units, out_dtype) - return _make.matmul(data, weight, units, out_dtype, input_transposed, weight_transposed) + return _make.matmul(data, weight, units, out_dtype, data_transposed, weight_transposed) def dense(data, weight, units=None, out_dtype=""): diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 9a552d73bf46..ed3bc4af8d3d 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -725,7 +725,7 @@ def _compute_matmul(attrs, inputs, out_type): inputs[1], None, out_dtype, - attrs.input_transposed, + attrs.data_transposed, attrs.weight_transposed, ] if need_auto_scheduler_layout: diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py index 71b2464736df..76fbd7f82117 100644 --- a/python/tvm/topi/cuda/dense.py +++ b/python/tvm/topi/cuda/dense.py @@ -36,7 +36,7 @@ def _matmul_cublas_common( weight, bias=None, out_dtype=None, - input_transposed=False, + data_transposed=False, weight_transposed=False, ): assert len(data.shape) == 2 and len(weight.shape) == 2, "only support 2-dim matmul" @@ -47,7 +47,7 @@ def _matmul_cublas_common( assert out_dtype == data.dtype, "Mixed precision not supported." batch, in_dim = get_const_tuple(data.shape) out_dim, _ = get_const_tuple(weight.shape) - matmul = cublas.matmul(data, weight, input_transposed, weight_transposed) + matmul = cublas.matmul(data, weight, data_transposed, weight_transposed) if all(isinstance(d, int) for d in [batch, in_dim, out_dim]): cfg.add_flop(batch * in_dim * out_dim * 2) if bias is not None: @@ -64,12 +64,12 @@ def matmul_cublas( weight, bias=None, out_dtype=None, - input_transposed=False, + data_transposed=False, weight_transposed=False, ): """Matmul operator on CUDA with CUBLAS""" return _matmul_cublas_common( - cfg, data, weight, bias, out_dtype, input_transposed, weight_transposed + cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed ) diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py index daaf3c343dcf..9cc0b1c19f84 100644 --- a/python/tvm/topi/nn/dense.py +++ b/python/tvm/topi/nn/dense.py @@ -26,7 +26,7 @@ def matmul( weight, bias=None, out_dtype=None, - input_transposed=False, + data_transposed=False, weight_transposed=False, auto_scheduler_rewritten_layout="", ): @@ -59,7 +59,7 @@ def matmul( assert len(bias.shape) == 1 if out_dtype is None: out_dtype = data.dtype - if input_transposed: + if data_transposed: in_dim, batch = data.shape else: batch, in_dim = data.shape @@ -77,7 +77,7 @@ def matmul( assert in_dim == red_dim k = te.reduce_axis((0, in_dim), name="k") - if input_transposed: + if data_transposed: if weight_transposed: compute_lambda = lambda i, j: te.sum( data[k, i].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=k diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs index 84533e9ecaea..e77972e45f86 100644 --- a/rust/tvm/src/ir/relay/attrs/nn.rs +++ b/rust/tvm/src/ir/relay/attrs/nn.rs @@ -62,7 +62,7 @@ pub struct MatmulAttrsNode { pub base: BaseAttrsNode, pub units: IndexExpr, pub out_dtype: DataType, - pub input_transposed: bool, + pub data_transposed: bool, pub weight_transposed: bool, } diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h index b957df987f4c..f7bb98f9d1b6 100644 --- a/src/relay/op/make_op.h +++ b/src/relay/op/make_op.h @@ -44,7 +44,7 @@ Expr MakeClip(Expr a, double a_min, double a_max); Expr MakeConcatenate(Expr data, int axis); -Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool input_transposed, +Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool data_transposed, bool weight_transposed); Expr MakeDense(Expr data, Expr weight, IndexExpr units, DataType out_dtype); diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index 70e7cd0f5be8..7283242e1914 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -165,14 +165,14 @@ Useful for // ------------------- relay.nn.matmul TVM_REGISTER_NODE_TYPE(MatmulAttrs); -Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool input_transposed, +Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool data_transposed, bool weight_transposed) { auto attrs = make_object(); attrs->units = units; attrs->out_dtype = out_dtype; - attrs->input_transposed = input_transposed; + attrs->data_transposed = data_transposed; attrs->weight_transposed = weight_transposed; - if (!input_transposed && weight_transposed) { + if (!data_transposed && weight_transposed) { static const Op& dense_op = Op::Get("nn.dense"); return Call(dense_op, {data, weight}, Attrs(attrs), {}); } else { @@ -229,7 +229,7 @@ Expr MakeDensePack(Expr data, Expr weight, IndexExpr units, DataType out_dtype) auto attrs = make_object(); attrs->units = units; attrs->out_dtype = out_dtype; - attrs->input_transposed = false; + attrs->data_transposed = false; attrs->weight_transposed = true; static const Op& op = Op::Get("nn.contrib_dense_pack"); return Call(op, {data, weight}, Attrs(attrs), {}); diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h index fe59ba047e18..60bff5d3c69a 100644 --- a/src/relay/op/nn/nn.h +++ b/src/relay/op/nn/nn.h @@ -51,7 +51,7 @@ bool MatmulRel(const Array& types, int num_inputs, const Attrs& attrs, const Array& dshape = data->shape; Array oshape = dshape; tvm::PrimExpr reduce = dshape[dshape.size() - 1]; - if (param->input_transposed) { + if (param->data_transposed) { reduce = dshape[dshape.size() - 2]; oshape.Set((oshape.size() - 2), dshape[oshape.size() - 1]); } diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc index fbaa8a96f562..f47fe71ebab7 100644 --- a/src/relay/qnn/op/dense.cc +++ b/src/relay/qnn/op/dense.cc @@ -79,7 +79,7 @@ Expr MakeQuantizedDense(Expr data, Expr weight, Expr input_zero_point, Expr kern auto attrs = make_object(); attrs->units = std::move(units); attrs->out_dtype = out_dtype; - attrs->input_transposed = false; + attrs->data_transposed = false; attrs->weight_transposed = true; static const Op& op = Op::Get("qnn.dense"); return Call(op, {data, weight, input_zero_point, kernel_zero_point, input_scale, kernel_scale}, diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc index 0f7a5eafb0f0..09572b04ed4b 100644 --- a/src/relay/quantize/realize.cc +++ b/src/relay/quantize/realize.cc @@ -285,7 +285,7 @@ Expr DenseRealize(const Call& ref_call, const Array& new_args, const Objec *attrs = *ref_attrs; DataType out_dtype = cfg->dtype_activation; attrs->out_dtype = out_dtype; - attrs->input_transposed = false; + attrs->data_transposed = false; attrs->weight_transposed = true; Expr ret = Call(ref_call->op, {ldata, rdata}, Attrs(attrs), ref_call->type_args); diff --git a/src/relay/transforms/combine_parallel_dense.cc b/src/relay/transforms/combine_parallel_dense.cc index 41f3a70c62bc..966210c3c882 100644 --- a/src/relay/transforms/combine_parallel_dense.cc +++ b/src/relay/transforms/combine_parallel_dense.cc @@ -124,7 +124,7 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner { const auto dense_attrs = make_object(); dense_attrs->units = new_output_dims; dense_attrs->out_dtype = origin_attrs->out_dtype; - dense_attrs->input_transposed = false; + dense_attrs->data_transposed = false; dense_attrs->weight_transposed = true; return Call(dense_op, {input, new_weight}, Attrs{dense_attrs}, {}); } diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py index 11f79b2477bf..d5a0e8bc8403 100644 --- a/tests/python/relay/test_op_level1.py +++ b/tests/python/relay/test_op_level1.py @@ -439,7 +439,7 @@ def test_matmul(): x = relay.var("x", relay.TensorType((n, c, w, h), dtype)) wh, ww = te.size_var("wh"), te.size_var("ww") w = relay.var("w", relay.TensorType((wh, ww), dtype)) - y = relay.nn.matmul(x, w, input_transposed=True) + y = relay.nn.matmul(x, w, data_transposed=True) yy = run_infer_type(y) assert yy.checked_type == relay.TensorType((n, c, h, ww), dtype) @@ -452,7 +452,7 @@ def test_matmul(): x = relay.var("x", shape=(5, 10), dtype=dtype) w = relay.var("w", shape=(5, 2), dtype=dtype) - z = relay.nn.matmul(x, w, input_transposed=True) + z = relay.nn.matmul(x, w, data_transposed=True) # Check result. func = relay.Function([x, w], z) diff --git a/tests/python/topi/python/test_topi_matmul.py b/tests/python/topi/python/test_topi_matmul.py index c777b8f2bc6e..cf92ab0807cc 100644 --- a/tests/python/topi/python/test_topi_matmul.py +++ b/tests/python/topi/python/test_topi_matmul.py @@ -46,7 +46,7 @@ def verify_nn_matmul(sa, sb, transp_a, transp_b): b = np.random.uniform(low=-1.0, high=1.0, size=sb).astype(np.float32) c1 = np.matmul(np.transpose(a) if transp_a else a, np.transpose(b) if transp_b else b) c2 = with_tvm( - lambda A, B: topi.nn.matmul(A, B, input_transposed=transp_a, weight_transposed=transp_b), + lambda A, B: topi.nn.matmul(A, B, data_transposed=transp_a, weight_transposed=transp_b), a, b, ) From 0fc436a9109032fec2f5a69f1b3b47c77b46a367 Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Thu, 10 Jun 2021 15:35:56 +0800 Subject: [PATCH 05/26] Update --- python/tvm/relay/op/nn/nn.py | 2 +- python/tvm/topi/nn/dense.py | 8 +++++++- src/relay/op/nn/nn.cc | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index df049478f577..ee82d47548ec 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -1497,7 +1497,7 @@ def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight of shape `(d_1, d_2, ..., d_n, units)`. data_transposed : bool, optional - Whether the input tensor is in transposed format. + Whether the data tensor is in transposed format. weight_transposed : bool, optional Whether the weight tensor is in transposed format. diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py index 9cc0b1c19f84..b7e3ece5f579 100644 --- a/python/tvm/topi/nn/dense.py +++ b/python/tvm/topi/nn/dense.py @@ -30,7 +30,7 @@ def matmul( weight_transposed=False, auto_scheduler_rewritten_layout="", ): - """The default implementation of dense in topi. + """The default implementation of matmul in topi. Parameters ---------- @@ -46,6 +46,12 @@ def matmul( out_dtype : Optional[str] The output type. This is used for mixed precision. + data_transposed : Optional[bool] + Whether the data tensor is in transposed format. + + weight_transposed : Optional[bool] + Whether the weight tensor is in transposed format. + auto_scheduler_rewritten_layout: str = "" The layout after auto-scheduler's layout rewrite pass. diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index 7283242e1914..20179a4597ed 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -186,7 +186,7 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.matmul").set_body_typed(MakeMatmul); RELAY_REGISTER_OP("nn.matmul") .describe(R"code(Applies a linear transformation: :math:`Y = XW`. X & W can be transposed. -- **data**: `(x1, x2, ..., xn, input_dim)` +- **data**: `(x1, x2, ..., xn, input_dim)` or `(x1, x2, ..., input_dim, xn)` - **weight**: `(input_dim, units)` or `(units, input_dim)` - **out**: `(x1, x2, ..., xn, units)`. From e262e5ebcf94b6c53c15bb7af3abaa14c89bfacf Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Thu, 10 Jun 2021 15:42:39 +0800 Subject: [PATCH 06/26] Lint fix --- src/relay/transforms/auto_scheduler_layout_rewrite.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc b/src/relay/transforms/auto_scheduler_layout_rewrite.cc index 5005e7bf4c41..8aa5117dee1e 100644 --- a/src/relay/transforms/auto_scheduler_layout_rewrite.cc +++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc @@ -105,7 +105,7 @@ class FuncMutator : public ExprMutator { std::vector target_ops_{ "nn.conv2d", "nn.conv3d", "nn.contrib_conv2d_winograd_without_weight_transform", - "nn.matmul", "nn.dense", "nn.batch_matmul"}; + "nn.matmul", "nn.dense", "nn.batch_matmul"}; }; Expr AutoSchedulerLayoutRewriter::VisitExpr_(const CallNode* n) { From 3bdff51fb9dda3fe81dd1f6ad5f3b4026c4d6923 Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Thu, 10 Jun 2021 15:46:16 +0800 Subject: [PATCH 07/26] Update --- python/tvm/relay/op/nn/nn.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index ee82d47548ec..d8d36d3a544d 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -1507,15 +1507,12 @@ def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight result : tvm.relay.Expr The computed result. """ - # With N/T format, the compute will be seen as a dense - if not data_transposed and weight_transposed: - return dense(data, weight, units, out_dtype) return _make.matmul(data, weight, units, out_dtype, data_transposed, weight_transposed) def dense(data, weight, units=None, out_dtype=""): """Dense operator. - Applies a linear transformation. This is an alias of matmul with weight transposed. + Applies a linear transformation .. math:: From 23facfb7884e5586a3d76e0f9591aef16022e6b0 Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Thu, 10 Jun 2021 15:51:19 +0800 Subject: [PATCH 08/26] Lint fix --- python/tvm/relay/frontend/tensorflow_ops.py | 3 ++- python/tvm/relay/op/nn/_nn.py | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py index 32c1792aa700..c9b37881e208 100644 --- a/python/tvm/relay/frontend/tensorflow_ops.py +++ b/python/tvm/relay/frontend/tensorflow_ops.py @@ -1113,8 +1113,9 @@ def _impl(inputs, attr, params, mod): def _matmul(): def _impl(inputs, attr, params, mod): - channels = _infer_channels(inputs[1], not attr["transpose_b"]) from .tensorflow import _USE_DENSE_INSTEAD_OF_MATMUL + + channels = _infer_channels(inputs[1], not attr["transpose_b"]) if _USE_DENSE_INSTEAD_OF_MATMUL: if attr["transpose_a"]: inputs[0] = _op.transpose(inputs[0], axes=(1, 0)) diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index 7bf593c0d9be..929331c3daf2 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -1191,9 +1191,7 @@ def matmul_shape_func(attrs, inputs, _): """ Shape function for matmul op. """ - ret = [ - _matmul_shape_func(inputs[0], inputs[1], attrs.data_transposed, attrs.weight_transposed) - ] + ret = [_matmul_shape_func(inputs[0], inputs[1], attrs.data_transposed, attrs.weight_transposed)] return ret From e73a5a6b4c6dcc2d69d65b8a7776be778f20a5aa Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Thu, 10 Jun 2021 17:42:06 +0800 Subject: [PATCH 09/26] Bug fix --- python/tvm/relay/op/nn/nn.py | 12 +++++++++++- .../contrib/test_arm_compute_lib/test_dense.py | 2 ++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index d8d36d3a544d..5a90d3ba6aab 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -1510,7 +1510,7 @@ def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight return _make.matmul(data, weight, units, out_dtype, data_transposed, weight_transposed) -def dense(data, weight, units=None, out_dtype=""): +def dense(data, weight, units=None, out_dtype="", data_transposed=False, weight_transposed=True): """Dense operator. Applies a linear transformation @@ -1535,11 +1535,21 @@ def dense(data, weight, units=None, out_dtype=""): Specifies the output data type for mixed precision dense, of shape `(d_1, d_2, ..., d_n, units)`. + data_transposed : bool, optional + Whether the data tensor is in transposed format. Expected to be False. + + weight_transposed : bool, optional + Whether the weight tensor is in transposed format. Expected to be True. + Returns ------- result : tvm.relay.Expr The computed result. """ + # Add data_transposed & weight_transposed parameters for some API requires to apply + # attrs to this function + assert data_transposed + assert not weight_transposed return _make.dense(data, weight, units, out_dtype) diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py index e6620a4bc1cb..0fac4de35388 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_dense.py +++ b/tests/python/contrib/test_arm_compute_lib/test_dense.py @@ -123,6 +123,8 @@ def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False): "shape": [[list(output_shape)]], "dtype": [[dtype]], "units": [[str(units)]], + "data_transposed": [[0]], + "weight_transposed": [[1]], }, } From f5aedd84d770a6bddc67c190bc4c976e4b3d8185 Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Thu, 10 Jun 2021 18:48:17 +0800 Subject: [PATCH 10/26] Bug fix --- python/tvm/relay/op/nn/nn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index 5a90d3ba6aab..a6cf147cc564 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -1548,8 +1548,8 @@ def dense(data, weight, units=None, out_dtype="", data_transposed=False, weight_ """ # Add data_transposed & weight_transposed parameters for some API requires to apply # attrs to this function - assert data_transposed - assert not weight_transposed + assert not data_transposed + assert weight_transposed return _make.dense(data, weight, units, out_dtype) From 9dc686ff5737debd63a9d825fe14f015266c7313 Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Thu, 10 Jun 2021 19:59:59 +0800 Subject: [PATCH 11/26] Bug fix --- tests/python/contrib/test_arm_compute_lib/test_dense.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py index 0fac4de35388..007a6b9bf8d0 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_dense.py +++ b/tests/python/contrib/test_arm_compute_lib/test_dense.py @@ -123,8 +123,8 @@ def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False): "shape": [[list(output_shape)]], "dtype": [[dtype]], "units": [[str(units)]], - "data_transposed": [[0]], - "weight_transposed": [[1]], + "data_transposed": [["0"]], + "weight_transposed": [["1"]], }, } From d5e2625fd47e65cad78d4307c061a7ad0358c7ad Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Thu, 17 Jun 2021 11:15:55 +0800 Subject: [PATCH 12/26] Recover DenseAttrs --- include/tvm/relay/attrs/nn.h | 16 ++++++++++++ python/tvm/relay/op/nn/nn.py | 12 +-------- python/tvm/relay/op/op_attrs.py | 7 ++++- rust/tvm/src/ir/relay/attrs/nn.rs | 10 +++++++ src/relay/op/nn/nn.cc | 26 ++++++++++--------- src/relay/op/nn/nn.h | 19 +++++++++----- src/relay/qnn/op/dense.cc | 16 +++++------- src/relay/quantize/realize.cc | 6 ++--- .../auto_scheduler_layout_rewrite.cc | 4 +++ .../transforms/combine_parallel_dense.cc | 16 +++++------- .../test_arm_compute_lib/test_dense.py | 2 -- 11 files changed, 79 insertions(+), 55 deletions(-) diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h index 6518f122b72e..0a3246ecf5a1 100644 --- a/include/tvm/relay/attrs/nn.h +++ b/include/tvm/relay/attrs/nn.h @@ -987,6 +987,22 @@ struct MatmulAttrs : public tvm::AttrsNode { } }; +/*! \brief Attributes for dense operator */ +struct DenseAttrs : public tvm::AttrsNode { + IndexExpr units; + tvm::String auto_scheduler_rewritten_layout; // The layout after auto-scheduler's layout rewrite + DataType out_dtype; + + TVM_DECLARE_ATTRS(DenseAttrs, "relay.attrs.DenseAttrs") { + TVM_ATTR_FIELD(units).describe("Number of hidden units of the dense transformation."); + + // use 0 bits to indicate none. + TVM_ATTR_FIELD(out_dtype) + .set_default(NullValue()) + .describe("Output data type, set to explicit type under mixed precision setting"); + } +}; + /*! \brief Attributes for batch matmul operator */ struct BatchMatmulAttrs : public tvm::AttrsNode { tvm::String auto_scheduler_rewritten_layout; // The layout after auto-scheduler's layout rewrite diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index a6cf147cc564..d8d36d3a544d 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -1510,7 +1510,7 @@ def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight return _make.matmul(data, weight, units, out_dtype, data_transposed, weight_transposed) -def dense(data, weight, units=None, out_dtype="", data_transposed=False, weight_transposed=True): +def dense(data, weight, units=None, out_dtype=""): """Dense operator. Applies a linear transformation @@ -1535,21 +1535,11 @@ def dense(data, weight, units=None, out_dtype="", data_transposed=False, weight_ Specifies the output data type for mixed precision dense, of shape `(d_1, d_2, ..., d_n, units)`. - data_transposed : bool, optional - Whether the data tensor is in transposed format. Expected to be False. - - weight_transposed : bool, optional - Whether the weight tensor is in transposed format. Expected to be True. - Returns ------- result : tvm.relay.Expr The computed result. """ - # Add data_transposed & weight_transposed parameters for some API requires to apply - # attrs to this function - assert not data_transposed - assert weight_transposed return _make.dense(data, weight, units, out_dtype) diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py index 323190e309b8..68c745beb027 100644 --- a/python/tvm/relay/op/op_attrs.py +++ b/python/tvm/relay/op/op_attrs.py @@ -66,7 +66,12 @@ class BiasAddAttrs(Attrs): @tvm._ffi.register_object("relay.attrs.MatmulAttrs") class MatmulAttrs(Attrs): - """Attributes for nn.matmul and nn.dense""" + """Attributes for nn.matmul""" + + +@tvm._ffi.register_object("relay.attrs.DenseAttrs") +class DenseAttrs(Attrs): + """Attributes for nn.dense""" @tvm._ffi.register_object("relay.attrs.SoftmaxAttrs") diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs index e77972e45f86..e0a1d5bf02cd 100644 --- a/rust/tvm/src/ir/relay/attrs/nn.rs +++ b/rust/tvm/src/ir/relay/attrs/nn.rs @@ -66,6 +66,16 @@ pub struct MatmulAttrsNode { pub weight_transposed: bool, } +#[repr(C)] +#[derive(Object, Debug)] +#[ref_name = "DenseAttrs"] +#[type_key = "relay.attrs.DenseAttrs"] +pub struct DenseAttrsNode { + pub base: BaseAttrsNode, + pub units: IndexExpr, + pub out_dtype: DataType, +} + #[repr(C)] #[derive(Object, Debug)] #[ref_name = "GlobalPool2DAttrs"] diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index 20179a4597ed..1cd825618473 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -164,18 +164,22 @@ Useful for // ------------------- relay.nn.matmul TVM_REGISTER_NODE_TYPE(MatmulAttrs); +TVM_REGISTER_NODE_TYPE(DenseAttrs); Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool data_transposed, bool weight_transposed) { - auto attrs = make_object(); - attrs->units = units; - attrs->out_dtype = out_dtype; - attrs->data_transposed = data_transposed; - attrs->weight_transposed = weight_transposed; if (!data_transposed && weight_transposed) { + auto attrs = make_object(); + attrs->units = units; + attrs->out_dtype = out_dtype; static const Op& dense_op = Op::Get("nn.dense"); return Call(dense_op, {data, weight}, Attrs(attrs), {}); } else { + auto attrs = make_object(); + attrs->units = units; + attrs->out_dtype = out_dtype; + attrs->data_transposed = data_transposed; + attrs->weight_transposed = weight_transposed; static const Op& matmul_op = Op::Get("nn.matmul"); return Call(matmul_op, {data, weight}, Attrs(attrs), {}); } @@ -215,22 +219,20 @@ RELAY_REGISTER_OP("nn.dense") - **out**: `(x1, x2, ..., xn, units)`. )code" TVM_ADD_FILELINE) - .set_attrs_type() + .set_attrs_type() .set_num_inputs(2) .add_argument("data", "nD Tensor", "Input data.") .add_argument("weight", "2D Tensor", "Weight matrix.") .set_support_level(1) - .add_type_rel("Dense", MatmulRel); + .add_type_rel("Dense", MatmulRel); // ------------------- relay.nn.dense // ------------------- relay.nn.contrib_dense_pack // Positional relay function to create dense_pack operator used by frontend FFI. Expr MakeDensePack(Expr data, Expr weight, IndexExpr units, DataType out_dtype) { - auto attrs = make_object(); + auto attrs = make_object(); attrs->units = units; attrs->out_dtype = out_dtype; - attrs->data_transposed = false; - attrs->weight_transposed = true; static const Op& op = Op::Get("nn.contrib_dense_pack"); return Call(op, {data, weight}, Attrs(attrs), {}); } @@ -245,12 +247,12 @@ RELAY_REGISTER_OP("nn.contrib_dense_pack") - **out**: `(x1, x2, ..., xn, units)`. )code" TVM_ADD_FILELINE) - .set_attrs_type() + .set_attrs_type() .set_num_inputs(2) .add_argument("data", "nD Tensor", "Input data.") .add_argument("weight", "3D Tensor", "Packed weight matrix.") .set_support_level(10) - .add_type_rel("DensePack", DensePackRel); + .add_type_rel("DensePack", DensePackRel); // ------------------- relay.nn.contrib_dense_pack // relay.leaky_relu diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h index 60bff5d3c69a..97591981c2a3 100644 --- a/src/relay/op/nn/nn.h +++ b/src/relay/op/nn/nn.h @@ -42,23 +42,28 @@ bool MatmulRel(const Array& types, int num_inputs, const Attrs& attrs, const auto* data = types[0].as(); const auto* weight = types[1].as(); if (data == nullptr) return false; + ICHECK(static_cast(data->shape.size()) != 0); const AttrType* param = attrs.as(); ICHECK(param != nullptr); - - ICHECK(static_cast(data->shape.size()) != 0); + bool data_transposed = false; + bool weight_transposed = true; + if (attrs->IsInstance()) { + data_transposed = param->data_transposed; + weight_transposed = param->weight_transposed; + } const Array& dshape = data->shape; Array oshape = dshape; tvm::PrimExpr reduce = dshape[dshape.size() - 1]; - if (param->data_transposed) { + if (data_transposed) { reduce = dshape[dshape.size() - 2]; oshape.Set((oshape.size() - 2), dshape[oshape.size() - 1]); } if (param->units.defined()) { // validate the weight shape is proper if defined // Assign weight type - const Array& wshape = param->weight_transposed + const Array& wshape = weight_transposed ? Array({param->units, reduce}) : Array({reduce, param->units}); // It is possible for weight to be nullptr in which case we will use @@ -90,12 +95,12 @@ bool MatmulRel(const Array& types, int num_inputs, const Attrs& attrs, } else { ICHECK(static_cast(weight->shape.size()) == 2); if (!data->shape.back().as()) { - ICHECK((param->weight_transposed && reporter->AssertEQ(reduce, weight->shape[1])) || - (!param->weight_transposed && reporter->AssertEQ(reduce, weight->shape[0]))) + ICHECK((weight_transposed && reporter->AssertEQ(reduce, weight->shape[1])) || + (!weight_transposed && reporter->AssertEQ(reduce, weight->shape[0]))) << "MatmulRel: input dimension doesn't match," << " data shape=" << data->shape << ", weight shape=" << weight->shape; } - oshape.Set((oshape.size() - 1), param->weight_transposed ? wshape[0] : wshape[1]); + oshape.Set((oshape.size() - 1), weight_transposed ? wshape[0] : wshape[1]); } } diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc index f47fe71ebab7..6284524bff27 100644 --- a/src/relay/qnn/op/dense.cc +++ b/src/relay/qnn/op/dense.cc @@ -45,8 +45,8 @@ bool QnnDenseRel(const Array& types, int num_inputs, const Attrs& attrs, const auto* data = types[0].as(); const auto* weight = types[1].as(); if (data == nullptr || weight == nullptr) return false; - const auto* param = attrs.as(); - ICHECK(param != nullptr) << "MatmulAttrs cannot be nullptr."; + const auto* param = attrs.as(); + ICHECK(param != nullptr) << "DenseAttrs cannot be nullptr."; ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8)) << "Expected quantized dense type(int8, uint8) for input but was " << data->dtype; ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8)) @@ -70,24 +70,22 @@ bool QnnDenseRel(const Array& types, int num_inputs, const Attrs& attrs, // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay // Dense infer type function. Array tensor_types = {types[0], types[1], types[6]}; - return MatmulRel(tensor_types, 3, attrs, reporter); + return DenseRel(tensor_types, 3, attrs, reporter); } // Positional relay function to create quantized dense operator used by frontend FFI. Expr MakeQuantizedDense(Expr data, Expr weight, Expr input_zero_point, Expr kernel_zero_point, Expr input_scale, Expr kernel_scale, IndexExpr units, DataType out_dtype) { - auto attrs = make_object(); + auto attrs = make_object(); attrs->units = std::move(units); attrs->out_dtype = out_dtype; - attrs->data_transposed = false; - attrs->weight_transposed = true; static const Op& op = Op::Get("qnn.dense"); return Call(op, {data, weight, input_zero_point, kernel_zero_point, input_scale, kernel_scale}, Attrs(attrs), {}); } Expr DenseFirstTerm(const Expr& quantized_data, const Expr& quantized_kernel, - const MatmulAttrs* attrs) { + const DenseAttrs* attrs) { return Dense(quantized_data, quantized_kernel, attrs->units, attrs->out_dtype); } @@ -163,7 +161,7 @@ Expr QnnDenseCanonicalize(const Attrs& attrs, const Array& new_args, const auto in_shape = get_shape(arg_types[0]); const int reduction_dim_size = get_const_int(in_shape[1]); - const auto* qnn_dense_attrs = attrs.as(); + const auto* qnn_dense_attrs = attrs.as(); auto term1 = DenseFirstTerm(quantized_data, quantized_kernel, qnn_dense_attrs); auto term2 = DenseSecondTerm(quantized_data, kernel_zero_point); @@ -206,7 +204,7 @@ RELAY_REGISTER_OP("qnn.dense") - **weight**: quantized(int8, unit8) `(units, input_dim)` - **out**: quantized(int32) `(x1, x2, ..., xn, units)`. )code" TVM_ADD_FILELINE) - .set_attrs_type() + .set_attrs_type() .set_num_inputs(6) .add_argument("data", "quantized nD Tensor", "Input data.") .add_argument("weight", "quantized 2D Tensor", "Weight matrix.") diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc index 09572b04ed4b..968628fbfe39 100644 --- a/src/relay/quantize/realize.cc +++ b/src/relay/quantize/realize.cc @@ -280,13 +280,11 @@ Expr DenseRealize(const Call& ref_call, const Array& new_args, const Objec } Expr rdata = Cast(rhs->data, cfg->dtype_weight); - const auto ref_attrs = ref_call->attrs.as(); - auto attrs = make_object(); + const auto ref_attrs = ref_call->attrs.as(); + auto attrs = make_object(); *attrs = *ref_attrs; DataType out_dtype = cfg->dtype_activation; attrs->out_dtype = out_dtype; - attrs->data_transposed = false; - attrs->weight_transposed = true; Expr ret = Call(ref_call->op, {ldata, rdata}, Attrs(attrs), ref_call->type_args); Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale); diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc b/src/relay/transforms/auto_scheduler_layout_rewrite.cc index 8aa5117dee1e..da0bd35a332a 100644 --- a/src/relay/transforms/auto_scheduler_layout_rewrite.cc +++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc @@ -89,6 +89,8 @@ class FuncMutator : public ExprMutator { updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout); } else if (auto pattr = call->attrs.as()) { updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout); + } else if (auto pattr = call->attrs.as()) { + updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout); } else if (auto pattr = call->attrs.as()) { updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout); } else { @@ -168,6 +170,8 @@ TVM_REGISTER_GLOBAL("relay.attrs.get_auto_scheduler_rewritten_layout") return attrs.as()->auto_scheduler_rewritten_layout; } else if (attrs->IsInstance()) { return attrs.as()->auto_scheduler_rewritten_layout; + } else if (attrs->IsInstance()) { + return attrs.as()->auto_scheduler_rewritten_layout; } else if (attrs->IsInstance()) { return attrs.as()->auto_scheduler_rewritten_layout; } else { diff --git a/src/relay/transforms/combine_parallel_dense.cc b/src/relay/transforms/combine_parallel_dense.cc index 966210c3c882..3cd9cca4fec4 100644 --- a/src/relay/transforms/combine_parallel_dense.cc +++ b/src/relay/transforms/combine_parallel_dense.cc @@ -70,15 +70,15 @@ class ParallelDenseToBatchCombiner : public ParallelOpBatchCombiner { } CHECK_EQ(num_args, 2); - const auto* origin_attrs = branches[0][0]->attrs.as(); + const auto* origin_attrs = branches[0][0]->attrs.as(); ICHECK(origin_attrs); return Downcast(MakeBatchMatmul(new_args[0], new_args[1], origin_attrs->out_dtype)); } virtual bool CanOpsBeCombined(const CallNode* a, const CallNode* b) { StructuralEqual eq; - const auto* attrs_a = a->attrs.as(); - const auto* attrs_b = b->attrs.as(); + const auto* attrs_a = a->attrs.as(); + const auto* attrs_b = b->attrs.as(); ICHECK(attrs_a); ICHECK(attrs_b); const auto* weight_a = a->args[1]->type_as(); @@ -103,8 +103,8 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner { bool CanOpsBeCombined(const CallNode* a, const CallNode* b) { StructuralEqual eq; - const auto* attrs_a = a->attrs.as(); - const auto* attrs_b = b->attrs.as(); + const auto* attrs_a = a->attrs.as(); + const auto* attrs_b = b->attrs.as(); const auto* weight_a = a->args[1]->type_as(); const auto* weight_b = b->args[1]->type_as(); ICHECK(attrs_a != nullptr && attrs_b != nullptr && weight_a != nullptr && weight_b != nullptr); @@ -119,13 +119,11 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner { IndexExpr new_output_dims; // concat all weights into one std::tie(new_weight, new_output_dims) = TransformWeight(branches); - const auto* origin_attrs = branches[0][0]->attrs.as(); + const auto* origin_attrs = branches[0][0]->attrs.as(); ICHECK(origin_attrs); - const auto dense_attrs = make_object(); + const auto dense_attrs = make_object(); dense_attrs->units = new_output_dims; dense_attrs->out_dtype = origin_attrs->out_dtype; - dense_attrs->data_transposed = false; - dense_attrs->weight_transposed = true; return Call(dense_op, {input, new_weight}, Attrs{dense_attrs}, {}); } diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py index 007a6b9bf8d0..e6620a4bc1cb 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_dense.py +++ b/tests/python/contrib/test_arm_compute_lib/test_dense.py @@ -123,8 +123,6 @@ def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False): "shape": [[list(output_shape)]], "dtype": [[dtype]], "units": [[str(units)]], - "data_transposed": [["0"]], - "weight_transposed": [["1"]], }, } From 91ead08fed4711b6add620067deafc0ca8b7471a Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Thu, 24 Jun 2021 11:39:52 +0800 Subject: [PATCH 13/26] Add grad for matmul & some update --- include/tvm/relay/attrs/nn.h | 2 +- python/tvm/relay/op/_tensor_grad.py | 29 +++++++++++++++++++++++ python/tvm/relay/op/nn/_nn.py | 2 +- python/tvm/relay/op/nn/nn.py | 2 ++ python/tvm/relay/op/strategy/cuda.py | 19 ++++++++++----- python/tvm/relay/op/strategy/x86.py | 22 +++++++++++++++++ python/tvm/topi/cuda/dense.py | 2 +- src/relay/op/nn/nn.cc | 17 ++++++------- src/relay/op/nn/nn.h | 13 +++++----- src/relay/qnn/op/dense.cc | 2 +- tests/python/relay/test_op_grad_level2.py | 17 +++++++++++++ 11 files changed, 101 insertions(+), 26 deletions(-) diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h index 0a3246ecf5a1..40d8406b2be2 100644 --- a/include/tvm/relay/attrs/nn.h +++ b/include/tvm/relay/attrs/nn.h @@ -961,7 +961,7 @@ struct AvgPool3DAttrs : public tvm::AttrsNode { } }; -/*! \brief Attributes for matmul operator and dense operator */ +/*! \brief Attributes for matmul operator */ struct MatmulAttrs : public tvm::AttrsNode { IndexExpr units; DataType out_dtype; diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py index d5b891088933..1f68cf5bb641 100644 --- a/python/tvm/relay/op/_tensor_grad.py +++ b/python/tvm/relay/op/_tensor_grad.py @@ -557,6 +557,35 @@ def dense_grad(orig, grad): ] +@register_gradient("nn.matmul") +def matmul_grad(orig, grad): + """Returns [grad' @ weight, data @ grad']""" + data, weight = orig.args + if (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (True, True): + return [ + collapse_sum_like( + _nn.matmul(weight, grad, data_transposed=True, weight_transposed=True), data + ), + collapse_sum_like( + _nn.matmul(grad, data, data_transposed=True, weight_transposed=True), weight + ), + ] + if (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (True, False): + return [ + collapse_sum_like(_nn.matmul(weight, grad, weight_transposed=True), data), + collapse_sum_like(_nn.matmul(data, grad), weight), + ] + if (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (False, True): + # Keep using Dense op here for not involving extra ops + # TODO(jcf94): Merge all to nn.matmul when it is finally ready + return dense_grad(orig, grad) + # (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (False, False) + return [ + collapse_sum_like(_nn.matmul(grad, weight, weight_transposed=True), data), + collapse_sum_like(_nn.matmul(data, grad, data_transposed=True), weight), + ] + + @register_gradient("nn.batch_matmul") def batch_matmul_grad(orig, grad): """gradient for nn.batch_matmul: in einsum LHS_bik,RHS_bjk->RES_bij diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index 929331c3daf2..91fff7df18e4 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -53,7 +53,7 @@ @reg.register_legalize("nn.matmul") -def leaglize_matmul(attrs, inputs, types): +def legalize_matmul(attrs, inputs, types): """Legalize matmul op. Parameters diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index d8d36d3a544d..ff7fe9080714 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -1507,6 +1507,8 @@ def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight result : tvm.relay.Expr The computed result. """ + if not data_transposed and weight_transposed: + return dense(data, weight, units, out_dtype) return _make.matmul(data, weight, units, out_dtype, data_transposed, weight_transposed) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 2f49aa4a89c7..d0c724832b57 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -702,6 +702,19 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target): def matmul_strategy_cuda(attrs, inputs, out_type, target): """dense cuda strategy""" strategy = _op.OpStrategy() + if is_auto_scheduler_enabled(): + strategy.add_implementation( + wrap_compute_matmul(topi.nn.matmul), + naive_schedule, + name="matmul.cuda", + ) + else: + logger.warning("Matmul other than NT format is not optimized for cuda.") + strategy.add_implementation( + wrap_compute_matmul(topi.nn.matmul), + wrap_topi_schedule(topi.cuda.schedule_dense_small_batch), + name="dense_small_batch.cuda", + ) if target.kind.name == "cuda" and "cublas" in target.libs: strategy.add_implementation( wrap_compute_matmul(topi.cuda.matmul_cublas), @@ -709,12 +722,6 @@ def matmul_strategy_cuda(attrs, inputs, out_type, target): name="matmul_cublas.cuda", plevel=25, ) - if is_auto_scheduler_enabled(): - strategy.add_implementation( - wrap_compute_matmul(topi.nn.matmul), - naive_schedule, - name="matmul.cuda", - ) return strategy diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index c21ec4d13906..7173e5b8db63 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -370,6 +370,28 @@ def conv1d_strategy_cpu(attrs, inputs, out_type, target): return strategy +@matmul_strategy.register("cpu") +def matmul_strategy_cpu(attrs, inputs, out_type, target): + """matmul x86 strategy""" + strategy = _op.OpStrategy() + if is_auto_scheduler_enabled(): + strategy.add_implementation( + wrap_compute_matmul(topi.nn.matmul, need_auto_scheduler_layout=True), + naive_schedule, + name="matmul.generic", + plevel=11, + ) + else: + logger.warning("Matmul other than NT format is not optimized for x86.") + strategy.add_implementation( + wrap_compute_matmul(topi.nn.matmul), + naive_schedule, + name="matmul.generic", + ) + + return strategy + + @dense_strategy.register("cpu") def dense_strategy_cpu(attrs, inputs, out_type, target): """dense x86 strategy""" diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py index 76fbd7f82117..884a1c61a5e5 100644 --- a/python/tvm/topi/cuda/dense.py +++ b/python/tvm/topi/cuda/dense.py @@ -104,7 +104,7 @@ def schedule_dense_small_batch(cfg, outs): s = te.create_schedule([x.op for x in outs]) def _callback(op): - if op.tag == "dense": + if op.tag == "dense" or op.tag == "matmul": _schedule_dense_small_batch(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index 1cd825618473..0fee3b15545d 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -164,17 +164,9 @@ Useful for // ------------------- relay.nn.matmul TVM_REGISTER_NODE_TYPE(MatmulAttrs); -TVM_REGISTER_NODE_TYPE(DenseAttrs); Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool data_transposed, bool weight_transposed) { - if (!data_transposed && weight_transposed) { - auto attrs = make_object(); - attrs->units = units; - attrs->out_dtype = out_dtype; - static const Op& dense_op = Op::Get("nn.dense"); - return Call(dense_op, {data, weight}, Attrs(attrs), {}); - } else { auto attrs = make_object(); attrs->units = units; attrs->out_dtype = out_dtype; @@ -182,7 +174,6 @@ Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, boo attrs->weight_transposed = weight_transposed; static const Op& matmul_op = Op::Get("nn.matmul"); return Call(matmul_op, {data, weight}, Attrs(attrs), {}); - } } TVM_REGISTER_GLOBAL("relay.op.nn._make.matmul").set_body_typed(MakeMatmul); @@ -204,9 +195,15 @@ RELAY_REGISTER_OP("nn.matmul") // ------------------- relay.nn.matmul // ------------------- relay.nn.dense +TVM_REGISTER_NODE_TYPE(DenseAttrs); + // Positional relay function to create dense operator used by frontend FFI. Expr MakeDense(Expr data, Expr weight, IndexExpr units, DataType out_dtype) { - return MakeMatmul(data, weight, units, out_dtype, false, true); + auto attrs = make_object(); + attrs->units = units; + attrs->out_dtype = out_dtype; + static const Op& op = Op::Get("nn.dense"); + return Call(op, {data, weight}, Attrs(attrs), {}); } TVM_REGISTER_GLOBAL("relay.op.nn._make.dense").set_body_typed(MakeDense); diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h index 97591981c2a3..a1fbc0a4de90 100644 --- a/src/relay/op/nn/nn.h +++ b/src/relay/op/nn/nn.h @@ -46,11 +46,13 @@ bool MatmulRel(const Array& types, int num_inputs, const Attrs& attrs, const AttrType* param = attrs.as(); ICHECK(param != nullptr); + // Default set to dense layout bool data_transposed = false; bool weight_transposed = true; - if (attrs->IsInstance()) { - data_transposed = param->data_transposed; - weight_transposed = param->weight_transposed; + const auto& mattrs = attrs.as(); + if (mattrs != nullptr) { + data_transposed = mattrs->data_transposed; + weight_transposed = mattrs->weight_transposed; } const Array& dshape = data->shape; @@ -63,9 +65,8 @@ bool MatmulRel(const Array& types, int num_inputs, const Attrs& attrs, if (param->units.defined()) { // validate the weight shape is proper if defined // Assign weight type - const Array& wshape = weight_transposed - ? Array({param->units, reduce}) - : Array({reduce, param->units}); + const Array& wshape = weight_transposed ? Array({param->units, reduce}) + : Array({reduce, param->units}); // It is possible for weight to be nullptr in which case we will use // data dtype as the weight dtype. However if weight dtype is explicitly // present we will use that. diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc index 6284524bff27..592fa77aed77 100644 --- a/src/relay/qnn/op/dense.cc +++ b/src/relay/qnn/op/dense.cc @@ -70,7 +70,7 @@ bool QnnDenseRel(const Array& types, int num_inputs, const Attrs& attrs, // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay // Dense infer type function. Array tensor_types = {types[0], types[1], types[6]}; - return DenseRel(tensor_types, 3, attrs, reporter); + return MatmulRel(tensor_types, 3, attrs, reporter); } // Positional relay function to create quantized dense operator used by frontend FFI. diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py index 686fd9834640..b9dff0464f07 100644 --- a/tests/python/relay/test_op_grad_level2.py +++ b/tests/python/relay/test_op_grad_level2.py @@ -199,6 +199,22 @@ def test_dense_grad(): verify_dense_grad((5, 4), (3, 4)) +def verify_matmul_grad(d_shape, w_shape, d_transposed, w_transposed): + data = relay.var("data", relay.TensorType(d_shape, "float32")) + weight = relay.var("weight", relay.TensorType(w_shape, "float32")) + fwd_func = relay.Function( + [data, weight], + relay.nn.matmul(data, weight, data_transposed=d_transposed, weight_transposed=w_transposed), + ) + check_grad(fwd_func) + + +def test_matmul_grad(): + verify_matmul_grad((1, 8), (8, 16), False, False) + verify_matmul_grad((4, 1), (4, 3), True, False) + verify_matmul_grad((4, 5), (3, 4), True, True) + + def verify_batch_flatten_grad(d_shape): data = relay.var("data", relay.TensorType(d_shape, "float32")) fwd_func = relay.Function([data], relay.nn.batch_flatten(data)) @@ -216,4 +232,5 @@ def test_batch_flatten_grad(): test_global_avg_pool2d_grad() test_conv2d_grad() test_dense_grad() + test_matmul_grad() test_batch_flatten_grad() From eec6f25786432467343a42e2376cc3f987269244 Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Thu, 24 Jun 2021 14:07:53 +0800 Subject: [PATCH 14/26] Bug fix --- python/tvm/relay/op/nn/_nn.py | 20 +++++++++++++++++--- python/tvm/relay/op/nn/nn.py | 4 ++-- python/tvm/relay/op/strategy/cuda.py | 21 +++++++++------------ src/relay/op/nn/nn.cc | 14 +++++++------- 4 files changed, 35 insertions(+), 24 deletions(-) diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index 91fff7df18e4..7812c42a62c2 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -59,7 +59,7 @@ def legalize_matmul(attrs, inputs, types): Parameters ---------- attrs : tvm.ir.Attrs - Attributes of current convolution + Attributes of current matmul inputs : list of tvm.relay.Expr The args of the Relay expr to be legalized types : list of types @@ -1191,7 +1191,14 @@ def matmul_shape_func(attrs, inputs, _): """ Shape function for matmul op. """ - ret = [_matmul_shape_func(inputs[0], inputs[1], attrs.data_transposed, attrs.weight_transposed)] + ret = [ + _matmul_shape_func( + inputs[0], + inputs[1], + expr.IntImm("bool", attrs.data_transposed), + expr.IntImm("bool", attrs.weight_transposed), + ) + ] return ret @@ -1200,7 +1207,14 @@ def dense_shape_func(attrs, inputs, _): """ Shape function for dense op. """ - ret = [_matmul_shape_func(inputs[0], inputs[1], False, True)] + ret = [ + _matmul_shape_func( + inputs[0], + inputs[1], + expr.IntImm("bool", False), + expr.IntImm("bool", True), + ) + ] return ret diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index ff7fe9080714..c1aa1cfcc788 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -1472,12 +1472,12 @@ def bias_add(data, bias, axis=1): def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight_transposed=False): - """Dense operator. + """Matmul operator. Applies a linear transformation. The X & W can be transposed. .. math:: - `Y = X * W` + `Y = X * W` Parameters ---------- diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index d0c724832b57..890b02e61867 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -702,19 +702,16 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target): def matmul_strategy_cuda(attrs, inputs, out_type, target): """dense cuda strategy""" strategy = _op.OpStrategy() - if is_auto_scheduler_enabled(): - strategy.add_implementation( - wrap_compute_matmul(topi.nn.matmul), - naive_schedule, - name="matmul.cuda", - ) - else: + # Temporary use this as a basic schedule + strategy.add_implementation( + wrap_compute_matmul(topi.nn.matmul), + wrap_topi_schedule(topi.cuda.schedule_dense_small_batch), + name="dense_small_batch.cuda", + ) + + if not is_auto_scheduler_enabled(): logger.warning("Matmul other than NT format is not optimized for cuda.") - strategy.add_implementation( - wrap_compute_matmul(topi.nn.matmul), - wrap_topi_schedule(topi.cuda.schedule_dense_small_batch), - name="dense_small_batch.cuda", - ) + if target.kind.name == "cuda" and "cublas" in target.libs: strategy.add_implementation( wrap_compute_matmul(topi.cuda.matmul_cublas), diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index 0fee3b15545d..24ba6ac1843e 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -167,13 +167,13 @@ TVM_REGISTER_NODE_TYPE(MatmulAttrs); Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool data_transposed, bool weight_transposed) { - auto attrs = make_object(); - attrs->units = units; - attrs->out_dtype = out_dtype; - attrs->data_transposed = data_transposed; - attrs->weight_transposed = weight_transposed; - static const Op& matmul_op = Op::Get("nn.matmul"); - return Call(matmul_op, {data, weight}, Attrs(attrs), {}); + auto attrs = make_object(); + attrs->units = units; + attrs->out_dtype = out_dtype; + attrs->data_transposed = data_transposed; + attrs->weight_transposed = weight_transposed; + static const Op& matmul_op = Op::Get("nn.matmul"); + return Call(matmul_op, {data, weight}, Attrs(attrs), {}); } TVM_REGISTER_GLOBAL("relay.op.nn._make.matmul").set_body_typed(MakeMatmul); From 85fb6ca94d26ed56d652e1cb661ffe25ba7ac9af Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Fri, 25 Jun 2021 10:41:28 +0800 Subject: [PATCH 15/26] Update matmul cuda default schedule --- python/tvm/relay/op/strategy/cuda.py | 20 ++++++++++++------- python/tvm/topi/cuda/dense.py | 29 +++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 890b02e61867..3ec8546407c7 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -702,15 +702,21 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target): def matmul_strategy_cuda(attrs, inputs, out_type, target): """dense cuda strategy""" strategy = _op.OpStrategy() - # Temporary use this as a basic schedule - strategy.add_implementation( - wrap_compute_matmul(topi.nn.matmul), - wrap_topi_schedule(topi.cuda.schedule_dense_small_batch), - name="dense_small_batch.cuda", - ) - if not is_auto_scheduler_enabled(): + if is_auto_scheduler_enabled(): + strategy.add_implementation( + wrap_compute_matmul(topi.nn.matmul), + naive_schedule, + name="matmul.cuda", + ) + else: logger.warning("Matmul other than NT format is not optimized for cuda.") + # Temporary use this as a basic schedule + strategy.add_implementation( + wrap_compute_matmul(topi.cuda.matmul_default_cuda), + wrap_topi_schedule(topi.cuda.schedule_matmul_default_cuda), + name="matmul_default.cuda", + ) if target.kind.name == "cuda" and "cublas" in target.libs: strategy.add_implementation( diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py index 884a1c61a5e5..2db6aa7a1fb4 100644 --- a/python/tvm/topi/cuda/dense.py +++ b/python/tvm/topi/cuda/dense.py @@ -104,7 +104,34 @@ def schedule_dense_small_batch(cfg, outs): s = te.create_schedule([x.op for x in outs]) def _callback(op): - if op.tag == "dense" or op.tag == "matmul": + if op.tag == "dense": + _schedule_dense_small_batch(cfg, s, op.output(0)) + + traverse_inline(s, outs[0].op, _callback) + return s + + +@autotvm.register_topi_compute("matmul_default.cuda") +def matmul_default_cuda( + cfg, + data, + weight, + bias=None, + out_dtype=None, + data_transposed=False, + weight_transposed=False, +): + return nn.matmul(data, weight, bias, out_dtype, data_transposed, weight_transposed) + + +@autotvm.register_topi_schedule("matmul_default.cuda") +def schedule_matmul_default_cuda(cfg, outs): + # Temporary use this as a basic schedule for matmul + outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs + s = te.create_schedule([x.op for x in outs]) + + def _callback(op): + if op.tag == "matmul": _schedule_dense_small_batch(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) From d2d1a321beb01bc3cc904bd70c23361a46db0726 Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Fri, 25 Jun 2021 10:47:52 +0800 Subject: [PATCH 16/26] Lint fix --- python/tvm/topi/cuda/dense.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py index 2db6aa7a1fb4..296e0975c041 100644 --- a/python/tvm/topi/cuda/dense.py +++ b/python/tvm/topi/cuda/dense.py @@ -121,17 +121,20 @@ def matmul_default_cuda( data_transposed=False, weight_transposed=False, ): + """Matmul operator on cuda""" return nn.matmul(data, weight, bias, out_dtype, data_transposed, weight_transposed) @autotvm.register_topi_schedule("matmul_default.cuda") def schedule_matmul_default_cuda(cfg, outs): - # Temporary use this as a basic schedule for matmul + """Schedule matmul on cuda""" outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) def _callback(op): if op.tag == "matmul": + # Temporary use this as a basic schedule for matmul + # TODO(jcf94): Add a more general schedule for matmul _schedule_dense_small_batch(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) From 5300840b909d905c4f748aa6b58e883b8fded9fc Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Fri, 25 Jun 2021 10:52:40 +0800 Subject: [PATCH 17/26] Update --- python/tvm/relay/op/nn/nn.py | 2 +- python/tvm/relay/op/strategy/cuda.py | 2 +- python/tvm/topi/nn/dense.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index c1aa1cfcc788..e83b95665264 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -1490,7 +1490,7 @@ def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight of shape `(units_in, units)` or `(units, units_in)`. units : int, optional - Number of hidden units of the dense transformation. + Number of hidden units of the matmul transformation. out_dtype : str, optional Specifies the output data type for mixed precision dense, diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 3ec8546407c7..d1791f676ca1 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -700,7 +700,7 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target): @matmul_strategy.register(["cuda", "gpu"]) def matmul_strategy_cuda(attrs, inputs, out_type, target): - """dense cuda strategy""" + """Matmul cuda strategy""" strategy = _op.OpStrategy() if is_auto_scheduler_enabled(): diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py index b7e3ece5f579..3be4975b52ae 100644 --- a/python/tvm/topi/nn/dense.py +++ b/python/tvm/topi/nn/dense.py @@ -137,7 +137,7 @@ def matmul_legalize(attrs, inputs, types): Parameters ---------- attrs : tvm.ir.Attrs - Attributes of current dense + Attributes of current matmul inputs : list of tvm.relay.Expr The args of the Relay expr to be legalized types : list of types From 9a57b20604a96ef17ebb9d4cf7bbe3b9e52e9dff Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Mon, 28 Jun 2021 19:04:25 +0800 Subject: [PATCH 18/26] Add blas support for matmul --- python/tvm/relay/op/strategy/x86.py | 27 ++++++++ python/tvm/topi/x86/dense.py | 99 +++++++++++++++++++++-------- 2 files changed, 101 insertions(+), 25 deletions(-) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 7173e5b8db63..4fd4bbb444eb 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -389,6 +389,33 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target): name="matmul.generic", ) + same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype + dtype = inputs[0].dtype + u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32" + if "cblas" in target.libs: + with SpecializedCondition(same_type and dtype in ["float32", "float64"]): + strategy.add_implementation( + wrap_compute_matmul(topi.x86.matmul_cblas), + wrap_topi_schedule(topi.x86.schedule_matmul_cblas), + name="matmul_cblas.x86", + plevel=13, + ) + if "mkl" in target.libs: + with SpecializedCondition(same_type and dtype in ["float32", "float64"] or u8s8s32): + strategy.add_implementation( + wrap_compute_matmul(topi.x86.matmul_mkl), + wrap_topi_schedule(topi.x86.schedule_matmul_mkl), + name="matmul_mkl.x86", + plevel=14, + ) + if "mkldnn" in target.libs: + with SpecializedCondition(same_type and dtype == "float32"): + strategy.add_implementation( + wrap_compute_matmul(topi.x86.matmul_mkldnn), + wrap_topi_schedule(topi.x86.schedule_matmul_mkldnn), + name="matmul_mkldnn.x86", + plevel=15, + ) return strategy diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py index 4fed4c16464e..4be1408d0ba4 100644 --- a/python/tvm/topi/x86/dense.py +++ b/python/tvm/topi/x86/dense.py @@ -28,7 +28,7 @@ from .utils import get_fp32_len from .injective import schedule_injective_from_existing -from .. import generic, tag +from .. import tag from ..utils import traverse_inline, get_const_tuple @@ -281,8 +281,8 @@ def _callback(op): return s -def dense_blas_common(cfg, data, weight, bias, out_dtype, lib): - """Compute dense using a BLAS library""" +def matmul_blas_common(cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, lib): + """Compute matmul/dense using a BLAS library""" M, K = get_const_tuple(data.shape) N, _ = get_const_tuple(weight.shape) if isinstance(M, int) and isinstance(K, int) and isinstance(N, int): @@ -290,63 +290,112 @@ def dense_blas_common(cfg, data, weight, bias, out_dtype, lib): if data.dtype == "uint8" and weight.dtype == "int8" and out_dtype == "int32": if not hasattr(lib, "matmul_u8s8s32"): raise NotImplementedError( - f"Dense with {lib.__name__} for {data.dtype} is not supported " + f"Matmul/Dense with {lib.__name__} for {data.dtype} is not supported " "(matmulu8s8s32 not imlemented)" ) - C = lib.matmul_u8s8s32(data, weight, False, True, dtype=out_dtype) + C = lib.matmul_u8s8s32(data, weight, data_transposed, weight_transposed, dtype=out_dtype) elif data.dtype == "float32" or data.dtype == "float64": - C = lib.matmul(data, weight, False, True) + C = lib.matmul(data, weight, data_transposed, weight_transposed) else: - raise NotImplementedError(f"Dense with {lib.__name__} for {data.dtype} is not supported") + raise NotImplementedError( + f"Matmul/Dense with {lib.__name__} for {data.dtype} is not supported" + ) if bias is not None: C = te.compute(C.shape, lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST) return C +def schedule_matmul_blas_common(outs): + """Default matmul schedule for BLAS library""" + s = te.create_schedule([x.op for x in outs]) + te.schedule.AutoInlineInjective(s) + + for out in outs: + if "dense" not in out.op.tag and "matmul" not in out.op.tag: + schedule_injective_from_existing(s, out) + return s + + @autotvm.register_topi_compute("dense_cblas.x86") def dense_cblas(cfg, data, weight, bias=None, out_dtype=None): """Compute dense using a cblas""" - return dense_blas_common(cfg, data, weight, bias, out_dtype, cblas) + return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, cblas) @autotvm.register_topi_schedule("dense_cblas.x86") def schedule_dense_cblas(_, outs): """Create schedule for dense_cblas""" - return generic.schedule_extern(outs) + return schedule_matmul_blas_common(outs) @autotvm.register_topi_compute("dense_mkl.x86") def dense_mkl(cfg, data, weight, bias=None, out_dtype=None): """Compute dense using mkl""" - return dense_blas_common(cfg, data, weight, bias, out_dtype, mkl) + return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, mkl) @autotvm.register_topi_schedule("dense_mkl.x86") def schedule_dense_mkl(_, outs): """Create schedule for dense_mkl""" - # return generic.schedule_extern(outs) - s = te.create_schedule([x.op for x in outs]) - te.schedule.AutoInlineInjective(s) - - def _callback(op): - if "broadcast" in op.tag or "injective" in op.tag or "elemwise" in op.tag: - schedule_injective_from_existing(s, op.output(0)) - - # traverse_inline(s, outs[0].op, _callback) - for out in outs: - if "dense" not in out.op.name: - schedule_injective_from_existing(s, out) - return s + return schedule_matmul_blas_common(outs) @autotvm.register_topi_compute("dense_mkldnn.x86") def dense_mkldnn(cfg, data, weight, bias=None, out_dtype=None): """Compute dense using mkldnn""" - return dense_blas_common(cfg, data, weight, bias, out_dtype, mkldnn) + return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, mkldnn) @autotvm.register_topi_schedule("dense_mkldnn.x86") def schedule_dense_mkldnn(_, outs): """Create schedule for dense_mkldnn""" - return generic.schedule_extern(outs) + return schedule_matmul_blas_common(outs) + + +@autotvm.register_topi_compute("matmul_cblas.x86") +def matmul_cblas( + cfg, data, weight, bias=None, out_dtype=None, data_transposed=False, weight_transposed=False +): + """Compute matmul using a cblas""" + return matmul_blas_common( + cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, cblas + ) + + +@autotvm.register_topi_schedule("matmul_cblas.x86") +def schedule_matmul_cblas(_, outs): + """Create schedule for matmul_cblas""" + return schedule_matmul_blas_common(outs) + + +@autotvm.register_topi_compute("matmul_mkl.x86") +def matmul_mkl( + cfg, data, weight, bias=None, out_dtype=None, data_transposed=False, weight_transposed=False +): + """Compute matmul using mkl""" + return matmul_blas_common( + cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, mkl + ) + + +@autotvm.register_topi_schedule("matmul_mkl.x86") +def schedule_matmul_mkl(_, outs): + """Create schedule for matmul_mkl""" + return schedule_matmul_blas_common(outs) + + +@autotvm.register_topi_compute("matmul_mkldnn.x86") +def matmul_mkldnn( + cfg, data, weight, bias=None, out_dtype=None, data_transposed=False, weight_transposed=False +): + """Compute matmul using mkldnn""" + return matmul_blas_common( + cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, mkldnn + ) + + +@autotvm.register_topi_schedule("matmul_mkldnn.x86") +def schedule_matmul_mkldnn(_, outs): + """Create schedule for matmul_mkldnn""" + return schedule_matmul_blas_common(outs) From 26daf14c6848be06afbf0c19d1574b8d257b8da3 Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Tue, 29 Jun 2021 11:14:26 +0800 Subject: [PATCH 19/26] Update --- include/tvm/relay/attrs/nn.h | 8 +- python/tvm/relay/frontend/tensorflow.py | 18 ++-- python/tvm/relay/frontend/tensorflow_ops.py | 12 +-- python/tvm/relay/op/_tensor_grad.py | 24 +++--- python/tvm/relay/op/nn/_nn.py | 14 +-- python/tvm/relay/op/nn/nn.py | 29 ++++--- python/tvm/relay/op/strategy/generic.py | 4 +- python/tvm/topi/cuda/dense.py | 40 ++++----- python/tvm/topi/nn/dense.py | 91 ++++++++++---------- python/tvm/topi/x86/dense.py | 30 +++---- rust/tvm/src/ir/relay/attrs/nn.rs | 4 +- src/relay/op/make_op.h | 4 +- src/relay/op/nn/nn.cc | 20 ++--- src/relay/op/nn/nn.h | 64 +++++++------- tests/python/relay/test_op_grad_level2.py | 10 +-- tests/python/relay/test_op_level1.py | 6 +- tests/python/topi/python/test_topi_matmul.py | 2 +- 17 files changed, 191 insertions(+), 189 deletions(-) diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h index 2154cc0efa27..8b245ab529d2 100644 --- a/include/tvm/relay/attrs/nn.h +++ b/include/tvm/relay/attrs/nn.h @@ -965,8 +965,8 @@ struct AvgPool3DAttrs : public tvm::AttrsNode { struct MatmulAttrs : public tvm::AttrsNode { IndexExpr units; DataType out_dtype; - bool data_transposed; - bool weight_transposed; + bool transpose_a; + bool transpose_b; tvm::String auto_scheduler_rewritten_layout; // The layout after auto-scheduler's layout rewrite TVM_DECLARE_ATTRS(MatmulAttrs, "relay.attrs.MatmulAttrs") { @@ -977,11 +977,11 @@ struct MatmulAttrs : public tvm::AttrsNode { .set_default(NullValue()) .describe("Output data type, set to explicit type under mixed precision setting"); - TVM_ATTR_FIELD(data_transposed) + TVM_ATTR_FIELD(transpose_a) .set_default(false) .describe("Whether the input tensor is in transposed format."); - TVM_ATTR_FIELD(weight_transposed) + TVM_ATTR_FIELD(transpose_b) .set_default(false) .describe("Whether the weight tensor is in transposed format."); } diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py index 7ae94aa6c67c..860ee2198998 100644 --- a/python/tvm/relay/frontend/tensorflow.py +++ b/python/tvm/relay/frontend/tensorflow.py @@ -44,9 +44,14 @@ __all__ = ["from_tensorflow"] -# By default, TVM convert `tf.matmul` to `nn.dense` op with data tensor non-transposed and weight -# tensor transposed -_USE_DENSE_INSTEAD_OF_MATMUL = True +# The default configurations of Relay TensorFlow frontend. +TF_DEFAULT_CONFIGS = { + # By default, TVM converts `tf.matmul` to `transpose(weight) + nn.dense`, which introduces + # unnecessary overhead in weight transpose. Change this flag to False to directly convert to + # `nn.matmul` to get rid of the overhead. + # However, please note that `nn.matmul` is in experimental so it may have some performance issues. + "use_dense": True, +} # compatible operators that do NOT require any conversion. _identity_list = [] @@ -1226,7 +1231,7 @@ def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None, use_dense_op outputs : List of output tensor names (Optional) if not specified then the last node is assumed as graph output. - use_dense_op : bool (Optional) + use_dense_op : bool (Optional) = True Ture to convert `tf.matmul` to `nn.dense`, else to `nn.matmul`. The `nn.dense` op requires the data tensor to be non-transposed and weight tensor to be transposed, may insert extra `transpose` to the original graph. @@ -1239,9 +1244,8 @@ def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None, use_dense_op params : dict of str to tvm.nd.NDArray Dict of converted parameters stored in tvm.nd.NDArray format """ - global _USE_DENSE_INSTEAD_OF_MATMUL - if use_dense_op != _USE_DENSE_INSTEAD_OF_MATMUL: - _USE_DENSE_INSTEAD_OF_MATMUL = use_dense_op + global TF_DEFAULT_CONFIGS + TF_DEFAULT_CONFIGS["use_dense"] = use_dense_op g = GraphProto() mod, params = g.from_tensorflow(graph, layout, shape, outputs) diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py index 612ea908ced2..004174f076fd 100644 --- a/python/tvm/relay/frontend/tensorflow_ops.py +++ b/python/tvm/relay/frontend/tensorflow_ops.py @@ -1113,10 +1113,10 @@ def _impl(inputs, attr, params, mod): def _matmul(): def _impl(inputs, attr, params, mod): - from .tensorflow import _USE_DENSE_INSTEAD_OF_MATMUL + from .tensorflow import TF_DEFAULT_CONFIGS channels = _infer_channels(inputs[1], not attr["transpose_b"]) - if _USE_DENSE_INSTEAD_OF_MATMUL: + if TF_DEFAULT_CONFIGS["use_dense"]: if attr["transpose_a"]: inputs[0] = _op.transpose(inputs[0], axes=(1, 0)) if not attr["transpose_b"]: @@ -1128,12 +1128,8 @@ def _impl(inputs, attr, params, mod): )(inputs, attr) return AttrCvt( op_name="matmul", - extras={ - "units": channels, - "data_transposed": attr["transpose_a"] or False, - "weight_transposed": attr["transpose_b"] or False, - }, - ignores=["transpose_a", "transpose_b", "T"], + extras={"units": channels}, + ignores=["T"], )(inputs, attr) return _impl diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py index ca1c7292c9ce..fa2772c1299f 100644 --- a/python/tvm/relay/op/_tensor_grad.py +++ b/python/tvm/relay/op/_tensor_grad.py @@ -556,30 +556,30 @@ def dense_grad(orig, grad): @register_gradient("nn.matmul") def matmul_grad(orig, grad): - """Returns [grad' @ weight, data @ grad']""" - data, weight = orig.args - if (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (True, True): + """Returns [grad' @ tensor_b, tensor_a @ grad']""" + tensor_a, tensor_b = orig.args + if (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (True, True): return [ collapse_sum_like( - _nn.matmul(weight, grad, data_transposed=True, weight_transposed=True), data + _nn.matmul(tensor_b, grad, transpose_a=True, transpose_b=True), tensor_a ), collapse_sum_like( - _nn.matmul(grad, data, data_transposed=True, weight_transposed=True), weight + _nn.matmul(grad, tensor_a, transpose_a=True, transpose_b=True), tensor_b ), ] - if (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (True, False): + if (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (True, False): return [ - collapse_sum_like(_nn.matmul(weight, grad, weight_transposed=True), data), - collapse_sum_like(_nn.matmul(data, grad), weight), + collapse_sum_like(_nn.matmul(tensor_b, grad, transpose_b=True), tensor_a), + collapse_sum_like(_nn.matmul(tensor_a, grad), tensor_b), ] - if (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (False, True): + if (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (False, True): # Keep using Dense op here for not involving extra ops # TODO(jcf94): Merge all to nn.matmul when it is finally ready return dense_grad(orig, grad) - # (orig.attrs["data_transposed"], orig.attrs["weight_transposed"]) == (False, False) + # (orig.attrs["transpose_a"], orig.attrs["transpose_b"]) == (False, False) return [ - collapse_sum_like(_nn.matmul(grad, weight, weight_transposed=True), data), - collapse_sum_like(_nn.matmul(data, grad, data_transposed=True), weight), + collapse_sum_like(_nn.matmul(grad, tensor_b, transpose_b=True), tensor_a), + collapse_sum_like(_nn.matmul(tensor_a, grad, transpose_a=True), tensor_b), ] diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index 6da983e76933..492d0c910172 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -1186,13 +1186,13 @@ def batch_flatten_shape_func(attrs, inputs, _): @script -def _matmul_shape_func(data_shape, weight_shape, data_transposed, weight_transposed): - out = output_tensor((data_shape.shape[0],), "int64") +def _matmul_shape_func(tensor_a_shape, tensor_b_shape, transpose_a, transpose_b): + out = output_tensor((tensor_a_shape.shape[0],), "int64") for i in const_range(out.shape[0] - 1): - out[i] = data_shape[i] - if data_transposed: + out[i] = tensor_a_shape[i] + if transpose_a: out[out.shape[0] - 2] = out[out.shape[0] - 1] - out[out.shape[0] - 1] = weight_shape[0] if weight_transposed else weight_shape[1] + out[out.shape[0] - 1] = tensor_b_shape[0] if transpose_b else tensor_b_shape[1] return out @@ -1206,8 +1206,8 @@ def matmul_shape_func(attrs, inputs, _): _matmul_shape_func( inputs[0], inputs[1], - expr.IntImm("bool", attrs.data_transposed), - expr.IntImm("bool", attrs.weight_transposed), + expr.IntImm("bool", attrs.transpose_a), + expr.IntImm("bool", attrs.transpose_b), ) ] return ret diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index 85ba75411028..4c94102275bb 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -1471,35 +1471,35 @@ def bias_add(data, bias, axis=1): return _make.bias_add(data, bias, axis) -def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight_transposed=False): +def matmul(tensor_a, tensor_b, units=None, out_dtype="", transpose_a=False, transpose_b=False): """Matmul operator. - Applies a linear transformation. The X & W can be transposed. + Applies a linear transformation. The A & B can be transposed. .. math:: - `Y = X * W` + `C = A * B` Parameters ---------- data : tvm.relay.Expr - The input data to the operator, + The first input of the operator, of shape `(d_1, d_2, ..., d_n, units_in)` or `(d_1, d_2, ..., units_in, d_n)`. weight : tvm.relay.Expr - The weight expressions, 2-D matrix, + The second input expressions, 2-D matrix, of shape `(units_in, units)` or `(units, units_in)`. - units : int, optional + units : Optional[int] Number of hidden units of the matmul transformation. - out_dtype : str, optional - Specifies the output data type for mixed precision dense, + out_dtype : Optional[str] + Specifies the output data type for mixed precision matmul, of shape `(d_1, d_2, ..., d_n, units)`. - data_transposed : bool, optional + transpose_a : Optional[bool] = False Whether the data tensor is in transposed format. - weight_transposed : bool, optional + transpose_b : Optional[bool] = False Whether the weight tensor is in transposed format. Returns @@ -1507,9 +1507,12 @@ def matmul(data, weight, units=None, out_dtype="", data_transposed=False, weight result : tvm.relay.Expr The computed result. """ - if not data_transposed and weight_transposed: - return dense(data, weight, units, out_dtype) - return _make.matmul(data, weight, units, out_dtype, data_transposed, weight_transposed) + # Since currently `nn.dense` has better topi schedule support, will prefer to use `dense` + # rather than `matmul` for better compatibility + if not transpose_a and transpose_b: + # TODO(jcf94): Remove this when `nn.matmul` is finnaly ready + return dense(tensor_a, tensor_b, units, out_dtype) + return _make.matmul(tensor_a, tensor_b, units, out_dtype, transpose_a, transpose_b) def dense(data, weight, units=None, out_dtype=""): diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index edb4556a554b..5cb3f65f3ebe 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -725,8 +725,8 @@ def _compute_matmul(attrs, inputs, out_type): inputs[1], None, out_dtype, - attrs.data_transposed, - attrs.weight_transposed, + attrs.transpose_a, + attrs.transpose_b, ] if need_auto_scheduler_layout: args.append(get_auto_scheduler_rewritten_layout(attrs)) diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py index 296e0975c041..3e0b76e69175 100644 --- a/python/tvm/topi/cuda/dense.py +++ b/python/tvm/topi/cuda/dense.py @@ -32,22 +32,22 @@ def _matmul_cublas_common( cfg, - data, - weight, + tensor_a, + tensor_b, bias=None, out_dtype=None, - data_transposed=False, - weight_transposed=False, + transpose_a=False, + transpose_b=False, ): - assert len(data.shape) == 2 and len(weight.shape) == 2, "only support 2-dim matmul" + assert len(tensor_a.shape) == 2 and len(tensor_b.shape) == 2, "only support 2-dim matmul" if bias is not None: assert len(bias.shape) == 1 if out_dtype is None: - out_dtype = data.dtype - assert out_dtype == data.dtype, "Mixed precision not supported." - batch, in_dim = get_const_tuple(data.shape) - out_dim, _ = get_const_tuple(weight.shape) - matmul = cublas.matmul(data, weight, data_transposed, weight_transposed) + out_dtype = tensor_a.dtype + assert out_dtype == tensor_a.dtype, "Mixed precision not supported." + batch, in_dim = get_const_tuple(tensor_a.shape) + out_dim, _ = get_const_tuple(tensor_b.shape) + matmul = cublas.matmul(tensor_a, tensor_b, transpose_a, transpose_b) if all(isinstance(d, int) for d in [batch, in_dim, out_dim]): cfg.add_flop(batch * in_dim * out_dim * 2) if bias is not None: @@ -60,16 +60,16 @@ def _matmul_cublas_common( @autotvm.register_topi_compute("matmul_cublas.cuda") def matmul_cublas( cfg, - data, - weight, + tensor_a, + tensor_b, bias=None, out_dtype=None, - data_transposed=False, - weight_transposed=False, + transpose_a=False, + transpose_b=False, ): """Matmul operator on CUDA with CUBLAS""" return _matmul_cublas_common( - cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed + cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b ) @@ -114,15 +114,15 @@ def _callback(op): @autotvm.register_topi_compute("matmul_default.cuda") def matmul_default_cuda( cfg, - data, - weight, + tensor_a, + tensor_b, bias=None, out_dtype=None, - data_transposed=False, - weight_transposed=False, + transpose_a=False, + transpose_b=False, ): """Matmul operator on cuda""" - return nn.matmul(data, weight, bias, out_dtype, data_transposed, weight_transposed) + return nn.matmul(tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b) @autotvm.register_topi_schedule("matmul_default.cuda") diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py index 3be4975b52ae..02c6786472f8 100644 --- a/python/tvm/topi/nn/dense.py +++ b/python/tvm/topi/nn/dense.py @@ -22,22 +22,22 @@ def matmul( - data, - weight, + tensor_a, + tensor_b, bias=None, out_dtype=None, - data_transposed=False, - weight_transposed=False, + transpose_a=False, + transpose_b=False, auto_scheduler_rewritten_layout="", ): """The default implementation of matmul in topi. Parameters ---------- - data : tvm.te.Tensor + tensor_a : tvm.te.Tensor 2-D with shape [batch, in_dim] - weight : tvm.te.Tensor + tensor_b : tvm.te.Tensor 2-D with shape [out_dim, in_dim] bias : Optional[tvm.te.Tensor] @@ -46,13 +46,13 @@ def matmul( out_dtype : Optional[str] The output type. This is used for mixed precision. - data_transposed : Optional[bool] - Whether the data tensor is in transposed format. + transpose_a : Optional[bool] = False + Whether the tensor_a is in transposed format. - weight_transposed : Optional[bool] - Whether the weight tensor is in transposed format. + transpose_b : Optional[bool] = False + Whether the tensor_b is in transposed format. - auto_scheduler_rewritten_layout: str = "" + auto_scheduler_rewritten_layout: Optional[str] = "" The layout after auto-scheduler's layout rewrite pass. Returns @@ -60,61 +60,60 @@ def matmul( output : tvm.te.Tensor 2-D with shape [batch, out_dim] """ - assert len(data.shape) == 2, "only support 2-dim dense" + assert len(tensor_a.shape) == 2, "only support 2-dim dense" if bias is not None: assert len(bias.shape) == 1 if out_dtype is None: - out_dtype = data.dtype - if data_transposed: - in_dim, batch = data.shape + out_dtype = tensor_a.dtype + if transpose_a: + in_dim, batch = tensor_a.shape else: - batch, in_dim = data.shape + batch, in_dim = tensor_a.shape if auto_scheduler_rewritten_layout: # Infer shape for the rewritten layout out_dim, red_dim = auto_scheduler.get_shape_from_rewritten_layout( - auto_scheduler_rewritten_layout, ["j", "k"] if weight_transposed else ["k", "j"] + auto_scheduler_rewritten_layout, ["j", "k"] if transpose_b else ["k", "j"] ) - auto_scheduler.remove_index_check(weight) - elif weight_transposed: - out_dim, red_dim = weight.shape + auto_scheduler.remove_index_check(tensor_b) + elif transpose_b: + out_dim, red_dim = tensor_b.shape else: - red_dim, out_dim = weight.shape + red_dim, out_dim = tensor_b.shape assert in_dim == red_dim k = te.reduce_axis((0, in_dim), name="k") - if data_transposed: - if weight_transposed: - compute_lambda = lambda i, j: te.sum( - data[k, i].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=k - ) - compute_name = "T_matmul_TT" - else: - compute_lambda = lambda i, j: te.sum( - data[k, i].astype(out_dtype) * weight[k, j].astype(out_dtype), axis=k - ) - compute_name = "T_matmul_TN" + if (transpose_a, transpose_b) == (True, True): + compute_lambda = lambda i, j: te.sum( + tensor_a[k, i].astype(out_dtype) * tensor_b[j, k].astype(out_dtype), axis=k + ) + compute_name = "T_matmul" + compute_tag = "matmul" + elif (transpose_a, transpose_b) == (True, False): + compute_lambda = lambda i, j: te.sum( + tensor_a[k, i].astype(out_dtype) * tensor_b[k, j].astype(out_dtype), axis=k + ) + compute_name = "T_matmul" + compute_tag = "matmul" + elif (transpose_a, transpose_b) == (False, True): + compute_lambda = lambda i, j: te.sum( + tensor_a[i, k].astype(out_dtype) * tensor_b[j, k].astype(out_dtype), axis=k + ) + compute_name = "T_dense" + compute_tag = "dense" + else: # (transpose_a, transpose_b) == (False, False): + compute_lambda = lambda i, j: te.sum( + tensor_a[i, k].astype(out_dtype) * tensor_b[k, j].astype(out_dtype), axis=k + ) + compute_name = "T_matmul" compute_tag = "matmul" - else: - if weight_transposed: - compute_lambda = lambda i, j: te.sum( - data[i, k].astype(out_dtype) * weight[j, k].astype(out_dtype), axis=k - ) - compute_name = "T_dense" - compute_tag = "dense" - else: - compute_lambda = lambda i, j: te.sum( - data[i, k].astype(out_dtype) * weight[k, j].astype(out_dtype), axis=k - ) - compute_name = "T_matmul" - compute_tag = "matmul" mat = te.compute( (batch, out_dim), compute_lambda, name=compute_name, tag=compute_tag, - attrs={"layout_free_placeholders": [weight]}, + attrs={"layout_free_placeholders": [tensor_b]}, ) if bias is not None: diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py index 4be1408d0ba4..171e54e6c9e6 100644 --- a/python/tvm/topi/x86/dense.py +++ b/python/tvm/topi/x86/dense.py @@ -281,24 +281,24 @@ def _callback(op): return s -def matmul_blas_common(cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, lib): +def matmul_blas_common(cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, lib): """Compute matmul/dense using a BLAS library""" - M, K = get_const_tuple(data.shape) - N, _ = get_const_tuple(weight.shape) + M, K = get_const_tuple(tensor_a.shape) + N, _ = get_const_tuple(tensor_b.shape) if isinstance(M, int) and isinstance(K, int) and isinstance(N, int): cfg.add_flop(M * K * N * 2) - if data.dtype == "uint8" and weight.dtype == "int8" and out_dtype == "int32": + if tensor_a.dtype == "uint8" and tensor_b.dtype == "int8" and out_dtype == "int32": if not hasattr(lib, "matmul_u8s8s32"): raise NotImplementedError( - f"Matmul/Dense with {lib.__name__} for {data.dtype} is not supported " + f"Matmul/Dense with {lib.__name__} for {tensor_a.dtype} is not supported " "(matmulu8s8s32 not imlemented)" ) - C = lib.matmul_u8s8s32(data, weight, data_transposed, weight_transposed, dtype=out_dtype) - elif data.dtype == "float32" or data.dtype == "float64": - C = lib.matmul(data, weight, data_transposed, weight_transposed) + C = lib.matmul_u8s8s32(tensor_a, tensor_b, transpose_a, transpose_b, dtype=out_dtype) + elif tensor_a.dtype == "float32" or tensor_a.dtype == "float64": + C = lib.matmul(tensor_a, tensor_b, transpose_a, transpose_b) else: raise NotImplementedError( - f"Matmul/Dense with {lib.__name__} for {data.dtype} is not supported" + f"Matmul/Dense with {lib.__name__} for {tensor_a.dtype} is not supported" ) if bias is not None: @@ -355,11 +355,11 @@ def schedule_dense_mkldnn(_, outs): @autotvm.register_topi_compute("matmul_cblas.x86") def matmul_cblas( - cfg, data, weight, bias=None, out_dtype=None, data_transposed=False, weight_transposed=False + cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False ): """Compute matmul using a cblas""" return matmul_blas_common( - cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, cblas + cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, cblas ) @@ -371,11 +371,11 @@ def schedule_matmul_cblas(_, outs): @autotvm.register_topi_compute("matmul_mkl.x86") def matmul_mkl( - cfg, data, weight, bias=None, out_dtype=None, data_transposed=False, weight_transposed=False + cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False ): """Compute matmul using mkl""" return matmul_blas_common( - cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, mkl + cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, mkl ) @@ -387,11 +387,11 @@ def schedule_matmul_mkl(_, outs): @autotvm.register_topi_compute("matmul_mkldnn.x86") def matmul_mkldnn( - cfg, data, weight, bias=None, out_dtype=None, data_transposed=False, weight_transposed=False + cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False ): """Compute matmul using mkldnn""" return matmul_blas_common( - cfg, data, weight, bias, out_dtype, data_transposed, weight_transposed, mkldnn + cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, mkldnn ) diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs index e0a1d5bf02cd..04320d1f6f85 100644 --- a/rust/tvm/src/ir/relay/attrs/nn.rs +++ b/rust/tvm/src/ir/relay/attrs/nn.rs @@ -62,8 +62,8 @@ pub struct MatmulAttrsNode { pub base: BaseAttrsNode, pub units: IndexExpr, pub out_dtype: DataType, - pub data_transposed: bool, - pub weight_transposed: bool, + pub transpose_a: bool, + pub transpose_b: bool, } #[repr(C)] diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h index f7bb98f9d1b6..6f4db5ab268a 100644 --- a/src/relay/op/make_op.h +++ b/src/relay/op/make_op.h @@ -44,8 +44,8 @@ Expr MakeClip(Expr a, double a_min, double a_max); Expr MakeConcatenate(Expr data, int axis); -Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool data_transposed, - bool weight_transposed); +Expr MakeMatmul(Expr tensor_a, Expr tensor_b, IndexExpr units, DataType out_dtype, bool transpose_a, + bool transpose_b); Expr MakeDense(Expr data, Expr weight, IndexExpr units, DataType out_dtype); diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index 3b64a99bce09..4eaa12b17d7b 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -165,31 +165,31 @@ Useful for // ------------------- relay.nn.matmul TVM_REGISTER_NODE_TYPE(MatmulAttrs); -Expr MakeMatmul(Expr data, Expr weight, IndexExpr units, DataType out_dtype, bool data_transposed, - bool weight_transposed) { +Expr MakeMatmul(Expr tensor_a, Expr tensor_b, IndexExpr units, DataType out_dtype, bool transpose_a, + bool transpose_b) { auto attrs = make_object(); attrs->units = units; attrs->out_dtype = out_dtype; - attrs->data_transposed = data_transposed; - attrs->weight_transposed = weight_transposed; + attrs->transpose_a = transpose_a; + attrs->transpose_b = transpose_b; static const Op& matmul_op = Op::Get("nn.matmul"); - return Call(matmul_op, {data, weight}, Attrs(attrs), {}); + return Call(matmul_op, {tensor_a, tensor_b}, Attrs(attrs), {}); } TVM_REGISTER_GLOBAL("relay.op.nn._make.matmul").set_body_typed(MakeMatmul); RELAY_REGISTER_OP("nn.matmul") - .describe(R"code(Applies a linear transformation: :math:`Y = XW`. X & W can be transposed. + .describe(R"code(Applies a linear transformation: :math:`C = A * B`. A & B can be transposed. -- **data**: `(x1, x2, ..., xn, input_dim)` or `(x1, x2, ..., input_dim, xn)` -- **weight**: `(input_dim, units)` or `(units, input_dim)` +- **tensor_a**: `(x1, x2, ..., xn, input_dim)` or `(x1, x2, ..., input_dim, xn)` +- **tensor_b**: `(input_dim, units)` or `(units, input_dim)` - **out**: `(x1, x2, ..., xn, units)`. )code" TVM_ADD_FILELINE) .set_attrs_type() .set_num_inputs(2) - .add_argument("data", "nD Tensor", "Input data.") - .add_argument("weight", "2D Tensor", "Weight matrix.") + .add_argument("tensor_a", "nD Tensor", "The first input Tensor.") + .add_argument("tensor_b", "2D Tensor", "The second input Tensor.") .set_support_level(1) .add_type_rel("Matmul", MatmulRel); // ------------------- relay.nn.matmul diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h index a1fbc0a4de90..29f200c67c59 100644 --- a/src/relay/op/nn/nn.h +++ b/src/relay/op/nn/nn.h @@ -39,41 +39,41 @@ template bool MatmulRel(const Array& types, int num_inputs, const Attrs& attrs, const TypeReporter& reporter) { ICHECK_EQ(types.size(), 3); - const auto* data = types[0].as(); - const auto* weight = types[1].as(); - if (data == nullptr) return false; - ICHECK(static_cast(data->shape.size()) != 0); + const auto* tensor_a = types[0].as(); + const auto* tensor_b = types[1].as(); + if (tensor_a == nullptr) return false; + ICHECK(static_cast(tensor_a->shape.size()) != 0); const AttrType* param = attrs.as(); ICHECK(param != nullptr); // Default set to dense layout - bool data_transposed = false; - bool weight_transposed = true; + bool transpose_a = false; + bool transpose_b = true; const auto& mattrs = attrs.as(); if (mattrs != nullptr) { - data_transposed = mattrs->data_transposed; - weight_transposed = mattrs->weight_transposed; + transpose_a = mattrs->transpose_a; + transpose_b = mattrs->transpose_b; } - const Array& dshape = data->shape; + const Array& dshape = tensor_a->shape; Array oshape = dshape; tvm::PrimExpr reduce = dshape[dshape.size() - 1]; - if (data_transposed) { + if (transpose_a) { reduce = dshape[dshape.size() - 2]; oshape.Set((oshape.size() - 2), dshape[oshape.size() - 1]); } if (param->units.defined()) { - // validate the weight shape is proper if defined - // Assign weight type - const Array& wshape = weight_transposed ? Array({param->units, reduce}) - : Array({reduce, param->units}); - // It is possible for weight to be nullptr in which case we will use - // data dtype as the weight dtype. However if weight dtype is explicitly + // validate the tensor_b shape is proper if defined + // Assign tensor_b type + const Array& wshape = transpose_b ? Array({param->units, reduce}) + : Array({reduce, param->units}); + // It is possible for tensor_b to be nullptr in which case we will use + // data dtype as the tensor_b dtype. However if tensor_b dtype is explicitly // present we will use that. - auto weight_dtype = (weight == nullptr ? data->dtype : weight->dtype); + auto tensor_b_dtype = (tensor_b == nullptr ? tensor_a->dtype : tensor_b->dtype); if (param->auto_scheduler_rewritten_layout.size() == 0) { // Normal case: assign result to reporter - reporter->Assign(types[1], TensorType(wshape, weight_dtype)); + reporter->Assign(types[1], TensorType(wshape, tensor_b_dtype)); } else { // If the layout is rewritten by auto-scheduler, // we just forcly apply the layout provided by auto-scheduler and @@ -82,32 +82,32 @@ bool MatmulRel(const Array& types, int num_inputs, const Attrs& attrs, } oshape.Set((oshape.size() - 1), param->units); } else { - if (weight == nullptr) return false; - const Array& wshape = weight->shape; - // When weight's layout has been rewritten, figure it out based on the + if (tensor_b == nullptr) return false; + const Array& wshape = tensor_b->shape; + // When tensor_b's layout has been rewritten, figure it out based on the // total number of elements and input dimensions. if (param->auto_scheduler_rewritten_layout.size() != 0) { - PrimExpr weight_elements = 1; + PrimExpr tensor_b_elements = 1; for (size_t i = 0; i < wshape.size(); i++) { - weight_elements = weight_elements * wshape[i]; + tensor_b_elements = tensor_b_elements * wshape[i]; } - oshape.Set(oshape.size() - 1, weight_elements / dshape[dshape.size() - 1]); - // Otherwise just pull it out of the weight shape directly. + oshape.Set(oshape.size() - 1, tensor_b_elements / dshape[dshape.size() - 1]); + // Otherwise just pull it out of the tensor_b shape directly. } else { - ICHECK(static_cast(weight->shape.size()) == 2); - if (!data->shape.back().as()) { - ICHECK((weight_transposed && reporter->AssertEQ(reduce, weight->shape[1])) || - (!weight_transposed && reporter->AssertEQ(reduce, weight->shape[0]))) + ICHECK(static_cast(tensor_b->shape.size()) == 2); + if (!tensor_a->shape.back().as()) { + ICHECK((transpose_b && reporter->AssertEQ(reduce, tensor_b->shape[1])) || + (!transpose_b && reporter->AssertEQ(reduce, tensor_b->shape[0]))) << "MatmulRel: input dimension doesn't match," - << " data shape=" << data->shape << ", weight shape=" << weight->shape; + << " tensor_a shape=" << tensor_a->shape << ", tensor_b shape=" << tensor_b->shape; } - oshape.Set((oshape.size() - 1), weight_transposed ? wshape[0] : wshape[1]); + oshape.Set((oshape.size() - 1), transpose_b ? wshape[0] : wshape[1]); } } DataType out_dtype = param->out_dtype; if (out_dtype.bits() == 0) { - out_dtype = data->dtype; + out_dtype = tensor_a->dtype; } // assign output type reporter->Assign(types[2], TensorType(oshape, out_dtype)); diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py index b9dff0464f07..c8a94683eec4 100644 --- a/tests/python/relay/test_op_grad_level2.py +++ b/tests/python/relay/test_op_grad_level2.py @@ -199,12 +199,12 @@ def test_dense_grad(): verify_dense_grad((5, 4), (3, 4)) -def verify_matmul_grad(d_shape, w_shape, d_transposed, w_transposed): - data = relay.var("data", relay.TensorType(d_shape, "float32")) - weight = relay.var("weight", relay.TensorType(w_shape, "float32")) +def verify_matmul_grad(a_shape, b_shape, transpose_a, transpose_b): + tensor_a = relay.var("tensor_a", relay.TensorType(a_shape, "float32")) + tensor_b = relay.var("tensor_b", relay.TensorType(b_shape, "float32")) fwd_func = relay.Function( - [data, weight], - relay.nn.matmul(data, weight, data_transposed=d_transposed, weight_transposed=w_transposed), + [tensor_a, tensor_b], + relay.nn.matmul(tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b), ) check_grad(fwd_func) diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py index d5a0e8bc8403..cbc3e7fbd1e5 100644 --- a/tests/python/relay/test_op_level1.py +++ b/tests/python/relay/test_op_level1.py @@ -430,7 +430,7 @@ def test_matmul(): n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w") x = relay.var("x", relay.TensorType((n, c, h, w), dtype)) w = relay.var("w", relay.TensorType((2, w), dtype)) - y = relay.nn.matmul(x, w, units=2, weight_transposed=True) + y = relay.nn.matmul(x, w, units=2, transpose_b=True) assert "units=2" in y.astext() yy = run_infer_type(y) assert yy.checked_type == relay.TensorType((n, c, h, 2), dtype) @@ -439,7 +439,7 @@ def test_matmul(): x = relay.var("x", relay.TensorType((n, c, w, h), dtype)) wh, ww = te.size_var("wh"), te.size_var("ww") w = relay.var("w", relay.TensorType((wh, ww), dtype)) - y = relay.nn.matmul(x, w, data_transposed=True) + y = relay.nn.matmul(x, w, transpose_a=True) yy = run_infer_type(y) assert yy.checked_type == relay.TensorType((n, c, h, ww), dtype) @@ -452,7 +452,7 @@ def test_matmul(): x = relay.var("x", shape=(5, 10), dtype=dtype) w = relay.var("w", shape=(5, 2), dtype=dtype) - z = relay.nn.matmul(x, w, data_transposed=True) + z = relay.nn.matmul(x, w, transpose_a=True) # Check result. func = relay.Function([x, w], z) diff --git a/tests/python/topi/python/test_topi_matmul.py b/tests/python/topi/python/test_topi_matmul.py index cf92ab0807cc..de2d4d3c4c8e 100644 --- a/tests/python/topi/python/test_topi_matmul.py +++ b/tests/python/topi/python/test_topi_matmul.py @@ -46,7 +46,7 @@ def verify_nn_matmul(sa, sb, transp_a, transp_b): b = np.random.uniform(low=-1.0, high=1.0, size=sb).astype(np.float32) c1 = np.matmul(np.transpose(a) if transp_a else a, np.transpose(b) if transp_b else b) c2 = with_tvm( - lambda A, B: topi.nn.matmul(A, B, data_transposed=transp_a, weight_transposed=transp_b), + lambda A, B: topi.nn.matmul(A, B, transpose_a=transp_a, transpose_b=transp_b), a, b, ) From 32a2f42ea6875a928f5b859d3f3c4842a5ae7b1f Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Tue, 29 Jun 2021 11:36:13 +0800 Subject: [PATCH 20/26] Lint fix add update doc strings --- python/tvm/relay/frontend/tensorflow.py | 11 +++--- python/tvm/relay/op/strategy/cuda.py | 6 ++-- python/tvm/relay/op/strategy/x86.py | 5 ++- python/tvm/topi/cuda/dense.py | 6 ++-- python/tvm/topi/nn/dense.py | 2 ++ python/tvm/topi/x86/dense.py | 12 +++---- tests/python/contrib/test_cblas.py | 38 ++++++++++---------- tutorials/auto_scheduler/tune_network_x86.py | 4 +-- 8 files changed, 45 insertions(+), 39 deletions(-) diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py index 860ee2198998..e297398ffe5b 100644 --- a/python/tvm/relay/frontend/tensorflow.py +++ b/python/tvm/relay/frontend/tensorflow.py @@ -46,11 +46,12 @@ # The default configurations of Relay TensorFlow frontend. TF_DEFAULT_CONFIGS = { - # By default, TVM converts `tf.matmul` to `transpose(weight) + nn.dense`, which introduces - # unnecessary overhead in weight transpose. Change this flag to False to directly convert to - # `nn.matmul` to get rid of the overhead. - # However, please note that `nn.matmul` is in experimental so it may have some performance issues. - "use_dense": True, + # By default, TVM converts `tf.matmul` to `transpose(weight) + nn.dense`, which introduces + # unnecessary overhead in weight transpose. Change this flag to False to directly convert to + # `nn.matmul` to get rid of the overhead. + # However, please note that `nn.matmul` is in experimental so it may have some performance + # issues. + "use_dense": True, } # compatible operators that do NOT require any conversion. diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index d1791f676ca1..04c9f64b538d 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -700,7 +700,7 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target): @matmul_strategy.register(["cuda", "gpu"]) def matmul_strategy_cuda(attrs, inputs, out_type, target): - """Matmul cuda strategy""" + """Matmul cuda strategy.""" strategy = _op.OpStrategy() if is_auto_scheduler_enabled(): @@ -710,7 +710,9 @@ def matmul_strategy_cuda(attrs, inputs, out_type, target): name="matmul.cuda", ) else: - logger.warning("Matmul other than NT format is not optimized for cuda.") + logger.warning( + "Matmul is not optimized for cuda. Recommend to use cublas for better performance." + ) # Temporary use this as a basic schedule strategy.add_implementation( wrap_compute_matmul(topi.cuda.matmul_default_cuda), diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 4fd4bbb444eb..581aa5d6a781 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -382,7 +382,10 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target): plevel=11, ) else: - logger.warning("Matmul other than NT format is not optimized for x86.") + logger.warning( + "Matmul is not optimized for x86. " + "Recommend to use cblas/mkl/mkldnn for better performance." + ) strategy.add_implementation( wrap_compute_matmul(topi.nn.matmul), naive_schedule, diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py index 3e0b76e69175..24ee30e5313a 100644 --- a/python/tvm/topi/cuda/dense.py +++ b/python/tvm/topi/cuda/dense.py @@ -68,9 +68,7 @@ def matmul_cublas( transpose_b=False, ): """Matmul operator on CUDA with CUBLAS""" - return _matmul_cublas_common( - cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b - ) + return _matmul_cublas_common(cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b) @autotvm.register_topi_schedule("matmul_cublas.cuda") @@ -81,7 +79,7 @@ def schedule_matmul_cublas(_, outs): @autotvm.register_topi_compute("dense_cublas.cuda") def dense_cublas(cfg, data, weight, bias=None, out_dtype=None): - """Dense operator on CUDA with CUBLAS""" + """Dense operator on CUDA with CUBLAS. This is an alias of matmul_nt operator.""" return _matmul_cublas_common(cfg, data, weight, bias, out_dtype, False, True) diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py index 02c6786472f8..9f1a3963115e 100644 --- a/python/tvm/topi/nn/dense.py +++ b/python/tvm/topi/nn/dense.py @@ -154,6 +154,8 @@ def matmul_legalize(attrs, inputs, types): def dense(data, weight, bias=None, out_dtype=None, auto_scheduler_rewritten_layout=""): """The default implementation of dense in topi. + This is an alias of matmul operator for data tensor in non-transposed format and weight tensor + in transposed format. Parameters ---------- diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py index 171e54e6c9e6..f726cdc14022 100644 --- a/python/tvm/topi/x86/dense.py +++ b/python/tvm/topi/x86/dense.py @@ -319,7 +319,7 @@ def schedule_matmul_blas_common(outs): @autotvm.register_topi_compute("dense_cblas.x86") def dense_cblas(cfg, data, weight, bias=None, out_dtype=None): - """Compute dense using a cblas""" + """Compute dense using cblas. This is an alias of matmul_nt operator.""" return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, cblas) @@ -331,7 +331,7 @@ def schedule_dense_cblas(_, outs): @autotvm.register_topi_compute("dense_mkl.x86") def dense_mkl(cfg, data, weight, bias=None, out_dtype=None): - """Compute dense using mkl""" + """Compute dense using mkl. This is an alias of matmul_nt operator.""" return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, mkl) @@ -343,7 +343,7 @@ def schedule_dense_mkl(_, outs): @autotvm.register_topi_compute("dense_mkldnn.x86") def dense_mkldnn(cfg, data, weight, bias=None, out_dtype=None): - """Compute dense using mkldnn""" + """Compute dense using mkldnn. This is an alias of matmul_nt operator.""" return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, mkldnn) @@ -357,7 +357,7 @@ def schedule_dense_mkldnn(_, outs): def matmul_cblas( cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False ): - """Compute matmul using a cblas""" + """Compute matmul using cblas.""" return matmul_blas_common( cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, cblas ) @@ -373,7 +373,7 @@ def schedule_matmul_cblas(_, outs): def matmul_mkl( cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False ): - """Compute matmul using mkl""" + """Compute matmul using mkl.""" return matmul_blas_common( cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, mkl ) @@ -389,7 +389,7 @@ def schedule_matmul_mkl(_, outs): def matmul_mkldnn( cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False ): - """Compute matmul using mkldnn""" + """Compute matmul using mkldnn.""" return matmul_blas_common( cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, mkldnn ) diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py index 2b99879d8227..86556c334f5b 100644 --- a/tests/python/contrib/test_cblas.py +++ b/tests/python/contrib/test_cblas.py @@ -71,30 +71,30 @@ def verify(target="llvm"): ) verify("llvm") - verify("c") + # verify("c") def test_matmul_add(): - verify_matmul_add(235, 128, 1024, cblas) - verify_matmul_add(235, 128, 1024, cblas, True, False) - verify_matmul_add(235, 128, 1024, cblas, False, True) - verify_matmul_add(235, 128, 1024, cblas, True, True) - verify_matmul_add(235, 128, 1024, mkl) - verify_matmul_add(235, 128, 1024, mkl, True, False) - verify_matmul_add(235, 128, 1024, mkl, False, True) - verify_matmul_add(235, 128, 1024, mkl, True, True) + # verify_matmul_add(235, 128, 1024, cblas) + # verify_matmul_add(235, 128, 1024, cblas, True, False) + # verify_matmul_add(235, 128, 1024, cblas, False, True) + # verify_matmul_add(235, 128, 1024, cblas, True, True) + # verify_matmul_add(235, 128, 1024, mkl) + # verify_matmul_add(235, 128, 1024, mkl, True, False) + # verify_matmul_add(235, 128, 1024, mkl, False, True) + # verify_matmul_add(235, 128, 1024, mkl, True, True) verify_matmul_add(235, 128, 1024, mkldnn) verify_matmul_add(235, 128, 1024, mkldnn, True, False) verify_matmul_add(235, 128, 1024, mkldnn, False, True) verify_matmul_add(235, 128, 1024, mkldnn, True, True) - verify_matmul_add(1, 16, 4, cblas) - verify_matmul_add(1, 16, 3, cblas, True, False) - verify_matmul_add(1, 16, 3, cblas, False, False) - verify_matmul_add(1, 16, 3, cblas, True, True) - verify_matmul_add(1, 16, 4, mkl) - verify_matmul_add(1, 16, 3, mkl, True, False) - verify_matmul_add(1, 16, 3, mkl, False, False) - verify_matmul_add(1, 16, 3, mkl, True, True) + # verify_matmul_add(1, 16, 4, cblas) + # verify_matmul_add(1, 16, 3, cblas, True, False) + # verify_matmul_add(1, 16, 3, cblas, False, False) + # verify_matmul_add(1, 16, 3, cblas, True, True) + # verify_matmul_add(1, 16, 4, mkl) + # verify_matmul_add(1, 16, 3, mkl, True, False) + # verify_matmul_add(1, 16, 3, mkl, False, False) + # verify_matmul_add(1, 16, 3, mkl, True, True) verify_matmul_add(1, 16, 4, mkldnn) verify_matmul_add(1, 16, 3, mkldnn, True, False) verify_matmul_add(1, 16, 3, mkldnn, False, False) @@ -238,5 +238,5 @@ def test_batch_matmul(): if __name__ == "__main__": test_matmul_add() - test_quantized_matmul_add() - test_batch_matmul() + # test_quantized_matmul_add() + # test_batch_matmul() diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/tutorials/auto_scheduler/tune_network_x86.py index 76068fa79605..89b1e11d26eb 100644 --- a/tutorials/auto_scheduler/tune_network_x86.py +++ b/tutorials/auto_scheduler/tune_network_x86.py @@ -146,11 +146,11 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32", use_sparse=Fal # Define the neural network and compilation target. # If the target machine supports avx512 instructions, replace the # "llvm -mcpu=core-avx2" with "llvm -mcpu=skylake-avx512" -network = "resnet-50" +network = "mlp" use_sparse = False batch_size = 1 layout = "NHWC" -target = tvm.target.Target("llvm -mcpu=core-avx2") +target = tvm.target.Target("llvm -mcpu=core-avx2 -libs=mkldnn") dtype = "float32" log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name) From e92e7839b3d07487962dfce473df5df6262c61d9 Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Tue, 29 Jun 2021 11:43:59 +0800 Subject: [PATCH 21/26] Update --- python/tvm/topi/nn/dense.py | 12 +++---- python/tvm/topi/x86/dense.py | 12 +++---- tests/python/contrib/test_cblas.py | 38 ++++++++++---------- tutorials/auto_scheduler/tune_network_x86.py | 4 +-- 4 files changed, 33 insertions(+), 33 deletions(-) diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py index 9f1a3963115e..ac1fff79deea 100644 --- a/python/tvm/topi/nn/dense.py +++ b/python/tvm/topi/nn/dense.py @@ -87,25 +87,25 @@ def matmul( compute_lambda = lambda i, j: te.sum( tensor_a[k, i].astype(out_dtype) * tensor_b[j, k].astype(out_dtype), axis=k ) - compute_name = "T_matmul" + compute_name = "T_matmul_TT" compute_tag = "matmul" elif (transpose_a, transpose_b) == (True, False): compute_lambda = lambda i, j: te.sum( tensor_a[k, i].astype(out_dtype) * tensor_b[k, j].astype(out_dtype), axis=k ) - compute_name = "T_matmul" + compute_name = "T_matmul_TN" compute_tag = "matmul" elif (transpose_a, transpose_b) == (False, True): compute_lambda = lambda i, j: te.sum( tensor_a[i, k].astype(out_dtype) * tensor_b[j, k].astype(out_dtype), axis=k ) - compute_name = "T_dense" + compute_name = "T_matmul_NT" compute_tag = "dense" else: # (transpose_a, transpose_b) == (False, False): compute_lambda = lambda i, j: te.sum( tensor_a[i, k].astype(out_dtype) * tensor_b[k, j].astype(out_dtype), axis=k ) - compute_name = "T_matmul" + compute_name = "T_matmul_NN" compute_tag = "matmul" mat = te.compute( @@ -154,8 +154,8 @@ def matmul_legalize(attrs, inputs, types): def dense(data, weight, bias=None, out_dtype=None, auto_scheduler_rewritten_layout=""): """The default implementation of dense in topi. - This is an alias of matmul operator for data tensor in non-transposed format and weight tensor - in transposed format. + This is an alias of matmul_nt operator for data tensor in non-transposed format and weight + tensor in transposed format. Parameters ---------- diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py index f726cdc14022..189ac5bd34bd 100644 --- a/python/tvm/topi/x86/dense.py +++ b/python/tvm/topi/x86/dense.py @@ -325,7 +325,7 @@ def dense_cblas(cfg, data, weight, bias=None, out_dtype=None): @autotvm.register_topi_schedule("dense_cblas.x86") def schedule_dense_cblas(_, outs): - """Create schedule for dense_cblas""" + """Create schedule for dense_cblas. This is an alias of matmul_nt operator.""" return schedule_matmul_blas_common(outs) @@ -337,7 +337,7 @@ def dense_mkl(cfg, data, weight, bias=None, out_dtype=None): @autotvm.register_topi_schedule("dense_mkl.x86") def schedule_dense_mkl(_, outs): - """Create schedule for dense_mkl""" + """Create schedule for dense_mkl. This is an alias of matmul_nt operator.""" return schedule_matmul_blas_common(outs) @@ -349,7 +349,7 @@ def dense_mkldnn(cfg, data, weight, bias=None, out_dtype=None): @autotvm.register_topi_schedule("dense_mkldnn.x86") def schedule_dense_mkldnn(_, outs): - """Create schedule for dense_mkldnn""" + """Create schedule for dense_mkldnn. This is an alias of matmul_nt operator.""" return schedule_matmul_blas_common(outs) @@ -365,7 +365,7 @@ def matmul_cblas( @autotvm.register_topi_schedule("matmul_cblas.x86") def schedule_matmul_cblas(_, outs): - """Create schedule for matmul_cblas""" + """Create schedule for matmul_cblas.""" return schedule_matmul_blas_common(outs) @@ -381,7 +381,7 @@ def matmul_mkl( @autotvm.register_topi_schedule("matmul_mkl.x86") def schedule_matmul_mkl(_, outs): - """Create schedule for matmul_mkl""" + """Create schedule for matmul_mkl.""" return schedule_matmul_blas_common(outs) @@ -397,5 +397,5 @@ def matmul_mkldnn( @autotvm.register_topi_schedule("matmul_mkldnn.x86") def schedule_matmul_mkldnn(_, outs): - """Create schedule for matmul_mkldnn""" + """Create schedule for matmul_mkldnn.""" return schedule_matmul_blas_common(outs) diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py index 86556c334f5b..2b99879d8227 100644 --- a/tests/python/contrib/test_cblas.py +++ b/tests/python/contrib/test_cblas.py @@ -71,30 +71,30 @@ def verify(target="llvm"): ) verify("llvm") - # verify("c") + verify("c") def test_matmul_add(): - # verify_matmul_add(235, 128, 1024, cblas) - # verify_matmul_add(235, 128, 1024, cblas, True, False) - # verify_matmul_add(235, 128, 1024, cblas, False, True) - # verify_matmul_add(235, 128, 1024, cblas, True, True) - # verify_matmul_add(235, 128, 1024, mkl) - # verify_matmul_add(235, 128, 1024, mkl, True, False) - # verify_matmul_add(235, 128, 1024, mkl, False, True) - # verify_matmul_add(235, 128, 1024, mkl, True, True) + verify_matmul_add(235, 128, 1024, cblas) + verify_matmul_add(235, 128, 1024, cblas, True, False) + verify_matmul_add(235, 128, 1024, cblas, False, True) + verify_matmul_add(235, 128, 1024, cblas, True, True) + verify_matmul_add(235, 128, 1024, mkl) + verify_matmul_add(235, 128, 1024, mkl, True, False) + verify_matmul_add(235, 128, 1024, mkl, False, True) + verify_matmul_add(235, 128, 1024, mkl, True, True) verify_matmul_add(235, 128, 1024, mkldnn) verify_matmul_add(235, 128, 1024, mkldnn, True, False) verify_matmul_add(235, 128, 1024, mkldnn, False, True) verify_matmul_add(235, 128, 1024, mkldnn, True, True) - # verify_matmul_add(1, 16, 4, cblas) - # verify_matmul_add(1, 16, 3, cblas, True, False) - # verify_matmul_add(1, 16, 3, cblas, False, False) - # verify_matmul_add(1, 16, 3, cblas, True, True) - # verify_matmul_add(1, 16, 4, mkl) - # verify_matmul_add(1, 16, 3, mkl, True, False) - # verify_matmul_add(1, 16, 3, mkl, False, False) - # verify_matmul_add(1, 16, 3, mkl, True, True) + verify_matmul_add(1, 16, 4, cblas) + verify_matmul_add(1, 16, 3, cblas, True, False) + verify_matmul_add(1, 16, 3, cblas, False, False) + verify_matmul_add(1, 16, 3, cblas, True, True) + verify_matmul_add(1, 16, 4, mkl) + verify_matmul_add(1, 16, 3, mkl, True, False) + verify_matmul_add(1, 16, 3, mkl, False, False) + verify_matmul_add(1, 16, 3, mkl, True, True) verify_matmul_add(1, 16, 4, mkldnn) verify_matmul_add(1, 16, 3, mkldnn, True, False) verify_matmul_add(1, 16, 3, mkldnn, False, False) @@ -238,5 +238,5 @@ def test_batch_matmul(): if __name__ == "__main__": test_matmul_add() - # test_quantized_matmul_add() - # test_batch_matmul() + test_quantized_matmul_add() + test_batch_matmul() diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/tutorials/auto_scheduler/tune_network_x86.py index 89b1e11d26eb..76068fa79605 100644 --- a/tutorials/auto_scheduler/tune_network_x86.py +++ b/tutorials/auto_scheduler/tune_network_x86.py @@ -146,11 +146,11 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32", use_sparse=Fal # Define the neural network and compilation target. # If the target machine supports avx512 instructions, replace the # "llvm -mcpu=core-avx2" with "llvm -mcpu=skylake-avx512" -network = "mlp" +network = "resnet-50" use_sparse = False batch_size = 1 layout = "NHWC" -target = tvm.target.Target("llvm -mcpu=core-avx2 -libs=mkldnn") +target = tvm.target.Target("llvm -mcpu=core-avx2") dtype = "float32" log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name) From 968c6bf208de2da295be6e97b7372eadf7450c26 Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Tue, 29 Jun 2021 11:47:51 +0800 Subject: [PATCH 22/26] Update --- include/tvm/relay/attrs/nn.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h index 8b245ab529d2..3c7574562676 100644 --- a/include/tvm/relay/attrs/nn.h +++ b/include/tvm/relay/attrs/nn.h @@ -979,11 +979,11 @@ struct MatmulAttrs : public tvm::AttrsNode { TVM_ATTR_FIELD(transpose_a) .set_default(false) - .describe("Whether the input tensor is in transposed format."); + .describe("Whether the first input tensor is in transposed format."); TVM_ATTR_FIELD(transpose_b) .set_default(false) - .describe("Whether the weight tensor is in transposed format."); + .describe("Whether the second input tensor is in transposed format."); } }; From dc4681e3fa82d185651a88c41dad335032e136ee Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Tue, 29 Jun 2021 13:35:35 +0800 Subject: [PATCH 23/26] Update --- python/tvm/relay/op/nn/_nn.py | 8 +++----- python/tvm/topi/nn/dense.py | 3 ++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index 492d0c910172..056cb5694a48 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -1199,9 +1199,7 @@ def _matmul_shape_func(tensor_a_shape, tensor_b_shape, transpose_a, transpose_b) @reg.register_shape_func("nn.matmul", False) def matmul_shape_func(attrs, inputs, _): - """ - Shape function for matmul op. - """ + """Shape function for matmul op.""" ret = [ _matmul_shape_func( inputs[0], @@ -1215,8 +1213,8 @@ def matmul_shape_func(attrs, inputs, _): @reg.register_shape_func("nn.dense", False) def dense_shape_func(attrs, inputs, _): - """ - Shape function for dense op. + """Shape function for dense op. This is an alias of matmul_nt operator for data tensor in + non-transposed format and weight tensor in transposed format. """ ret = [ _matmul_shape_func( diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py index ac1fff79deea..d8c69efb70e0 100644 --- a/python/tvm/topi/nn/dense.py +++ b/python/tvm/topi/nn/dense.py @@ -60,7 +60,7 @@ def matmul( output : tvm.te.Tensor 2-D with shape [batch, out_dim] """ - assert len(tensor_a.shape) == 2, "only support 2-dim dense" + assert len(tensor_a.shape) == 2, "only support 2-dim matmul" if bias is not None: assert len(bias.shape) == 1 if out_dtype is None: @@ -100,6 +100,7 @@ def matmul( tensor_a[i, k].astype(out_dtype) * tensor_b[j, k].astype(out_dtype), axis=k ) compute_name = "T_matmul_NT" + # TODO(jcf94): Remove `dense` when `matmul` is finally ready compute_tag = "dense" else: # (transpose_a, transpose_b) == (False, False): compute_lambda = lambda i, j: te.sum( From e7a6e7639bf2d7c42e6ea6c8e297164dd37a51c2 Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Wed, 30 Jun 2021 10:32:36 +0800 Subject: [PATCH 24/26] Update --- python/tvm/relay/op/strategy/x86.py | 51 +++++++++++++++++++---------- python/tvm/topi/nn/dense.py | 3 +- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 581aa5d6a781..1271826cefa7 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -374,28 +374,12 @@ def conv1d_strategy_cpu(attrs, inputs, out_type, target): def matmul_strategy_cpu(attrs, inputs, out_type, target): """matmul x86 strategy""" strategy = _op.OpStrategy() - if is_auto_scheduler_enabled(): - strategy.add_implementation( - wrap_compute_matmul(topi.nn.matmul, need_auto_scheduler_layout=True), - naive_schedule, - name="matmul.generic", - plevel=11, - ) - else: - logger.warning( - "Matmul is not optimized for x86. " - "Recommend to use cblas/mkl/mkldnn for better performance." - ) - strategy.add_implementation( - wrap_compute_matmul(topi.nn.matmul), - naive_schedule, - name="matmul.generic", - ) same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype dtype = inputs[0].dtype u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32" if "cblas" in target.libs: + length_before = len(strategy.specializations) with SpecializedCondition(same_type and dtype in ["float32", "float64"]): strategy.add_implementation( wrap_compute_matmul(topi.x86.matmul_cblas), @@ -403,7 +387,12 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target): name="matmul_cblas.x86", plevel=13, ) + if length_before == len(strategy.specializations): + logger.warning( + "Currently cblas only support the data type to be float32 or float64. Skip." + ) if "mkl" in target.libs: + length_before = len(strategy.specializations) with SpecializedCondition(same_type and dtype in ["float32", "float64"] or u8s8s32): strategy.add_implementation( wrap_compute_matmul(topi.x86.matmul_mkl), @@ -411,7 +400,13 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target): name="matmul_mkl.x86", plevel=14, ) + if length_before == len(strategy.specializations): + logger.warning( + "Currently mkl only support the data type to be float32, float64 or input with " + "uint8 and int8 while output wiht int32. Skip." + ) if "mkldnn" in target.libs: + length_before = len(strategy.specializations) with SpecializedCondition(same_type and dtype == "float32"): strategy.add_implementation( wrap_compute_matmul(topi.x86.matmul_mkldnn), @@ -419,6 +414,28 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target): name="matmul_mkldnn.x86", plevel=15, ) + if length_before == len(strategy.specializations): + logger.warning("Currently mkldnn only support the data type to be float32. Skip.") + + if is_auto_scheduler_enabled(): + strategy.add_implementation( + wrap_compute_matmul(topi.nn.matmul, need_auto_scheduler_layout=True), + naive_schedule, + name="matmul.generic", + plevel=11, + ) + else: + # If no cblas/mkl/mkldnn strategy choosed + if not len(strategy.specializations): + logger.warning( + "Matmul is not optimized for x86. " + "Recommend to use cblas/mkl/mkldnn for better performance." + ) + strategy.add_implementation( + wrap_compute_matmul(topi.nn.matmul), + naive_schedule, + name="matmul.generic", + ) return strategy diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py index d8c69efb70e0..58c458a7d676 100644 --- a/python/tvm/topi/nn/dense.py +++ b/python/tvm/topi/nn/dense.py @@ -60,6 +60,7 @@ def matmul( output : tvm.te.Tensor 2-D with shape [batch, out_dim] """ + # TODO(jcf94): Add multi-dim support for tensor_a assert len(tensor_a.shape) == 2, "only support 2-dim matmul" if bias is not None: assert len(bias.shape) == 1 @@ -73,7 +74,7 @@ def matmul( if auto_scheduler_rewritten_layout: # Infer shape for the rewritten layout out_dim, red_dim = auto_scheduler.get_shape_from_rewritten_layout( - auto_scheduler_rewritten_layout, ["j", "k"] if transpose_b else ["k", "j"] + auto_scheduler_rewritten_layout, ["j", "k"] ) auto_scheduler.remove_index_check(tensor_b) elif transpose_b: From 63426d8376c24d77039ebe4c77783e6fddec7237 Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Wed, 30 Jun 2021 10:39:22 +0800 Subject: [PATCH 25/26] Lint fix --- python/tvm/relay/op/strategy/x86.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 1271826cefa7..d09d90a50d41 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -379,7 +379,7 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target): dtype = inputs[0].dtype u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32" if "cblas" in target.libs: - length_before = len(strategy.specializations) + length_before = len(strategy.specializations) if strategy.specializations else 0 with SpecializedCondition(same_type and dtype in ["float32", "float64"]): strategy.add_implementation( wrap_compute_matmul(topi.x86.matmul_cblas), @@ -387,12 +387,13 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target): name="matmul_cblas.x86", plevel=13, ) - if length_before == len(strategy.specializations): + length_after = len(strategy.specializations) if strategy.specializations else 0 + if length_before == length_after: logger.warning( "Currently cblas only support the data type to be float32 or float64. Skip." ) if "mkl" in target.libs: - length_before = len(strategy.specializations) + length_before = len(strategy.specializations) if strategy.specializations else 0 with SpecializedCondition(same_type and dtype in ["float32", "float64"] or u8s8s32): strategy.add_implementation( wrap_compute_matmul(topi.x86.matmul_mkl), @@ -400,13 +401,14 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target): name="matmul_mkl.x86", plevel=14, ) - if length_before == len(strategy.specializations): + length_after = len(strategy.specializations) if strategy.specializations else 0 + if length_before == length_after: logger.warning( "Currently mkl only support the data type to be float32, float64 or input with " "uint8 and int8 while output wiht int32. Skip." ) if "mkldnn" in target.libs: - length_before = len(strategy.specializations) + length_before = len(strategy.specializations) if strategy.specializations else 0 with SpecializedCondition(same_type and dtype == "float32"): strategy.add_implementation( wrap_compute_matmul(topi.x86.matmul_mkldnn), @@ -414,7 +416,8 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target): name="matmul_mkldnn.x86", plevel=15, ) - if length_before == len(strategy.specializations): + length_after = len(strategy.specializations) if strategy.specializations else 0 + if length_before == length_after: logger.warning("Currently mkldnn only support the data type to be float32. Skip.") if is_auto_scheduler_enabled(): @@ -426,7 +429,7 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target): ) else: # If no cblas/mkl/mkldnn strategy choosed - if not len(strategy.specializations): + if not strategy.specializations: logger.warning( "Matmul is not optimized for x86. " "Recommend to use cblas/mkl/mkldnn for better performance." From 9b4f77ced69267aa8a28f3715d041a7dc2087f38 Mon Sep 17 00:00:00 2001 From: "chengfan.jcf" Date: Wed, 30 Jun 2021 14:13:14 +0800 Subject: [PATCH 26/26] Bug fix for merge main --- python/tvm/relay/op/strategy/cuda.py | 6 +++--- python/tvm/topi/gpu/dense.py | 30 ++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index c40a1f025f2c..dd265e4b4d5b 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -715,9 +715,9 @@ def matmul_strategy_cuda(attrs, inputs, out_type, target): ) # Temporary use this as a basic schedule strategy.add_implementation( - wrap_compute_matmul(topi.cuda.matmul_default_cuda), - wrap_topi_schedule(topi.cuda.schedule_matmul_default_cuda), - name="matmul_default.cuda", + wrap_compute_matmul(topi.gpu.matmul_default), + wrap_topi_schedule(topi.gpu.schedule_matmul_default), + name="matmul_default.gpu", ) if target.kind.name == "cuda" and "cublas" in target.libs: diff --git a/python/tvm/topi/gpu/dense.py b/python/tvm/topi/gpu/dense.py index 806aa9f5ca44..b9009d3f3393 100644 --- a/python/tvm/topi/gpu/dense.py +++ b/python/tvm/topi/gpu/dense.py @@ -49,6 +49,36 @@ def _callback(op): return s +@autotvm.register_topi_compute("matmul_default.gpu") +def matmul_default( + cfg, + tensor_a, + tensor_b, + bias=None, + out_dtype=None, + transpose_a=False, + transpose_b=False, +): + """Matmul operator on GPU""" + return nn.matmul(tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b) + + +@autotvm.register_topi_schedule("matmul_default.gpu") +def schedule_matmul_default(cfg, outs): + """Schedule matmul on GPU""" + outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs + s = te.create_schedule([x.op for x in outs]) + + def _callback(op): + if op.tag == "matmul": + # Temporary use this as a basic schedule for matmul + # TODO(jcf94): Add a more general schedule for matmul + _schedule_dense_small_batch(cfg, s, op.output(0)) + + traverse_inline(s, outs[0].op, _callback) + return s + + def _schedule_dense_small_batch(cfg, s, C): A, weights = C.op.input_tensors _, in_dim_weights = get_const_tuple(weights.shape)