From 84a1394c2be65a3ff62098d9b32d4b5c4c5ebff9 Mon Sep 17 00:00:00 2001 From: hlu1 <14827759+hlu1@users.noreply.github.com> Date: Wed, 27 Feb 2019 16:47:52 -0800 Subject: [PATCH] winograd_nnpack --- include/tvm/relay/attrs/nn.h | 18 ++ nnvm/include/nnvm/top/nn.h | 20 ++ nnvm/python/nnvm/top/nn.py | 47 +++++ nnvm/src/top/nn/convolution.cc | 81 ++++++++- python/tvm/contrib/nnpack.py | 10 +- python/tvm/relay/op/nn/_nn.py | 52 ++++++ python/tvm/relay/op/nn/nn.py | 92 ++++++++++ python/tvm/relay/op/op_attrs.py | 5 + src/contrib/nnpack/convolution.cc | 28 +-- src/relay/op/nn/convolution.cc | 127 ++++++++++++- .../integration/test_winograd_nnpack.py | 127 +++++++++++++ topi/python/topi/arm_cpu/conv2d.py | 172 +++++++++++++++++- topi/python/topi/generic/nn.py | 33 ++++ topi/python/topi/nn/conv2d.py | 42 +++++ .../tests/python/test_topi_conv2d_winograd.py | 5 +- 15 files changed, 828 insertions(+), 31 deletions(-) create mode 100644 tests/python/integration/test_winograd_nnpack.py diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h index 103359e3617c..2c96a0745150 100644 --- a/include/tvm/relay/attrs/nn.h +++ b/include/tvm/relay/attrs/nn.h @@ -155,6 +155,24 @@ struct Conv2DWinogradAttrs : public tvm::AttrsNode { } }; +/*! \brief Attributes used in winograd weight transformation operators */ +struct Conv2DWinogradNNPACKWeightTransformAttrs + : public tvm::AttrsNode { + int convolution_algorithm; + DataType out_dtype; + + TVM_DECLARE_ATTRS(Conv2DWinogradNNPACKWeightTransformAttrs, + "relay.attrs.Conv2DWinogradNNPACKWeightTransformAttrs") { + TVM_ATTR_FIELD(convolution_algorithm) + .describe( + "The convolution algorithm for Winograd NNPACK. " + "E.g. tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8 for WT_8x8, " + "tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8_FP16 for WT_8x8_FP16"); + TVM_ATTR_FIELD(out_dtype) + .set_default(NullValue()) + .describe("Output data type, set to explicit type under mixed precision setting"); + } +}; /*! \brief Attributes used in softmax operators */ struct SoftmaxAttrs : public tvm::AttrsNode { diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h index 578f928c5b9f..ed4e964383eb 100644 --- a/nnvm/include/nnvm/top/nn.h +++ b/nnvm/include/nnvm/top/nn.h @@ -183,6 +183,26 @@ struct WinogradWeightTransformParam : public dmlc::Parameter { + int convolution_algorithm; + int out_dtype; + + DMLC_DECLARE_PARAMETER(WinogradNNPACKWeightTransformParam) { + DMLC_DECLARE_FIELD(convolution_algorithm) + .describe( + "The convolution algorithm for Winograd NNPACK. " + "E.g. tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8 for WT_8x8, " + "tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8_FP16 for WT_8x8_FP16"); + DMLC_DECLARE_DTYPE_FIELD(out_dtype) + .add_enum("same", -1) + .set_default(-1) + .describe("Output data type, set to explicit type under mixed precision setting"); + } + + static const constexpr int kWeight = 0; +}; + struct WinogradConv2DParam : public dmlc::Parameter { int channels; TShape kernel_size; diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py index ce2085da5a91..2510c902774b 100644 --- a/nnvm/python/nnvm/top/nn.py +++ b/nnvm/python/nnvm/top/nn.py @@ -161,6 +161,10 @@ def alter_conv2d_layout(attrs, inputs, tinfos): sym.contrib.conv2d_winograd_without_weight_transform sym.contrib_conv2d_winograd_weight_transform = \ sym.contrib.conv2d_winograd_weight_transform + sym.contrib_conv2d_winograd_nnpack_without_weight_transform = \ + sym.contrib.conv2d_winograd_nnpack_without_weight_transform + sym.contrib_conv2d_winograd_nnpack_weight_transform = \ + sym.contrib.conv2d_winograd_nnpack_weight_transform sym.nn = sym # map relay argument names to nnvm argument names @@ -274,6 +278,49 @@ def schedule_contrib_conv2d_winograd_without_weight_transform(attrs, outs, targe OpPattern.OUT_ELEMWISE_FUSABLE) +@reg.register_compute("_contrib_conv2d_winograd_nnpack_weight_transform") +def compute_contrib_conv2d_winograd_nnpack_weight_transform(attrs, inputs, _): + convolution_algorithm = attrs.get_int('convolution_algorithm') + out_dype = attrs.get_str('out_dtype') + return topi.nn.conv2d_winograd_nnpack_weight_transform( + inputs[0], convolution_algorithm, out_dype) + + +@reg.register_schedule("_contrib_conv2d_winograd_nnpack_weight_transform") +def schedule_contrib_conv2d_winograd_nnpack_weight_transform(attrs, outs, target): + with tvm.target.create(target): + return topi.generic.schedule_conv2d_winograd_nnpack_weight_transform(outs) + +reg.register_pattern("_contrib_conv2d_winograd_nnpack_weight_transform", OpPattern.OPAQUE) + + +@reg.register_compute("_contrib_conv2d_winograd_nnpack_without_weight_transform") +def compute_contrib_conv2d_winograd_nnpack_without_weight_transform(attrs, inputs, _): + padding = attrs.get_int_tuple("padding") + strides = attrs.get_int_tuple("strides") + dilation = attrs.get_int_tuple("dilation") + groups = attrs.get_int("groups") + layout = attrs.get_str("layout") + out_dtype = attrs.get_str("out_dtype") + out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype + assert dilation == (1, 1), "Do not support dilate now" + assert groups == 1, "Do not supoort arbitrary group number" + + # pylint: disable=assignment-from-no-return + out = topi.nn.conv2d_winograd_nnpack_without_weight_transform( + inputs[0], inputs[1], inputs[2] if attrs.get_bool("use_bias") else None, + strides, padding, dilation, layout, out_dtype) + return out + +@reg.register_schedule("_contrib_conv2d_winograd_nnpack_without_weight_transform") +def schedule_contrib_conv2d_winograd_nnpack_without_weight_transform(attrs, outs, target): + with tvm.target.create(target): + return topi.generic.schedule_conv2d_winograd_nnpack_without_weight_transform(outs) + +reg.register_pattern("_contrib_conv2d_winograd_nnpack_without_weight_transform", + OpPattern.OPAQUE) + + # conv2d_transpose @reg.register_compute("conv2d_transpose") def compute_conv2d_transpose(attrs, inputs, _): diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc index e6ff72239672..601e57ab325b 100644 --- a/nnvm/src/top/nn/convolution.cc +++ b/nnvm/src/top/nn/convolution.cc @@ -130,13 +130,14 @@ inline bool Conv2DInferShape(const nnvm::NodeAttrs& attrs, return true; } +template inline bool WinogradConv2DInferShape(const nnvm::NodeAttrs& attrs, std::vector* in_shape, std::vector* out_shape) { static const Layout kNCHW("NCHW"); static const Layout kOIHW("OIHW"); - const WinogradConv2DParam& param = nnvm::get(attrs.parsed); + const Param& param = nnvm::get(attrs.parsed); const Layout in_layout(param.layout); const Layout kernel_layout(param.kernel_layout); @@ -403,7 +404,7 @@ NNVM_REGISTER_OP(_contrib_conv2d_winograd_without_weight_transform) .set_attr_parser(ParamParser) .set_attr("FGetAttrDict", ParamGetAttrDict) .set_attr("FListInputNames", UseBiasListInputNames) -.set_attr("FInferShape", WinogradConv2DInferShape) +.set_attr("FInferShape", WinogradConv2DInferShape) .set_attr("FInferType", Conv2DInferType) .set_attr("FCorrectLayout", Conv2DCorrectLayout) .set_num_outputs(1) @@ -412,6 +413,82 @@ NNVM_REGISTER_OP(_contrib_conv2d_winograd_without_weight_transform) DMLC_REGISTER_PARAMETER(WinogradConv2DParam); + +inline bool Conv2DWinogradNNPACKWTInferType(const nnvm::NodeAttrs& attrs, + std::vector* in_type, + std::vector* out_type) { + const WinogradNNPACKWeightTransformParam& param = + nnvm::get(attrs.parsed); + + CHECK_EQ(in_type->size(), 1U) << "Input:[weight]"; + CHECK_EQ(out_type->size(), 1U); + + if (param.out_dtype != -1) { + NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_type, 0, param.out_dtype); + } else { + ElemwiseType<1, 1>(attrs, in_type, out_type); + } + return true; +} + +NNVM_REGISTER_OP(_contrib_conv2d_winograd_nnpack_weight_transform) +.describe(R"code(Weight transformation of winograd fast convolution algorithm. +Separate this into another nnvm symbol in order to enable Precompute Pass to compute the +weight transformation in advance. +- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1]) +)code" NNVM_ADD_FILELINE) +.add_argument("weight", "4D Tensor", "Weight tensor.") +.add_arguments(WinogradNNPACKWeightTransformParam::__FIELDS__()) +.set_attr_parser(ParamParser) +.set_attr("FGetAttrDict", ParamGetAttrDict) +.set_attr("FInferShape", [](const nnvm::NodeAttrs& attrs, + std::vector *in_shape, + std::vector *out_shape) { + const TShape &wshape = (*in_shape)[0]; + CHECK_EQ(wshape.ndim(), 4) << "Weight should be a 4 dimensional tensor"; + TShape oshape({wshape[0], wshape[1], 8, 8}); + NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape); + return true; +}) +.set_attr("FCorrectLayout", [](const NodeAttrs& attrs, + std::vector *ilayouts, + const std::vector *last_ilayouts, + std::vector *olayouts) { + Layout layout("OIHW"); + NNVM_ASSIGN_LAYOUT(*ilayouts, 0, layout); + NNVM_ASSIGN_LAYOUT(*olayouts, 0, layout); + return true; +}) +.set_attr("FInferType", Conv2DWinogradNNPACKWTInferType) +.set_num_outputs(1) +.set_num_inputs(1) +.set_support_level(5); + +DMLC_REGISTER_PARAMETER(WinogradNNPACKWeightTransformParam); + +NNVM_REGISTER_OP(_contrib_conv2d_winograd_nnpack_without_weight_transform) +.describe(R"code(Compute conv2d with winograd nnpack. +- **data**: Input is 4D array of shape (batch_size, in_channels, height, width) +- **weight**: Any shape + We do not check shape for this input tensor. +- **bias**: (channels,) +- **out**: Output is 4D array of shape (batch_size, channels, out_height, out_width) +)code" NNVM_ADD_FILELINE) +.add_argument("data", "4D Tensor", "Input data.") +.add_argument("weight", "4D Tensor", "Transformed weight tensor.") +.add_argument("bias", "1D Tensor", "Bias parameter.") +.add_arguments(Conv2DParam::__FIELDS__()) +.set_attr_parser(ParamParser) +.set_attr("FGetAttrDict", ParamGetAttrDict) +.set_attr("FListInputNames", UseBiasListInputNames) +.set_attr("FInferShape", WinogradConv2DInferShape) +.set_attr("FInferType", Conv2DInferType) +.set_attr("FCorrectLayout", Conv2DCorrectLayout) +.set_num_outputs(1) +.set_num_inputs(UseBiasNumInputs) +.set_support_level(5); + + NNVM_REGISTER_OP(_conv2d_grad) .describe(R"code(2D convolution grad. diff --git a/python/tvm/contrib/nnpack.py b/python/tvm/contrib/nnpack.py index 9fd0e7ed2cba..3aa874f3a65c 100644 --- a/python/tvm/contrib/nnpack.py +++ b/python/tvm/contrib/nnpack.py @@ -149,11 +149,12 @@ def convolution_inference_without_weight_transform( ins[1], ins[2] if bias is not None else 0, outs[0], padding[0], padding[1], padding[2], padding[3], - stride[0], stride[1], nthreads, algorithm), name="C") + stride[0], stride[1], nthreads, algorithm), name="C", dtype='float32') def convolution_inference_weight_transform( kernel, nthreads=1, - algorithm=ConvolutionAlgorithm.AUTO): + algorithm=ConvolutionAlgorithm.AUTO, + dtype='float32'): """Create an extern op to do inference convolution of 3D tensor data and 4D tensor kernel and 1D tensor bias with nnpack. @@ -171,13 +172,14 @@ def convolution_inference_weight_transform( """ assert algorithm in (ConvolutionAlgorithm.WT_8x8, ConvolutionAlgorithm.WT_8x8_FP16) output_channels, input_channels, _, _ = kernel.shape - transform_tile_size = 8 + if not isinstance(dtype, str): + dtype = dtype.dtype return _api.extern( (output_channels, input_channels, transform_tile_size, transform_tile_size), [kernel], lambda ins, outs: _intrin.call_packed( "tvm.contrib.nnpack.convolution_inference_weight_transform", - ins[0], outs[0], nthreads, algorithm), name="transform_kernel") + ins[0], outs[0], nthreads, algorithm), name="transform_kernel", dtype=dtype) _init_api("tvm.contrib.nnpack") diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index d38f40ac373b..958aabd2d45a 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -326,6 +326,58 @@ def schedule_contrib_conv2d_winograd_weight_transform(attrs, outs, target): reg.register_pattern("nn.contrib_conv2d_winograd_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE) + +# winograd nnpack related operators +@reg.register_compute("nn.contrib_conv2d_winograd_nnpack_without_weight_transform") +def compute_contrib_conv2d_winograd_nnpack_without_weight_transform( + attrs, inputs, out_dtype, target): + """Compute definition of conv2d_winograd_nnpack_without_weight_transform""" + # pylint: disable=assignment-from-no-return + padding = attrs.get_int_tuple("padding") + strides = attrs.get_int_tuple("strides") + dilation = attrs.get_int_tuple("dilation") + groups = attrs.get_int("groups") + data_layout = attrs.get_str("data_layout") + out_dtype = attrs.get_str("out_dtype") + out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype + assert dilation == (1, 1), "Do not support dilate now" + assert groups == 1, "Do not supoort arbitrary group number" + + # No bias + out = topi.nn.conv2d_winograd_nnpack_without_weight_transform( + inputs[0], inputs[1], None, strides, padding, dilation, data_layout, + out_dtype) + + return [out] + +@reg.register_schedule("nn.contrib_conv2d_winograd_nnpack_without_weight_transform") +def schedule_contrib_conv2d_winograd_nnpack_without_weight_transform(attrs, outs, target): + """Schedule definition of conv2d_winograd_nnpack_without_weight_transform""" + with target: + return topi.generic.schedule_conv2d_winograd_nnpack_without_weight_transform(outs) + +reg.register_pattern("nn.contrib_conv2d_winograd_nnpack_without_weight_transform", + OpPattern.OPAQUE) + + +@reg.register_compute("nn.contrib_conv2d_winograd_nnpack_weight_transform") +def compute_contrib_conv2d_winograd_nnpack_weight_transform(attrs, inputs, out_dtype, target): + """Compute definition of contrib_conv2d_winograd_nnpack_weight_transform""" + convolution_algorithm = attrs.get_int('convolution_algorithm') + out = topi.nn.conv2d_winograd_nnpack_weight_transform( + inputs[0], convolution_algorithm, out_dtype) + return [out] + +@reg.register_schedule("nn.contrib_conv2d_winograd_nnpack_weight_transform") +def schedule_contrib_conv2d_winograd_nnpack_weight_transform(attrs, outs, target): + """Schedule definition of contrib_conv2d_winograd_nnpack_weight_transform""" + with target: + return topi.generic.schedule_conv2d_winograd_nnpack_weight_transform(outs) + +reg.register_pattern("nn.contrib_conv2d_winograd_nnpack_weight_transform", + OpPattern.OPAQUE) + + @reg.register_compute("nn.contrib_conv2d_NCHWc") def compute_contrib_conv2d_NCHWc(attrs, inputs, out_dtype, target): """Compute definition of conv2d NCHWc""" diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index ad8b287bb397..1a9e02a08c98 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -1,3 +1,4 @@ +#pylint: disable=invalid-name, too-many-lines """Neural network operations.""" from __future__ import absolute_import as _abs from ...expr import TupleWrapper @@ -862,6 +863,72 @@ def contrib_conv2d_winograd_without_weight_transform(data, kernel_layout, out_layout, out_dtype) +def contrib_conv2d_winograd_nnpack_without_weight_transform(data, + weight, + strides=(1, 1), + padding=(0, 0), + dilation=(1, 1), + groups=1, + channels=None, + kernel_size=None, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="", + out_dtype=""): + r"""2D convolution with the NNPACK implementation of winograd algorithm. + + The basic parameters are the same as the ones in vanilla conv2d. + It assumes the weight is pre-transformed by nn.contrib_conv2d_winograd_nnpack_weight_transform + + Parameters + ---------- + data : tvm.relay.Expr + The input data to the operator. + + weight : tvm.relay.Expr + The weight expressions. + + strides : tuple of int, optional + The strides of convoltution. + + padding : tuple of int, optional + The padding of convolution on both sides of inputs before convolution. + + dilation : tuple of int, optional + Specifies the dilation rate to be used for dilated convolution. + + groups : int, optional + Number of groups for grouped convolution. + + channels : int, optional + Number of output channels of this convolution. + + kernel_size : tuple of int, optional + The spatial of the convolution kernel. + + data_layout : str, optional + Layout of the input. + + kernel_layout : str, optional + Layout of the weight. + + out_layout : str, optional + Layout of the output, by default, out_layout is the same as data_layout + + out_dtype : str, optional + Specifies the output data type for mixed precision conv2d. + + Returns + ------- + result : tvm.relay.Expr + The computed result. + """ + return _make.contrib_conv2d_winograd_nnpack_without_weight_transform( + data, weight, strides, padding, dilation, + groups, channels, kernel_size, data_layout, + kernel_layout, out_layout, out_dtype) + + def contrib_conv2d_nchwc(data, kernel, strides=(1, 1), @@ -1013,3 +1080,28 @@ def contrib_conv2d_winograd_weight_transform(weight, The computed result. """ return _make.contrib_conv2d_winograd_weight_transform(weight, tile_size) + + +def contrib_conv2d_winograd_nnpack_weight_transform(weight, + convolution_algorithm, + out_dtype=""): + r"""Weight Transformation part for 2D convolution with winograd algorithm. + + We separate this as a single op to enable pre-compute for inference. + Use this together with nn.contrib_conv2d_winograd_without_weight_transform + + Parameters + ---------- + weight : tvm.relay.Expr + The weight expressions. + + convolution_algorithm : int + The Tile size of winograd. E.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3) + + Returns + ------- + result : tvm.relay.Expr + The computed result. + """ + return _make.contrib_conv2d_winograd_nnpack_weight_transform( + weight, convolution_algorithm, out_dtype) diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py index 5fa83bd96f30..5f9b38a56aa6 100644 --- a/python/tvm/relay/op/op_attrs.py +++ b/python/tvm/relay/op/op_attrs.py @@ -18,6 +18,11 @@ class Conv2DWinogradWeightTransformAttrs(Attrs): """Attribute of nn.contrib_conv2d_winograd_weight_transform""" +@register_relay_attr_node +class Conv2DWinogradNNPACKWeightTransformAttrs(Attrs): + """Attribute of nn.contrib_conv2d_winograd_nnpack_weight_transform""" + + @register_relay_attr_node class GlobalPool2DAttrs(Attrs): """Attribute of nn.global_pool""" diff --git a/src/contrib/nnpack/convolution.cc b/src/contrib/nnpack/convolution.cc index 887129819bc2..538d29333b4a 100644 --- a/src/contrib/nnpack/convolution.cc +++ b/src/contrib/nnpack/convolution.cc @@ -189,20 +189,20 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_tra CHECK(workspace_buffer != nullptr); for (auto n = 0; n < input->shape[0]; ++n) { - nnp_status status = nnp_convolution_inference( - algo, nnp_convolution_transform_strategy_reuse, input_channels, output_channels, - input_size, input_padding, kernel_size, stride_size, - static_cast(input->data) + n * input->shape[1] * - input->shape[2] * - input->shape[3], - static_cast(transformed_kernel->data), - bias ? static_cast(bias->data) : zero_bias->data(), - static_cast(output->data) + n * output->shape[1] * - output->shape[2] * - output->shape[3], - workspace_buffer, &workspace_size, - nnp_activation_identity, nullptr, entry->threadpool, nullptr); - CHECK_EQ(status, nnp_status_success); + nnp_status status = nnp_convolution_inference( + algo, nnp_convolution_transform_strategy_reuse, input_channels, output_channels, + input_size, input_padding, kernel_size, stride_size, + static_cast(input->data) + n * input->shape[1] * + input->shape[2] * + input->shape[3], + static_cast(transformed_kernel->data), + bias ? static_cast(bias->data) : zero_bias->data(), + static_cast(output->data) + n * output->shape[1] * + output->shape[2] * + output->shape[3], + workspace_buffer, &workspace_size, + nnp_activation_identity, nullptr, entry->threadpool, nullptr); + CHECK_EQ(status, nnp_status_success); } cpu_api->FreeWorkspace(ctx, workspace_buffer); diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc index b53f57464e81..1e44e97250d4 100644 --- a/src/relay/op/nn/convolution.cc +++ b/src/relay/op/nn/convolution.cc @@ -344,6 +344,7 @@ v (batch_size, channels, out_height, out_width) if `layout` is `NCHW` // relay.nn.contrib_conv2d_winograd_without_weight_transform TVM_REGISTER_NODE_TYPE(Conv2DWinogradAttrs); +template bool Conv2DWinogradRel(const Array& types, int num_inputs, const Attrs& attrs, @@ -354,7 +355,7 @@ bool Conv2DWinogradRel(const Array& types, static const Layout kNCHW("NCHW"); static const Layout kOIHW("OIHW"); - const Conv2DWinogradAttrs* param = attrs.as(); + const Param* param = attrs.as(); CHECK(param != nullptr); const Layout in_layout(param->data_layout); const Layout kernel_layout(param->kernel_layout); @@ -467,7 +468,7 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_without_weight_transform") .add_argument("data", "Tensor", "The input tensor.") .add_argument("weight", "Tensor", "The weight tensor.") .set_support_level(10) -.add_type_rel("Conv2DWinograd", Conv2DWinogradRel) +.add_type_rel("Conv2DWinograd", Conv2DWinogradRel) .set_attr("FInferCorrectLayout", Conv2DInferCorrectLayout); @@ -511,8 +512,8 @@ Expr MakeConv2DWinogradWeightTransform(Expr weight, TVM_REGISTER_API("relay.op.nn._make.contrib_conv2d_winograd_weight_transform") .set_body([](const TVMArgs& args, TVMRetValue* rv) { - runtime::detail::unpack_call(MakeConv2DWinogradWeightTransform, args, rv); - }); + runtime::detail::unpack_call(MakeConv2DWinogradWeightTransform, args, rv); +}); RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_weight_transform") @@ -530,6 +531,124 @@ weight transformation in advance. .add_type_rel("Conv2DWinogradWeightTransform", Conv2DWinogradWeightTransformRel); +// Positional relay function to create conv2d winograd nnpack operator +// used by frontend FFI. +Expr MakeConv2DWinogradNNPACK(Expr data, + Expr weight, + Array strides, + Array padding, + Array dilation, + int groups, + IndexExpr channels, + Array kernel_size, + std::string data_layout, + std::string kernel_layout, + std::string out_layout, + DataType out_dtype) { + auto attrs = make_node(); + attrs->strides = std::move(strides); + attrs->padding = std::move(padding); + attrs->dilation = std::move(dilation); + attrs->groups = groups; + attrs->channels = channels; + attrs->kernel_size = std::move(kernel_size); + attrs->data_layout = std::move(data_layout); + attrs->kernel_layout = std::move(kernel_layout); + attrs->out_layout = std::move(out_layout); + attrs->out_dtype = std::move(out_dtype); + static const Op& op = Op::Get("nn.contrib_conv2d_winograd_nnpack_without_weight_transform"); + return CallNode::make(op, {data, weight}, Attrs(attrs), {}); +} + +TVM_REGISTER_API("relay.op.nn._make.contrib_conv2d_winograd_nnpack_without_weight_transform") +.set_body([](const TVMArgs& args, TVMRetValue* rv) { + runtime::detail::unpack_call(MakeConv2DWinogradNNPACK, args, rv); +}); + +RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_nnpack_without_weight_transform") +.describe(R"code(Compute conv2d with winograd nnpack. Only supports NCHW layout. + This operator assumes the weight tensor is already pre-transformed by + nn.contrib_conv2d_winograd_nnpack_weight_transform. + +- **data**: Input is 4D array of shape (batch_size, in_channels, height, width) +- **weight**: Any shape + We do not check the shape for this input tensor. Since different backend + has different layout strategy. + +- **out**: Output is 4D array of shape (batch_size, channels, out_height, out_width) +)code" TVM_ADD_FILELINE) +.set_attrs_type_key("relay.attrs.Conv2DAttrs") +.set_num_inputs(2) +.add_argument("data", "Tensor", "The input tensor.") +.add_argument("weight", "Tensor", "The weight tensor.") +.set_support_level(10) +.add_type_rel("Conv2DWinogradNNPACKRel", Conv2DWinogradRel) +.set_attr("FInferCorrectLayout", Conv2DInferCorrectLayout); + +// relay.nn.contrib_conv2d_winograd_nnpack_weight_transform +TVM_REGISTER_NODE_TYPE(Conv2DWinogradNNPACKWeightTransformAttrs); + +bool Conv2DWinogradNNPACKWeightTransformRel(const Array& types, + int num_inputs, + const Attrs& attrs, + const TypeReporter& reporter) { + CHECK_EQ(types.size(), 2); + const auto* data = types[0].as(); + if (data == nullptr) { + return false; + } + + const Conv2DWinogradNNPACKWeightTransformAttrs* param = + attrs.as(); + CHECK(param != nullptr); + + CHECK_EQ(data->shape.size(), 4) << "Only support NCHW normal kernel layout"; + + std::vector oshape{ + data->shape[0], + data->shape[1], + 8, + 8, + }; + + DataType out_dtype = param->out_dtype; + if (out_dtype.bits() == 0) { + out_dtype = data->dtype; + } + reporter->Assign(types[1], TensorTypeNode::make(Array(oshape), out_dtype)); + return true; +} + +Expr MakeConv2DWinogradNNPACKWeightTransform(Expr weight, + int convolution_algorithm, + DataType out_dtype) { + auto attrs = make_node(); + attrs->convolution_algorithm = convolution_algorithm; + attrs->out_dtype = std::move(out_dtype); + static const Op& op = Op::Get("nn.contrib_conv2d_winograd_nnpack_weight_transform"); + return CallNode::make(op, {weight}, Attrs(attrs), {}); +} + +TVM_REGISTER_API("relay.op.nn._make.contrib_conv2d_winograd_nnpack_weight_transform") +.set_body([](const TVMArgs& args, TVMRetValue* rv) { + runtime::detail::unpack_call(MakeConv2DWinogradNNPACKWeightTransform, args, rv); +}); + +RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_nnpack_weight_transform") +.describe(R"code(Weight transformation of winograd fast convolution algorithm with NNPACK. +Separate this into another symbol in order to enable Precompute Pass to compute the +weight transformation in advance. + +- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1]) + +)code" TVM_ADD_FILELINE) +.set_attrs_type_key("relay.attrs.Conv2DWinogradNNPACKWeightTransformAttrs") +.set_num_inputs(1) +.add_argument("weight", "Tensor", "The weight tensor.") +.set_support_level(10) +.add_type_rel("Conv2DWinogradNNPACKWeightTransform", Conv2DWinogradNNPACKWeightTransformRel); + + // Positional relay function to create conv2d NCHWc operator // used by frontend FFI. Expr MakeConv2DNCHWc(Expr data, diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py new file mode 100644 index 000000000000..62dc3771f7cf --- /dev/null +++ b/tests/python/integration/test_winograd_nnpack.py @@ -0,0 +1,127 @@ +import numpy as np +import tvm +from tvm import autotvm +from tvm.autotvm.task.space import FallbackConfigEntity +from tvm.contrib import nnpack +from tvm.contrib.pickle_memoize import memoize +import topi +import topi.testing +from topi.util import get_const_tuple +from nose import SkipTest + + +def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, + devices=['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']): + print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) + + in_height = in_width = in_size + + A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') + W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') + bias = tvm.placeholder((num_filter, 1, 1), name='bias') + + a_shape = get_const_tuple(A.shape) + w_shape = get_const_tuple(W.shape) + bias_shape = get_const_tuple(bias.shape) + dtype = A.dtype + + @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw") + def get_ref_data(): + a_np = np.random.uniform(size=a_shape).astype(dtype) + w_np = np.random.uniform(size=w_shape).astype(dtype) + b_np = np.random.uniform(size=bias_shape).astype(dtype) + dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) + c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding) + if add_bias: + b_np = np.random.uniform(size=bias_shape).astype(dtype) + c_np += b_np + if add_relu: + c_np = np.maximum(c_np, 0) + return a_np, w_np, b_np, c_np + + a_np, w_np, b_np, c_np = get_ref_data() + + def check_device(device): + ctx = tvm.context(device, 0) + if not ctx.exist: + raise SkipTest("Skip because %s is not enabled" % device) + print("Running on target: %s" % device) + with tvm.target.create(device): + C = topi.nn.conv2d(A, W, stride, padding, dilation, layout='NCHW', out_dtype=dtype) + if add_bias: + C = topi.add(C, bias) + if add_relu: + C = topi.nn.relu(C) + s = topi.generic.schedule_conv2d_nchw([C]) + + a = tvm.nd.array(a_np, ctx) + w = tvm.nd.array(w_np, ctx) + b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + if add_bias: + func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) + func(a, w, b, c) + else: + func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) + func(a, w, c) + tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4) + + + for device in devices: + check_device(device) + + +class WinogradFallback(autotvm.FallbackContext): + def _query_inside(self, target, workload): + key = (target, workload) + if key in self.memory: + return self.memory[key] + cfg = FallbackConfigEntity() + cfg.template_key = 'winograd_nnpack_fp32' + self.memory[key] = cfg + return cfg + +def test_conv2d_nchw(): + if not tvm.get_global_func("tvm.contrib.nnpack.convolution_inference_without_weight_transform", True): + raise SkipTest("skip because extern function is not available") + + if not nnpack.is_available(): + raise SkipTest("skip because nnpack is not available") + + devices = ['llvm -device=arm_cpu'] + autotvm.DispatchContext.current.silent = True + with WinogradFallback(): + # resnet 18 workloads + verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, devices=devices) + verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1, devices=devices) + verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1, devices=devices) + verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1, devices=devices) + + # unet workloads + verify_conv2d_nchw(1, 3, 192, 12, 3, 1, 1, add_bias=True, devices=devices) + verify_conv2d_nchw(1, 4, 192, 12, 3, 1, 1, add_bias=True, devices=devices) + verify_conv2d_nchw(1, 12, 96, 24, 3, 1, 1, add_bias=True, devices=devices) + verify_conv2d_nchw(1, 24, 48, 48, 3, 1, 1, add_bias=True, devices=devices) + verify_conv2d_nchw(1, 48, 24, 96, 3, 1, 1, add_bias=True, devices=devices) + verify_conv2d_nchw(1, 96, 12, 180, 3, 1, 1, add_bias=True, devices=devices) + verify_conv2d_nchw(1, 180, 6, 220, 3, 1, 1, add_bias=True, devices=devices) + verify_conv2d_nchw(1, 220, 6, 180, 3, 1, 1, add_bias=True, devices=devices) + verify_conv2d_nchw(1, 180, 12, 96, 3, 1, 1, add_bias=True, devices=devices) + verify_conv2d_nchw(1, 96, 24, 48, 3, 1, 1, add_bias=True, devices=devices) + verify_conv2d_nchw(1, 48, 48, 24, 3, 1, 1, add_bias=True, devices=devices) + verify_conv2d_nchw(1, 24, 96, 12, 3, 1, 1, add_bias=True, devices=devices) + verify_conv2d_nchw(1, 12, 192, 1, 3, 1, 1, add_bias=True, devices=devices) + + # relu, bias + verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_bias=True, devices=devices) + verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_relu=True, devices=devices) + verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_relu=True, add_bias=True, devices=devices) + + # werid workloads + verify_conv2d_nchw(1, 3, 3, 3, 3, 1, 1, devices=devices) + verify_conv2d_nchw(1, 13, 71, 59, 3, 1, 1, devices=devices) + + +if __name__ == "__main__": + import nose + nose.runmodule() diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py index df07f95f2744..2ab623ba1c46 100644 --- a/topi/python/topi/arm_cpu/conv2d.py +++ b/topi/python/topi/arm_cpu/conv2d.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name,unused-variable,no-else-return +# pylint: disable=invalid-name, unused-variable, no-else-return, unused-argument """Conv2D schedule for ARM CPU""" from __future__ import absolute_import as _abs @@ -8,11 +8,15 @@ import tvm from tvm import autotvm +import tvm.contrib.nnpack -from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform +from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform, \ + schedule_conv2d_winograd_nnpack_without_weight_transform from ..util import traverse_inline, get_const_tuple, const_matrix from ..nn import dilate, pad, conv2d, conv2d_alter_layout, \ - conv2d_winograd_without_weight_transform, depthwise_conv2d_nchw + conv2d_winograd_without_weight_transform, \ + conv2d_winograd_nnpack_without_weight_transform, \ + depthwise_conv2d_nchw from ..nn.util import get_const_int, get_pad_tuple @autotvm.register_topi_compute(conv2d, 'arm_cpu', ['direct']) @@ -55,7 +59,10 @@ def conv2d_arm_cpu(cfg, data, kernel, strides, padding, dilation, layout, out_dt return _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, num_tile=2) -@autotvm.register_topi_schedule(schedule_conv2d_nchw, 'arm_cpu', ['direct', 'winograd']) + +@autotvm.register_topi_schedule( + schedule_conv2d_nchw, 'arm_cpu', + ['direct', 'winograd', 'winograd_nnpack_fp16', 'winograd_nnpack_fp32']) def schedule_conv2d_nchw_arm_cpu(cfg, outs): """TOPI schedule callback for conv2d @@ -99,6 +106,10 @@ def _callback(op): output = op.output(0) _schedule_winograd(cfg, s, output, outs[0]) + if 'winograd_nnpack_conv2d_output' in op.tag: + output = op.output(0) + _schedule_winograd_nnpack(cfg, s, output, outs[0]) + traverse_inline(s, outs[0].op, _callback) return s @@ -499,6 +510,78 @@ def _schedule_winograd(cfg, s, output, last): s[output].compute_inline() +@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['winograd_nnpack_fp16']) +def conv2d_arm_cpu_winograd_nnpack_fp16( + cfg, data, kernel, strides, padding, dilation, layout, out_dtype): + """ TOPI compute callback. Use winograd_nnpack_fp16 template """ + return conv2d_arm_cpu_winograd_nnpack( + cfg, data, kernel, strides, padding, dilation, layout, out_dtype, + tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8_FP16) + + +@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['winograd_nnpack_fp32']) +def conv2d_arm_cpu_winograd_nnpack_fp32( + cfg, data, kernel, strides, padding, dilation, layout, out_dtype): + """ TOPI compute callback. Use winograd_nnpack_fp32 template """ + return conv2d_arm_cpu_winograd_nnpack( + cfg, data, kernel, strides, padding, dilation, layout, out_dtype, + tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8) + + +def conv2d_arm_cpu_winograd_nnpack( + cfg, data, kernel, strides, padding, dilation, layout, out_dtype, convolution_algorithm): + """ TOPI compute callback. Use winograd NNPACK template """ + N, CI, IH, IW = get_const_tuple(data.shape) + + if isinstance(dilation, int): + dilation_h = dilation_w = dilation + else: + dilation_h, dilation_w = dilation + assert (dilation_h, dilation_w) == (1, 1) + assert len(kernel.shape) == 4 + CO, _, KH, KW = get_const_tuple(kernel.shape) + HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) + HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + + assert layout == 'NCHW' + assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1 + H = (IH + 2 * HPAD - 3) // HSTR + 1 + W = (IW + 2 * WPAD - 3) // WSTR + 1 + + cfg.define_knob('winograd_nnpack_algorithm', [convolution_algorithm]) + + assert N == 1 + with tvm.tag_scope("winograd_nnpack_conv2d_weight_transform"): + transformed_kernel = tvm.contrib.nnpack.convolution_inference_weight_transform( + kernel, algorithm=tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8) + if autotvm.GLOBAL_SCOPE.in_tuning: + transformed_kernel = tvm.compute(transformed_kernel.shape, lambda *args: 0.0) + + with tvm.tag_scope("winograd_nnpack_conv2d_output"): + output = tvm.contrib.nnpack.convolution_inference_without_weight_transform( + data, transformed_kernel, + bias=None, + padding=[HPAD, HPAD, WPAD, WPAD], + stride=[HSTR, WSTR], + algorithm=cfg['winograd_nnpack_algorithm'].val) + + # we have to manually assign effective GFLOP for winograd + cfg.add_flop(2 * N * CI * H * W * KH * KW * CO) + return output + +def _schedule_winograd_nnpack(cfg, s, output, last): + # Could have bias. + + (X, TK) = output.op.input_tensors[:2] + + # transform kernel + assert isinstance(TK.op, (tvm.tensor.ComputeOp, tvm.tensor.ExternOp, tvm.tensor.PlaceholderOp)) + if autotvm.GLOBAL_SCOPE.in_tuning and isinstance(TK.op, tvm.tensor.ComputeOp): + # kernel transformation will be pre-computed during compilation, so we skip + # this part to make tuning records correct + s[TK].pragma(s[TK].op.axis[0], 'debug_skip_region') + + ##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM ##### @autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'arm_cpu', ['winograd']) def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size): @@ -522,6 +605,60 @@ def _callback(op): return s +##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD NNPACK WITHOUT WEIGHT TRANSFORM ##### +@autotvm.register_topi_compute(conv2d_winograd_nnpack_without_weight_transform, + 'arm_cpu', + ['winograd_nnpack_fp16', 'winograd_nnpack_fp32']) +def conv2d_winograd_nnpack_ww(cfg, data, transformed_kernel, bias, strides, + padding, dilation, layout, out_dtype): + """ TOPI compute callback. Use winograd NNPACK template """ + N, CI, IH, IW = get_const_tuple(data.shape) + if isinstance(dilation, int): + dilation_h = dilation_w = dilation + else: + dilation_h, dilation_w = dilation + assert (dilation_h, dilation_w) == (1, 1) + assert len(transformed_kernel.shape) == 4 + CO, _, _, _ = get_const_tuple(transformed_kernel.shape) + HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) + HPAD, WPAD, _, _ = get_pad_tuple(padding, (3, 3)) + KH, KW = 3, 3 + + assert layout == 'NCHW' + assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1 + H = (IH + 2 * HPAD - 3) // HSTR + 1 + W = (IW + 2 * WPAD - 3) // WSTR + 1 + + assert N == 1 + with tvm.tag_scope("winograd_nnpack_conv2d_output"): + output = tvm.contrib.nnpack.convolution_inference_without_weight_transform( + data=data, + transformed_kernel=transformed_kernel, + bias=bias, + padding=[HPAD, HPAD, WPAD, WPAD], + stride=[HSTR, WSTR], + algorithm=tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8) + + # we have to manually assign effective GFLOP for winograd + cfg.add_flop(2 * N * CI * H * W * KH * KW * CO) + return output + + +@autotvm.register_topi_schedule(schedule_conv2d_winograd_nnpack_without_weight_transform, + 'arm_cpu', ['winograd_nnpack_fp16', 'winograd_nnpack_fp32']) +def schedule_conv2d_winograd_nnpack_without_weight_transform_(cfg, outs): + """TOPI schedule callback""" + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if 'winograd_nnpack_conv2d_output' in op.tag: + output = op.output(0) + _schedule_winograd_nnpack(cfg, s, output, outs[0]) + + traverse_inline(s, outs[0].op, _callback) + return s + + ##### REGISTER ALTER OP LAYOUT ##### @conv2d_alter_layout.register(["arm_cpu"]) def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F): @@ -591,7 +728,7 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F): dispatch_ctx.update(target, new_workload, cfg) return F.nn.conv2d(*copy_inputs, **new_attrs) - else: # pre-compute weight transformation in winograd + elif cfg.template_key == "winograd": # pre-compute weight transformation in winograd if "-device=arm_cpu" in target.options: tile_size = 4 VC = cfg['tile_k'].size[-1] @@ -620,6 +757,31 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F): dispatch_ctx.update(target, new_workload, cfg) return F.nn.contrib_conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs) + elif cfg.template_key in ["winograd_nnpack_fp16", "winograd_nnpack_fp32"]: + # pre-compute winograd_nnpack transform + # for winograd_nnpack_fp16, the the precomputeprune pass must run on device, + # where float16 is supported + weight_dtype = 'float32' + transformed_kernel = F.nn.contrib_conv2d_winograd_nnpack_weight_transform( + copy_inputs[1], + convolution_algorithm=cfg['winograd_nnpack_algorithm'].val, + out_dtype=weight_dtype) + copy_inputs[1] = transformed_kernel + new_data = data + new_kernel = tvm.placeholder((CO, CI, 8, 8), "float32") + bias = tvm.placeholder((CO, ), "float32") + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, bias, strides, + padding, dilation, new_attrs[data_layout_key], out_dtype] + if len(copy_inputs) == 3 else + [new_data, new_kernel, strides, + padding, dilation, new_attrs[data_layout_key], out_dtype], + conv2d_winograd_nnpack_without_weight_transform) + dispatch_ctx.update(target, new_workload, cfg) + return F.nn.contrib_conv2d_winograd_nnpack_without_weight_transform( + *copy_inputs, **new_attrs) + else: + raise RuntimeError("Unsupported template_key '%s'" % cfg.template_key) else: workload = autotvm.task.args_to_workload( [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 00b742f24e64..40c6b34e2ac0 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -121,6 +121,39 @@ def schedule_conv2d_winograd_without_weight_transform(outs): return _default_schedule(outs, False) +@tvm.target.generic_func +def schedule_conv2d_winograd_nnpack_weight_transform(outs): + """Schedule for weight transformation of winograd + Parameters + ---------- + outs: Array of Tensor + The computation graph description of this operator + in the format of an array of tensors. + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + # Typically this is computed in nnvm PreCompute pass + s = tvm.create_schedule([x.op for x in outs]) + return s + +@tvm.target.generic_func +def schedule_conv2d_winograd_nnpack_without_weight_transform(outs): + """Schedule for winograd without weight transformation + Parameters + ---------- + outs: Array of Tensor + The computation graph description of this operator + in the format of an array of tensors. + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) + + @tvm.target.generic_func def schedule_conv2d_transpose_nchw(outs): """Schedule for conv2d_transpose_nchw diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index 559f132f19c2..a67f608d26dc 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -410,6 +410,48 @@ def conv2d_winograd_without_weight_transform(input, filter, strides, padding, di raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform") +def conv2d_winograd_nnpack_weight_transform(kernel, convolution_algorithm, out_dtype): + """Weight transformation for winograd + Parameters + ---------- + kernel: Tensor + The raw kernel tensor with layout "NCHW". Only 3x3 kernel is supported for now. + convolution_algorithm: int + The convolution algorithm for Winograd NNPACK. + Returns + ------- + output : tvm.Tensor + 4-D with shape [alpha, alpha, CO, CI] + """ + from tvm.contrib import nnpack + return nnpack.convolution_inference_weight_transform( + kernel, algorithm=convolution_algorithm, dtype=out_dtype) + +@tvm.target.generic_func +def conv2d_winograd_nnpack_without_weight_transform( + input, filter, bias, strides, padding, dilation, layout, out_dtype): + """Compute convolution in winograd algorithm. The filter is supposed to be transformed + in advance. + Parameters + ---------- + input : tvm.Tensor + 4-D with shape [batch, in_height, in_width, in_channel] + filter : tvm.Tensor + 4-D with shape [num_filter, in_channel, 8, 8] + bias : tvm.Tensor + 1-D with shape [num_filter] + strides : int or a list/tuple of two ints + Stride size, or [stride_height, stride_width] + padding : int or str + Padding size, or ['VALID', 'SAME'] + Returns + ------- + output : tvm.Tensor + 4-D with shape [batch, out_height, out_width, out_channel] + """ + raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform") + + @tvm.target.generic_func def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtype=None): """Group convolution operator in NCHW layout. diff --git a/topi/tests/python/test_topi_conv2d_winograd.py b/topi/tests/python/test_topi_conv2d_winograd.py index 1ca7240a41b0..a76c9c62e008 100644 --- a/topi/tests/python/test_topi_conv2d_winograd.py +++ b/topi/tests/python/test_topi_conv2d_winograd.py @@ -10,7 +10,8 @@ from topi.util import get_const_tuple -def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False): +def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, + devices=['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']): print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) in_height = in_width = in_size @@ -67,7 +68,7 @@ def check_device(device): tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) - for device in ['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']: + for device in devices: check_device(device)