From 42a8989863e2bf8619475626bfdb82b1a9cf9bfb Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Thu, 17 Oct 2019 16:00:52 -0700 Subject: [PATCH 01/48] relay op strategy fix lint bitpack strategy bitserial_dense (#6) * update strategy * address comments fix a few topi test Dense strategy (#5) * dense * add biforst; remove comments * address comment Refactor x86 conv2d_NCHWc (#4) * Refactor x86 conv2d * Add x86 depthwise_conv2d_NCHWc * Add back topi x86 conv2d_nchw * Merge x86 conv2d_nchw and conv2d_NCHWc * Minor fix for x86 conv2d fix more strategy Add x86 conv2d_NCHWc_int8 strategy (#8) * Add x86 conv2d_NCHWc_int8 strategy * Remove contrib_conv2d_nchwc_int8 * Fix generic conv2d_NCHWc for int8 * Fix topi arm_cpu conv2d_NCHWc_int8 update x86 conv2d enable specify relay ops to be tuned for autotvm add cuda conv2d strategy add conv2d strategy for rocm add conv2d strategy for hls add conv2d strategy for arm cpu add conv2d strategy for mali add conv2d strategy for bifrost add conv2d strategy for intel graphics clean up and fix lint remove template keys from autotvm remove 2 in the func name address comments fix --- include/tvm/relay/op_attr_types.h | 191 ++++- include/tvm/te/schedule.h | 55 ++ python/tvm/__init__.py | 2 +- python/tvm/autotvm/__init__.py | 4 +- .../autotvm/graph_tuner/base_graph_tuner.py | 33 +- .../graph_tuner/utils/traverse_graph.py | 40 +- python/tvm/autotvm/record.py | 3 + python/tvm/autotvm/task/__init__.py | 7 +- python/tvm/autotvm/task/dispatcher.py | 87 +- python/tvm/autotvm/task/relay_integration.py | 63 +- python/tvm/autotvm/task/space.py | 18 +- python/tvm/autotvm/task/task.py | 334 +++++--- python/tvm/autotvm/task/topi_integration.py | 408 +++------ python/tvm/autotvm/tophub.py | 9 +- .../tvm/autotvm/tuner/xgboost_cost_model.py | 3 +- python/tvm/relay/backend/compile_engine.py | 330 ++++++++ python/tvm/relay/expr.py | 6 + python/tvm/relay/expr_functor.py | 20 +- python/tvm/relay/memory_alloc.py | 4 +- python/tvm/relay/op/__init__.py | 5 +- python/tvm/relay/op/_algorithm.py | 48 +- python/tvm/relay/op/_reduce.py | 32 +- python/tvm/relay/op/_tensor.py | 119 ++- python/tvm/relay/op/_transform.py | 121 ++- python/tvm/relay/op/annotation/annotation.py | 4 +- python/tvm/relay/op/contrib/_contrib.py | 20 +- python/tvm/relay/op/image/_image.py | 13 +- python/tvm/relay/op/nn/_nn.py | 782 +++--------------- python/tvm/relay/op/nn/nn.py | 145 +--- python/tvm/relay/op/op.py | 94 ++- python/tvm/relay/op/strategy/__init__.py | 30 + python/tvm/relay/op/strategy/arm_cpu.py | 203 +++++ python/tvm/relay/op/strategy/bifrost.py | 97 +++ python/tvm/relay/op/strategy/cuda.py | 352 ++++++++ python/tvm/relay/op/strategy/generic.py | 678 +++++++++++++++ python/tvm/relay/op/strategy/hls.py | 151 ++++ .../tvm/relay/op/strategy/intel_graphics.py | 72 ++ python/tvm/relay/op/strategy/mali.py | 94 +++ python/tvm/relay/op/strategy/opengl.py | 73 ++ python/tvm/relay/op/strategy/rocm.py | 128 +++ python/tvm/relay/op/strategy/x86.py | 277 +++++++ python/tvm/relay/op/vision/_rcnn.py | 56 +- python/tvm/relay/op/vision/_vision.py | 77 +- python/tvm/relay/op/vision/_yolo.py | 6 +- python/tvm/relay/quantize/_annotate.py | 5 +- python/tvm/te/schedule.py | 34 + python/tvm/tir/expr.py | 8 + src/relay/backend/compile_engine.cc | 53 +- src/relay/backend/compile_engine.h | 12 + src/relay/ir/expr.cc | 6 + src/relay/ir/op_attr_types.cc | 110 +++ src/relay/op/annotation/annotation.cc | 14 +- src/relay/op/debug.cc | 5 +- src/relay/op/memory/memory.cc | 10 +- src/relay/op/nn/convolution.cc | 101 --- src/relay/op/nn/nn.cc | 23 +- src/relay/op/nn/pad.cc | 5 +- src/relay/op/nn/pooling.cc | 30 +- src/relay/op/tensor/binary.cc | 5 +- src/relay/op/tensor/reduce.cc | 71 +- src/relay/op/tensor/transform.cc | 135 ++- src/relay/op/tensor/unary.cc | 15 +- src/relay/op/vision/yolo.cc | 3 +- src/relay/pass/alter_op_layout.cc | 5 +- src/te/schedule/schedule_lang.cc | 73 +- .../relay/test_autotvm_task_extraction.py | 31 +- tests/python/relay/test_op_level2.py | 60 +- .../python/unittest/test_graph_tuner_core.py | 26 +- .../python/unittest/test_graph_tuner_utils.py | 4 +- topi/include/topi/cuda/normalization.h | 5 +- topi/include/topi/rocm/normalization.h | 7 +- topi/python/topi/__init__.py | 1 + topi/python/topi/argwhere.py | 2 - topi/python/topi/arm_cpu/__init__.py | 15 +- topi/python/topi/arm_cpu/bitserial_conv2d.py | 9 +- topi/python/topi/arm_cpu/bitserial_dense.py | 10 +- topi/python/topi/arm_cpu/conv2d.py | 447 ++-------- topi/python/topi/arm_cpu/conv2d_alter_op.py | 167 ++++ topi/python/topi/arm_cpu/conv2d_int8.py | 17 +- .../topi/arm_cpu/conv2d_spatial_pack.py | 6 +- topi/python/topi/arm_cpu/conv2d_transpose.py | 11 +- topi/python/topi/arm_cpu/depthwise_conv2d.py | 69 +- topi/python/topi/arm_cpu/injective.py | 4 - topi/python/topi/bifrost/conv2d.py | 141 ++-- topi/python/topi/bifrost/dense.py | 13 +- topi/python/topi/bifrost/depthwise_conv2d.py | 2 - topi/python/topi/cuda/__init__.py | 28 +- topi/python/topi/cuda/batch_matmul.py | 49 +- topi/python/topi/cuda/conv1d.py | 81 +- topi/python/topi/cuda/conv1d_transpose_ncw.py | 11 +- topi/python/topi/cuda/conv2d.py | 228 ++--- topi/python/topi/cuda/conv2d_alter_op.py | 134 +++ topi/python/topi/cuda/conv2d_direct.py | 2 +- topi/python/topi/cuda/conv2d_hwcn.py | 8 +- topi/python/topi/cuda/conv2d_int8.py | 18 +- .../python/topi/cuda/conv2d_transpose_nchw.py | 11 +- topi/python/topi/cuda/conv2d_winograd.py | 178 +--- topi/python/topi/cuda/conv3d.py | 211 +++-- topi/python/topi/cuda/conv3d_direct.py | 11 +- topi/python/topi/cuda/deformable_conv2d.py | 18 +- topi/python/topi/cuda/dense.py | 136 ++- topi/python/topi/cuda/depthwise_conv2d.py | 14 +- topi/python/topi/cuda/group_conv2d_nchw.py | 355 ++++---- topi/python/topi/cuda/injective.py | 7 +- topi/python/topi/cuda/nms.py | 13 +- topi/python/topi/cuda/nn.py | 6 +- topi/python/topi/cuda/pooling.py | 7 +- topi/python/topi/cuda/rcnn/__init__.py | 2 +- topi/python/topi/cuda/rcnn/proposal.py | 5 +- topi/python/topi/cuda/reduction.py | 2 - topi/python/topi/cuda/softmax.py | 3 +- topi/python/topi/cuda/sort.py | 14 +- topi/python/topi/cuda/ssd/multibox.py | 18 +- topi/python/topi/cuda/vision.py | 11 +- topi/python/topi/generic/conv2d.py | 82 +- topi/python/topi/generic/extern.py | 1 - topi/python/topi/generic/injective.py | 21 +- topi/python/topi/generic/nn.py | 73 +- topi/python/topi/generic/search.py | 2 - topi/python/topi/generic/sort.py | 2 - topi/python/topi/generic/vision.py | 9 - topi/python/topi/hls/injective.py | 3 - topi/python/topi/hls/nn.py | 14 - topi/python/topi/intel_graphics/__init__.py | 2 + topi/python/topi/intel_graphics/conv2d.py | 421 ++++------ .../topi/intel_graphics/conv2d_alter_op.py | 102 +++ .../topi/intel_graphics/depthwise_conv2d.py | 17 +- topi/python/topi/mali/conv2d.py | 152 ++-- topi/python/topi/mali/dense.py | 40 +- topi/python/topi/mali/depthwise_conv2d.py | 15 +- topi/python/topi/nn/batch_matmul.py | 22 +- topi/python/topi/nn/bitserial_conv2d.py | 221 +---- topi/python/topi/nn/bitserial_dense.py | 79 +- topi/python/topi/nn/conv1d.py | 15 +- topi/python/topi/nn/conv1d_transpose.py | 1 - topi/python/topi/nn/conv2d.py | 188 +---- topi/python/topi/nn/conv2d_transpose.py | 1 - topi/python/topi/nn/conv3d.py | 46 +- topi/python/topi/nn/deformable_conv2d.py | 1 - topi/python/topi/nn/dense.py | 28 +- topi/python/topi/nn/depthwise_conv2d.py | 3 - topi/python/topi/nn/local_response_norm.py | 2 - topi/python/topi/nn/sparse.py | 8 +- topi/python/topi/nn/util.py | 2 +- topi/python/topi/opengl/conv2d_nchw.py | 2 - topi/python/topi/opengl/dense.py | 2 - topi/python/topi/opengl/injective.py | 3 - topi/python/topi/opengl/pooling.py | 3 - topi/python/topi/opengl/softmax.py | 2 - topi/python/topi/rocm/conv2d.py | 74 +- topi/python/topi/rocm/dense.py | 101 ++- topi/python/topi/rocm/nn.py | 7 +- topi/python/topi/sort.py | 2 - topi/python/topi/vision/nms.py | 3 +- topi/python/topi/vision/rcnn/proposal.py | 2 +- topi/python/topi/vision/rcnn/roi_align.py | 1 - topi/python/topi/vision/rcnn/roi_pool.py | 1 - topi/python/topi/vision/reorg.py | 2 - topi/python/topi/vision/ssd/multibox.py | 3 - topi/python/topi/x86/__init__.py | 18 +- topi/python/topi/x86/batch_matmul.py | 53 +- topi/python/topi/x86/bitserial_conv2d.py | 235 +++++- topi/python/topi/x86/bitserial_dense.py | 80 +- topi/python/topi/x86/conv1d.py | 4 +- topi/python/topi/x86/conv2d.py | 401 +++------ topi/python/topi/x86/conv2d_alter_op.py | 226 +++-- topi/python/topi/x86/conv2d_avx_1x1.py | 150 ++-- topi/python/topi/x86/conv2d_avx_common.py | 147 ++-- topi/python/topi/x86/conv2d_int8.py | 218 +++-- topi/python/topi/x86/conv2d_transpose.py | 49 +- topi/python/topi/x86/conv3d.py | 24 +- topi/python/topi/x86/dense.py | 237 +++--- topi/python/topi/x86/depthwise_conv2d.py | 139 ++-- topi/python/topi/x86/injective.py | 4 - topi/python/topi/x86/nn.py | 1 - topi/python/topi/x86/pooling.py | 2 - topi/python/topi/x86/reduction.py | 5 +- topi/python/topi/x86/roi_align.py | 4 +- topi/python/topi/x86/sparse.py | 4 +- topi/src/topi.cc | 4 +- topi/tests/python/common.py | 38 +- topi/tests/python/test_fifo_buffer.py | 10 +- topi/tests/python/test_topi_broadcast.py | 10 +- topi/tests/python/test_topi_clip.py | 4 +- topi/tests/python/test_topi_depth_to_space.py | 4 +- topi/tests/python/test_topi_image.py | 6 +- topi/tests/python/test_topi_math.py | 32 +- topi/tests/python/test_topi_reduce.py | 4 +- topi/tests/python/test_topi_relu.py | 5 +- topi/tests/python/test_topi_space_to_depth.py | 4 +- topi/tests/python/test_topi_transform.py | 64 +- topi/tests/python/test_topi_upsampling.py | 6 +- tutorials/autotvm/tune_relay_arm.py | 2 +- tutorials/autotvm/tune_relay_cuda.py | 3 +- tutorials/autotvm/tune_relay_mobile_gpu.py | 3 +- tutorials/autotvm/tune_relay_x86.py | 19 +- vta/scripts/tune_resnet.py | 2 +- vta/tutorials/autotvm/tune_relay_vta.py | 5 +- 198 files changed, 6922 insertions(+), 5730 deletions(-) create mode 100644 python/tvm/relay/op/strategy/__init__.py create mode 100644 python/tvm/relay/op/strategy/arm_cpu.py create mode 100644 python/tvm/relay/op/strategy/bifrost.py create mode 100644 python/tvm/relay/op/strategy/cuda.py create mode 100644 python/tvm/relay/op/strategy/generic.py create mode 100644 python/tvm/relay/op/strategy/hls.py create mode 100644 python/tvm/relay/op/strategy/intel_graphics.py create mode 100644 python/tvm/relay/op/strategy/mali.py create mode 100644 python/tvm/relay/op/strategy/opengl.py create mode 100644 python/tvm/relay/op/strategy/rocm.py create mode 100644 python/tvm/relay/op/strategy/x86.py create mode 100644 src/relay/ir/op_attr_types.cc create mode 100644 topi/python/topi/arm_cpu/conv2d_alter_op.py create mode 100644 topi/python/topi/cuda/conv2d_alter_op.py create mode 100644 topi/python/topi/intel_graphics/conv2d_alter_op.py diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h index 88e948f5d72a..889895472168 100644 --- a/include/tvm/relay/op_attr_types.h +++ b/include/tvm/relay/op_attr_types.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -105,9 +106,8 @@ using TShapeDataDependant = bool; */ using FTVMCompute = runtime::TypedPackedFunc< Array(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target)>; + const Array& inputs, + const Type& out_type)>; /*! * \brief Build the computation schedule for @@ -123,6 +123,16 @@ using FTVMSchedule = runtime::TypedPackedFunc< const Array& outs, const Target& target)>; +/*! + * \brief Generate the strategy of operators. This function is a generic + * function and can be re-defined for different targets. + * + * The function signature of generic function is: + * OpStrategy(const Attrs& attrs, const Array& inputs, + * const Type& out_type, const Target& target) + */ +using FTVMStrategy = GenericFunc; + /*! * \brief Alternate the layout of operators or replace the * operator with other expressions. This function will be invoked @@ -136,7 +146,8 @@ using FTVMSchedule = runtime::TypedPackedFunc< using FTVMAlterOpLayout = runtime::TypedPackedFunc< Expr(const Attrs& attrs, const Array& args, - const Array& tinfos)>; + const Array& tinfos, + const Type& out_type)>; /*! * \brief Convert the layout of operators or replace the @@ -191,9 +202,7 @@ using FForwardRewrite = runtime::TypedPackedFunc< * \brief Gradient for a specific op. * * \param orig_call the original Expr. - * * \param output_grad the gradient of the Expr. - * * \return the gradient for each parameters. */ using FPrimalGradient = runtime::TypedPackedFunc(const Expr& orig_call, @@ -207,7 +216,7 @@ enum AnyCodegenStrategy { kVariableDimensions }; -/* \brief A runtime representation of shape. */ +/*! \brief A runtime representation of shape. */ using Shape = Array; using FShapeFunc = runtime::TypedPackedFunc< @@ -215,6 +224,174 @@ using FShapeFunc = runtime::TypedPackedFunc< const Array& inputs, const Array& out_ndims)>; +/*! + * \brief Operator implementation in TVM. + */ +class OpImplementNode : public Object { + public: + /*! \brief Compute function */ + FTVMCompute fcompute; + /*! \brief Schedule function */ + FTVMSchedule fschedule; + /*! \brief Priority level */ + Integer plevel; + + void VisitAttrs(tvm::AttrVisitor* v) { + v->Visit("plevel", &plevel); + } + + static constexpr const char* _type_key = "relay.OpImplement"; + TVM_DECLARE_FINAL_OBJECT_INFO(OpImplementNode, Object); +}; + +/*! + * \brief Operator implementation class. + */ +class OpImplement : public ObjectRef { + public: + /*! \brief default constructor */ + OpImplement() {} + /*! \brief constructor from node pointer */ + explicit OpImplement(ObjectPtr n) : ObjectRef(n) {} + /*! + * \brief access the internal node container + * \return the pointer to the internal node container + */ + inline const OpImplementNode* operator->() const; + /*! + * \brief Invoke the operator compute function. + * \param attrs The attribute of the primitive + * \param inputs The input tensors. + * \param out_type The output type information. + * \return The output compute description of the operator. + */ + Array Compute(const Attrs& attrs, + const Array& inputs, + const Type& out_type); + /*! + * \brief Build the computation schedule. + * \param attrs The attribute of the node. + * \param outs The output tensors. + * \param target The build target. + * \return The computation schedule. + */ + te::Schedule Schedule(const Attrs& attrs, + const Array& outs, + const Target& target); +}; + +/*! + * \brief Specialized implementations for operators under certain conditions. + */ +class OpSpecializationNode : public Object { + public: + /*! \brief List of implementations. */ + Array implements; + /*! \brief Condition to enable the specialization. + * Could be undefined to represent generic case. */ + te::SpecializedCondition condition; + + void VisitAttrs(tvm::AttrVisitor* v) { + v->Visit("condition", &condition); + v->Visit("implements", &implements); + } + + static constexpr const char* _type_key = "relay.OpSpecialization"; + TVM_DECLARE_FINAL_OBJECT_INFO(OpSpecializationNode, ExprNode); +}; + +/*! + * \brief Operator specialization class. + */ +class OpSpecialization : public ObjectRef { + public: + OpSpecialization() {} + explicit OpSpecialization(ObjectPtr n) : ObjectRef(n) {} + /*! + * \brief access the internal node container + * \return the pointer to the internal node container + */ + inline const OpSpecializationNode* operator->() const; + /*! + * \brief access the internal node container + * \return the pointer to the internal node container + */ + inline OpSpecializationNode* operator->(); + /*! + * \brief Add an implementation. + * \param compute Compute function + * \param schedule Schedule function + * \param plevel Priority level of this implemntation. + */ + void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, + int plevel); +}; + +/*! + * \brief Operator strategy to choose implementation. + */ +class OpStrategyNode : public Object { + public: + /*! \brief List of operator specializations. */ + Array specializations; + + void VisitAttrs(tvm::AttrVisitor* v) { + v->Visit("specializations", &specializations); + } + + static constexpr const char* _type_key = "relay.OpStrategy"; + TVM_DECLARE_FINAL_OBJECT_INFO(OpStrategyNode, ExprNode); +}; + +/*! + * \brief Operator strategy class. + */ +class OpStrategy : public ObjectRef { + public: + /*! \brief default constructor */ + OpStrategy() {} + /*! \brief constructor from node pointer */ + explicit OpStrategy(ObjectPtr n) : ObjectRef(n) {} + /*! + * \brief access the internal node container + * \return the pointer to the internal node container + */ + inline const OpStrategyNode* operator->() const; + /*! + * \brief access the internal node container + * \return the pointer to the internal node container + */ + inline OpStrategyNode* operator->(); + /*! + * \brief Add an implementation. + * \param compute Compute function + * \param schedule Schedule function + * \param plevel Priority level of this implementation. + */ + void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, int plevel); +}; + +// implementations +inline const OpImplementNode* OpImplement::operator->() const { + return static_cast(get()); +} + +inline const OpSpecializationNode* OpSpecialization::operator->() const { + return static_cast(get()); +} + +inline OpSpecializationNode* OpSpecialization::operator->() { + return static_cast(get_mutable()); +} + +inline const OpStrategyNode* OpStrategy::operator->() const { + return static_cast(get()); +} + +inline OpStrategyNode* OpStrategy::operator->() { + return static_cast(get_mutable()); +} + } // namespace relay } // namespace tvm #endif // TVM_RELAY_OP_ATTR_TYPES_H_ diff --git a/include/tvm/te/schedule.h b/include/tvm/te/schedule.h index e99b54a86565..2a88f4c8f7e9 100644 --- a/include/tvm/te/schedule.h +++ b/include/tvm/te/schedule.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -742,6 +743,55 @@ class SingletonNode : public IterVarRelationNode { TVM_DECLARE_FINAL_OBJECT_INFO(SingletonNode, IterVarRelationNode); }; +class SpecializedConditionNode; + +/*! + * \brief Specialized condition to enable op specialization + */ +class SpecializedCondition : public ObjectRef { + public: + SpecializedCondition() {} + explicit SpecializedCondition(ObjectPtr n) : ObjectRef(n) {} + /*! + * \brief Get the current specialized condition. + * \return The current specialized condition. + */ + TVM_DLL static SpecializedCondition Current(); + + const SpecializedConditionNode* operator->() const; + + using ContainerType = SpecializedConditionNode; + class Internal; + private: + // enable with syntax. + friend class Internal; + friend class With; + /*! \brief Push a new specialized condition onto the thread local stack. */ + TVM_DLL void EnterWithScope(); + /*! \brief Pop a specialized condition off the thread local context stack. */ + TVM_DLL void ExitWithScope(); +}; + +/*! \brief Container for specialization conditions. */ +class SpecializedConditionNode : public Object { + public: + /*! + * \brief List of conditions in conjunctive joint form (CNF). + * Each condition should be a simple expression, e.g., n > 16, m % 8 == 0, etc., + * where n, m are tvm::Var that represents a dimension in the tensor shape. + */ + Array clauses; + + void VisitAttrs(AttrVisitor* v) { + v->Visit("clauses", &clauses); + } + + static SpecializedCondition make(Array conditions); + + static constexpr const char* _type_key = "SpecializedCondition"; + TVM_DECLARE_FINAL_OBJECT_INFO(SpecializedConditionNode, Object); +}; + // implementations inline const StageNode* Stage::operator->() const { @@ -765,6 +815,11 @@ inline const IterVarRelationNode* IterVarRelation::operator->() const { inline const IterVarAttrNode* IterVarAttr::operator->() const { return static_cast(get()); } + +inline const SpecializedConditionNode* SpecializedCondition::operator->() const { + return static_cast(get()); +} + } // namespace te } // namespace tvm #endif // TVM_TE_SCHEDULE_H_ diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index 65cb67266de6..c1b80b887ebf 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -50,7 +50,7 @@ from .target import build_config # tvm.te -from .te import decl_tensor_intrin, create_schedule, tag_scope +from .te import decl_tensor_intrin, create_schedule, tag_scope, current_specialization # tvm.testing from . import testing diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py index cf8362ad8368..eab4ddfeaf7d 100644 --- a/python/tvm/autotvm/__init__.py +++ b/python/tvm/autotvm/__init__.py @@ -41,8 +41,8 @@ from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo, \ LocalBuilder, LocalRunner, RPCRunner from .tuner import callback -from .task import template, get_config, create, ConfigSpace, ConfigEntity, \ - register_topi_compute, register_topi_schedule, \ +from .task import get_config, create, ConfigSpace, ConfigEntity, \ + register_topi_compute, register_topi_schedule, register_customized_task, \ DispatchContext, FallbackContext, ApplyHistoryBest as apply_history_best, \ ApplyGraphBest as apply_graph_best from .env import GLOBAL_SCOPE diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py index b02c289cb10f..489a97f10d5d 100644 --- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py +++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py @@ -25,7 +25,7 @@ import tvm from tvm import autotvm, relay from tvm.autotvm.task import get_config -from tvm.autotvm.task.topi_integration import deserialize_args, serialize_args +from tvm.autotvm.task.topi_integration import serialize_args from tvm.autotvm.record import encode, load_from_file from tvm.autotvm.measure import MeasureResult, MeasureInput @@ -35,18 +35,17 @@ from ._base import INVALID_LAYOUT_TIME -# Setup topi_op_name -> layout function -# NOTE: To add more ops, change the following dictionary. -OP2LAYOUT = { - "topi_nn_conv2d": topi.nn.conv2d_infer_layout, - "topi_nn_depthwise_conv2d_nchw": topi.nn.depthwise_conv2d_infer_layout, -} +def get_infer_layout(task_name): + if task_name.startswith("conv2d"): + return topi.nn.conv2d_infer_layout + elif task_name.startswith("depthwise_conv2d"): + return topi.nn.depthwise_conv2d_infer_layout + else: + raise ValueError("Cannot find infer layout for task %s" % task_name) - -@autotvm.template +@autotvm.register_customized_task("layout_transform") def layout_transform(*args): """Autotvm layout transform template.""" - args = deserialize_args(args) cfg = get_config() cfg.add_flop(-1) data = args[0] @@ -82,7 +81,7 @@ def __init__(self, graph, input_shapes, records, target_ops, Each row of this file is an encoded record pair. Otherwise, it is an iterator. - target_ops : List of str + target_ops : List of relay.op.Op Target tuning operators. target : str or tvm.target @@ -104,7 +103,7 @@ def __init__(self, graph, input_shapes, records, target_ops, self._layout_transform_perf_records = {} self._layout_transform_interlayer_cost = {} self._input_shapes = input_shapes - self._target_ops = [op.__name__ for op in target_ops] + self._target_ops = target_ops self._name = name self._max_sch_num = max_sch_num @@ -212,7 +211,7 @@ def _fetch_cfg(self): node_entry["record_candidates"] = cache_dict[workload] continue record_candidates = [] - infer_layout_func = OP2LAYOUT[node_entry["topi_op"][0]] + infer_layout_func = get_infer_layout(node_entry["topi_op"][0]) layout_tracking_dict = {} for record in cfg_dict[workload]: in_measure, out_measure = record @@ -264,7 +263,7 @@ def _iterate_layout_transform(self, callback): if node_entry["op"] in self._target_ops: o_idx = key - o_infer_layout_func = OP2LAYOUT[node_entry["topi_op"][0]] + o_infer_layout_func = get_infer_layout(node_entry["topi_op"][0]) o_wkl = node_entry["workloads"][0] i_topi_op = in_node_entry["topi_op"][0] i_wkl = in_node_entry["workloads"][0] @@ -273,14 +272,14 @@ def _iterate_layout_transform(self, callback): pivot += 1 i_topi_op = in_node_entry["topi_op"][pivot] i_wkl = in_node_entry["workloads"][pivot] - i_infer_layout_func = OP2LAYOUT[i_topi_op] + i_infer_layout_func = get_infer_layout(i_topi_op) else: o_idx = target_input_idx if i <= target_input_pos: continue - o_infer_layout_func = OP2LAYOUT[node_entry["topi_op"][0]] + o_infer_layout_func = get_infer_layout(node_entry["topi_op"][0]) o_wkl = node_entry["workloads"][target_input_pos] - i_infer_layout_func = OP2LAYOUT[node_entry["topi_op"][i]] + i_infer_layout_func = get_infer_layout(node_entry["topi_op"][i]) i_wkl = node_entry["workloads"][i] if (i_idx, o_idx) in pair_tracker: diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py index 7648322d3b18..5c598b5b1260 100644 --- a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py +++ b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py @@ -18,8 +18,6 @@ """API for graph traversing.""" import threading -import topi - import tvm from tvm import relay, autotvm from tvm.relay import transform @@ -30,13 +28,6 @@ from .utils import has_multiple_inputs, is_boundary_node, is_skipped_node -# Setup relay op base name -> topi compute functions -# NOTE: To add more ops, change the following dictionary. -OP2COMPUTE = { - "conv2d" : [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw], -} - - def expr2graph(expr, target_ops, node_dict, node_list): """Convert relay expr to graph data structure and fetch workloads of target operators. @@ -46,8 +37,8 @@ def expr2graph(expr, target_ops, node_dict, node_list): expr : tvm.relay.Expr.Function Input relay function expression. - target_ops: List of str - List of target relay base op name + target_ops: List of relay.op.Op + List of target relay ops node_dict : dictionary from tvm.relay.Expr to int Dictionary to record node index @@ -59,14 +50,10 @@ def expr2graph(expr, target_ops, node_dict, node_list): "name": str, "workloads": [tuple], "topi_op": [function]} """ env = TaskExtractEnv.get(allow_duplicate=True) - topi_funcs = [] - for op_name in target_ops: - if op_name not in OP2COMPUTE: - raise RuntimeError("Not supported relay op in graph tuner: %s" - % op_name) - topi_funcs += OP2COMPUTE[op_name] - env.reset(topi_funcs) - # pylint: disable=not-context-manager + env.reset(target_ops) + # TODO(@kevinthesun, @icemelon9): Currently graph tuning pass relies on the fact + # that # autotvm tasks == # ops. But this won't be true after having relay op + # strategy. We need to find a solution to fix this. with env: _expr2graph_impl(expr, target_ops, node_dict, node_list) task_pos = 0 @@ -75,8 +62,7 @@ def expr2graph(expr, target_ops, node_dict, node_list): task_name, args = env.task_collection[task_pos] task = autotvm.task.create(task_name, args, target="llvm", - target_host=None, - template_key='direct') + target_host=None) node_entry["workloads"] = [task.workload] node_entry["topi_op"] = [task_name] task_pos += 1 @@ -101,8 +87,8 @@ def _traverse_expr(node): "op": "null", "name": None} if isinstance(node, Call): - op_name = node.op.name.split(".")[-1] - node_entry["op"] = op_name + op = node.op + node_entry["op"] = node.op for arg in node.args: in_node_idx = node_dict[arg] if isinstance(arg, (Tuple, TupleGetItem)): @@ -118,12 +104,12 @@ def _traverse_expr(node): node_entry["types"].append(tupe_type) else: raise RuntimeError("Unsupported output type %s in operator %s" - % (type(out_type), op_name)) + % (type(out_type), op.name)) # Utilize tracing target to fetch workload with topo-order. # Since we only need workload, dummy target can be used to # create task. - if op_name in target_ops: + if op in target_ops: params = [] for i, input_idx in enumerate(node_entry["inputs"]): input_node_entry = node_list[input_idx[0]] @@ -133,7 +119,7 @@ def _traverse_expr(node): "operators with input node of type " "relay.expr.Var/Constant/Call. Now " "find a target op %s with input type %s" - % (op_name, str(type(input_node_entry["node"])))) + % (op, str(type(input_node_entry["node"])))) free_var = relay.Var("var_%d" % i, input_type) params.append(free_var) call = relay.Call(node.op, params, node.attrs) @@ -155,11 +141,9 @@ def _traverse_expr(node): _expr2graph_impl(node, target_ops, node_dict, node_list) return elif isinstance(node, TupleGetItem): - node_entry["op"] = "TupleGetItem" in_node_idx = node_dict[node.tuple_value] node_entry["inputs"].append([in_node_idx, node.index, 0]) elif isinstance(node, Tuple): - node_entry["op"] = "Tuple" for tuple_item in node: in_node_idx = node_dict[tuple_item] if isinstance(tuple_item, TupleGetItem): diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py index fbf4a08f7b0c..2ea288ed3426 100644 --- a/python/tvm/autotvm/record.py +++ b/python/tvm/autotvm/record.py @@ -28,6 +28,7 @@ import os import itertools from collections import OrderedDict +import numpy as np from .. import build, lower, target as _target @@ -152,6 +153,7 @@ def clean_json_to_python(x): config = ConfigEntity.from_json_dict(config) inp = MeasureInput(tgt, tsk, config) result = MeasureResult(*[tuple(x) if isinstance(x, list) else x for x in row["r"]]) + config.cost = np.mean(result.costs) return inp, result if protocol == 'pickle': @@ -160,6 +162,7 @@ def clean_json_to_python(x): task_tuple = pickle.loads(base64.b64decode(items[1].encode())) config = pickle.loads(base64.b64decode(items[2].encode())) result = pickle.loads(base64.b64decode(items[3].encode())) + config.cost = np.mean(result.costs) tsk = task.Task(task_tuple[0], task_tuple[1]) tsk.workload = task_tuple[3] diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py index f249f6bacb90..29313d4b5491 100644 --- a/python/tvm/autotvm/task/__init__.py +++ b/python/tvm/autotvm/task/__init__.py @@ -22,12 +22,13 @@ of typical tasks of interest. """ -from .task import Task, create, register, template, get_config, args_to_workload +from .task import Task, create, get_config, args_to_workload, \ + register_customized_task from .space import ConfigSpace, ConfigEntity from .code_hash import attach_code_hash, attach_code_hash_to_arg -from .dispatcher import dispatcher, DispatchContext, ApplyConfig, ApplyHistoryBest, \ +from .dispatcher import DispatchContext, ApplyConfig, ApplyHistoryBest, \ FallbackContext, clear_fallback_cache, ApplyGraphBest from .topi_integration import register_topi_compute, register_topi_schedule, \ - TaskExtractEnv + TaskExtractEnv, get_workload from .relay_integration import extract_from_program, extract_from_multiple_program diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py index e7022fad2081..75466bb50e9c 100644 --- a/python/tvm/autotvm/task/dispatcher.py +++ b/python/tvm/autotvm/task/dispatcher.py @@ -152,79 +152,6 @@ def __exit__(self, ptype, value, trace): DispatchContext.current = self._old_ctx -def dispatcher(fworkload): - """Wrap a workload dispatcher function. - - Parameters - ---------- - fworkload : function - The workload extraction function from arguments. - - Returns - ------- - fdispatcher : function - A wrapped dispatcher function, which will - dispatch based on DispatchContext and - the current workload. - """ - dispatch_dict = {} - func_name = fworkload.__name__ - - def register(key, func=None, override=False): - """Register template function. - - Parameters - ---------- - key : str or List of str - The template key to identify the template - under this dispatcher. - func : function - The function to be registered. - The first argument of the function is always - cfg returned by DispatchContext, - the rest arguments are the same as the fworkload. - override : bool - Whether override existing registration. - - Returns - ------- - The register function if necessary. - """ - if isinstance(key, str): - key = [key] - - def _do_reg(myf): - for x in key: - if x in dispatch_dict and not override: - raise ValueError( - "Key %s is already registered for %s" % (x, func_name)) - dispatch_dict[x] = myf - return myf - - if func: - return _do_reg(func) - return _do_reg - - def dispatch_func(func, *args, **kwargs): - """The wrapped dispatch function""" - tgt = _target.Target.current() - workload = func(*args, **kwargs) - cfg = DispatchContext.current.query(tgt, workload) - if cfg.is_fallback and not cfg.template_key: - # first try 'direct' template - if 'direct' in dispatch_dict: - return dispatch_dict['direct'](cfg, *args, **kwargs) - # otherwise pick a random template - for v in dispatch_dict.values(): - return v(cfg, *args, **kwargs) - else: - return dispatch_dict[cfg.template_key](cfg, *args, **kwargs) - - fdecorate = decorate(fworkload, dispatch_func) - fdecorate.register = register - return fdecorate - - class ApplyConfig(DispatchContext): """Apply a deterministic config entity for all queries. @@ -334,7 +261,8 @@ def _query_inside(self, target, workload): if key in self._best_user_defined: return self._best_user_defined[key] if key in self.best_by_model: - return self.best_by_model[key][0].config + inp, _ = self.best_by_model[key] + return inp.config # then try matching by target key for k in target.keys: @@ -342,13 +270,16 @@ def _query_inside(self, target, workload): if key in self._best_user_defined: return self._best_user_defined[key] if key in self.best_by_targetkey: - return self.best_by_targetkey[key][0].config + inp, _ = self.best_by_targetkey[key] + return inp.config return None def update(self, target, workload, cfg): model = target.model key = (model, workload) + # assume user provided config is the best + cfg.cost = 0 self._best_user_defined[key] = cfg for k in target.keys: @@ -481,8 +412,12 @@ def _query_inside(self, target, workload): """ if self._counter < len(self._records): cfg = self._records[self._counter][0].config + wkl = self._records[self._counter][0].task.workload + if workload is not None: + assert wkl == workload self._counter += 1 - self.update(target, workload, cfg) + self.update(target, wkl, cfg) + cfg.workload = wkl return cfg key = (str(target), workload) if key not in self._global_cfg_dict: diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index b39c8d446c7f..fda646c053f5 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -21,7 +21,6 @@ """ import threading -import warnings import logging @@ -55,8 +54,7 @@ def _lower(mod, compiler.lower(mod, target=target) -def extract_from_program(mod, params, ops, target, target_host=None, - template_keys=None): +def extract_from_program(mod, params, target, target_host=None, ops=None): """ Extract tuning tasks from a relay program. This function is the single program version of extract_from_multiple_program. @@ -67,27 +65,22 @@ def extract_from_program(mod, params, ops, target, target_host=None, The module or function to tune params: dict of str to numpy array The associated parameters of the program - ops: List of relay op - List of relay ops to be tuned target: tvm.target.Target The compilation target target_host: tvm.target.Target The host compilation target - template_keys: dict of topi op to str - The tuning template keys map for schedules, default to None. - Example: {topi.nn.conv2d: 'direct'} + ops: List of relay.op.Op + List of relay ops to be tuned Returns ------- task: Array of autotvm.task.Task collected tasks """ - return extract_from_multiple_program([mod], [params], ops, target, target_host, - template_keys) + return extract_from_multiple_program([mod], [params], target, target_host, ops) -def extract_from_multiple_program(mods, params, ops, target, target_host=None, - template_keys=None): +def extract_from_multiple_program(mods, params, target, target_host=None, ops=None): """ Extract tuning tasks from multiple relay programs. This function collects tuning tasks by building a list of programs @@ -99,15 +92,12 @@ def extract_from_multiple_program(mods, params, ops, target, target_host=None, The list of modules or functions to tune params: List of dict of str to numpy array The associated parameters of the programs - ops: List of relay op - List of relay ops to be tuned target: tvm.target.Target The compilation target target_host: tvm.target.Target The host compilation target - template_keys: dict of topi op to str - The tuning template keys map for schedules, default to None. - Example: {topi.nn.conv2d: 'direct'} + ops: List of relay.op.Op + List of relay ops to be tuned Returns ------- @@ -115,36 +105,13 @@ def extract_from_multiple_program(mods, params, ops, target, target_host=None, collected tasks """ # pylint: disable=import-outside-toplevel - import tvm.relay.op from tvm import relay import topi env = TaskExtractEnv.get() - # NOTE: To add more ops, you only need to change the following lists - # relay op -> topi compute - OP2TOPI = { - tvm.relay.op.nn.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw, - topi.nn.group_conv2d_nchw, - topi.nn.conv2d_NCHWc, - topi.nn.conv2d_NCHWc_int8], - tvm.relay.op.nn.conv2d_transpose: [topi.nn.conv2d_transpose_nchw], - tvm.relay.op.nn.dense: [topi.nn.dense], - tvm.relay.op.nn.batch_matmul: [topi.nn.batch_matmul], - tvm.relay.op.nn.deformable_conv2d: [topi.nn.deformable_conv2d_nchw], - tvm.relay.op.nn.conv1d_transpose: [topi.nn.conv1d_transpose_ncw], - tvm.relay.op.nn.conv3d: [topi.nn.conv3d], - } - - topi_funcs = [] - for op_name in ops: - if op_name in OP2TOPI: - topi_funcs.extend(OP2TOPI[op_name]) - else: - warnings.warn("Op %s is not tunable, ignored" % op_name) - # run compiler to collect all TOPI calls during compilation - env.reset(topi_funcs) + env.reset(ops) with env: # disable logger temporarily old_state = logger.disabled @@ -164,24 +131,12 @@ def extract_from_multiple_program(mods, params, ops, target, target_host=None, logger.disabled = old_state - # convert *topi op to template key* map to *task name to template key* map - task_name_to_keys = {} - if template_keys is not None: - for op in template_keys.keys(): - if op in env.topi_to_task: - task_name_to_keys[env.topi_to_task[op]] = template_keys[op] - else: - logger.warning("Invalid template key, fallback to direct") - task_name_to_keys[env.topi_to_task[op]] = 'direct' - # create tasks for target tasks = [] for task_name, args in env.get_tasks(): try: - key = task_name_to_keys[task_name] if task_name in task_name_to_keys else 'direct' tsk = create(task_name, args, - target=target, target_host=target_host, - template_key=key) + target=target, target_host=target_host) tasks.append(tsk) except topi.InvalidShapeError: logger.warning("Invalid shape during AutoTVM task creation") diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py index fbdd34e502ca..d1810d4a75a2 100644 --- a/python/tvm/autotvm/task/space.py +++ b/python/tvm/autotvm/task/space.py @@ -613,9 +613,9 @@ def __init__(self): self._entity_map = OrderedDict() # name -> entity self._constraints = [] self.errors = [] - self.template_key = None self.code_hash = None self.flop = 0 + self.cost = None self.is_fallback = False @staticmethod @@ -796,7 +796,7 @@ def get(self, index): for name, space in self.space_map.items(): entities[name] = space[t % len(space)] t //= len(space) - ret = ConfigEntity(index, self.code_hash, self.template_key, entities, self._constraints) + ret = ConfigEntity(index, self.code_hash, entities, self._constraints) return ret def __iter__(self): @@ -836,17 +836,14 @@ class ConfigEntity(ConfigSpace): index of this config in space code_hash: str hash of schedule code - template_key : str - The specific template key entity_map: dict map name to transform entity constraints : list List of constraints """ - def __init__(self, index, code_hash, template_key, entity_map, constraints): + def __init__(self, index, code_hash, entity_map, constraints): super(ConfigEntity, self).__init__() self.index = index - self.template_key = template_key self._collect = False self._entity_map = entity_map self._space_map = None @@ -897,7 +894,6 @@ def to_json_dict(self): """ ret = {} ret['i'] = int(self.index) - ret['t'] = self.template_key ret['c'] = self.code_hash entity_map = [] for k, v in self._entity_map.items(): @@ -932,7 +928,6 @@ def from_json_dict(json_dict): """ index = json_dict["i"] code_hash = json_dict["c"] - template_key = json_dict["t"] constraints = [] entity_map = OrderedDict() @@ -950,11 +945,10 @@ def from_json_dict(json_dict): raise RuntimeError("Invalid config knob type: " + knob_type) entity_map[str(key)] = entity - return ConfigEntity(index, code_hash, template_key, entity_map, constraints) + return ConfigEntity(index, code_hash, entity_map, constraints) def __repr__(self): - return "%s,%s,%s,%d" % (str(self._entity_map)[12:-1], self.template_key, - self.code_hash, self.index) + return "%s,%s,%d" % (str(self._entity_map)[12:-1], self.code_hash, self.index) class FallbackConfigEntity(ConfigSpace): @@ -1068,4 +1062,4 @@ def __setitem__(self, name, entity): self._entity_map[name] = entity def __repr__(self): - return "%s,%s,%s" % (str(self._entity_map)[12:-1], self.template_key, self.code_hash) + return "%s,%s" % (str(self._entity_map)[12:-1], self.code_hash) diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py index 9ff8b24fcb5d..7fbc94e6732f 100644 --- a/python/tvm/autotvm/task/task.py +++ b/python/tvm/autotvm/task/task.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=unused-variable +# pylint: disable=unused-variable,not-callable """Definition of task function. Task can be constructed from tuple of func, args, and kwargs. @@ -24,10 +24,10 @@ import numpy as np -from ... import tensor, expr, container, target as _target +from ... import tensor, expr, container, placeholder, target as _target -from ..util import get_const_int, get_const_tuple, get_func_name -from .dispatcher import DispatchContext, ApplyConfig, dispatcher +from ..util import get_const_int, get_const_tuple +from .dispatcher import DispatchContext, ApplyConfig from .space import ConfigSpace def _raise_error(*args, **kwargs): # pylint: disable=unused-argument @@ -35,6 +35,39 @@ def _raise_error(*args, **kwargs): # pylint: disable=unused-argument "of this task is registered in another python file " "which is not imported in this run") + +def serialize_args(args): + """serialize arguments of a topi function to a hashable tuple. + + Parameters + ---------- + args: list of hashable or Tensor + """ + ret = [] + for t in args: + if isinstance(t, tensor.Tensor): + ret.append(('TENSOR', get_const_tuple(t.shape), t.dtype)) + else: + ret.append(t) + return tuple(ret) + + +def deserialize_args(args): + """The inverse function of :code:`serialize_args`. + + Parameters + ---------- + args: list of hashable or Tensor + """ + ret = [] + for t in args: + if isinstance(t, tuple) and t[0] == 'TENSOR': + ret.append(placeholder(shape=t[1], dtype=t[2])) + else: + ret.append(t) + return ret + + class Task(object): """A Tunable Task @@ -116,43 +149,134 @@ def __repr__(self): self.name, self.args, self.kwargs, self.workload ) -TASK_TABLE = { -} +TASK_TABLE = {} + +class TopiTemplate(object): + """Topi template that holds the topi compute and schedule function""" + def __init__(self): + self.compute = None + self.schedule = None + self.customized_func = None + + def __call__(self, *args, **kwargs): + args = deserialize_args(args) + if self.customized_func is None: + return self._default_func(*args, **kwargs) + assert callable(self.customized_func) + return self.customized_func(*args, **kwargs) + + def _default_func(self, *args, **kwargs): + assert callable(self.compute) and callable(self.schedule) + out = self.compute(*args, **kwargs) + arg_bufs = [out] + self.get_inputs(out) + s = self.schedule([out]) + return s, arg_bufs + + def get_inputs(self, out): + inputs = [] + queue = [out] + while queue: + t = queue.pop(0) + if isinstance(t.op, tensor.PlaceholderOp): + inputs.append(t) + else: + queue.extend(t.op.input_tensors) + return inputs + +def register_task_compute(name, func=None): + """Register compute function to autotvm task + + Parameters + ---------- + name: str + The task name + + func: None or callable + If it is None, return a decorator. + If is callable, decorate this function. -def register(name, func=None, override=False): - """Register a task function. + Returns + ------- + decorator: callable + A decorator + """ + def _do_reg(f): + if name not in TASK_TABLE: + TASK_TABLE[name] = TopiTemplate() + tmpl = TASK_TABLE[name] + if tmpl.compute is not None: + raise ValueError("Compute is already registered in autoTVM task %s" % name) + tmpl.compute = f + return f + if func: + return _do_reg(func) + return _do_reg + +def register_task_schedule(name, func=None): + """Register schedule function to autotvm task Parameters ---------- - name : str - The name to identify the task. - func : callable - The function to be registered. - override : bool - Whether override existing registration. + name: str + The task name + + func: None or callable + If it is None, return a decorator. + If is callable, decorate this function. Returns ------- - func: callable - The registered function + decorator: callable + A decorator """ - def _do_reg(myf): - if name in TASK_TABLE and not override: - raise ValueError( - "Key %s is already registered" % name) - TASK_TABLE[name] = myf - return myf + def _do_reg(f): + if name not in TASK_TABLE: + TASK_TABLE[name] = TopiTemplate() + tmpl = TASK_TABLE[name] + if tmpl.schedule is not None: + raise ValueError("Schedule is already registered in autoTVM task %s" % name) + tmpl.schedule = f + return f if func: return _do_reg(func) return _do_reg -def create(func_name, args, target, target_host=None, template_key=None): +def register_customized_task(name, func=None): + """Register a customized function to autotvm task. + + Parameters + ---------- + name: str + The task name + + func: None or callable + If it is None, return a decorator. + If is callable, decorate this function. + + Returns + ------- + decorator: callable + A decorator + """ + def _do_reg(f): + if name not in TASK_TABLE: + TASK_TABLE[name] = TopiTemplate() + tmpl = TASK_TABLE[name] + if tmpl.customized_func is not None: + raise ValueError("Customized func is already registered in autoTVM task %s" % name) + tmpl.customized_func = f + return f + if func: + return _do_reg(func) + return _do_reg + +def create(task_name, args, target, target_host=None): """Create a tuning task and initialize its search space Parameters ---------- - func_name : str or callable - The task function + task_name : str + The AutoTVM task name args : List Positional arguments target : Target @@ -165,30 +289,18 @@ def create(func_name, args, target, target_host=None, template_key=None): tsk: Task a task object """ - if callable(func_name): - # register this function if it is not registered before - func = func_name - func_name = func.func_name if hasattr(func, 'func_name') else func.__name__ - if func_name in TASK_TABLE: - assert func == TASK_TABLE[func_name], "Find name conflict in task registration. " \ - "Consider to choose another name for this task" - else: - register(func_name, func=func) - - func = TASK_TABLE[func_name] - ret = Task(func_name, args) + ret = Task(task_name, args) if isinstance(target, str): target = _target.create(target) # init config space ret.config_space = ConfigSpace() - ret.config_space.template_key = template_key or "" ctx = ApplyConfig(ret.config_space) with ctx: with target: - sch, _ = func(*args) + sch, _ = ret.func(*args) ret.config_space.code_hash = getattr(sch, 'code_hash', None) ret.workload = ctx.workload @@ -198,7 +310,7 @@ def create(func_name, args, target, target_host=None, template_key=None): return ret -def args_to_workload(x, topi_compute_func=None): +def args_to_workload(x, task_name=None): """Convert argument list to hashable workload tuple. This function will convert list to tuple, tvm node to python value and flatten tvm.tensor.Tensor to a tuple @@ -207,8 +319,8 @@ def args_to_workload(x, topi_compute_func=None): ---------- x: primitive hashable types or tensor.Tensor The original value - topi_compute_func: topi compute function - The function name will be added as first element of the workload tuple + task_name: str + The AutoTVM task name Returns ------- @@ -227,76 +339,76 @@ def args_to_workload(x, topi_compute_func=None): workload = 0 else: raise RuntimeError('Do not support type "%s" in argument. Consider to use' - 'primitive types or tvm.tir.Var only' % type(x)) - return (get_func_name(topi_compute_func), ) + workload if topi_compute_func else workload - -def template(func): - """ - Decorate a function as a tunable schedule template - - Parameters - ---------- - func: callable - A callable template function. - Its argument should be hashable values. - Its return value should be a Tuple(Schedule, Array of Tensor) - - Returns - ------- - func: callable - The decorated function - - Examples - -------- - The following code is a tunable template for a blocked matrix multiplication - - .. code-block:: python - - @autotvm.template - def matmul(N, L, M, dtype): - A = tvm.placeholder((N, L), name='A', dtype=dtype) - B = tvm.placeholder((L, M), name='B', dtype=dtype) - - k = tvm.reduce_axis((0, L), name='k') - C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') - s = tvm.create_schedule(C.op) - - # schedule - y, x = s[C].op.axis - k = s[C].op.reduce_axis[0] - - ##### define space begin ##### - cfg = autotvm.get_config() - cfg.define_split("tile_y", y, num_outputs=2) - cfg.define_split("tile_x", x, num_outputs=2) - ##### define space end ##### - - # schedule according to config - yo, yi = cfg["tile_y"].apply(s, C, y) - xo, xi = cfg["tile_x"].apply(s, C, x) - - s[C].reorder(yo, xo, k, yi, xi) + 'primitive types or tvm.expr.Var only' % type(x)) + return tuple((task_name, ) + workload) if task_name else workload - return s, [A, B, C] - """ - # pylint: disable=unused-variable - - fname = get_func_name(func) - - @register(fname) - @dispatcher - def config_dispatcher(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - return (fname, ) + args_to_workload(args) - - @config_dispatcher.register("") - def template_call(cfg, *args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - with ApplyConfig(cfg): - return func(*args, **kwargs) - - config_dispatcher.func_name = fname - return config_dispatcher +# def template(func): +# """ +# Decorate a function as a tunable schedule template +# +# Parameters +# ---------- +# func: callable +# A callable template function. +# Its argument should be hashable values. +# Its return value should be a Tuple(Schedule, Array of Tensor) +# +# Returns +# ------- +# func: callable +# The decorated function +# +# Examples +# -------- +# The following code is a tunable template for a blocked matrix multiplication +# +# .. code-block:: python +# +# @autotvm.template +# def matmul(N, L, M, dtype): +# A = tvm.placeholder((N, L), name='A', dtype=dtype) +# B = tvm.placeholder((L, M), name='B', dtype=dtype) +# +# k = tvm.reduce_axis((0, L), name='k') +# C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') +# s = tvm.create_schedule(C.op) +# +# # schedule +# y, x = s[C].op.axis +# k = s[C].op.reduce_axis[0] +# +# ##### define space begin ##### +# cfg = autotvm.get_config() +# cfg.define_split("tile_y", y, num_outputs=2) +# cfg.define_split("tile_x", x, num_outputs=2) +# ##### define space end ##### +# +# # schedule according to config +# yo, yi = cfg["tile_y"].apply(s, C, y) +# xo, xi = cfg["tile_x"].apply(s, C, x) +# +# s[C].reorder(yo, xo, k, yi, xi) +# +# return s, [A, B, C] +# """ +# # pylint: disable=unused-variable +# +# fname = get_func_name(func) +# +# @register(fname) +# @dispatcher +# def config_dispatcher(*args, **kwargs): +# assert not kwargs, "Do not support kwargs in template function call" +# return (fname, ) + args_to_workload(args) +# +# @config_dispatcher.register("") +# def template_call(cfg, *args, **kwargs): +# assert not kwargs, "Do not support kwargs in template function call" +# with ApplyConfig(cfg): +# return func(*args, **kwargs) +# +# config_dispatcher.func_name = fname +# return config_dispatcher def get_config(): """Get current config object diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py index 3d3a1d3d3a4e..29796df14271 100644 --- a/python/tvm/autotvm/task/topi_integration.py +++ b/python/tvm/autotvm/task/topi_integration.py @@ -26,6 +26,7 @@ See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage. """ +<<<<<<< HEAD import tvm.te._ffi_api from ... import tensor, placeholder @@ -68,6 +69,13 @@ def deserialize_args(args): else: ret.append(t) return ret +======= +from tvm import target as _target + +from ... import _api_internal, tensor +from .task import args_to_workload, DispatchContext, \ + register_task_compute, register_task_schedule, serialize_args +>>>>>>> relay op strategy # Task extractor for relay program @@ -77,250 +85,49 @@ class TaskExtractEnv: registered = None def __init__(self, allow_duplicate=False): - # pylint: disable=import-outside-toplevel - import topi - - # topi compute -> autotvm task name - self.topi_to_task = { - topi.nn.conv2d: "topi_nn_conv2d", - topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw", - topi.nn.group_conv2d_nchw: "topi_nn_group_conv2d_nchw", - topi.nn.conv2d_transpose_nchw: "topi_nn_conv2d_transpose_nchw", - topi.nn.conv2d_NCHWc: "topi_x86_conv2d_NCHWc", - topi.nn.conv2d_NCHWc_int8: "topi_x86_conv2d_NCHWc_int8", - topi.nn.dense: "topi_nn_dense", - topi.nn.batch_matmul: "topi_nn_batch_matmul", - topi.nn.bitserial_conv2d_nchw: "topi_nn_bitserial_conv2d_nchw", - topi.nn.bitserial_conv2d_nhwc: "topi_nn_bitserial_conv2d_nhwc", - topi.nn.bitserial_dense: "topi_nn_bitserial_dense", - topi.nn.deformable_conv2d_nchw: "topi_nn_deformable_conv2d_nchw", - topi.nn.conv1d_transpose_ncw: "topi_nn_conv1d_transpose_ncw", - topi.nn.conv3d: "topi_nn_conv3d", - } - - self.topi_to_schedule = { - topi.nn.conv2d: [topi.generic.schedule_conv2d_nchw, - topi.generic.schedule_conv2d_nhwc], - topi.nn.depthwise_conv2d_nchw: [topi.generic.schedule_depthwise_conv2d_nchw, - topi.generic.schedule_depthwise_conv2d_nhwc], - topi.nn.group_conv2d_nchw: [topi.generic.schedule_group_conv2d_nchw], - topi.nn.conv2d_transpose_nchw: [topi.generic.schedule_conv2d_transpose_nchw], - topi.nn.conv2d_NCHWc: [topi.generic.schedule_conv2d_NCHWc], - topi.nn.conv2d_NCHWc_int8: [topi.generic.schedule_conv2d_NCHWc_int8], - topi.nn.dense: [topi.generic.schedule_dense], - topi.nn.batch_matmul: [topi.generic.schedule_batch_matmul], - topi.nn.bitserial_conv2d_nchw: [topi.generic.schedule_bitserial_conv2d_nchw], - topi.nn.bitserial_conv2d_nhwc: [topi.generic.schedule_bitserial_conv2d_nhwc], - topi.nn.bitserial_dense: [topi.generic.schedule_bitserial_dense], - topi.nn.deformable_conv2d_nchw: [topi.generic.schedule_deformable_conv2d_nchw], - topi.nn.conv1d_transpose_ncw: [topi.generic.schedule_conv1d_transpose_ncw], - topi.nn.conv3d: [topi.generic.schedule_conv3d_ndhwc], - } - - # function reflection for tracing - self.func_to_reflection = { - topi.nn.conv2d: lambda x: setattr(topi.nn, 'conv2d', x), - topi.nn.conv2d_NCHWc: lambda x: setattr(topi.nn, 'conv2d_NCHWc', x), - topi.nn.conv2d_NCHWc_int8: lambda x: setattr(topi.nn, 'conv2d_NCHWc_int8', x), - topi.nn.depthwise_conv2d_nchw: lambda x: setattr(topi.nn, 'depthwise_conv2d_nchw', x), - topi.nn.group_conv2d_nchw: lambda x: setattr(topi.nn, 'group_conv2d_nchw', x), - topi.nn.conv2d_transpose_nchw: lambda x: setattr(topi.nn, 'conv2d_transpose_nchw', x), - topi.nn.dense: lambda x: setattr(topi.nn, 'dense', x), - topi.nn.batch_matmul: lambda x: setattr(topi.nn, 'batch_matmul', x), - topi.nn.bitserial_conv2d_nchw: lambda x: setattr(topi.nn, 'bitserial_conv2d_nchw', x), - topi.nn.bitserial_conv2d_nhwc: lambda x: setattr(topi.nn, 'bitserial_conv2d_nhwc', x), - topi.nn.bitserial_dense: lambda x: setattr(topi.nn, 'bitserial_dense', x), - topi.nn.deformable_conv2d_nchw: lambda x: setattr(topi.nn, 'deformable_conv2d_nchw', x), - topi.nn.conv1d_transpose_ncw: lambda x: setattr(topi.nn, 'conv1d_transpose_ncw', x), - topi.nn.conv3d: lambda x: setattr(topi.nn, 'conv3d', x), - } - self.allow_duplicate = allow_duplicate - self._register_topi_task() self.task_collection = [] - self.wanted_topi_funcs = list(self.topi_to_task.keys()) + self.wanted_relay_ops = None self.modified_funcs = [] + self.tracing = False def __enter__(self): self.task_collection = [] - self.modified_funcs = [] - - for topi_compute in self.wanted_topi_funcs: - def _local_scope(compute_func): - """start a scope to hold the local function in for loop""" - - def _tracing_wrapper(*args, **kwargs): - assert not kwargs, "Do not support extracting tuning tasks when " \ - "kwargs is used in TOPI function call. " \ - "Please modify it to use only positional args." - key = (self.topi_to_task[compute_func], serialize_args(args)) - if self.allow_duplicate or key not in self.task_collection: - self.task_collection.append(key) - - return compute_func(*args, **kwargs) - - self.func_to_reflection[compute_func](_tracing_wrapper) - self.modified_funcs.append(compute_func) - - _local_scope(topi_compute) + self.tracing = True return self def __exit__(self, exc_type, exc_val, exc_tb): - # revert modification - for func in self.modified_funcs: - self.func_to_reflection[func](func) - - def _register_topi_task(self): - """register tuning wrapper for topi function""" - # pylint: disable=import-outside-toplevel - import topi - - # Avoid double registration for certain targets - if TaskExtractEnv.registered: - return - TaskExtractEnv.registered = True - - # Tuning wrapper for topi functions - @register("topi_nn_conv2d") - def _topi_nn_conv2d(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) - A, W = args[:2] - layout = args[-2] - C = topi.nn.conv2d(*args, **kwargs) - if layout == 'NCHW': - s = topi.generic.schedule_conv2d_nchw([C]) - elif layout == 'HWCN': - s = topi.generic.schedule_conv2d_hwcn([C]) - elif layout == 'NHWC': - s = topi.generic.schedule_conv2d_nhwc([C]) - else: - raise ValueError("Unsupported layout {}".format(layout)) - return s, [A, W, C] + self.tracing = False - @register("topi_nn_depthwise_conv2d_nchw") - def _topi_nn_depthwise_conv2d_nchw(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) - A, W = args[:2] - C = topi.nn.depthwise_conv2d_nchw(*args, **kwargs) - s = topi.generic.schedule_depthwise_conv2d_nchw([C]) - return s, [A, W, C] - - @register("topi_nn_group_conv2d_nchw") - def _topi_nn_group_conv2d_nchw(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) - A, W = args[:2] - C = topi.nn.group_conv2d_nchw(*args, **kwargs) - s = topi.generic.schedule_group_conv2d_nchw([C]) - return s, [A, W, C] - - @register("topi_nn_conv2d_transpose_nchw") - def _topi_nn_conv2d_transpose_nchw(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) - A, W = args[:2] - C = topi.nn.conv2d_transpose_nchw(*args, **kwargs) - s = topi.generic.schedule_conv2d_transpose_nchw([C]) - return s, [A, W, C] - - @register("topi_nn_conv1d_transpose_ncw") - def _topi_nn_conv1d_transpose_ncw(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) - A, W = args[:2] - C = topi.nn.conv1d_transpose_ncw(*args, **kwargs) - s = topi.generic.schedule_conv1d_transpose_ncw([C]) - return s, [A, W, C] - - @register("topi_nn_conv3d") - def _topi_nn_conv3d(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) - A, W = args[:2] - C = topi.nn.conv3d(*args, **kwargs) - s = topi.generic.schedule_conv3d_ndhwc([C]) - return s, [A, W, C] - - @register("topi_nn_dense") - def _topi_nn_dense(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) - if len(args) > 2: - data, weight, bias = args[:3] - else: - data, weight = args - bias = None - C = topi.nn.dense(*args, **kwargs) - s = topi.generic.schedule_dense([C]) - if bias is not None: - return s, [data, weight, bias, C] - return s, [data, weight, C] - - @register("topi_nn_batch_matmul") - def _topi_nn_batch_matmul(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) - A, B = args - C = topi.nn.batch_matmul(A, B) - s = topi.generic.schedule_batch_matmul([C]) - return s, [A, B, C] - - @register("topi_nn_bitserial_conv2d_nhwc") - def _topi_bitserial_conv2d_nhwc(*args, **kwargs): - args = deserialize_args(args) - C = topi.nn.bitserial_conv2d_nhwc(*args, **kwargs) - s = topi.generic.nn.schedule_bitserial_conv2d_nhwc([C]) - A, W = args[:2] - return s, [A, W, C] - - @register("topi_nn_bitserial_conv2d_nchw") - def _topi_bitserial_conv2d_nchw(*args, **kwargs): - args = deserialize_args(args) - C = topi.nn.bitserial_conv2d_nchw(*args, **kwargs) - s = topi.generic.nn.schedule_bitserial_conv2d_nchw([C]) - A, W = args[:2] - return s, [A, W, C] - - @register("topi_nn_bitserial_dense") - def _topi_nn_bitserial_dense(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) - A, W = args[:2] - C = topi.nn.bitserial_dense(*args, **kwargs) - s = topi.generic.schedule_bitserial_dense([C]) - return s, [A, W, C] - - @register("topi_nn_deformable_conv2d_nchw") - def _topi_nn_deformable_conv2d_nchw(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) - A, Offset, W = args[:3] - C = topi.nn.deformable_conv2d_nchw(*args, **kwargs) - s = topi.generic.schedule_deformable_conv2d_nchw([C]) - return s, [A, Offset, W, C] - - @register("topi_nn_conv2d_NCHWc") - def _topi_nn_conv2d_NCHWc(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) - A, W = args[:2] - C = topi.nn.conv2d_NCHWc(*args, **kwargs) - s = topi.generic.schedule_conv2d_NCHWc([C]) - return s, [A, W, C] - - def reset(self, wanted_topi_funcs): + def reset(self, wanted_relay_ops=None): """Reset task collections Parameters ---------- - wanted_topi_funcs: List of function - The topi function to be extracted + wanted_relay_ops: List of relay.op.Op + The relay ops to be extracted """ self.task_collection = [] - self.wanted_topi_funcs = wanted_topi_funcs + self.wanted_relay_ops = wanted_relay_ops + + def add_task(self, task_name, args): + """Add AutoTVM task + + Parameters + ---------- + task_name: str + AutoTVM task name. + + args: tuple + Arguments to the TOPI function. + + cond: SpecializedCondition + Specialized condition to enable the TOPI template. + """ + key = (task_name, serialize_args(args)) + if self.allow_duplicate or key not in self.task_collection: + self.task_collection.append(key) def get_tasks(self): """Get collected tasks @@ -355,7 +162,7 @@ def get(allow_duplicate=False): return TaskExtractEnv.current -def register_topi_compute(topi_compute, target_keys, template_keys, func=None, override=False): +def register_topi_compute(task_name, func=None): """Register a tunable template for a topi compute function. After the registration, this topi compute will become a configuration dispatcher. It uses @@ -366,15 +173,9 @@ def register_topi_compute(topi_compute, target_keys, template_keys, func=None, o Parameters ---------- - topi_compute: GenericFunc - The topi compute function that will be overloaded - target_keys: str or list of str - The compilation target. The same as the argument of GenericFunc.register. - template_keys: str or list of str - The template key. - We might have several strategies for a single operator (e.g. direct, im2col, winograd). - The template key is used to identity the algorithm strategy. - Every operator must have a "direct" template, which is used by default. + task_name: str + The AutoTVM task name + func: None or callable If it is None, return a decorator. If is callable, decorate this function. @@ -388,6 +189,7 @@ def register_topi_compute(topi_compute, target_keys, template_keys, func=None, o -------- See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage. """ +<<<<<<< HEAD def _decorator(f): targets = [target_keys] if isinstance(target_keys, str) else target_keys for target_key in targets: @@ -436,14 +238,50 @@ def template_call(cfg, *args, **kwargs): return [op.output(i) for i in range(len(node))] return f +======= + def _decorate(topi_compute): + @register_task_compute(task_name) + def wrapper(*args, **kwargs): + """wrapper function for topi compute""" + assert not kwargs, "Do not support kwargs in template function call" + task_env = TaskExtractEnv.current + if task_env is not None and task_env.tracing: + task_env.add_task(task_name, args) + workload = args_to_workload(args, task_name) + tgt = _target.current_target() + cfg = DispatchContext.current.query(tgt, workload) + node = topi_compute(cfg, *args) + + # attach workload to return op + op = node.op + attrs = {} + for k, v in node.op.attrs.items(): + attrs[k] = v + attrs['workload'] = workload + if isinstance(op, tensor.ComputeOp): + op = _api_internal._ComputeOp( + op.name, op.tag, attrs, op.axis, op.body) + elif isinstance(op, tensor.ExternOp): + op = _api_internal._ExternOp( + op.name, op.tag, attrs, + op.inputs, op.input_placeholders, + op.output_placeholders, op.body) + else: + raise RuntimeError("Unsupported op type: " + str(type(op))) +>>>>>>> relay op strategy - if func: - _decorator(func) + if isinstance(node, tensor.Tensor): + return op.output(0) + return [op.output(i) for i in range(len(node))] + + return wrapper - return _decorator + if func: + return _decorate(func) + return _decorate -def register_topi_schedule(topi_schedule, target_keys, template_keys, func=None, override=False): +def register_topi_schedule(task_name, func=None): """Register a tunable template for a topi schedule function. After the registration. This topi schedule will become a configuration dispatcher. It dispatches @@ -452,17 +290,13 @@ def register_topi_schedule(topi_schedule, target_keys, template_keys, func=None, Note that this function will try to find "workload" from all the ComputeOp in the input. You can attach "workload" to your compute op by using :any:`register_topi_compute`. + The task name need to match with the task name of the corresponding topi compute function. + Parameters ---------- - topi_schedule: GenericFunc - The topi schedule function that will be overloaded - target_keys: str or list of str - The compilation target - template_keys: str or list of str - The template key. - We might have several strategies for a single operator (e.g. direct, im2col, winograd). - The template key is used to identity the algorithm strategy. - Every operator must have a "direct" template, which is used by default. + task_name: str + The AutoTVM task name + func: None or callable If it is None, return a decorator. If is callable, decorate this function. @@ -476,49 +310,33 @@ def register_topi_schedule(topi_schedule, target_keys, template_keys, func=None, -------- See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage. """ - def _decorator(f): - targets = [target_keys] if isinstance(target_keys, str) else target_keys - for target_key in targets: - if target_key not in _REGISTERED_DISPATCHER: - _REGISTERED_DISPATCHER[target_key] = {} - if topi_schedule not in _REGISTERED_DISPATCHER[target_key]: - @topi_schedule.register(target_key) - @dispatcher - def config_dispatcher(outs, *args, **kwargs): - """override topi call as a workload dispatcher""" - def traverse(tensors): - """traverse all ops to find attached workload""" - for t in tensors: - op = t.op - if 'workload' in op.attrs: - return op.attrs['workload'] - wkl = traverse(op.input_tensors) - if wkl: - return wkl - return None - - outs = [outs] if isinstance(outs, tensor.Tensor) else outs - workload = traverse(outs) - - if workload is None: - raise RuntimeError("Cannot find workload in attribute of this schedule") - - return args_to_workload(workload) - - _REGISTERED_DISPATCHER[target_key][topi_schedule] = config_dispatcher - - config_dispatcher = _REGISTERED_DISPATCHER[target_key][topi_schedule] - - @config_dispatcher.register(template_keys, override=override) - def template_call(cfg, outs, *args, **kwargs): - """call the schedule func""" - if f == topi_schedule.fdefault: - return f(outs, *args, **kwargs) - return f(cfg, outs, *args, **kwargs) - - return f - + def _decorate(topi_schedule): + @register_task_schedule(task_name) + def wrapper(outs, *args, **kwargs): + """wrapper function for topi schedule""" + workload = get_workload(outs) + if workload is None: + raise RuntimeError("Cannot find workload in attribute of this schedule") + tgt = _target.current_target() + cfg = DispatchContext.current.query(tgt, workload) + return topi_schedule(cfg, outs, *args, **kwargs) + return wrapper if func: - _decorator(func) - - return _decorator + return _decorate(func) + return _decorate + + +def get_workload(outs): + """Retrieve the workload from outputs""" + def traverse(tensors): + """traverse all ops to find attached workload""" + for t in tensors: + op = t.op + if 'workload' in op.attrs: + return args_to_workload(op.attrs['workload']) + wkl = traverse(op.input_tensors) + if wkl: + return wkl + return None + outs = [outs] if isinstance(outs, tensor.Tensor) else outs + return traverse(outs) diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py index e1a7d86695f2..ce0be70e4a15 100644 --- a/python/tvm/autotvm/tophub.py +++ b/python/tvm/autotvm/tophub.py @@ -189,7 +189,7 @@ def download_package(tophub_location, package_name): # global cache for load_reference_log REFERENCE_LOG_CACHE = {} -def load_reference_log(backend, model, workload_name, template_key): +def load_reference_log(backend, model, workload_name): """ Load reference log from TopHub to support fallback in template. Template will use these reference logs to choose fallback config. @@ -201,8 +201,6 @@ def load_reference_log(backend, model, workload_name, template_key): The name of the device model workload_name: str The name of the workload. (The first item in the workload tuple) - template_key: str - The template key """ backend = _alias(backend) @@ -211,7 +209,7 @@ def load_reference_log(backend, model, workload_name, template_key): filename = os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, package_name) global REFERENCE_LOG_CACHE - key = (backend, model, workload_name, template_key) + key = (backend, model, workload_name) if key not in REFERENCE_LOG_CACHE: tmp = [] @@ -233,8 +231,7 @@ def load_reference_log(backend, model, workload_name, template_key): model = max(counts.items(), key=lambda k: k[1])[0] for inp, res in load_from_file(filename): - if (model == inp.target.model and inp.task.workload[0] == workload_name and - inp.config.template_key == template_key): + if model == inp.target.model and inp.task.workload[0] == workload_name: tmp.append((inp, res)) REFERENCE_LOG_CACHE[key] = tmp diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py index 882b0ad19dd5..305244808a33 100644 --- a/python/tvm/autotvm/tuner/xgboost_cost_model.py +++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py @@ -219,8 +219,7 @@ def fit_log(self, records, plan_size): # filter data, only pick the data with a same task data = [] for inp, res in records: - if inp.task.name == self.task.name and \ - inp.config.template_key == self.task.config_space.template_key: + if inp.task.name == self.task.name: data.append((inp, res)) logger.debug("XGB load %d entries from history log file", len(data)) diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py index 4eedd23faa1c..e07baf20e54b 100644 --- a/python/tvm/relay/backend/compile_engine.py +++ b/python/tvm/relay/backend/compile_engine.py @@ -14,18 +14,38 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# pylint: disable=len-as-condition,no-else-return """Backend code generation engine.""" from __future__ import absolute_import +import hashlib +import numpy as np +import tvm +from topi import tag from ..base import register_relay_node, Object +from ... import _api_internal from ... import target as _target +from ..._ffi.function import register_func +from ... import autotvm from .. import expr as _expr +from .. import op as _op +from .. import ty as _ty +from ..expr_functor import ExprVisitor from . import _backend @register_relay_node class CachedFunc(Object): """Low-level tensor function to back a relay primitive function. """ + def __init__(self, target, func_name, inputs, outputs, schedule=None, + lowered_funcs=None, shape_func_param_states=None): + if lowered_funcs is None: + lowered_funcs = [] + if shape_func_param_states is None: + shape_func_param_states = [] + self.__init_handle_by_constructor__( + _backend._make_CachedFunc, target, func_name, inputs, outputs, + schedule, lowered_funcs, shape_func_param_states) @register_relay_node @@ -63,6 +83,316 @@ def _get_cache_key(source_func, target): return source_func +def get_shape(shape): + """Convert the shape to correct dtype and vars.""" + ret = [] + for dim in shape: + if isinstance(dim, tvm.expr.IntImm): + val = int(dim) + assert val <= np.iinfo(np.int32).max + ret.append(tvm.expr.IntImm("int32", val)) + elif isinstance(dim, tvm.expr.Any): + ret.append(tvm.var("any_dim", "int32")) + else: + ret.append(dim) + return ret + + +def get_valid_implements(op, attrs, inputs, out_type, target): + """Get all valid implementations from the op strategy. + + Note that this function doesn't support op that has symbolic input shapes. + + Parameters + ---------- + op : relay.op.Op + Relay operator. + + attrs : object + The op attribute. + + inputs : list of tvm.Tensor + Input tensors to the op. + + out_type : relay.Type + The output type. + + target : tvm.Target + The target to compile the op. + + Returns + ------- + ret : list of relay.op.OpImplement + The list of op implementations. + """ + fstrategy = op.get_attr("FTVMStrategy") + assert fstrategy is not None, "%s doesn't have FTVMStrategy registered" % op.name + with target: + strategy = fstrategy(attrs, inputs, out_type, target) + ret = [] + for spec in strategy.specializations: + if spec.condition: + # check if all the clauses in the specialized condition are true + flag = True + for clause in spec.condition.clauses: + clause = tvm.ir_pass.Simplify(clause) + if isinstance(clause, tvm.expr.IntImm) and clause.value: + continue + flag = False + break + if flag: + for impl in spec.implements: + ret.append(impl) + else: + for impl in spec.implements: + ret.append(impl) + return ret + + +def select_implement(op, attrs, inputs, out_type, target, use_autotvm=True): + """Select the best implement from the op strategy. + + If use_autotvm is True, it'll first try to find the best implementation + based on AutoTVM profile results. If no AutoTVM profile result is found, + it'll choose the implementation with highest plevel. + + If use_autotvm is False, it'll directly choose the implementation with + highest plevel. + + Note that this function doesn't support op that has symbolic input shapes. + + Parameters + ---------- + op : relay.op.Op + Relay operator. + + attrs : object + The op attribute. + + inputs : list[tvm.Tensor] + Input tensors to the op. + + out_type : relay.Type + The output type. + + target : tvm.Target + The target to compile the op. + + use_autotvm : bool + Whether query AutoTVM to pick the best. + + Returns + ------- + ret : tuple(relay.op.OpImplement, list[tvm.Tensor]) + The best op implementation and the corresponding output tensors. + """ + all_impls = get_valid_implements(op, attrs, inputs, out_type, target) + + best_plevel_impl = None + for impl in all_impls: + if best_plevel_impl is None or int(impl.plevel) > int(best_plevel_impl.plevel): + best_plevel_impl = impl + if not use_autotvm: + outs = best_plevel_impl.compute(attrs, inputs, out_type) + return best_plevel_impl, outs + + outputs = {} + best_autotvm_impl = None + best_cfg = None + dispatch_ctx = autotvm.task.DispatchContext.current + for impl in all_impls: + outs = impl.compute(attrs, inputs, out_type) + outputs[impl] = outs + workload = autotvm.task.get_workload(outs) + if workload is None: + continue + cfg = dispatch_ctx.query(target, workload) + if cfg.cost is None: + # It's a fallback config + continue + if best_cfg is None or best_cfg.cost > cfg.cost: + best_autotvm_impl = impl + best_cfg = cfg + if best_autotvm_impl: + return best_autotvm_impl, outputs[best_autotvm_impl] + return best_plevel_impl, outputs[best_plevel_impl] + + +class ScheduleGetter(ExprVisitor): + """Get the schedule given a fused Relay function""" + + MAX_FUNC_NAME_LENGTH = 80 + + def __init__(self, target): + super().__init__() + self.target = target + self.master_op = None + self.master_attrs = None + self.master_op_pattern = 0 + self.master_implement = None + self.func_name = "" + self.scalars = [] + self._device_copy_op = _op.get("device_copy") + + def create(self, prim_func): + """Get the schedule and create the cached function""" + assert isinstance(prim_func, _expr.Function) + assert prim_func.is_primitive() + + def create_tensors(typ, tensors): + if isinstance(typ, _ty.TensorType): + tensors.append(tvm.placeholder(get_shape(typ.shape), typ.dtype)) + else: + assert isinstance(typ, _ty.TupleType) + for field in typ.fields: + create_tensors(field, tensors) + + inputs = [] + for param in prim_func.params: + tensors = [] + create_tensors(param.checked_type, tensors) + self.memo_map[param] = tensors + inputs.extend(tensors) + self.func_name = "fused" + outputs = self.visit(prim_func.body) + if len(self.func_name) > ScheduleGetter.MAX_FUNC_NAME_LENGTH: + hash_digest = int(hashlib.sha1(self.func_name).hexdigest(), 16) + self.func_name = "%s_%s" % ( + self.func_name[:ScheduleGetter.MAX_FUNC_NAME_LENGTH], hash_digest) + + assert self.master_op is not None + tensor_outs = [] + for tensor in outputs: + if not isinstance(tensor.op, tvm.tensor.PlaceholderOp): + tensor_outs.append(tensor) + sch = None + if not isinstance(self.master_attrs, _op.op_attrs.DeviceCopyAttrs): + # print('master op:', self.master_op.name) + sch = self.master_implement.schedule(self.master_attrs, tensor_outs, self.target) + for scalar in self.scalars: + sch[scalar].compute_inline() + return CachedFunc(self.target, self.func_name, inputs, outputs, sch) + + def visit_var(self, var): + assert False, "Found free variable " + var.name_hint + + def visit_constant(self, const): + assert len(const.data.shape) == 0, "Constant is not scalar" + dtype = const.data.dtype + data = const.data.asnumpy() + def fcompute(): + if dtype.startswith("int"): + return tvm.expr.IntImm(dtype, int(data)) + elif dtype.startswith("uint"): + return tvm.expr.UIntImm(dtype, int(data)) + elif dtype.startswith("float"): + return tvm.expr.FloatImm(dtype, float(data)) + else: + assert False, "not handled" + return tvm.expr.Expr() + value = tvm.compute((), fcompute, name="compile_engine_const", tag=tag.BROADCAST) + self.scalars.append(value.op) + return [value] + + def visit_call(self, call): + inputs = [] + count_tuple = 0 + for arg in call.args: + if isinstance(arg.checked_type, _ty.TupleType): + count_tuple += 1 + inputs.extend(self.visit(arg)) + assert count_tuple <= 1, "Only allow function with a single tuple input" + ret_type = call.checked_type + if isinstance(ret_type, _ty.TensorType): + ret_type = _ty.TensorType(get_shape(ret_type.shape), ret_type.dtype) + elif isinstance(ret_type, _ty.TupleType): + new_fields = [] + for field in ret_type.fields: + if isinstance(field, _ty.TensorType): + new_fields.append(_ty.TensorType(get_shape(field.shape), field.dtype)) + else: + new_fields.append(field) + ret_type = _ty.TupleType(new_fields) + assert isinstance(call.op, _op.Op) + op = call.op + + # disable AutoTVM tracing if op is not in wanted list + env = autotvm.task.TaskExtractEnv.current + reenable_tracing = False + if env is not None and env.tracing: + if env.wanted_relay_ops is not None and op not in env.wanted_relay_ops: + env.tracing = False + reenable_tracing = True + + if op == self._device_copy_op: + copy_input = inputs[0] + outputs = [_api_internal._Tensor(copy_input.shape, copy_input.dtype, + None, 0)] + else: + is_dyn = call.checked_type.is_dynamic() + for arg in call.args: + is_dyn = is_dyn or arg.checked_type.is_dynamic() + + if not is_dyn: + best_impl, outputs = select_implement( + op, call.attrs, inputs, ret_type, self.target) + else: + # TODO(@icemelon9): Allow tvm to generate multiple kernels for dynamic shapes + # for dynamic case, we currently use the implementation with highest plevel + best_impl, outputs = select_implement( + op, call.attrs, inputs, ret_type, self.target, use_autotvm=False) + op_pattern = op.get_attr("TOpPattern") + if op_pattern >= _op.OpPattern.COMM_REDUCE: + assert self.master_op is None or self.master_op_pattern < _op.OpPattern.COMM_REDUCE, \ + "Two complicated op in a primitive function master=%s current=%s" % ( + self.master_op, op) + if op_pattern >= self.master_op_pattern: + self.master_op = op + self.master_attrs = call.attrs + self.master_op_pattern = op_pattern + self.master_implement = best_impl + if len(outputs) > 1: + assert isinstance(call.checked_type, _ty.TupleType) + assert len(call.checked_type.fields) == len(outputs) + if op == self._device_copy_op: + self.func_name += "__copy" + else: + self.func_name += "_" + op.name + + # re-enable AutoTVM tracing + if reenable_tracing: + env.tracing = True + + return outputs + + def visit_let(self, let): + val = self.visit(let.value) + assert let.var not in self.memo_map + self.memo_map[let.var] = val + return self.visit(let.body) + + def visit_tuple(self, tup): + fields = [] + for field in tup.fields: + assert isinstance(field.checked_type, _ty.TensorType), "Only allow Tuple of Tensor" + res = self.visit(field) + assert len(res) == 1 + fields.append(res[0]) + return fields + + def visit_tuple_getitem(self, t): + tup = self.visit(t.tuple) + assert len(tup) == len(t.tuple.checked_type.fields) + assert t.index >= 0 + assert t.index < tup.size() + return [tup[t.index]] + + +@register_func("relay.backend.create_schedule") +def create_schedule(src_func, target): + return ScheduleGetter(target).create(src_func) + + @register_relay_node class CompileEngine(Object): """CompileEngine to get lowered code. diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py index 39e68b8333ff..22d89050298c 100644 --- a/python/tvm/relay/expr.py +++ b/python/tvm/relay/expr.py @@ -277,6 +277,12 @@ def set_params(self, params): return _expr.FunctionSetParams(self, params) + def is_primitive(self): + return int(self.get_attribute("Primitive")) == 1 + + def get_attribute(self, name): + return _expr.FunctionGetAttr(self, name) + def set_attribute(self, name, ref): return _expr.FunctionSetAttr(self, name, ref) diff --git a/python/tvm/relay/expr_functor.py b/python/tvm/relay/expr_functor.py index f492c743173c..8d6923920979 100644 --- a/python/tvm/relay/expr_functor.py +++ b/python/tvm/relay/expr_functor.py @@ -131,22 +131,22 @@ class ExprVisitor(ExprFunctor): The default behavior recursively traverses the AST. """ - def visit_tuple(self, t): - for x in t.fields: + def visit_tuple(self, tup): + for x in tup.fields: self.visit(x) - def visit_call(self, c): - self.visit(c.op) - for a in c.args: + def visit_call(self, call): + self.visit(call.op) + for a in call.args: self.visit(a) - def visit_var(self, v): + def visit_var(self, var): pass - def visit_let(self, l): - self.visit(l.var) - self.visit(l.value) - self.visit(l.body) + def visit_let(self, let): + self.visit(let.var) + self.visit(let.value) + self.visit(let.body) def visit_function(self, f): self.visit(f.body) diff --git a/python/tvm/relay/memory_alloc.py b/python/tvm/relay/memory_alloc.py index d61c6f1d6fba..f8e981121031 100644 --- a/python/tvm/relay/memory_alloc.py +++ b/python/tvm/relay/memory_alloc.py @@ -28,8 +28,8 @@ def is_primitive(call): - return hasattr(call.op, 'attrs') and hasattr(call.op.attrs, 'Primitive') and \ - int(call.op.attrs.Primitive) == 1 + return hasattr(call, 'op') and hasattr(call.op, 'attrs') and \ + hasattr(call.op.attrs, 'Primitive') and int(call.op.attrs.Primitive) == 1 # TODO(@jroesch): port to c++ and unify with existing code class LinearizeRetType: diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py index bcd58ba5b1b1..8c22e35dfe6c 100644 --- a/python/tvm/relay/op/__init__.py +++ b/python/tvm/relay/op/__init__.py @@ -17,9 +17,10 @@ #pylint: disable=wildcard-import, redefined-builtin """Relay core operators.""" # operator defs -from .op import get, register, register_schedule, register_compute, register_gradient, \ +from .op import get, register, register_compute, register_gradient, \ register_pattern, register_alter_op_layout, register_legalize, \ - schedule_injective, Op, OpPattern, debug + Op, OpPattern, debug +from . import strategy # Operators from .reduce import * diff --git a/python/tvm/relay/op/_algorithm.py b/python/tvm/relay/op/_algorithm.py index 09746be13e30..e1e6fd3a1139 100644 --- a/python/tvm/relay/op/_algorithm.py +++ b/python/tvm/relay/op/_algorithm.py @@ -18,48 +18,14 @@ # pylint: disable=invalid-name,unused-argument from __future__ import absolute_import -import topi -from topi.util import get_const_int -from ..op import OpPattern, register_compute, register_schedule, register_pattern - - -@register_schedule("argsort") -def schedule_argsort(_, outs, target): - """Schedule definition of argsort""" - with target: - return topi.generic.schedule_argsort(outs) - - -@register_compute("argsort") -def compute_argsort(attrs, inputs, _, target): - """Compute definition of argsort""" - axis = get_const_int(attrs.axis) - is_ascend = bool(get_const_int(attrs.is_ascend)) - dtype = attrs.dtype - return [topi.argsort(inputs[0], axis=axis, is_ascend=is_ascend, dtype=dtype)] - +from . import strategy +from .op import OpPattern, register_pattern +from .op import register_strategy +# argsort +register_strategy("argsort", strategy.argsort_strategy) register_pattern("argsort", OpPattern.OPAQUE) - -@register_schedule("topk") -def schedule_topk(_, outs, target): - """Schedule definition of argsort""" - with target: - return topi.generic.schedule_topk(outs) - - -@register_compute("topk") -def compute_topk(attrs, inputs, _, target): - """Compute definition of argsort""" - k = get_const_int(attrs.k) - axis = get_const_int(attrs.axis) - ret_type = attrs.ret_type - is_ascend = bool(get_const_int(attrs.is_ascend)) - dtype = attrs.dtype - out = topi.topk(inputs[0], k, axis, ret_type, is_ascend, dtype) - out = out if isinstance(out, list) else [out] - return out - - +# topk +register_strategy("topk", strategy.topk_strategy) register_pattern("topk", OpPattern.OPAQUE) diff --git a/python/tvm/relay/op/_reduce.py b/python/tvm/relay/op/_reduce.py index 43f71c0aa679..3103520bdfef 100644 --- a/python/tvm/relay/op/_reduce.py +++ b/python/tvm/relay/op/_reduce.py @@ -17,33 +17,21 @@ """Backend compiler related feature registration""" from __future__ import absolute_import -import topi - from topi.util import get_const_int, get_const_tuple from . import op as _reg from ...api import convert from ...hybrid import script - -def _schedule_reduce(_, outs, target): - """Generic schedule for reduce""" - with target: - return topi.generic.schedule_reduce(outs) - - -_reg.register_schedule("argmax", _schedule_reduce) -_reg.register_schedule("argmin", _schedule_reduce) -_reg.register_schedule("sum", _schedule_reduce) -_reg.register_schedule("all", _schedule_reduce) -_reg.register_schedule("any", _schedule_reduce) -_reg.register_schedule("max", _schedule_reduce) -_reg.register_schedule("min", _schedule_reduce) -_reg.register_schedule("prod", _schedule_reduce) -_reg.register_schedule("mean", _schedule_reduce) -_reg.register_schedule("variance", _schedule_reduce) -_reg.register_schedule("nn.cross_entropy", _schedule_reduce) -_reg.register_schedule("nn.cross_entropy_with_logits", _schedule_reduce) - +_reg.register_strategy_reduce("argmax") +_reg.register_strategy_reduce("argmin") +_reg.register_strategy_reduce("sum") +_reg.register_strategy_reduce("all") +_reg.register_strategy_reduce("any") +_reg.register_strategy_reduce("max") +_reg.register_strategy_reduce("min") +_reg.register_strategy_reduce("prod") +_reg.register_strategy_reduce("mean") +_reg.register_strategy_reduce("variance") def _create_axis_record(attrs, inputs): axes = attrs.axis if attrs.axis is None else list(get_const_tuple(attrs.axis)) diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py index c1d02bd56d1b..ebcb8e36aa65 100644 --- a/python/tvm/relay/op/_tensor.py +++ b/python/tvm/relay/op/_tensor.py @@ -19,101 +19,99 @@ from __future__ import absolute_import import topi from topi.util import get_const_tuple -from .op import register_compute, register_schedule, register_pattern, register_shape_func -from .op import schedule_injective, OpPattern +from .op import register_compute, register_shape_func +from .op import register_strategy_broadcast, register_strategy_injective +from .op import register_pattern, OpPattern from ...hybrid import script from ...api import convert -schedule_broadcast = schedule_injective -schedule_elemwise = schedule_injective - -register_schedule("log", schedule_broadcast) -register_schedule("cos", schedule_broadcast) -register_schedule("sin", schedule_broadcast) -register_schedule("atan", schedule_broadcast) -register_schedule("exp", schedule_broadcast) -register_schedule("erf", schedule_broadcast) -register_schedule("sqrt", schedule_broadcast) -register_schedule("rsqrt", schedule_broadcast) -register_schedule("sigmoid", schedule_broadcast) -register_schedule("floor", schedule_broadcast) -register_schedule("ceil", schedule_broadcast) -register_schedule("trunc", schedule_broadcast) -register_schedule("round", schedule_broadcast) -register_schedule("sign", schedule_broadcast) -register_schedule("abs", schedule_broadcast) -register_schedule("tanh", schedule_broadcast) -register_schedule("logical_not", schedule_broadcast) -register_schedule("bitwise_not", schedule_broadcast) -register_schedule("negative", schedule_broadcast) -register_schedule("copy", schedule_broadcast) - -register_schedule("add", schedule_broadcast) -register_schedule("subtract", schedule_broadcast) -register_schedule("multiply", schedule_broadcast) -register_schedule("divide", schedule_broadcast) -register_schedule("floor_divide", schedule_broadcast) -register_schedule("power", schedule_injective) -register_schedule("mod", schedule_broadcast) -register_schedule("floor_mod", schedule_broadcast) -register_schedule("logical_and", schedule_broadcast) -register_schedule("logical_or", schedule_broadcast) -register_schedule("bitwise_and", schedule_broadcast) -register_schedule("bitwise_or", schedule_broadcast) -register_schedule("bitwise_xor", schedule_broadcast) -register_schedule("equal", schedule_broadcast) -register_schedule("not_equal", schedule_broadcast) -register_schedule("less", schedule_broadcast) -register_schedule("less_equal", schedule_broadcast) -register_schedule("greater", schedule_broadcast) -register_schedule("greater_equal", schedule_broadcast) -register_schedule("maximum", schedule_injective) -register_schedule("minimum", schedule_injective) -register_schedule("right_shift", schedule_injective) -register_schedule("left_shift", schedule_injective) -register_schedule("shape_of", schedule_injective) + +register_strategy_broadcast("log") +register_strategy_broadcast("cos") +register_strategy_broadcast("sin") +register_strategy_broadcast("atan") +register_strategy_broadcast("exp") +register_strategy_broadcast("erf") +register_strategy_broadcast("sqrt") +register_strategy_broadcast("rsqrt") +register_strategy_broadcast("sigmoid") +register_strategy_broadcast("floor") +register_strategy_broadcast("ceil") +register_strategy_broadcast("trunc") +register_strategy_broadcast("round") +register_strategy_broadcast("sign") +register_strategy_broadcast("abs") +register_strategy_broadcast("tanh") +register_strategy_broadcast("add") +register_strategy_broadcast("subtract") +register_strategy_broadcast("multiply") +register_strategy_broadcast("divide") +register_strategy_broadcast("floor_divide") +register_strategy_broadcast("power") +register_strategy_broadcast("copy") +register_strategy_broadcast("logical_not") +register_strategy_broadcast("logical_and") +register_strategy_broadcast("logical_or") +register_strategy_broadcast("bitwise_not") +register_strategy_broadcast("bitwise_and") +register_strategy_broadcast("bitwise_or") +register_strategy_broadcast("bitwise_xor") +register_strategy_broadcast("negative") +register_strategy_broadcast("mod") +register_strategy_broadcast("floor_mod") +register_strategy_broadcast("equal") +register_strategy_broadcast("not_equal") +register_strategy_broadcast("less") +register_strategy_broadcast("less_equal") +register_strategy_broadcast("greater") +register_strategy_broadcast("greater_equal") +register_strategy_injective("maximum") +register_strategy_injective("minimum") +register_strategy_injective("right_shift") +register_strategy_injective("left_shift") +register_strategy_injective("shape_of") # zeros @register_compute("zeros") -def zeros_compute(attrs, inputs, output_type, target): +def zeros_compute(attrs, inputs, output_type): assert not inputs return [topi.full(output_type.shape, output_type.dtype, 0.0)] -register_schedule("zeros", schedule_broadcast) +register_strategy_broadcast("zeros") register_pattern("zeros", OpPattern.ELEMWISE) # zeros_like @register_compute("zeros_like") -def zeros_like_compute(attrs, inputs, output_type, target): +def zeros_like_compute(attrs, inputs, output_type): assert len(inputs) == 1 return [topi.full_like(inputs[0], 0.0)] -register_schedule("zeros_like", schedule_broadcast) +register_strategy_broadcast("zeros_like") # ones @register_compute("ones") -def ones_compute(attrs, inputs, output_type, target): +def ones_compute(attrs, inputs, output_type): assert not inputs return [topi.full(output_type.shape, output_type.dtype, 1.0)] -register_schedule("ones", schedule_broadcast) +register_strategy_broadcast("ones") register_pattern("ones", OpPattern.ELEMWISE) # ones_like @register_compute("ones_like") -def ones_like(attrs, inputs, output_type, target): +def ones_like_compute(attrs, inputs, output_type): assert len(inputs) == 1 return [topi.full_like(inputs[0], 1.0)] -register_schedule("ones_like", schedule_broadcast) +register_strategy_broadcast("ones_like") # clip @register_compute("clip") -def clip_compute(attrs, inputs, output_type, target): +def clip_compute(attrs, inputs, output_type): assert len(inputs) == 1 return [topi.clip(inputs[0], attrs.a_min, attrs.a_max)] -register_schedule("clip", schedule_elemwise) +register_strategy_injective("clip") @script def _cast_shape_function(x): @@ -198,6 +196,7 @@ def elemwise_shape_func(attrs, inputs, _): register_shape_func("floor_mod", False, broadcast_shape_func) register_shape_func("logical_and", False, broadcast_shape_func) register_shape_func("logical_or", False, broadcast_shape_func) +register_shape_func("bitwise_not", False, broadcast_shape_func) register_shape_func("bitwise_and", False, broadcast_shape_func) register_shape_func("bitwise_or", False, broadcast_shape_func) register_shape_func("bitwise_xor", False, broadcast_shape_func) diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py index e6053b887d38..ccc53cc6ef1d 100644 --- a/python/tvm/relay/op/_transform.py +++ b/python/tvm/relay/op/_transform.py @@ -21,52 +21,74 @@ import topi from topi.util import get_const_int, get_const_tuple from . import op as _reg -from ._reduce import _schedule_reduce +from . import strategy from .op import OpPattern from ...hybrid import script from ...api import convert -schedule_injective = _reg.schedule_injective -schedule_broadcast = _reg.schedule_injective -schedule_concatenate = _reg.schedule_concatenate - - -_reg.register_schedule("collapse_sum_like", _schedule_reduce) -_reg.register_schedule("broadcast_to", schedule_broadcast) -_reg.register_schedule("broadcast_to_like", schedule_broadcast) -_reg.register_schedule("expand_dims", schedule_broadcast) -_reg.register_schedule("squeeze", schedule_injective) -_reg.register_schedule("reshape", schedule_injective) -_reg.register_schedule("reshape_like", schedule_injective) -_reg.register_schedule("full", schedule_injective) -_reg.register_schedule("full_like", schedule_injective) -_reg.register_schedule("arange", schedule_injective) -_reg.register_schedule("reverse", schedule_injective) -_reg.register_schedule("repeat", schedule_broadcast) -_reg.register_schedule("tile", schedule_broadcast) -_reg.register_schedule("cast", schedule_injective) -_reg.register_schedule("cast_like", schedule_injective) -_reg.register_schedule("reinterpret", schedule_injective) -_reg.register_schedule("strided_slice", schedule_injective) -_reg.register_schedule("strided_set", schedule_injective) -_reg.register_schedule("slice_like", schedule_injective) -_reg.register_schedule("split", schedule_injective) -_reg.register_schedule("take", schedule_injective) -_reg.register_schedule("transpose", schedule_injective) -_reg.register_schedule("where", schedule_broadcast) -_reg.register_schedule("stack", schedule_injective) -_reg.register_schedule("concatenate", schedule_concatenate) -_reg.register_schedule("_contrib_reverse_reshape", schedule_injective) -_reg.register_schedule("gather_nd", schedule_injective) -_reg.register_schedule("sequence_mask", schedule_injective) -_reg.register_schedule("one_hot", schedule_injective) +_reg.register_strategy_broadcast("broadcast_to") +_reg.register_strategy_broadcast("broadcast_to_like") +_reg.register_strategy_broadcast("expand_dims") +_reg.register_strategy_broadcast("repeat") +_reg.register_strategy_broadcast("tile") +_reg.register_strategy_broadcast("where") +_reg.register_strategy_injective("squeeze") +_reg.register_strategy_injective("reshape") +_reg.register_strategy_injective("reshape_like") +_reg.register_strategy_injective("full") +_reg.register_strategy_injective("full_like") +_reg.register_strategy_injective("arange") +_reg.register_strategy_injective("reverse") +_reg.register_strategy_injective("cast") +_reg.register_strategy_injective("cast_like") +_reg.register_strategy_injective("reinterpret") +_reg.register_strategy_injective("strided_slice") +_reg.register_strategy_injective("slice_like") +_reg.register_strategy_injective("split") +_reg.register_strategy_injective("take") +_reg.register_strategy_injective("transpose") +_reg.register_strategy_injective("stack") +_reg.register_strategy_injective("_contrib_reverse_reshape") +_reg.register_strategy_injective("gather_nd") +_reg.register_strategy_injective("sequence_mask") +_reg.register_strategy_injective("one_hot") +_reg.register_strategy_reduce("collapse_sum_like") + +# concatenate +_reg.register_schedule("concatenate", strategy.schedule_concatenate) + +# strided_set +@_reg.register_compute("strided_set") +def compute_strided_set(attrs, inputs, output_type): + """Compute definition of strided_set""" + return [topi.strided_set(inputs[0], inputs[1], inputs[2], inputs[3], inputs[4])] +_reg.register_strategy_injective("strided_set") # layout_transform -_reg.register_schedule("layout_transform", schedule_injective) +_reg.register_strategy_injective("layout_transform") _reg.register_pattern("layout_transform", OpPattern.INJECTIVE) -# shape func +# argwhere +@_reg.register_compute("argwhere") +def compute_argwhere(attrs, inputs, output_type): + """Compute definition of argwhere""" + output_shape = [] + for s in output_type.shape: + if hasattr(s, "value"): + output_shape.append(s) + else: + # see Any, replace it with a var + output_shape.append(tvm.var("any_dim", "int32")) + new_output_type = tvm.relay.ty.TensorType(output_shape, "int32") + return [topi.argwhere(new_output_type, inputs[0])] + +_reg.register_schedule("argwhere", strategy.schedule_argwhere) + +##################### +# Shape functions # +##################### + @script def _arange_shape_func(start, stop, step): out = output_tensor((1,), "int64") @@ -284,31 +306,6 @@ def argwhere_shape_func(attrs, inputs, out_ndims): return [_argwhere_shape_func_5d(inputs[0])] return ValueError("Does not support rank higher than 5 in argwhere") -@_reg.register_schedule("argwhere") -def schedule_argwhere(_, outs, target): - """Schedule definition of argwhere""" - with target: - return topi.generic.schedule_argwhere(outs) - - -@_reg.register_compute("argwhere") -def compute_argwhere(attrs, inputs, output_type, _): - """Compute definition of argwhere""" - output_shape = [] - for s in output_type.shape: - if hasattr(s, "value"): - output_shape.append(s) - else: - # see Any, replace it with a var - output_shape.append(tvm.var("any_dim", "int32")) - new_output_type = tvm.relay.ty.TensorType(output_shape, "int32") - return [topi.argwhere(new_output_type, inputs[0])] - -@_reg.register_compute("strided_set") -def compute_strided_set(attrs, inputs, output_type, _): - """Compute definition of strided_set""" - return [topi.strided_set(inputs[0], inputs[1], inputs[2], inputs[3], inputs[4])] - @script def _layout_transform_shape_func(data_shape, out_layout_len, diff --git a/python/tvm/relay/op/annotation/annotation.py b/python/tvm/relay/op/annotation/annotation.py index 586c30085601..5fcc112787a3 100644 --- a/python/tvm/relay/op/annotation/annotation.py +++ b/python/tvm/relay/op/annotation/annotation.py @@ -19,7 +19,7 @@ from tvm.runtime import TVMContext as _TVMContext from . import _make -from ..op import register_schedule, schedule_injective +from .. import op as reg def on_device(data, device): @@ -79,7 +79,7 @@ def checkpoint(data): """ return _make.checkpoint(data) -register_schedule("annotation.checkpoint", schedule_injective) +reg.register_strategy_injective("annotation.checkpoint") def compiler_begin(data, compiler): diff --git a/python/tvm/relay/op/contrib/_contrib.py b/python/tvm/relay/op/contrib/_contrib.py index 4b5588024411..16f22f1363c9 100644 --- a/python/tvm/relay/op/contrib/_contrib.py +++ b/python/tvm/relay/op/contrib/_contrib.py @@ -18,29 +18,19 @@ """Backend compiler related feature registration""" from __future__ import absolute_import -import topi from .. import op as reg -from ..op import schedule_injective, OpPattern +from .. import strategy +from ..op import OpPattern # adaptive_max_pool2d -@reg.register_schedule("contrib.adaptive_max_pool2d") -def schedule_adaptive_max_pool2d(_, outs, target): - """Schedule definition of adaptive_max_pool2d""" - with target: - return topi.generic.schedule_adaptive_pool(outs) - +reg.register_schedule("contrib.adaptive_max_pool2d", strategy.schedule_adaptive_pool) reg.register_pattern("contrib.adaptive_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) # adaptive_avg_pool2d -@reg.register_schedule("contrib.adaptive_avg_pool2d") -def schedule_adaptive_avg_pool2d(_, outs, target): - """Schedule definition of adaptive_avg_pool2d""" - with target: - return topi.generic.schedule_adaptive_pool(outs) - +reg.register_schedule("contrib.adaptive_avg_pool2d", strategy.schedule_adaptive_pool) reg.register_pattern("contrib.adaptive_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) # relay.contrib.ndarray_size -reg.register_schedule("contrib.ndarray_size", schedule_injective) +reg.register_strategy_injective("contrib.ndarray_size") diff --git a/python/tvm/relay/op/image/_image.py b/python/tvm/relay/op/image/_image.py index 89fde6dc1738..14a7080d5986 100644 --- a/python/tvm/relay/op/image/_image.py +++ b/python/tvm/relay/op/image/_image.py @@ -20,13 +20,10 @@ import topi from .. import op as reg -from ..op import schedule_injective # resize -reg.register_schedule("image.resize", schedule_injective) - @reg.register_compute("image.resize") -def compute_resize(attrs, inputs, out_type, target): +def compute_resize(attrs, inputs, out_type): size = attrs.size layout = attrs.layout method = attrs.method @@ -34,12 +31,12 @@ def compute_resize(attrs, inputs, out_type, target): out_dtype = attrs.out_dtype return [topi.image.resize(inputs[0], size, layout, method, coord_trans, out_dtype)] +reg.register_strategy_injective("image.resize") -# crop and resize -reg.register_schedule("image.crop_and_resize", schedule_injective) +# crop and resize @reg.register_compute("image.crop_and_resize") -def compute_crop_and_resize(attrs, inputs, out_type, target): +def compute_crop_and_resize(attrs, inputs, out_type): crop_size = attrs.crop_size layout = attrs.layout method = attrs.method @@ -48,3 +45,5 @@ def compute_crop_and_resize(attrs, inputs, out_type, target): return [topi.image.crop_and_resize(inputs[0], inputs[1], inputs[2], crop_size, layout, method, extrapolation_value, out_dtype)] + +reg.register_strategy_injective("image.crop_and_resize") diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index 3fdafd5b8628..4e0443fde59f 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -21,253 +21,79 @@ import topi from topi.util import get_const_tuple from .. import op as reg -from ..op import OpPattern, schedule_injective +from .. import strategy +from ..op import OpPattern from .._tensor import elemwise_shape_func from ....api import convert from ....hybrid import script # relu -reg.register_schedule("nn.relu", schedule_injective) +reg.register_strategy_broadcast("nn.relu") reg.register_pattern("nn.relu", OpPattern.ELEMWISE) -# softmax -@reg.register_schedule("nn.softmax") -def schedule_softmax(_, outputs, target): - """Schedule definition of softmax""" - with target: - return topi.generic.schedule_softmax(outputs) - +# softmax +reg.register_schedule("nn.softmax", strategy.schedule_softmax) reg.register_pattern("nn.softmax", OpPattern.OPAQUE) -schedule_broadcast = schedule_injective - - -@reg.register_schedule("nn.log_softmax") -def schedule_log_softmax(_, outputs, target): - """Schedule definition of log_softmax""" - with target: - return topi.generic.schedule_softmax(outputs) - +# log_softmax +reg.register_schedule("nn.log_softmax", strategy.schedule_softmax) reg.register_pattern("nn.log_softmax", OpPattern.OPAQUE) # dense -@reg.register_compute("nn.dense") -def compute_dense(attrs, inputs, out_type, target): - """Compute definition of dense""" - out_dtype = attrs.out_dtype - out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype - return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)] - - -@reg.register_schedule("nn.dense") -def schedule_dense(attrs, outputs, target): - """Schedule definition of dense""" - with target: - return topi.generic.schedule_dense(outputs) - - +reg.register_strategy("nn.dense", strategy.dense_strategy) reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE) +# fifo_buffer @reg.register_compute('nn.fifo_buffer') -def compute_fifo_buffer(attrs, inputs, out_type, target): +def compute_fifo_buffer(attrs, inputs, out_type): return [topi.nn.fifo_buffer(inputs[0], inputs[1], axis=attrs.get_int('axis'))] - -@reg.register_schedule('nn.fifo_buffer') -def schedule_fifo_buffer(attrs, outputs, target): - with target: - return topi.generic.schedule_injective(outputs) - - +reg.register_strategy_injective("nn.fifo_buffer") reg.register_pattern("nn.fifo_buffer", OpPattern.OPAQUE) # batch_matmul -@reg.register_compute("nn.batch_matmul") -def compute_batch_matmul(attrs, inputs, out_type, target): - """Compute definition of batch_matmul""" - with target: - return [topi.nn.batch_matmul(inputs[0], inputs[1])] - - -@reg.register_schedule("nn.batch_matmul") -def schedule_batch_matmul(attrs, outputs, target): - """Schedule definition of batch_matmul""" - with target: - return topi.generic.schedule_batch_matmul(outputs) - - +reg.register_strategy("nn.batch_matmul", strategy.batch_matmul_strategy) reg.register_pattern("nn.batch_matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE) + # sparse_dense @reg.register_compute("nn.sparse_dense") -def compute_sparse_dense(attrs, inputs, out_type, target): +def compute_sparse_dense(attrs, inputs, out_type): """Compute definition of sparse_dense""" return [topi.nn.sparse_dense(inputs[0], inputs[1], inputs[2], inputs[3])] -@reg.register_schedule("nn.sparse_dense") -def schedule_sparse_dense(attrs, outputs, target): - """Schedule definition of batch_matmul""" - with target: - return topi.generic.schedule_sparse_dense(outputs) - +reg.register_schedule("nn.sparse_dense", strategy.schedule_sparse_dense) reg.register_pattern("nn.sparse_dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE) + # sparse_transpose @reg.register_compute("nn.sparse_transpose") -def compute_sparse_transpose(attrs, inputs, out_type, target): +def compute_sparse_transpose(attrs, inputs, out_type): """Compute definition of sparse_transpose""" return topi.nn.sparse_transpose(inputs[0], inputs[1], inputs[2]) -@reg.register_schedule("nn.sparse_transpose") -def schedule_sparse_transpose(attrs, outputs, target): - """Schedule definition of batch_matmul""" - with target: - return topi.generic.schedule_sparse_transpose(outputs) - +reg.register_schedule("nn.sparse_transpose", strategy.schedule_sparse_transpose) reg.register_pattern("nn.sparse_transpose", reg.OpPattern.OUT_ELEMWISE_FUSABLE) -# Conv1D -@reg.register_compute("nn.conv1d") -def compute_conv1d(attrs, inputs, out_type, target): - """Compute definition of conv1d""" - strides = get_const_tuple(attrs.strides) - padding = get_const_tuple(attrs.padding) - dilation = get_const_tuple(attrs.dilation) - layout = attrs.data_layout - out_dtype = attrs.out_dtype - out_dtype = (inputs[0].dtype if out_dtype in ("same", "") - else out_dtype) - - assert layout in ["NCW", "NWC"] - if dilation[0] < 1: - raise ValueError("dilation should be a positive value") - - return [topi.nn.conv1d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)] - - -@reg.register_schedule("nn.conv1d") -def schedule_conv1d(attrs, outs, target): - """Schedule definition of conv1d""" - layout = attrs.data_layout - - with target: - if layout == "NCW": - return topi.generic.schedule_conv1d_ncw(outs) - elif layout == "NCW": - return topi.generic.schedule_conv1d_nwc(outs) - raise ValueError("No compatible schedule") - - +# conv1d +reg.register_strategy("nn.conv1d", strategy.conv1d_strategy) reg.register_pattern("nn.conv1d", OpPattern.OUT_ELEMWISE_FUSABLE) # conv2d -def _find_conv2d_op(op): - """Find the op with conv2d in its tag by traversing.""" - if 'conv2d' in op.tag: - return op - for tensor in op.input_tensors: - op_ = _find_conv2d_op(tensor.op) - if op_ is not None: - return op_ - return None - -@reg.register_compute("nn.conv2d") -def compute_conv2d(attrs, inputs, out_type, target): - """Compute definition of conv2d""" - padding = get_const_tuple(attrs.padding) - strides = get_const_tuple(attrs.strides) - dilation = get_const_tuple(attrs.dilation) - groups = attrs.groups - layout = attrs.data_layout - kernel_layout = attrs.kernel_layout - out_dtype = attrs.out_dtype - out_dtype = (inputs[0].dtype if out_dtype in ("same", "") - else out_dtype) - - assert layout in ["NCHW", "NHWC", "NCHW4c", "HWCN"] - (dilation_h, dilation_w) = dilation - if dilation_h < 1 or dilation_w < 1: - raise ValueError("dilation should be positive value") - - def _get_out_depth(): - weight_shape = get_const_tuple(inputs[1].shape) - # NHWC layout - if kernel_layout.startswith("HW"): - return weight_shape[2] * weight_shape[3] - # NCHW layout. - # in ARM CPU contrib_spatial_pack schedule, we will prepack weight layout - if len(weight_shape) == 4: - return weight_shape[0] * weight_shape[1] - else: - assert len(weight_shape) == 5 - C, M, _, _, VC = weight_shape - return C * VC * M - - if groups == 1: - out = topi.nn.conv2d( - inputs[0], inputs[1], strides, padding, - dilation, layout, out_dtype) - elif layout == "NCHW" and _get_out_depth() == groups: - out = topi.nn.depthwise_conv2d_nchw( - inputs[0], inputs[1], strides, padding, dilation, out_dtype) - elif layout == "NHWC" and kernel_layout == "HWOI" and _get_out_depth() == groups: - out = topi.nn.depthwise_conv2d_nhwc( - inputs[0], inputs[1], strides, padding, dilation, out_dtype) - elif layout in ['NCHW', 'NCHW4c']: - out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups, - out_dtype) - else: - raise ValueError("not support arbitrary group number for now") - return [out] - - -@reg.register_schedule("nn.conv2d") -def schedule_conv2d(attrs, outs, target): - """Schedule definition of conv2d""" - groups = attrs.groups - layout = attrs.data_layout - kernel_layout = attrs.kernel_layout - - with target: - if groups == 1 and layout == "NCHW": - return topi.generic.schedule_conv2d_nchw(outs) - elif groups == 1 and layout == "NCHW4c": - return topi.generic.schedule_conv2d_nchw(outs) - elif groups == 1 and layout == "NHWC": - return topi.generic.schedule_conv2d_nhwc(outs) - elif groups == 1 and layout == "HWCN": - return topi.generic.schedule_conv2d_hwcn(outs) - elif groups != 1: - # collect in_channels to distinguish depthwise and group conv2d - op = _find_conv2d_op(outs[0].op) - assert op is not None - - is_depthwise = 'depthwise' in op.tag - if is_depthwise: - if layout == "NCHW": - # TODO(leyuan, merrymercy, Huyuwei): fold depthwise topi into conv2d. - return topi.generic.schedule_depthwise_conv2d_nchw(outs) - if layout == "NHWC" and kernel_layout == "HWOI": - return topi.generic.schedule_depthwise_conv2d_nhwc(outs) - else: - if layout in ["NCHW", "NCHW4c"]: - return topi.generic.schedule_group_conv2d_nchw(outs) - raise ValueError("No compatible schedule") - +reg.register_strategy("nn.conv2d", strategy.conv2d_strategy) +reg.register_pattern("nn.conv2d", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_alter_op_layout("nn.conv2d") -def alter_op_layout_conv2d(attrs, inputs, tinfos): +def alter_op_layout_conv2d(attrs, inputs, tinfos, out_type): """Alternate the layout of conv2d""" - # pylint: disable=import-outside-toplevel - from ... import op - return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, op) + return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, out_type) @reg.register_legalize("nn.conv2d") def legalize_conv2d(attrs, inputs, types): @@ -289,7 +115,6 @@ def legalize_conv2d(attrs, inputs, types): """ return topi.nn.conv2d_legalize(attrs, inputs, types) - @reg.register_convert_op_layout("nn.conv2d") def convert_conv2d(attrs, inputs, tinfos, desired_layout): """Convert Layout pass registration for conv2d op. @@ -330,82 +155,10 @@ def convert_conv2d(attrs, inputs, tinfos, desired_layout): return relay.nn.conv2d(data, weight, **new_attrs) return None -reg.register_pattern("nn.conv2d", OpPattern.OUT_ELEMWISE_FUSABLE) - # conv2d_transpose -@reg.register_compute("nn.conv2d_transpose") -def compute_conv2d_transpose(attrs, inputs, out_dtype, target): - """Compute definition of conv2d_transpose""" - padding = get_const_tuple(attrs.padding) - strides = get_const_tuple(attrs.strides) - dilation = get_const_tuple(attrs.dilation) - groups = attrs.groups - layout = attrs.data_layout - out_dtype = attrs.out_dtype - out_dtype = (inputs[0].dtype if out_dtype in ("same", "") - else out_dtype) - assert layout == "NCHW", "only support nchw for now" - assert dilation == (1, 1), "not support dilate now" - assert groups == 1, "only support groups == 1 for now" - out = topi.nn.conv2d_transpose_nchw( - inputs[0], inputs[1], strides, padding, out_dtype) - output_padding = get_const_tuple(attrs.output_padding) - out = topi.nn.pad(out, - [0, 0, 0, 0], [0, 0, output_padding[0], output_padding[1]]) - return [out] - - -@reg.register_compute("nn.conv3d") -def compute_conv3d(attrs, inputs, out_type, target): - """Compute definition of conv3d""" - padding = get_const_tuple(attrs.padding) - strides = get_const_tuple(attrs.strides) - dilation = get_const_tuple(attrs.dilation) - groups = attrs.groups - layout = attrs.data_layout - out_dtype = attrs.out_dtype - out_dtype = (inputs[0].dtype if out_dtype in ("same", "") - else out_dtype) - - assert layout in ["NCDHW", "NDHWC"] - (dilation_d, dilation_h, dilation_w) = dilation - if dilation_d < 1 or dilation_h < 1 or dilation_w < 1: - raise ValueError("dilation should be positive value") - - if groups == 1: - out = topi.nn.conv3d( - inputs[0], inputs[1], strides, padding, - dilation, layout, out_dtype) - else: - raise ValueError("not support arbitrary group number for now") - return [out] - - -@reg.register_schedule("nn.conv3d") -def schedule_conv3d(attrs, outs, target): - """Schedule definition of conv3d""" - groups = attrs.groups - layout = attrs.data_layout - - with target: - if groups == 1 and layout == "NCDHW": - return topi.generic.schedule_conv3d_ncdhw(outs) - elif groups == 1 and layout == "NDHWC": - return topi.generic.schedule_conv3d_ndhwc(outs) - - raise ValueError("No compatible schedule") - - -reg.register_pattern("nn.conv3d", OpPattern.OUT_ELEMWISE_FUSABLE) - - -@reg.register_schedule("nn.conv2d_transpose") -def schedule_conv2d_transpose(attrs, outs, target): - """Schedule definition of conv2d_transpose""" - with target: - return topi.generic.schedule_conv2d_transpose_nchw(outs) - +reg.register_strategy("nn.conv2d_transpose", strategy.conv2d_transpose_strategy) +reg.register_pattern("nn.conv2d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_legalize("nn.conv2d_transpose") def legalize_conv2d_transpose(attrs, inputs, types): @@ -427,202 +180,102 @@ def legalize_conv2d_transpose(attrs, inputs, types): """ return topi.nn.conv2d_transpose_legalize(attrs, inputs, types) -reg.register_pattern("nn.conv2d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE) - -# conv1d_transpose -@reg.register_compute("nn.conv1d_transpose") -def compute_conv1d_transpose(attrs, inputs, out_dtype, target): - """Compute definition of conv1d_transpose""" - padding = get_const_tuple(attrs.padding) - strides = get_const_tuple(attrs.strides) - dilation = get_const_tuple(attrs.dilation) - groups = attrs.groups - layout = attrs.data_layout - out_dtype = attrs.out_dtype - out_dtype = (inputs[0].dtype if out_dtype in ("same", "") - else out_dtype) - assert layout == "NCW", "conv1d_transpose ncw only supported" - assert dilation == (1,), "conv1d_transpose dilation is not supported" - assert groups == 1, "conv1d_transpose groups == 1 only supported" - out = topi.nn.conv1d_transpose_ncw( - inputs[0], inputs[1], strides, padding, out_dtype) - output_padding = get_const_tuple(attrs.output_padding) - out = topi.nn.pad(out, - [0, 0, 0], [0, 0, output_padding[0]]) - return [out] +# conv3d +reg.register_strategy("nn.conv3d", strategy.conv3d_strategy) +reg.register_pattern("nn.conv3d", OpPattern.OUT_ELEMWISE_FUSABLE) -@reg.register_schedule("nn.conv1d_transpose") -def schedule_conv1d_transpose(attrs, outs, target): - """Schedule definition of conv1d_transpose""" - with target: - return topi.generic.schedule_conv1d_transpose_ncw(outs) +# conv1d_transpose +reg.register_strategy("nn.conv1d_transpose", strategy.conv1d_transpose_strategy) reg.register_pattern("nn.conv1d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE) + # bias_add -reg.register_schedule("nn.bias_add", schedule_injective) +reg.register_strategy_injective("nn.bias_add") reg.register_pattern("nn.bias_add", OpPattern.BROADCAST) # max_pool1d -@reg.register_schedule("nn.max_pool1d") -def schedule_max_pool1d(attrs, outs, target): - """Schedule definition of max_pool1d""" - layout = attrs.layout - with target: - return topi.generic.schedule_pool(outs, layout) - - +reg.register_schedule("nn.max_pool1d", strategy.schedule_pool) reg.register_pattern("nn.max_pool1d", OpPattern.OUT_ELEMWISE_FUSABLE) # max_pool2d -@reg.register_schedule("nn.max_pool2d") -def schedule_max_pool2d(attrs, outs, target): - """Schedule definition of max_pool2d""" - layout = attrs.layout - with target: - return topi.generic.schedule_pool(outs, layout) - - +reg.register_schedule("nn.max_pool2d", strategy.schedule_pool) reg.register_pattern("nn.max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) # max_pool3d -@reg.register_schedule("nn.max_pool3d") -def schedule_max_pool3d(attrs, outs, target): - """Schedule definition of max_pool3d""" - layout = attrs.layout - with target: - return topi.generic.schedule_pool(outs, layout) - - +reg.register_schedule("nn.max_pool3d", strategy.schedule_pool) reg.register_pattern("nn.max_pool3d", OpPattern.OUT_ELEMWISE_FUSABLE) # avg_pool1d -@reg.register_schedule("nn.avg_pool1d") -def schedule_avg_pool1d(attrs, outs, target): - """Schedule definition of avg_pool1d""" - layout = attrs.layout - with target: - return topi.generic.schedule_pool(outs, layout) - - +reg.register_schedule("nn.avg_pool1d", strategy.schedule_pool) reg.register_pattern("nn.avg_pool1d", OpPattern.OUT_ELEMWISE_FUSABLE) # avg_pool2d -@reg.register_schedule("nn.avg_pool2d") -def schedule_avg_pool2d(attrs, outs, target): - """Schedule definition of avg_pool2d""" - layout = attrs.layout - with target: - return topi.generic.schedule_pool(outs, layout) - +reg.register_schedule("nn.avg_pool2d", strategy.schedule_pool) reg.register_pattern("nn.avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) # avg_pool3d -@reg.register_schedule("nn.avg_pool3d") -def schedule_avg_pool3d(attrs, outs, target): - """Schedule definition of avg_pool3d""" - layout = attrs.layout - with target: - return topi.generic.schedule_pool(outs, layout) - - +reg.register_schedule("nn.avg_pool3d", strategy.schedule_pool) reg.register_pattern("nn.avg_pool3d", OpPattern.OUT_ELEMWISE_FUSABLE) # max_pool2d_grad -@reg.register_schedule("nn.max_pool2d_grad") -def schedule_max_pool2d_grad(attrs, outs, target): - """Schedule definition of max_pool2d_grad""" - with target: - return topi.generic.schedule_pool_grad(outs) - - +reg.register_schedule("nn.max_pool2d_grad", strategy.schedule_pool_grad) reg.register_pattern("nn.max_pool2d_grad", OpPattern.OUT_ELEMWISE_FUSABLE) # avg_pool2d_grad -@reg.register_schedule("nn.avg_pool2d_grad") -def schedule_avg_pool2d_grad(attrs, outs, target): - """Schedule definition of avg_pool2d_grad""" - with target: - return topi.generic.schedule_pool_grad(outs) - - +reg.register_schedule("nn.avg_pool2d_grad", strategy.schedule_pool_grad) reg.register_pattern("nn.avg_pool2d_grad", OpPattern.OUT_ELEMWISE_FUSABLE) # global_max_pool2d -@reg.register_schedule("nn.global_max_pool2d") -def schedule_global_max_pool2d(_, outs, target): - """Schedule definition of global_max_pool2d""" - with target: - return topi.generic.schedule_adaptive_pool(outs) - - +reg.register_schedule("nn.global_max_pool2d", strategy.schedule_adaptive_pool) reg.register_pattern("nn.global_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) # global_avg_pool2d -@reg.register_schedule("nn.global_avg_pool2d") -def schedule_global_avg_pool2d(_, outs, target): - """Schedule definition of global_avg_pool2d""" - with target: - return topi.generic.schedule_adaptive_pool(outs) - - +reg.register_schedule("nn.global_avg_pool2d", strategy.schedule_adaptive_pool) reg.register_pattern("nn.global_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) # leaky_relu -reg.register_schedule("nn.leaky_relu", schedule_broadcast) +reg.register_strategy_broadcast("nn.leaky_relu") reg.register_pattern("nn.leaky_relu", OpPattern.ELEMWISE) + # prelu -reg.register_schedule("nn.prelu", schedule_broadcast) +reg.register_strategy_broadcast("nn.prelu") reg.register_pattern("nn.prelu", OpPattern.BROADCAST) + # flatten -reg.register_schedule("nn.batch_flatten", schedule_broadcast) +reg.register_strategy_broadcast("nn.batch_flatten") reg.register_pattern("nn.batch_flatten", OpPattern.INJECTIVE) # lrn @reg.register_compute("nn.lrn") -def compute_lrn(attrs, inputs, out_dtype, target): +def compute_lrn(attrs, inputs, out_dtype): """Compute definition of lrn""" assert len(inputs) == 1 return [topi.nn.lrn(inputs[0], attrs.size, attrs.axis, attrs.alpha, attrs.beta, attrs.bias)] - -@reg.register_schedule("nn.lrn") -def schedule_lrn(attrs, outs, target): - """Schedule definition of lrn""" - with target: - return topi.generic.schedule_lrn(outs) - - +reg.register_schedule("nn.lrn", strategy.schedule_lrn) reg.register_pattern("nn.lrn", OpPattern.OPAQUE) # upsampling -reg.register_schedule("nn.upsampling", reg.schedule_injective) - - -def schedule_upsampling(_, outs, target): - """Schedule definition of upsampling""" - with target: - return topi.generic.schedule_injective(outs) - @reg.register_compute("nn.upsampling") -def compute_upsampling(attrs, inputs, out_dtype, target): +def compute_upsampling(attrs, inputs, out_dtype): scale_h = attrs.scale_h scale_w = attrs.scale_w layout = attrs.layout @@ -630,16 +283,12 @@ def compute_upsampling(attrs, inputs, out_dtype, target): align_corners = attrs.align_corners return [topi.nn.upsampling(inputs[0], scale_h, scale_w, layout, method, align_corners)] -# upsampling3d -reg.register_schedule("nn.upsampling3d", reg.schedule_injective) +reg.register_strategy_injective("nn.upsampling") -def schedule_upsampling3d(_, outs, target): - """Schedule definition of upsampling3d""" - with target: - return topi.generic.schedule_injective(outs) +# upsampling3d @reg.register_compute("nn.upsampling3d") -def compute_upsampling3d(attrs, inputs, out_dtype, target): +def compute_upsampling3d(attrs, inputs, out_dtype): scale_d = attrs.scale_d scale_h = attrs.scale_h scale_w = attrs.scale_w @@ -649,12 +298,14 @@ def compute_upsampling3d(attrs, inputs, out_dtype, target): return [topi.nn.upsampling3d(inputs[0], scale_d, scale_h, scale_w, layout, method,\ coordinate_transformation_mode)] +reg.register_strategy_injective("nn.upsampling3d") + + # pad -reg.register_schedule("nn.pad", schedule_broadcast) +reg.register_strategy_broadcast("nn.pad") -# mirror_pad -reg.register_schedule("nn.mirror_pad", schedule_broadcast) +# mirror_pad @reg.register_compute("nn.mirror_pad") def compute_mirror_pad(attrs, inputs, out_dtype, target): pad_before, pad_after = list(zip(*attrs.pad_width)) @@ -662,284 +313,78 @@ def compute_mirror_pad(attrs, inputs, out_dtype, target): out = topi.nn.mirror_pad(inputs[0], pad_before=pad_before, pad_after=pad_after, mode=mode) return [out] -# winograd related operators -@reg.register_compute("nn.contrib_conv2d_winograd_without_weight_transform") -def compute_contrib_conv2d_winograd_without_weight_transform(attrs, inputs, out_dtype, target): - """Compute definition of conv2d_winograd_without_weight_transform""" - # pylint: disable=assignment-from-no-return - padding = attrs.get_int_tuple("padding") - strides = attrs.get_int_tuple("strides") - dilation = attrs.get_int_tuple("dilation") - groups = attrs.get_int("groups") - data_layout = attrs.get_str("data_layout") - out_dtype = attrs.get_str("out_dtype") - tile_size = attrs.get_int("tile_size") - out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype - assert dilation == (1, 1), "Do not support dilate now" - assert groups == 1, "Do not supoort arbitrary group number" - - out = topi.nn.conv2d_winograd_without_weight_transform( - inputs[0], inputs[1], strides, padding, dilation, data_layout, - out_dtype, tile_size) - - return [out] - - -@reg.register_schedule("nn.contrib_conv2d_winograd_without_weight_transform") -def schedule_contrib_conv2d_winograd_without_weight_transform(attrs, outs, target): - """Schedule definition of conv2d_winograd_without_weight_transform""" - with target: - return topi.generic.schedule_conv2d_winograd_without_weight_transform(outs) +reg.register_strategy_broadcast("nn.mirror_pad") +# conv2d_winograd related operators +reg.register_strategy("nn.contrib_conv2d_winograd_without_weight_transform", + strategy.conv2d_winograd_without_weight_transfrom_strategy) reg.register_pattern("nn.contrib_conv2d_winograd_without_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_compute("nn.contrib_conv2d_winograd_weight_transform") -def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, out_dtype, target): +def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, out_dtype): """Compute definition of contrib_conv2d_winograd_weight_transform""" out = topi.nn.conv2d_winograd_weight_transform( inputs[0], attrs.get_int('tile_size')) return [out] - -@reg.register_schedule("nn.contrib_conv2d_winograd_weight_transform") -def schedule_contrib_conv2d_winograd_weight_transform(attrs, outs, target): - """Schedule definition of contrib_conv2d_winograd_weight_transform""" - with target: - return topi.generic.schedule_conv2d_winograd_weight_transform(outs) - - +reg.register_schedule("nn.contrib_conv2d_winograd_weight_transform", + strategy.schedule_conv2d_winograd_weight_transform) reg.register_pattern("nn.contrib_conv2d_winograd_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE) - -# winograd nnpack related operators -@reg.register_compute("nn.contrib_conv2d_winograd_nnpack_without_weight_transform") -def compute_contrib_conv2d_winograd_nnpack_without_weight_transform( - attrs, inputs, out_dtype, target): - """Compute definition of conv2d_winograd_nnpack_without_weight_transform""" - # pylint: disable=assignment-from-no-return - padding = attrs.get_int_tuple("padding") - strides = attrs.get_int_tuple("strides") - dilation = attrs.get_int_tuple("dilation") - groups = attrs.get_int("groups") - data_layout = attrs.get_str("data_layout") - out_dtype = attrs.get_str("out_dtype") - out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype - assert dilation == (1, 1), "Do not support dilate now" - assert groups == 1, "Do not supoort arbitrary group number" - - # No bias - out = topi.nn.conv2d_winograd_nnpack_without_weight_transform( - inputs[0], inputs[1], None, strides, padding, dilation, data_layout, - out_dtype) - - return [out] - - -@reg.register_schedule("nn.contrib_conv2d_winograd_nnpack_without_weight_transform") -def schedule_contrib_conv2d_winograd_nnpack_without_weight_transform(attrs, outs, target): - """Schedule definition of conv2d_winograd_nnpack_without_weight_transform""" - with target: - return topi.generic.schedule_conv2d_winograd_nnpack_without_weight_transform(outs) - - -reg.register_pattern("nn.contrib_conv2d_winograd_nnpack_without_weight_transform", - OpPattern.OPAQUE) - - @reg.register_compute("nn.contrib_conv2d_winograd_nnpack_weight_transform") -def compute_contrib_conv2d_winograd_nnpack_weight_transform(attrs, inputs, out_dtype, target): +def compute_contrib_conv2d_winograd_nnpack_weight_transform(attrs, inputs, out_dtype): """Compute definition of contrib_conv2d_winograd_nnpack_weight_transform""" convolution_algorithm = attrs.get_int('convolution_algorithm') out = topi.nn.conv2d_winograd_nnpack_weight_transform( inputs[0], convolution_algorithm, out_dtype) return [out] - -@reg.register_schedule("nn.contrib_conv2d_winograd_nnpack_weight_transform") -def schedule_contrib_conv2d_winograd_nnpack_weight_transform(attrs, outs, target): - """Schedule definition of contrib_conv2d_winograd_nnpack_weight_transform""" - with target: - return topi.generic.schedule_conv2d_winograd_nnpack_weight_transform(outs) - - +reg.register_schedule("nn.contrib_conv2d_winograd_nnpack_weight_transform", + strategy.schedule_conv2d_winograd_nnpack_weight_transform) reg.register_pattern("nn.contrib_conv2d_winograd_nnpack_weight_transform", OpPattern.OPAQUE) -@reg.register_compute("nn.contrib_conv2d_NCHWc") -def compute_contrib_conv2d_NCHWc(attrs, inputs, out_dtype, target): - """Compute definition of conv2d NCHWc""" - # pylint: disable=assignment-from-no-return - padding = attrs.get_int_tuple("padding") - strides = attrs.get_int_tuple("strides") - dilation = attrs.get_int_tuple("dilation") - data_layout = attrs.get_str("data_layout") - out_layout = attrs.get_str("out_layout") - out_dtype = attrs.get_str("out_dtype") - out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype - - out = topi.nn.conv2d_NCHWc(inputs[0], inputs[1], strides, padding, dilation, - data_layout, out_layout, out_dtype) - return [out] - - -@reg.register_schedule("nn.contrib_conv2d_NCHWc") -def schedule_contrib_conv2d_NCHWc(attrs, outs, target): - """Schedule definition of contrib_conv2d_NCHWc""" - with target: - return topi.generic.schedule_conv2d_NCHWc(outs) - - +# conv2d_NCHWc +reg.register_strategy("nn.contrib_conv2d_NCHWc", strategy.conv2d_NCHWc_strategy) reg.register_pattern("nn.contrib_conv2d_NCHWc", OpPattern.OUT_ELEMWISE_FUSABLE) - -@reg.register_compute("nn.contrib_conv2d_NCHWc_int8") -def compute_contrib_conv2d_NCHWc_int8(attrs, inputs, out_dtype, target): - """Compute definition of conv2d NCHWc""" - # pylint: disable=assignment-from-no-return - padding = attrs.get_int_tuple("padding") - strides = attrs.get_int_tuple("strides") - dilation = attrs.get_int_tuple("dilation") - data_layout = attrs.get_str("data_layout") - out_layout = attrs.get_str("out_layout") - out_dtype = attrs.get_str("out_dtype") - out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype - - out = topi.nn.conv2d_NCHWc_int8(inputs[0], inputs[1], strides, padding, dilation, - data_layout, out_layout, out_dtype) - return [out] - - -@reg.register_schedule("nn.contrib_conv2d_NCHWc_int8") -def schedule_contrib_conv2d_NCHWc_int8(attrs, outs, target): - """Schedule definition of contrib_conv2d_NCHWc_int8""" - with target: - return topi.generic.schedule_conv2d_NCHWc_int8(outs) - - -reg.register_pattern("nn.contrib_conv2d_NCHWc_int8", - OpPattern.OUT_ELEMWISE_FUSABLE) - - -@reg.register_compute("nn.contrib_depthwise_conv2d_NCHWc") -def compute_contrib_depthwise_conv2d_NCHWc(attrs, inputs, out_dtype, target): - """Compute definition of depthwise conv2d NCHWc""" - # pylint: disable=assignment-from-no-return - padding = attrs.get_int_tuple("padding") - strides = attrs.get_int_tuple("strides") - dilation = attrs.get_int_tuple("dilation") - data_layout = attrs.get_str("data_layout") - out_layout = attrs.get_str("out_layout") - out_dtype = attrs.get_str("out_dtype") - out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype - - out = topi.nn.depthwise_conv2d_NCHWc(inputs[0], inputs[1], strides, padding, dilation, - data_layout, out_layout, out_dtype) - return [out] - - -@reg.register_schedule("nn.contrib_depthwise_conv2d_NCHWc") -def schedule_contrib_depthwise_conv2d_NCHWc(attrs, outs, target): - """Schedule definition of contrib_conv2d_NCHWc""" - with target: - return topi.generic.schedule_depthwise_conv2d_NCHWc(outs) - - +# depthwise_conv2d_NCHWc +reg.register_strategy("nn.contrib_depthwise_conv2d_NCHWc", + strategy.depthwise_conv2d_NCHWc_strategy) reg.register_pattern("nn.contrib_depthwise_conv2d_NCHWc", OpPattern.OUT_ELEMWISE_FUSABLE) -@reg.register_compute("nn.deformable_conv2d") -def compute_deformable_conv2d(attrs, inputs, out_dtype, target): - """Compute definition of deformable_conv2d""" - padding = get_const_tuple(attrs.padding) - strides = get_const_tuple(attrs.strides) - dilation = get_const_tuple(attrs.dilation) - deformable_groups = attrs.deformable_groups - groups = attrs.groups - out_dtype = attrs.out_dtype - out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype - with target: - out = topi.nn.deformable_conv2d_nchw(inputs[0], inputs[1], inputs[2], strides, padding, - dilation, deformable_groups, groups, out_dtype) - return [out] - - -@reg.register_schedule("nn.deformable_conv2d") -def schedule_deformable_conv2d(attrs, outs, target): - """Schedule definition of deformable_conv2d""" - with target: - return topi.generic.schedule_deformable_conv2d_nchw(outs) - - +# deformable_conv2d +reg.register_strategy("nn.deformable_conv2d", strategy.deformable_conv2d_strategy) reg.register_pattern("nn.deformable_conv2d", OpPattern.OUT_ELEMWISE_FUSABLE) +# bitpack @reg.register_compute("nn.bitpack") -def compute_bitpack(attrs, inputs, out_dtype, target): +def compute_bitpack(attrs, inputs, out_dtype): """Compute definition for bitpack""" bits = attrs.bits pack_axis = attrs.pack_axis bit_axis = attrs.bit_axis pack_type = attrs.pack_type name = attrs.name - with target: - out = topi.nn.bitpack(inputs[0], bits, pack_axis, bit_axis, pack_type, - name) + out = topi.nn.bitpack(inputs[0], bits, pack_axis, bit_axis, pack_type, name) return [out] -@reg.register_schedule("nn.bitpack") -def schedule_bitpack(attrs, outs, target): - with target: - return topi.generic.schedule_bitpack(outs) - +reg.register_schedule("nn.bitpack", strategy.schedule_bitpack) reg.register_pattern("nn.bitpack", OpPattern.INJECTIVE) -@reg.register_compute("nn.bitserial_conv2d") -def compute_bitserial_conv2d(attrs, inputs, out_dtype, target): - """Compute definition for bitserial conv2d.""" - padding = get_const_tuple(attrs.padding) - strides = get_const_tuple(attrs.strides) - activation_bits = attrs.activation_bits - weight_bits = attrs.weight_bits - layout = attrs.data_layout - pack_dtype = attrs.pack_dtype - out_dtype = attrs.out_dtype - unipolar = attrs.unipolar - if layout == 'NCHW': - with target: - out = topi.nn.bitserial_conv2d_nchw( - inputs[0], inputs[1], strides, padding, activation_bits, - weight_bits, pack_dtype, out_dtype, unipolar) - elif layout == 'NHWC': - with target: - out = topi.nn.bitserial_conv2d_nhwc( - inputs[0], inputs[1], strides, padding, activation_bits, - weight_bits, pack_dtype, out_dtype, unipolar) - else: - raise ValueError("Data layout not supported.") - - return [out] - - -@reg.register_schedule("nn.bitserial_conv2d") -def schedule_bitserial_conv2d(attrs, outs, target): - """Schedule definition for bitserial conv2d.""" - layout = attrs.data_layout - if layout == 'NCHW': - with target: - return topi.generic.schedule_bitserial_conv2d_nchw(outs) - elif layout == 'NHWC': - with target: - return topi.generic.schedule_bitserial_conv2d_nhwc(outs) - else: - raise ValueError("Data layout not supported.") +# bitserial_conv2d +reg.register_strategy("nn.bitserial_conv2d", strategy.bitserial_conv2d_strategy) +reg.register_pattern("nn.bitserial_conv2d", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_legalize("nn.bitserial_conv2d") def legalize_bitserial_conv2d(attrs, inputs, types): @@ -962,79 +407,58 @@ def legalize_bitserial_conv2d(attrs, inputs, types): return topi.nn.bitserial_conv2d_legalize(attrs, inputs, types) -reg.register_pattern("nn.bitserial_conv2d", OpPattern.OUT_ELEMWISE_FUSABLE) - - # bitserial_dense -@reg.register_compute("nn.bitserial_dense") -def compute_bitserial_dense(attrs, inputs, out_type, target): - """Compute definition of bitserial_dense""" - data_bits = attrs.data_bits - weight_bits = attrs.weight_bits - pack_dtype = attrs.pack_dtype - out_dtype = attrs.out_dtype - out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype - unipolar = attrs.unipolar - return [ - topi.nn.bitserial_dense( - inputs[0], - inputs[1], - data_bits, - weight_bits, - pack_dtype, - out_dtype, - unipolar) - ] - - -@reg.register_schedule("nn.bitserial_dense") -def schedule_bitserial_dense(attrs, outputs, target): - """Schedule definition of bitserial_dense""" - with target: - return topi.generic.schedule_bitserial_dense(outputs) - - +reg.register_strategy("nn.bitserial_dense", strategy.bitserial_dense_strategy) reg.register_pattern("nn.bitserial_dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE) -reg.register_pattern("nn.cross_entropy", OpPattern.OPAQUE) - +# cross_entropy @reg.register_compute("nn.cross_entropy") -def compute_cross_entropy(attrs, inputs, out_dtype, target): +def compute_cross_entropy(attrs, inputs, out_dtype): x, y = inputs return [-topi.sum(topi.log(x) * y) / x.shape[0]] +reg.register_strategy_reduce("nn.cross_entropy") +reg.register_pattern("nn.cross_entropy", OpPattern.OPAQUE) -reg.register_pattern("nn.cross_entropy_with_logits", OpPattern.OPAQUE) +# cross_entropy_with_logits @reg.register_compute("nn.cross_entropy_with_logits") -def compute_cross_entropy_with_logits(attrs, inputs, out_dtype, target): +def compute_cross_entropy_with_logits(attrs, inputs, out_dtype): x, y = inputs return [-topi.sum(x * y) / x.shape[0]] +reg.register_strategy_reduce("nn.cross_entropy_with_logits") +reg.register_pattern("nn.cross_entropy_with_logits", OpPattern.OPAQUE) + +# depth_to_space @reg.register_compute("nn.depth_to_space") -def compute_depth_to_space(attrs, inputs, out_dtype, target): +def compute_depth_to_space(attrs, inputs, out_dtype): block_size = attrs.block_size layout = attrs.layout mode = attrs.mode return [topi.nn.depth_to_space(inputs[0], block_size, layout=layout, mode=mode)] -reg.register_schedule("nn.depth_to_space", schedule_injective) +reg.register_strategy_injective("nn.depth_to_space") reg.register_pattern("nn.depth_to_space", OpPattern.INJECTIVE) +# space_to_depth @reg.register_compute("nn.space_to_depth") -def compute_space_to_depth(attrs, inputs, out_dtype, target): +def compute_space_to_depth(attrs, inputs, out_dtype): block_size = attrs.block_size layout = attrs.layout return [topi.nn.space_to_depth(inputs[0], block_size, layout=layout)] -reg.register_schedule("nn.space_to_depth", schedule_injective) +reg.register_strategy_injective("nn.space_to_depth") reg.register_pattern("nn.space_to_depth", OpPattern.INJECTIVE) -# shape func +##################### +# Shape functions # +##################### + @script def _conv2d_NCHWc_shape_func(dshape, kshape, strides, padding, dilation, oc_bn): out = output_tensor((dshape.shape[0],), "int64") diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index 9ee43438f83d..eaf41cf7871a 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -204,7 +204,8 @@ def conv2d(data, # TODO enforce 4-way padding in topi/nn/conv2d after #4644 merged # convert 2-way padding to 4-way padding padding = get_pad_tuple2d(padding) - + if not out_layout: + out_layout = data_layout return _make.conv2d(data, weight, strides, padding, dilation, groups, channels, kernel_size, data_layout, kernel_layout, out_layout, out_dtype) @@ -298,7 +299,8 @@ def conv3d(data, dilation = (dilation, dilation, dilation) if isinstance(padding, int): padding = (padding, padding, padding) - + if not out_layout: + out_layout = data_layout return _make.conv3d(data, weight, strides, padding, dilation, groups, channels, kernel_size, data_layout, kernel_layout, out_layout, out_dtype) @@ -367,6 +369,8 @@ def conv2d_transpose(data, """ # convert 2-way padding to 4-way padding padding = get_pad_tuple2d(padding) + if not out_layout: + out_layout = data_layout return _make.conv2d_transpose(data, weight, strides, padding, dilation, groups, channels, kernel_size, data_layout, kernel_layout, out_layout, output_padding, out_dtype) @@ -433,6 +437,8 @@ def conv1d_transpose(data, result : tvm.relay.Expr The computed result. """ + if not out_layout: + out_layout = data_layout return _make.conv1d_transpose(data, weight, strides, padding, dilation, groups, channels, kernel_size, data_layout, kernel_layout, out_layout, output_padding, out_dtype) @@ -1772,74 +1778,6 @@ def contrib_conv2d_winograd_without_weight_transform(data, kernel_layout, out_layout, out_dtype) -def contrib_conv2d_winograd_nnpack_without_weight_transform(data, - weight, - strides=(1, 1), - padding=(0, 0), - dilation=(1, 1), - groups=1, - channels=None, - kernel_size=None, - data_layout="NCHW", - kernel_layout="OIHW", - out_layout="", - out_dtype=""): - r"""2D convolution with the NNPACK implementation of winograd algorithm. - - The basic parameters are the same as the ones in vanilla conv2d. - It assumes the weight is pre-transformed by nn.contrib_conv2d_winograd_nnpack_weight_transform - - Parameters - ---------- - data : tvm.relay.Expr - The input data to the operator. - - weight : tvm.relay.Expr - The weight expressions. - - strides : tuple of int, optional - The strides of convolution. - - padding : tuple of int, optional - The padding of convolution on both sides of inputs before convolution. - - dilation : tuple of int, optional - Specifies the dilation rate to be used for dilated convolution. - - groups : int, optional - Number of groups for grouped convolution. - - channels : int, optional - Number of output channels of this convolution. - - kernel_size : tuple of int, optional - The spatial of the convolution kernel. - - data_layout : str, optional - Layout of the input. - - kernel_layout : str, optional - Layout of the weight. - - out_layout : str, optional - Layout of the output, by default, out_layout is the same as data_layout - - out_dtype : str, optional - Specifies the output data type for mixed precision conv2d. - - Returns - ------- - result : tvm.relay.Expr - The computed result. - """ - # convert 2-way padding to 4-way padding - padding = get_pad_tuple2d(padding) - return _make.contrib_conv2d_winograd_nnpack_without_weight_transform( - data, weight, strides, padding, dilation, - groups, channels, kernel_size, data_layout, - kernel_layout, out_layout, out_dtype) - - def contrib_conv2d_nchwc(data, kernel, strides=(1, 1), @@ -1974,73 +1912,6 @@ def contrib_depthwise_conv2d_nchwc(data, groups, channels, kernel_size, data_layout, kernel_layout, out_layout, out_dtype) -def contrib_conv2d_nchwc_int8(data, - kernel, - strides=(1, 1), - padding=(0, 0), - dilation=(1, 1), - groups=1, - channels=None, - kernel_size=None, - data_layout="NCHW8c", - kernel_layout="OIHW", - out_layout="", - out_dtype=""): - r"""Variant of 2D convolution. It deals with only int8 inputs. - - This operator takes the weight as the convolution kernel - and convolves it with data to produce an output, following a specialized - NCHWc data layout. - - Parameters - ---------- - data : tvm.relay.Expr - The input data to the operator. - - kernel : tvm.relay.Expr - The kernel expressions. - - strides : tuple of int, optional - The strides of convolution. - - padding : tuple of int, optional - The padding of convolution on both sides of inputs before convolution. - - dilation : tuple of int, optional - Specifies the dilation rate to be used for dilated convolution. - - groups : int, optional - Number of groups for grouped convolution. - - channels : int, optional - Number of output channels of this convolution. - - kernel_size : tuple of int, optional - The spatial of the convolution kernel. - - data_layout : str, optional - Layout of the input. - - kernel_layout : str, optional - Layout of the weight. - - out_layout : str, optional - Layout of the output, by default, out_layout is the same as data_layout - - out_dtype : str, optional - Specifies the output data type for mixed precision conv2d. - - Returns - ------- - result : tvm.relay.Expr - The computed result. - """ - # convert 2-way padding to 4-way padding - padding = get_pad_tuple2d(padding) - return _make.contrib_conv2d_NCHWc_int8(data, kernel, strides, padding, dilation, - groups, channels, kernel_size, data_layout, - kernel_layout, out_layout, out_dtype) - def contrib_conv2d_winograd_weight_transform(weight, tile_size): diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py index c6d301213e98..0a1500203db2 100644 --- a/python/tvm/relay/op/op.py +++ b/python/tvm/relay/op/op.py @@ -14,15 +14,15 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -#pylint: disable=unused-argument +#pylint: disable=unused-argument,invalid-name """The base node types for the Relay language.""" -import topi import tvm._ffi from tvm.driver import lower, build from ..base import register_relay_node from ..expr import RelayExpr from ...api import register_func +from ...target import get_native_generic_func, GenericFunc from . import _make @register_relay_node @@ -143,21 +143,47 @@ class OpPattern(object): OPAQUE = 8 -def register_schedule(op_name, schedule=None, level=10): - """Register schedule function for an op +@register_relay_node +class OpImplement(Expr): + """Operator implementation""" + def compute(self, attrs, inputs, out_type): + return _OpImplementCompute(self, attrs, inputs, out_type) - Parameters - ---------- - op_name : str - The name of the op. + def schedule(self, attrs, outs, target): + return _OpImplementSchedule(self, attrs, outs, target) - schedule : function (attrs: Attrs, outs: List[Tensor], target: Target) -> sch: Schedule - The schedule function. - level : int - The priority level - """ - return register(op_name, "FTVMSchedule", schedule, level) +@register_relay_node +class OpSpecialization(Expr): + """Operator specialization""" + + +@register_relay_node +class OpStrategy(Expr): + def __init__(self): + self.__init_handle_by_constructor__(_make.OpStrategy) + + def add_implement(self, compute, schedule, plevel=10): + _OpStrategyAddImplement(self, compute, schedule, plevel) + + +def wrap_fstrategy(compute, schedule): + def fstrategy(attrs, inputs, out_type, target): + strategy = OpStrategy() + strategy.add_implement(compute, schedule) + return strategy + return fstrategy + + +def create_simple_fstrategy(op_name, schedule): + assert hasattr(schedule, "dispatch_dict") + compute = get(op_name).get_attr("FTVMCompute") + assert compute is not None, "FTVMCompute is not registered for op %s" % op_name + fstrategy = get_native_generic_func("{}_strategy".format(op_name)) + fstrategy.set_default(wrap_fstrategy(compute, schedule.fdefault)) + for key, sch in schedule.dispatch_dict.items(): + fstrategy.register(wrap_fstrategy(compute, sch), [key]) + return fstrategy def register_compute(op_name, compute=None, level=10): @@ -178,6 +204,30 @@ def register_compute(op_name, compute=None, level=10): return register(op_name, "FTVMCompute", compute, level) +def register_strategy(op_name, fstrategy=None, level=10): + if not isinstance(fstrategy, GenericFunc): + assert hasattr(fstrategy, "generic_func_node") + fstrategy = fstrategy.generic_func_node + return register(op_name, "FTVMStrategy", fstrategy, level) + + +def register_schedule(op_name, schedule, level=10): + fstrategy = create_simple_fstrategy(op_name, schedule) + return register_strategy(op_name, fstrategy, level) + + +def register_strategy_injective(op_name, level=10): + return register_schedule(op_name, _schedule_injective, level) + + +def register_strategy_broadcast(op_name, level=10): + return register_schedule(op_name, _schedule_injective, level) + + +def register_strategy_reduce(op_name, level=10): + return register_schedule(op_name, _schedule_reduce, level) + + def register_alter_op_layout(op_name, alter_layout=None, level=10): """Register alter op layout function for an op @@ -245,6 +295,7 @@ def register_pattern(op_name, pattern, level=10): """ return register(op_name, "TOpPattern", pattern, level) + def register_gradient(op_name, fgradient=None, level=10): """Register operator pattern for an op. @@ -261,6 +312,7 @@ def register_gradient(op_name, fgradient=None, level=10): """ return register(op_name, "FPrimalGradient", fgradient, level) + def register_shape_func(op_name, data_dependant, shape_func=None, level=10): """Register operator shape function for an op. @@ -290,18 +342,8 @@ def _lower(name, schedule, inputs, outputs): def _build(lowered_funcs): return build(lowered_funcs, target="llvm") - -def schedule_injective(attrs, outputs, target): - """Generic schedule for binary broadcast.""" - with target: - return topi.generic.schedule_injective(outputs) - - -def schedule_concatenate(attrs, outputs, target): - """Generic schedule for concatinate.""" - with target: - return topi.generic.schedule_concatenate(outputs) - +_schedule_injective = None +_schedule_reduce = None __DEBUG_COUNTER__ = 0 diff --git a/python/tvm/relay/op/strategy/__init__.py b/python/tvm/relay/op/strategy/__init__.py new file mode 100644 index 000000000000..cbb9eb6470e7 --- /dev/null +++ b/python/tvm/relay/op/strategy/__init__.py @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=wildcard-import +"""Relay op strategies.""" +from __future__ import absolute_import as _abs + +from .generic import * +from . import x86 +from . import arm_cpu +from . import cuda +from . import hls +from . import mali +from . import opengl +from . import rocm +from . import intel_graphics diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py new file mode 100644 index 000000000000..72b5b1aa5b79 --- /dev/null +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -0,0 +1,203 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Definition of ARM CPU operator strategy.""" +# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import +from __future__ import absolute_import + +import re +import logging + +import topi +from .generic import * +from .. import op as _op + +logger = logging.getLogger('strategy') + +@schedule_injective.register("arm_cpu") +def schedule_injective_arm_cpu(_, outs, target): + """schedule injective ops for arm cpu""" + with target: + return topi.arm_cpu.schedule_injective(outs) + +@schedule_concatenate.register("arm_cpu") +def schedule_concatenate_arm_cpu(_, outs, target): + """schedule concatenate for arm cpu""" + with target: + return topi.arm_cpu.schedule_concatenate(outs) + +@conv2d_strategy.register("arm_cpu") +def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): + """conv2d arm cpu strategy""" + strategy = _op.OpStrategy() + data, kernel = inputs + dilation_h, dilation_w = attrs.get_int_tuple("dilation") + stride_h, stride_w = attrs.get_int_tuple("strides") + padding = attrs.get_int_tuple("padding") + groups = attrs.groups + layout = attrs.data_layout + kernel_layout = attrs.kernel_layout + if dilation_h < 1 or dilation_w < 1: + raise ValueError("dilation should be positive value") + + if groups == 1: + if layout == "NCHW": + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack)) + + _, _, kh, kw = get_const_tuple(kernel.shape) + pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw)) + if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \ + dilation_h == 1 and dilation_w == 1: + strategy.add_implement( + wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd), + 15) + if pt == 1 and pb == 1 and pl == 1 and pr == 1: + strategy.add_implement( + wrap_compute_conv2d_winograd_nnpack( + topi.arm_cpu.conv2d_nchw_winograd_nnpack), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack), + 13) + elif layout == "HWCN": + assert kernel_layout == "HWIO" + logger.warning("conv2d with layout HWCN is not optimized for arm cpu.") + strategy.add_implement( + wrap_compute_conv2d(topi.nn.conv2d_hwcn), + wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn)) + elif layout == "NHWC": + assert kernel_layout == "HWIO" + strategy.add_implement( + wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack)) + else: + raise RuntimeError("Unsupported conv2d layout {} for arm cpu".format(layout)) + elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): + if layout == "NCHW": + assert kernel_layout == "OIHW" or re.match(r"OIHW\d*o", kernel_layout) + if kernel_layout == "OIHW": + strategy.add_implement( + wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw), + wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw)) + strategy.add_implement( + wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw_spatial_pack), + wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_spatial_pack), + 15) + elif layout == "NHWC": + assert kernel_layout == "HWOI" + logger.warning("depthwise_conv2d with layout NHWC is not optimized for arm cpu.") + strategy.add_implement( + wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), + wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc)) + else: + raise RuntimeError("Unsupported depthwise_conv2d layout {} for arm cpu". + format(layout)) + else: # group_conv2d + if layout == 'NCHW': + assert kernel_layout == "OIHW" + logger.warning("group_conv2d with layout NCHW is not optimized for arm cpu.") + strategy.add_implement( + wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), + wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw)) + else: + raise RuntimeError("Unsupported group_conv2d layout {} for arm cpu". + format(layout)) + return strategy + +def wrap_compute_conv2d_winograd_nnpack(topi_compute): + """wrap topi compute for conv2d_winograd NNPack""" + def _compute_conv2d_nnpack(attrs, inputs, out_type): + padding = attrs.get_int_tuple("padding") + strides = attrs.get_int_tuple("strides") + dilation = attrs.get_int_tuple("dilation") + out_dtype = attrs.get_str("out_dtype") + out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype + return [topi_compute(inputs[0], inputs[1], None, strides, padding, + dilation, out_dtype)] + return _compute_conv2d_nnpack + +@conv2d_winograd_without_weight_transfrom_strategy.register("arm_cpu") +def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out_type, target): + """conv2d_winograd_without_weight_transfrom arm cpu strategy""" + dilation = attrs.get_int_tuple("dilation") + padding = attrs.get_int_tuple("padding") + groups = attrs.get_int("groups") + layout = attrs.data_layout + stride_h, stride_w = attrs.get_int_tuple("strides") + assert dilation == (1, 1), "Do not support dilate now" + assert groups == 1, "Do not supoort arbitrary group number" + strategy = _op.OpStrategy() + if layout == "NCHW": + _, _, kh, kw = get_const_tuple(inputs[1].shape) + pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw)) + assert kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 + strategy.add_implement( + wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd)) + if pt == 1 and pb == 1 and pl == 1 and pr == 1: + strategy.add_implement( + wrap_compute_conv2d_winograd_nnpack( + topi.arm_cpu.conv2d_nchw_winograd_nnpack_without_weight_transform), + wrap_topi_schedule( + topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack_without_weight_transform), + 5) + else: + raise RuntimeError("Unsupported conv2d_winograd_without_weight_transfrom layout {}". + format(layout)) + return strategy + +@conv2d_transpose_strategy.register("arm_cpu") +def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target): + """conv2d_transpose arm cpu strategy""" + layout = attrs.data_layout + dilation = get_const_tuple(attrs.dilation) + groups = attrs.groups + assert layout == "NCHW", "only support nchw for now" + assert dilation == (1, 1), "not support dilate now" + assert groups == 1, "only support groups == 1 for now" + strategy = _op.OpStrategy() + strategy.add_implement( + wrap_comptue_conv2d_transpose(topi.arm_cpu.conv2d_transpose_nchw), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_transpose_nchw)) + return strategy + +@bitserial_conv2d_strategy.register("arm_cpu") +def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): + """bitserial_conv2d x86 strategy""" + strategy = _op.OpStrategy() + layout = attrs.data_layout + if layout == "NCHW": + strategy.add_implement( + wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nchw), + wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nchw)) + elif layout == "NHWC": + strategy.add_implement( + wrap_compute_bitserial_conv2d(topi.arm_cpu.bitserial_conv2d_nhwc), + wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_conv2d_nhwc)) + else: + raise ValueError("Data layout {} not supported.".format(layout)) + return strategy + +@bitserial_dense_strategy.register("arm_cpu") +def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target): + """bitserial_dense arm cpu strategy""" + strategy = _op.OpStrategy() + strategy.add_implement( + wrap_compute_bitserial_dense(topi.arm_cpu.bitserial_dense), + wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_dense)) + return strategy diff --git a/python/tvm/relay/op/strategy/bifrost.py b/python/tvm/relay/op/strategy/bifrost.py new file mode 100644 index 000000000000..9407000faed9 --- /dev/null +++ b/python/tvm/relay/op/strategy/bifrost.py @@ -0,0 +1,97 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Definition of bifrost operator strategy.""" +# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import + +from __future__ import absolute_import + +import topi +from .generic import * +from .. import op as _op + + +@conv2d_strategy.register("bifrost") +def conv2d_strategy_bifrost(attrs, inputs, out_type, target): + """conv2d mali(bifrost) strategy""" + strategy = _op.OpStrategy() + data, kernel = inputs + dilation_h, dilation_w = attrs.get_int_tuple("dilation") + stride_h, stride_w = attrs.get_int_tuple("strides") + groups = attrs.groups + layout = attrs.data_layout + kernel_layout = attrs.kernel_layout + if dilation_h < 1 or dilation_w < 1: + raise ValueError("dilation should be positive value") + + if groups == 1: + if layout == "NCHW": + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.bifrost.conv2d_nchw_spatial_pack), + wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_spatial_pack)) + + _, _, kh, kw = get_const_tuple(kernel.shape) + if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \ + dilation_h == 1 and dilation_w == 1: + strategy.add_implement( + wrap_compute_conv2d(topi.bifrost.conv2d_nchw_winograd), + wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_winograd), + 15) + else: + raise RuntimeError("Unsupported conv2d layout {} for Mali(Bifrost)". + format(layout)) + elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): + if layout == "NCHW": + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), + wrap_topi_schedule(topi.bifrost.schedule_depthwise_conv2d_nchw)) + else: + raise RuntimeError("Unsupported depthwise_conv2d layout {} for Mali(Bifrost)". + format(layout)) + else: # group_conv2d + raise RuntimeError("group_conv2d is not supported for Mali(Bifrost)") + return strategy + +@conv2d_winograd_without_weight_transfrom_strategy.register("bifrost") +def conv2d_winograd_without_weight_transfrom_strategy_bifrost(attrs, inputs, out_type, target): + """conv2d_winograd_without_weight_transfrom mali(bifrost) strategy""" + dilation = attrs.get_int_tuple("dilation") + groups = attrs.get_int("groups") + layout = attrs.data_layout + stride_h, stride_w = attrs.get_int_tuple("strides") + assert dilation == (1, 1), "Do not support dilate now" + assert groups == 1, "Do not supoort arbitrary group number" + strategy = _op.OpStrategy() + if layout == "NCHW": + _, _, kh, kw = get_const_tuple(inputs[1].shape) + assert kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 + strategy.add_implement( + wrap_compute_conv2d(topi.bifrost.conv2d_nchw_winograd), + wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_winograd)) + else: + raise RuntimeError("Unsupported conv2d_winograd_without_weight_transfrom layout {}". + format(layout)) + return strategy + +@dense_strategy.register("bifrost") +def dense_strategy_bifrost(attrs, inputs, out_type, target): + """dense mali(bifrost) strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_dense(topi.bifrost.dense), + wrap_topi_schedule(topi.bifrost.schedule_dense)) + return strategy diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py new file mode 100644 index 000000000000..ca07604e6418 --- /dev/null +++ b/python/tvm/relay/op/strategy/cuda.py @@ -0,0 +1,352 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Definition of CUDA/GPU operator strategy.""" +# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import +from __future__ import absolute_import + +import topi +from .generic import * +from .. import op as _op +from ....schedule import SpecializedCondition + +@schedule_injective.register(["cuda", "gpu"]) +def schedule_injective_cuda(attrs, outs, target): + """schedule injective ops for cuda""" + with target: + return topi.cuda.schedule_injective(outs) + +@schedule_reduce.register(["cuda", "gpu"]) +def schedule_reduce_cuda(attrs, outs, target): + """schedule reduction ops for cuda""" + with target: + return topi.cuda.schedule_reduce(outs) + +@schedule_concatenate.register(["cuda", "gpu"]) +def schedule_concatenate_cuda(attrs, outs, target): + """schedule concatenate for cuda""" + with target: + return topi.cuda.schedule_injective(outs) + +@schedule_pool.register(["cuda", "gpu"]) +def schedule_pool_cuda(attrs, outs, target): + """schedule pooling ops for cuda""" + with target: + return topi.cuda.schedule_pool(outs, attrs.layout) + +@schedule_pool_grad.register(["cuda", "gpu"]) +def schedule_pool_grad_cuda(attrs, outs, target): + """schedule pooling gradient ops for cuda""" + with target: + return topi.cuda.schedule_pool_grad(outs) + +@schedule_adaptive_pool.register(["cuda", "gpu"]) +def schedule_adaptive_pool_cuda(attrs, outs, target): + """schedule adaptive pooling ops for cuda""" + with target: + return topi.cuda.schedule_adaptive_pool(outs) + +@schedule_softmax.register(["cuda", "gpu"]) +def schedule_softmax_cuda(attrs, outs, target): + """schedule softmax for cuda""" + with target: + return topi.cuda.schedule_softmax(outs) + +@schedule_lrn.register(["cuda", "gpu"]) +def schedule_lrn_cuda(attrs, outs, target): + """schedule LRN for cuda""" + with target: + return topi.cuda.schedule_lrn(outs) + +@schedule_l2_normalize.register(["cuda", "gpu"]) +def schedule_l2_normalize_cuda(attrs, outs, target): + """schedule L2 normalize for cuda""" + with target: + return topi.cuda.schedule_l2_normalize(outs) + +@conv2d_strategy.register(["cuda", "gpu"]) +def conv2d_strategy_cuda(attrs, inputs, out_type, target): + """conv2d cuda strategy""" + strategy = _op.OpStrategy() + data, kernel = inputs + stride_h, stride_w = attrs.get_int_tuple("strides") + dilation_h, dilation_w = attrs.get_int_tuple("dilation") + padding = attrs.get_int_tuple("padding") + groups = attrs.groups + layout = attrs.data_layout + kernel_layout = attrs.kernel_layout + if dilation_h < 1 or dilation_w < 1: + raise ValueError("dilation should be positive value") + + if groups == 1: + if layout == "NCHW": + # TODO(@vinx13, @icemelon9): Use conv2d_NCHWc_int8 when dtype is int8/uint8. + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.conv2d_nchw), + wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw)) + _, _, kh, kw = get_const_tuple(kernel.shape) + if kh <= 7 and kw <= 7 and kh == kw and stride_h == 1 and stride_w == 1 and \ + dilation_h == 1 and dilation_w == 1: + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), + wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd), + 15) + elif layout == "HWCN": + assert kernel_layout == "HWIO" + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.conv2d_hwcn), + wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn)) + # Re-enable this after @alexgl-github fix the conv2d_nhwc for cuda + # elif layout == "NHWC": + # assert kernel_layout == "HWIO" + # strategy.add_implement( + # wrap_compute_conv2d(topi.cuda.conv2d_nhwc), + # wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc)) + elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: + assert kernel_layout == "OIHW4o4i" + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), + wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8)) + else: + raise RuntimeError("Unsupported conv2d layout {} for CUDA".format(layout)) + # add cudnn implementation + if target.target_name == "cuda" and "cudnn" in target.libs: + if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \ + padding[1] == padding[3]: + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.conv2d_cudnn, True), + wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), 5) + elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): + if layout == "NCHW": + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), + wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw)) + elif layout == "NHWC": + assert kernel_layout == "HWOI" + strategy.add_implement( + wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), + wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc)) + else: + raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) + else: # group_conv2d + if layout == 'NCHW': + # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8. + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), + wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw)) + elif layout == 'NCHW4c' and data.dtype in ["int8", "uint8"]: + assert kernel_layout == "OIHW4o4i" + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True), + wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8)) + else: + raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) + return strategy + +@conv2d_winograd_without_weight_transfrom_strategy.register(["cuda", "gpu"]) +def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_type, target): + """conv2d_winograd_without_weight_transfrom cuda strategy""" + dilation = attrs.get_int_tuple("dilation") + groups = attrs.get_int("groups") + layout = attrs.data_layout + assert dilation == (1, 1), "Do not support dilate now" + assert groups == 1, "Do not supoort arbitrary group number" + strategy = _op.OpStrategy() + if layout == "NCHW": + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd_without_weight_transform), + wrap_topi_schedule( + topi.cuda.schedule_conv2d_nchw_winograd_without_weight_transform_cuda)) + else: + raise RuntimeError("Unsupported conv2d_winograd_without_weight_transfrom layout {}". + format(layout)) + return strategy + +@deformable_conv2d_strategy.register(["cuda", "gpu"]) +def deformable_conv2d_strategy_cuda(attrs, inputs, out_type, target): + """deformable_conv2d cuda strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_deformable_conv2d(topi.cuda.deformable_conv2d_nchw), + wrap_topi_schedule(topi.cuda.schedule_deformable_conv2d_nchw)) + return strategy + +@conv2d_transpose_strategy.register(["cuda", "gpu"]) +def conv2d_transpose_strategy_cuda(attrs, inputs, out_type, target): + """conv2d_transpose cuda strategy""" + layout = attrs.data_layout + dilation = get_const_tuple(attrs.dilation) + groups = attrs.groups + assert layout == "NCHW", "only support nchw for now" + assert dilation == (1, 1), "not support dilate now" + assert groups == 1, "only support groups == 1 for now" + strategy = _op.OpStrategy() + strategy.add_implement( + wrap_comptue_conv2d_transpose(topi.cuda.conv2d_transpose_nchw), + wrap_topi_schedule(topi.cuda.schedule_conv2d_transpose_nchw)) + return strategy + +@conv3d_strategy.register(["cuda", "gpu"]) +def conv3d_strategy_cuda(attrs, inputs, out_type, target): + """conv3d cuda strategy""" + strategy = _op.OpStrategy() + layout = attrs.data_layout + assert layout in ["NCDHW", "NDHWC"], "Not support this layout {} yet".format(layout) + if layout == "NCDHW": + strategy.add_implement(wrap_compute_conv3d(topi.cuda.conv3d_ncdhw), + wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw), + 10) + else: # layout == "NDHWC": + strategy.add_implement(wrap_compute_conv3d(topi.cuda.conv3d_ndhwc), + wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc), + 10) + if target.target_name == "cuda" and "cudnn" in target.libs: + strategy.add_implement(wrap_compute_conv3d(topi.cuda.conv3d_cudnn), + wrap_topi_schedule(topi.cuda.schedule_conv3d_cudnn), + 15) + return strategy + +@conv1d_strategy.register(["cuda", "gpu"]) +def conv1d_strategy_cuda(attrs, inputs, out_type, target): + """conv1d cuda strategy""" + layout = attrs.data_layout + dilation = get_const_tuple(attrs.dilation) + if dilation[0] < 1: + raise ValueError("dilation should be a positive value") + strategy = _op.OpStrategy() + if layout == "NCW": + strategy.add_implement(wrap_compute_conv1d(topi.cuda.conv1d_ncw), + wrap_topi_schedule(topi.cuda.schedule_conv1d_ncw)) + elif layout == "NWC": + strategy.add_implement(wrap_compute_conv1d(topi.cuda.conv1d_nwc), + wrap_topi_schedule(topi.cuda.schedule_conv1d_nwc)) + else: + raise ValueError("Unsupported conv1d layout {}".format(layout)) + return strategy + +@conv1d_transpose_strategy.register(["cuda", "gpu"]) +def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target): + """conv1d_transpose cuda strategy""" + strategy = _op.OpStrategy() + layout = attrs.data_layout + dilation = get_const_tuple(attrs.dilation) + groups = attrs.groups + assert layout == "NCW", "conv1d_transpose ncw only supported" + assert dilation == (1,), "conv1d_transpose dilation is not supported" + assert groups == 1, "conv1d_transpose groups == 1 only supported" + strategy.add_implement(wrap_compute_conv1d_transpose(topi.cuda.conv1d_transpose_ncw), + wrap_topi_schedule(topi.cuda.schedule_conv1d_transpose_ncw)) + return strategy + +@dense_strategy.register(["cuda", "gpu"]) +def dense_strategy_cuda(attrs, inputs, out_type, target): + """dense cuda strategy""" + strategy = _op.OpStrategy() + if out_type.dtype == "int8": + strategy.add_implement(wrap_compute_dense(topi.cuda.dense_int8), + wrap_topi_schedule(topi.cuda.schedule_dense_int8)) + else: + strategy.add_implement(wrap_compute_dense(topi.cuda.dense_small_batch), + wrap_topi_schedule(topi.cuda.schedule_dense_small_batch)) + b = inputs[0].shape[0] + with SpecializedCondition(b >= 32): + strategy.add_implement(wrap_compute_dense(topi.cuda.dense_large_batch), + wrap_topi_schedule(topi.cuda.schedule_dense_large_batch)) + if target.target_name == "cuda" and "cublas" in target.libs: + strategy.add_implement(wrap_compute_dense(topi.cuda.dense_cublas), + wrap_topi_schedule(topi.cuda.schedule_dense_cublas), 5) + return strategy + +@batch_matmul_strategy.register(["cuda", "gpu"]) +def batch_matmul_strategy_cuda(attrs, inputs, out_type, target): + """batch_matmul cuda strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_batch_matmul(topi.nn.batch_matmul), + wrap_topi_schedule(topi.cuda.schedule_batch_matmul), + 10) + if target.target_name == "cuda" and "cublas" in target.libs: + strategy.add_implement(wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas), + wrap_topi_schedule(topi.generic.schedule_extern), + 15) + return strategy + +@argsort_strategy.register(["cuda", "gpu"]) +def argsort_strategy_cuda(attrs, inputs, out_type, target): + """argsort cuda strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_argsort(topi.cuda.argsort_gpu), + wrap_topi_schedule(topi.cuda.schedule_argsort)) + return strategy + +@topk_strategy.register(["cuda", "gpu"]) +def topk_strategy_cuda(attrs, inputs, out_type, target): + """topk cuda strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_topk(topi.cuda.topk_gpu), + wrap_topi_schedule(topi.cuda.schedule_topk)) + return strategy + +@schedule_multibox_prior.register(["cuda", "gpu"]) +def schedule_multibox_prior_cuda(attrs, outs, target): + """schedule multibox_prior for cuda""" + with target: + return topi.cuda.schedule_multibox_prior(outs) + +@schedule_multibox_transform_loc.register(["cuda", "gpu"]) +def schedule_multibox_transform_loc_cuda(attrs, outs, target): + """schedule multibox_transform_loc for cuda""" + with target: + return topi.cuda.schedule_multibox_transform_loc(outs) + +@get_valid_counts_strategy.register(["cuda", "gpu"]) +def get_valid_counts_strategy_cuda(attrs, inputs, out_type, target): + """get_valid_counts cuda strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_get_valid_counts(topi.cuda.get_valid_counts), + wrap_topi_schedule(topi.cuda.schedule_get_valid_counts)) + return strategy + +@nms_strategy.register(["cuda", "gpu"]) +def nms_strategy_cuda(attrs, inputs, out_type, target): + """nms cuda strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_nms(topi.cuda.non_max_suppression), + wrap_topi_schedule(topi.cuda.schedule_nms)) + return strategy + +@roi_align_strategy.register(["cuda", "gpu"]) +def roi_align_strategy_cuda(attrs, inputs, out_type, target): + """roi_align cuda strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw), + wrap_topi_schedule(topi.cuda.schedule_roi_align)) + return strategy + +@schedule_roi_pool.register(["cuda", "gpu"]) +def schedule_roi_pool_cuda(attrs, outs, target): + """schedule roi_pool for cuda""" + with target: + return topi.cuda.schedule_roi_pool(outs) + +@proposal_strategy.register(["cuda", "gpu"]) +def proposal_strategy_cuda(attrs, inputs, out_type, target): + """proposal cuda strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_proposal(topi.cuda.proposal), + wrap_topi_schedule(topi.cuda.schedule_proposal)) + return strategy diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py new file mode 100644 index 000000000000..73923e554579 --- /dev/null +++ b/python/tvm/relay/op/strategy/generic.py @@ -0,0 +1,678 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Definition of generic operator strategy.""" +# pylint: disable=invalid-name,unused-argument +from __future__ import absolute_import + +import re +import topi +from topi.util import get_const_int, get_const_float, get_const_tuple, get_float_tuple +from .. import op as _op +from ....target import generic_func, override_native_generic_func + +def wrap_topi_schedule(topi_schedule): + """Wrap TOPI schedule which doesn't use attrs""" + def wrapper(attrs, outs, target): + with target: + return topi_schedule(outs) + return wrapper + +def get_conv2d_in_channels(data_shape, data_layout): + """Get conv2d input channels""" + data_shape = get_const_tuple(data_shape) + if len(data_shape) == 4: + idx = data_layout.find("C") + assert idx >= 0, "Invalid conv2d data layout {}".format(data_layout) + return data_shape[idx] + elif re.match(r"NCHW\d*c", data_layout): + # NCHW[8]c + return data_shape[1] * data_shape[4] + else: + raise ValueError("Unknown conv2d data layout {}".format(data_layout)) + +def get_conv2d_out_channels(kernel_shape, kernel_layout): + """Get conv2d output channels""" + kernel_shape = get_const_tuple(kernel_shape) + if len(kernel_shape) == 4: + idx = kernel_layout.find("O") + assert idx >= 0, "Invalid conv2d kernel layout {}".format(kernel_layout) + return kernel_shape[idx] + elif re.match(r"OIHW\d*i\d*o", kernel_layout): + return kernel_shape[0] * kernel_shape[5] + elif re.match(r"OIHW\d*o", kernel_layout): + return kernel_shape[0] * kernel_shape[4] + else: + raise ValueError("Unknown conv2d kernel layout {}".format(kernel_layout)) + +def is_depthwise_conv2d(data_shape, data_layout, kernel_shape, kernel_layout, groups): + ic = get_conv2d_in_channels(data_shape, data_layout) + oc = get_conv2d_out_channels(kernel_shape, kernel_layout) + return ic == oc == groups + +@generic_func +def schedule_injective(attrs, outs, target): + """Schedule injective ops""" + with target: + return topi.generic.schedule_injective(outs) + +@generic_func +def schedule_reduce(attrs, outs, target): + """Schedule reduction ops""" + with target: + return topi.generic.schedule_reduce(outs) + +_op._schedule_injective = schedule_injective +_op._schedule_reduce = schedule_reduce + +# concatenate +@generic_func +def schedule_concatenate(attrs, outs, target): + """Schedule concatenate op""" + with target: + return topi.generic.schedule_injective(outs) + +# pool +@generic_func +def schedule_pool(attrs, outs, target): + """Schedule pooling ops""" + with target: + return topi.generic.schedule_pool(outs, attrs.layout) + +# pool_grad +@generic_func +def schedule_pool_grad(attrs, outs, target): + """Schedule pooling gradient ops""" + with target: + return topi.generic.schedule_pool_grad(outs) + +# adaptive pool +@generic_func +def schedule_adaptive_pool(attrs, outs, target): + """Schedule adaptive pooling ops""" + with target: + return topi.generic.schedule_adaptive_pool(outs) + +# softmax +@generic_func +def schedule_softmax(attrs, outs, target): + """Schedule softmax""" + with target: + return topi.generic.schedule_softmax(outs) + +# lrn +@generic_func +def schedule_lrn(attrs, outs, target): + """Schedule LRN op""" + with target: + return topi.generic.schedule_lrn(outs) + +# l2_normalize +@generic_func +def schedule_l2_normalize(attrs, outs, target): + """Schedule L2 normalize op""" + with target: + return topi.generic.schedule_l2_normalize(outs) + +# bitpack +@generic_func +def schedule_bitpack(attrs, outs, target): + """Schedule bitpack""" + with target: + return topi.generic.schedule_bitpack(outs) + +# conv2d +def wrap_compute_conv2d(topi_compute, need_data_layout=False, need_out_layout=False, + has_groups=False): + """Wrap conv2d topi compute""" + def _compute_conv2d(attrs, inputs, out_type): + padding = get_const_tuple(attrs.padding) + strides = get_const_tuple(attrs.strides) + dilation = get_const_tuple(attrs.dilation) + data_layout = attrs.get_str("data_layout") + out_layout = attrs.get_str("out_layout") + out_dtype = attrs.out_dtype + out_dtype = (inputs[0].dtype if out_dtype in ("same", "") + else out_dtype) + args = [inputs[0], inputs[1], strides, padding, dilation] + if has_groups: + args.append(attrs.groups) + if need_data_layout: + args.append(data_layout) + if need_out_layout: + args.append(out_layout) + args.append(out_dtype) + return [topi_compute(*args)] + return _compute_conv2d + +@override_native_generic_func("conv2d_strategy") +def conv2d_strategy(attrs, inputs, out_type, target): + """conv2d generic strategy""" + strategy = _op.OpStrategy() + data, kernel = inputs + dilation = get_const_tuple(attrs.dilation) + groups = attrs.groups + layout = attrs.data_layout + kernel_layout = attrs.kernel_layout + (dilation_h, dilation_w) = dilation + if dilation_h < 1 or dilation_w < 1: + raise ValueError("dilation should be positive value") + + if groups == 1: + if layout == "NCHW": + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.nn.conv2d_nchw), + wrap_topi_schedule(topi.generic.schedule_conv2d_nchw)) + elif layout == "NHWC": + assert kernel_layout == "HWIO" + strategy.add_implement( + wrap_compute_conv2d(topi.nn.conv2d_nhwc), + wrap_topi_schedule(topi.generic.schedule_conv2d_nhwc)) + elif layout == "HWCN": + assert kernel_layout == "HWIO" + strategy.add_implement( + wrap_compute_conv2d(topi.nn.conv2d_hwcn), + wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn)) + else: + raise RuntimeError("Unsupported conv2d layout {}".format(layout)) + elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): + if layout == "NCHW": + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), + wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nchw)) + elif layout == "NHWC": + assert kernel_layout == "HWOI" + strategy.add_implement( + wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), + wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc)) + else: + raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) + else: # group_conv2d + if layout == 'NCHW': + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), + wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw)) + else: + raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) + return strategy + +# conv2d_NCHWc +@override_native_generic_func("conv2d_NCHWc_strategy") +def conv2d_NCHWc_strategy(attrs, inputs, out_type, target): + """conv2d_NCHWc generic strategy""" + strategy = _op.OpStrategy() + if inputs[0].dtype == "int8" or inputs[0].dtype == "uint8": + strategy.add_implement( + wrap_compute_conv2d(topi.nn.conv2d_NCHWc_int8, True, True), + wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc_int8)) + else: + strategy.add_implement( + wrap_compute_conv2d(topi.nn.conv2d_NCHWc, True, True), + wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc)) + return strategy + +# depthwise_conv2d_NCHWc +@override_native_generic_func("depthwise_conv2d_NCHWc_strategy") +def depthwise_conv2d_NCHWc_strategy(attrs, inputs, out_type, target): + """depthwise_conv2d generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implement( + wrap_compute_conv2d(topi.nn.depthwise_conv2d_NCHWc, True, True), + wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_NCHWc)) + return strategy + +# conv2d_winograd_without_weight_transform +@override_native_generic_func("conv2d_winograd_without_weight_transform_strategy") +def conv2d_winograd_without_weight_transfrom_strategy(attrs, inputs, out_type, target): + """conv2d_winograd_without_weight_transfrom generic strategy""" + raise ValueError("No generic implemenation for conv2d_winograd_without_weight_transform") + +# conv2d_winograd_weight_transform +@generic_func +def schedule_conv2d_winograd_weight_transform(attrs, outs, target): + """Schedule conv2d_winograd_weight_transform""" + with target: + return topi.generic.schedule_conv2d_winograd_weight_transform(outs) + +# conv2d_winograd_nnpack_weight_transform +@generic_func +def schedule_conv2d_winograd_nnpack_weight_transform(attrs, outs, target): + """Schedule conv2d_winograd_nnpack_weight_transform""" + with target: + return topi.generic.schedule_conv2d_winograd_nnpack_weight_transform(outs) + +# deformable_conv2d +def wrap_compute_deformable_conv2d(topi_compute): + """wrap deformable_conv2d topi compute""" + def _compute_deformable_conv2d(attrs, inputs, out_dtype): + assert attrs.data_layout == "NCHW" + padding = get_const_tuple(attrs.padding) + strides = get_const_tuple(attrs.strides) + dilation = get_const_tuple(attrs.dilation) + deformable_groups = attrs.deformable_groups + groups = attrs.groups + out_dtype = attrs.out_dtype + out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype + out = topi_compute(inputs[0], inputs[1], inputs[2], strides, padding, + dilation, deformable_groups, groups, out_dtype) + return [out] + return _compute_deformable_conv2d + +@override_native_generic_func("deformable_conv2d_strategy") +def deformable_conv2d_strategy(attrs, inputs, out_type, target): + """deformable_conv2d generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_deformable_conv2d(topi.nn.deformable_conv2d_nchw), + wrap_topi_schedule(topi.generic.schedule_deformable_conv2d_nchw)) + return strategy + +# conv2d_transpose +def wrap_comptue_conv2d_transpose(topi_compute): + """wrap conv2d_transpose topi compute""" + def compute_conv2d_transpose(attrs, inputs, out_dtype): + """Compute definition of conv2d_transpose""" + padding = get_const_tuple(attrs.padding) + strides = get_const_tuple(attrs.strides) + out_dtype = attrs.out_dtype + out_dtype = (inputs[0].dtype if out_dtype in ("same", "") + else out_dtype) + out = topi_compute( + inputs[0], inputs[1], strides, padding, out_dtype) + output_padding = get_const_tuple(attrs.output_padding) + out = topi.nn.pad(out, [0, 0, 0, 0], + [0, 0, output_padding[0], output_padding[1]]) + return [out] + return compute_conv2d_transpose + +@override_native_generic_func("conv2d_transpose_strategy") +def conv2d_transpose_strategy(attrs, inputs, out_type, target): + """conv2d_transpose generic strategy""" + layout = attrs.data_layout + dilation = get_const_tuple(attrs.dilation) + groups = attrs.groups + assert layout == "NCHW", "only support nchw for now" + assert dilation == (1, 1), "not support dilate now" + assert groups == 1, "only support groups == 1 for now" + strategy = _op.OpStrategy() + strategy.add_implement( + wrap_comptue_conv2d_transpose(topi.nn.conv2d_transpose_nchw), + wrap_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw)) + return strategy + +# conv3d +def wrap_compute_conv3d(topi_compute): + """wrap conv3d topi compute""" + def _compute_conv3d(attrs, inputs, out_type): + padding = get_const_tuple(attrs.padding) + strides = get_const_tuple(attrs.strides) + dilation = get_const_tuple(attrs.dilation) + groups = attrs.groups + layout = attrs.data_layout + out_dtype = attrs.out_dtype + out_dtype = (inputs[0].dtype if out_dtype in ("same", "") + else out_dtype) + + (dilation_d, dilation_h, dilation_w) = dilation + if dilation_d < 1 or dilation_h < 1 or dilation_w < 1: + raise ValueError("Dilation should be positive value") + + if groups == 1: + out = topi_compute(inputs[0], inputs[1], strides, padding, dilation, + layout, out_dtype) + else: + raise ValueError("Not support arbitrary group number for now") + return [out] + return _compute_conv3d + +@override_native_generic_func("conv3d_strategy") +def conv3d_strategy(attrs, inputs, out_type, target): + """conv3d generic strategy""" + strategy = _op.OpStrategy() + layout = attrs.data_layout + if layout == "NCDHW": + strategy.add_implement(wrap_compute_conv3d(topi.nn.conv3d_ncdhw), + wrap_topi_schedule(topi.generic.schedule_conv3d_ncdhw)) + elif layout == "NDHWC": + strategy.add_implement(wrap_compute_conv3d(topi.nn.conv3d_ndhwc), + wrap_topi_schedule(topi.generic.schedule_conv3d_ndhwc)) + else: + raise ValueError("Not support this layout {} yet".format(layout)) + return strategy + +# conv1d +def wrap_compute_conv1d(topi_compute): + """wrap conv1d topi compute""" + def _compute_conv1d(attrs, inputs, out_type): + """Compute definition of conv1d""" + strides = get_const_tuple(attrs.strides) + padding = get_const_tuple(attrs.padding) + dilation = get_const_tuple(attrs.dilation) + out_dtype = attrs.out_dtype + out_dtype = (inputs[0].dtype if out_dtype in ("same", "") + else out_dtype) + return [topi_compute(inputs[0], inputs[1], strides, padding, dilation, + out_dtype)] + return _compute_conv1d + +@override_native_generic_func("conv1d_strategy") +def conv1d_strategy(attrs, inputs, out_type, target): + """conv1d generic strategy""" + layout = attrs.data_layout + dilation = get_const_tuple(attrs.dilation) + if dilation[0] < 1: + raise ValueError("dilation should be a positive value") + strategy = _op.OpStrategy() + if layout == "NCW": + strategy.add_implement(wrap_compute_conv1d(topi.nn.conv1d_ncw), + wrap_topi_schedule(topi.generic.schedule_conv1d_ncw)) + elif layout == "NWC": + strategy.add_implement(wrap_compute_conv1d(topi.nn.conv1d_nwc), + wrap_topi_schedule(topi.generic.schedule_conv1d_nwc)) + else: + raise ValueError("Unsupported conv1d layout {}".format(layout)) + return strategy + +# conv1d_transpose +def wrap_compute_conv1d_transpose(topi_compute): + """wrap conv1d_transpose topi compute""" + def _compute_conv1d_tranpsoe(attrs, inputs, out_type): + padding = get_const_tuple(attrs.padding) + strides = get_const_tuple(attrs.strides) + out_dtype = attrs.out_dtype + out_dtype = (inputs[0].dtype if out_dtype in ("same", "") else out_dtype) + out = topi_compute(inputs[0], inputs[1], strides, padding, out_dtype) + output_padding = get_const_tuple(attrs.output_padding) + out = topi.nn.pad(out, [0, 0, 0], [0, 0, output_padding[0]]) + return [out] + return _compute_conv1d_tranpsoe + +@override_native_generic_func("conv1d_transpose_strategy") +def conv1d_transpose_strategy(attrs, inputs, out_type, target): + """conv1d_transpose generic strategy""" + strategy = _op.OpStrategy() + layout = attrs.data_layout + dilation = get_const_tuple(attrs.dilation) + groups = attrs.groups + assert layout == "NCW", "conv1d_transpose ncw only supported" + assert dilation == (1,), "conv1d_transpose dilation is not supported" + assert groups == 1, "conv1d_transpose groups == 1 only supported" + strategy.add_implement(wrap_compute_conv1d_transpose(topi.nn.conv1d_transpose_ncw), + wrap_topi_schedule(topi.generic.schedule_conv1d_transpose_ncw)) + return strategy + +# dense +def wrap_compute_dense(topi_compute): + """wrap dense topi compute""" + def _compute_dense(attrs, inputs, out_type): + """Compute definition of dense""" + out_dtype = attrs.out_dtype + out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype + return [topi_compute(inputs[0], inputs[1], None, out_dtype)] + return _compute_dense + +@override_native_generic_func("dense_strategy") +def dense_strategy(attrs, inputs, out_type, target): + """dense generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_dense(topi.nn.dense), + wrap_topi_schedule(topi.generic.schedule_dense)) + return strategy + +# batch_matmul +def wrap_compute_batch_matmul(topi_func): + """wrap batch_matmul topi compute""" + def _compute_batch_matmul(attrs, inputs, out_type): + return [topi_func(inputs[0], inputs[1])] + return _compute_batch_matmul + +@override_native_generic_func("batch_matmul_strategy") +def batch_matmul_strategy(attrs, inputs, out_type, target): + """batch_matmul generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_batch_matmul(topi.nn.batch_matmul), + wrap_topi_schedule(topi.generic.schedule_batch_matmul)) + return strategy + +# sparse_dense +@generic_func +def schedule_sparse_dense(attrs, outs, target): + """schedule sparse_dense""" + with target: + return topi.generic.schedule_sparse_dense(outs) + +# sparse_transpose +@generic_func +def schedule_sparse_transpose(attrs, outs, target): + """schedule sparse_transpose""" + with target: + return topi.generic.schedule_sparse_transpose(outs) + +# argsort +def wrap_compute_argsort(topi_compute): + """Wrap argsort topi compute""" + def _compute_argsort(attrs, inputs, _): + axis = get_const_int(attrs.axis) + is_ascend = bool(get_const_int(attrs.is_ascend)) + dtype = attrs.dtype + return [topi_compute(inputs[0], axis=axis, is_ascend=is_ascend, dtype=dtype)] + return _compute_argsort + +@override_native_generic_func("argsort_strategy") +def argsort_strategy(attrs, inputs, out_type, target): + """argsort generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_argsort(topi.argsort), + wrap_topi_schedule(topi.generic.schedule_argsort)) + return strategy + +# topk +def wrap_compute_topk(topi_func): + """Wrap topk compute""" + def _compute_topk(attrs, inputs, out_type): + k = get_const_int(attrs.k) + axis = get_const_int(attrs.axis) + ret_type = attrs.ret_type + is_ascend = bool(get_const_int(attrs.is_ascend)) + dtype = attrs.dtype + out = topi_func(inputs[0], k, axis, ret_type, is_ascend, dtype) + out = out if isinstance(out, list) else [out] + return out + return _compute_topk + +@override_native_generic_func("topk_strategy") +def topk_strategy(attrs, inputs, out_type, target): + """topk generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_topk(topi.topk), + wrap_topi_schedule(topi.generic.schedule_topk)) + return strategy + +# multibox_prior +@generic_func +def schedule_multibox_prior(attrs, outs, target): + """schedule multibox_prior""" + with target: + return topi.generic.schedule_multibox_prior(outs) + +# multibox_transform_loc +@generic_func +def schedule_multibox_transform_loc(attrs, outs, target): + """schedule multibox_transform_loc""" + with target: + return topi.generic.schedule_multibox_transform_loc(outs) + +# get_valid_counts +def wrap_compute_get_valid_counts(topi_compute): + """wrap get_valid_counts topi compute""" + def _compute_get_valid_counts(attrs, inputs, out_type): + score_threshold = get_const_float(attrs.score_threshold) + id_index = get_const_int(attrs.id_index) + score_index = get_const_int(attrs.score_index) + return topi_compute(inputs[0], score_threshold, id_index, score_index) + return _compute_get_valid_counts + +@override_native_generic_func("get_valid_counts_strategy") +def get_valid_counts_strategy(attrs, inputs, out_type, target): + """get_valid_counts generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_get_valid_counts(topi.vision.get_valid_counts), + wrap_topi_schedule(topi.generic.schedule_get_valid_counts)) + return strategy + +# non-maximum suppression +def wrap_compute_nms(topi_compute): + """wrap nms topi compute""" + def _compute_nms(attrs, inputs, out_type): + return_indices = bool(get_const_int(attrs.return_indices)) + max_output_size = get_const_int(attrs.max_output_size) + iou_threshold = get_const_float(attrs.iou_threshold) + force_suppress = bool(get_const_int(attrs.force_suppress)) + top_k = get_const_int(attrs.top_k) + coord_start = get_const_int(attrs.coord_start) + score_index = get_const_int(attrs.score_index) + id_index = get_const_int(attrs.id_index) + invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom)) + return [topi_compute(inputs[0], inputs[1], max_output_size, iou_threshold, + force_suppress, top_k, coord_start, score_index, + id_index, return_indices, invalid_to_bottom)] + return _compute_nms + +@override_native_generic_func("non_max_suppression_strategy") +def nms_strategy(attrs, inputs, out_type, target): + """nms generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_nms(topi.vision.non_max_suppression), + wrap_topi_schedule(topi.generic.schedule_nms)) + return strategy + +# roi_align +def wrap_compute_roi_align(topi_compute): + """wrap roi_align topi compute""" + def _compute_roi_align(attrs, inputs, out_type): + assert attrs.layout == "NCHW" + pooled_size = get_const_tuple(attrs.pooled_size) + return [topi_compute(inputs[0], inputs[1], + pooled_size=pooled_size, + spatial_scale=attrs.spatial_scale, + sample_ratio=attrs.sample_ratio)] + return _compute_roi_align + +@override_native_generic_func("roi_align_strategy") +def roi_align_strategy(attrs, inputs, out_type, target): + """roi_align generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw), + wrap_topi_schedule(topi.generic.schedule_roi_align)) + return strategy + +# roi_pool +@generic_func +def schedule_roi_pool(attrs, outs, target): + """schedule roi_pool""" + with target: + return topi.generic.schedule_roi_pool(outs) + +# proposal +def wrap_compute_proposal(topi_compute): + """wrap proposal topi compute""" + def _compute_proposal(attrs, inputs, out_type): + scales = get_float_tuple(attrs.scales) + ratios = get_float_tuple(attrs.ratios) + feature_stride = attrs.feature_stride + threshold = attrs.threshold + rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n + rpn_post_nms_top_n = attrs.rpn_post_nms_top_n + rpn_min_size = attrs.rpn_min_size + iou_loss = bool(get_const_int(attrs.iou_loss)) + return [topi_compute(inputs[0], inputs[1], inputs[2], scales, ratios, + feature_stride, threshold, rpn_pre_nms_top_n, + rpn_post_nms_top_n, rpn_min_size, iou_loss)] + return _compute_proposal + +@override_native_generic_func("proposal_strategy") +def proposal_strategy(attrs, inputs, out_type, target): + """proposal generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_proposal(topi.vision.rcnn.proposal), + wrap_topi_schedule(topi.generic.schedule_proposal)) + return strategy + +# argwhere +@generic_func +def schedule_argwhere(attrs, outs, target): + """schedule argwhere""" + with target: + return topi.generic.schedule_argwhere(outs) + +# bitserial_conv2d +def wrap_compute_bitserial_conv2d(topi_compute): + """wrap bitserial_conv2d topi compute""" + def compute_bitserial_conv2d(attrs, inputs, out_dtype): + """Compute definition for bitserial conv2d.""" + padding = get_const_tuple(attrs.padding) + strides = get_const_tuple(attrs.strides) + activation_bits = attrs.activation_bits + weight_bits = attrs.weight_bits + pack_dtype = attrs.pack_dtype + out_dtype = attrs.out_dtype + unipolar = attrs.unipolar + return [topi_compute(inputs[0], inputs[1], strides, padding, activation_bits, + weight_bits, pack_dtype, out_dtype, unipolar)] + return compute_bitserial_conv2d + +@override_native_generic_func("bitserial_conv2d_strategy") +def bitserial_conv2d_strategy(attrs, inputs, out_type, target): + """bitserial_conv2d generic strategy""" + strategy = _op.OpStrategy() + layout = attrs.data_layout + if layout == "NCHW": + strategy.add_implement( + wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nchw), + wrap_topi_schedule(topi.generic.schedule_bitserial_conv2d_nchw)) + elif layout == "NHWC": + strategy.add_implement( + wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nhwc), + wrap_topi_schedule(topi.generic.schedule_bitserial_conv2d_nhwc)) + else: + raise ValueError("Data layout {} not supported.".format(layout)) + return strategy + +# bitserial_dense +def wrap_compute_bitserial_dense(topi_compute): + """wrap bitserial_dense topi compute""" + def compute_bitserial_dense(attrs, inputs, out_type): + """Compute definition of bitserial dense""" + data_bits = attrs.data_bits + weight_bits = attrs.weight_bits + pack_dtype = attrs.pack_dtype + out_dtype = attrs.out_dtype + out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype + unipolar = attrs.unipolar + return [topi_compute(inputs[0], inputs[1], data_bits, weight_bits, + pack_dtype, out_dtype, unipolar)] + return compute_bitserial_dense + +@override_native_generic_func("bitserial_dense_strategy") +def bitserial_dense_strategy(attrs, inputs, out_type, target): + """bitserial_dense generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implement( + wrap_compute_bitserial_dense(topi.nn.bitserial_dense), + wrap_topi_schedule(topi.generic.schedule_bitserial_dense)) + return strategy diff --git a/python/tvm/relay/op/strategy/hls.py b/python/tvm/relay/op/strategy/hls.py new file mode 100644 index 000000000000..0600f875416a --- /dev/null +++ b/python/tvm/relay/op/strategy/hls.py @@ -0,0 +1,151 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Definition of HLS operator strategy.""" +# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import +from __future__ import absolute_import + +import topi +from .generic import * +from .. import op as _op + +@schedule_injective.register("hls") +def schedule_injective_hls(attrs, outs, target): + """schedule injective ops for hls""" + with target: + return topi.hls.schedule_injective(outs) + +@schedule_reduce.register("hls") +def schedule_reduce_hls(attrs, outs, target): + """schedule reduction ops for hls""" + with target: + return topi.hls.schedule_reduce(outs) + +@schedule_concatenate.register("hls") +def schedule_concatenate_hls(attrs, outs, target): + """schedule concatenate for hls""" + with target: + return topi.hls.schedule_injective(outs) + +@schedule_pool.register("hls") +def schedule_pool_hls(attrs, outs, target): + """schedule pooling ops for hls""" + with target: + return topi.hls.schedule_pool(outs, attrs.layout) + +@schedule_adaptive_pool.register("hls") +def schedule_adaptive_pool_hls(attrs, outs, target): + """schedule adaptive pooling ops for hls""" + with target: + return topi.hls.schedule_adaptive_pool(outs) + +@schedule_softmax.register("hls") +def schedule_softmax_hls(attrs, outs, target): + """schedule softmax for hls""" + with target: + return topi.hls.schedule_softmax(outs) + +@override_native_generic_func("conv2d_strategy") +def conv2d_strategy_hls(attrs, inputs, out_type, target): + """conv2d hls strategy""" + strategy = _op.OpStrategy() + data, kernel = inputs + dilation = get_const_tuple(attrs.dilation) + groups = attrs.groups + layout = attrs.data_layout + kernel_layout = attrs.kernel_layout + (dilation_h, dilation_w) = dilation + if dilation_h < 1 or dilation_w < 1: + raise ValueError("dilation should be positive value") + + if groups == 1: + if layout == "NCHW": + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.nn.conv2d_nchw), + wrap_topi_schedule(topi.hls.schedule_conv2d_nchw)) + elif layout == "NHWC": + assert kernel_layout == "HWIO" + strategy.add_implement( + wrap_compute_conv2d(topi.nn.conv2d_nhwc), + wrap_topi_schedule(topi.hls.schedule_conv2d_nhwc)) + else: + raise RuntimeError("Unsupported conv2d layout {}".format(layout)) + elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): + if layout == "NCHW": + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), + wrap_topi_schedule(topi.hls.schedule_depthwise_conv2d_nchw)) + elif layout == "NHWC": + assert kernel_layout == "HWOI" + strategy.add_implement( + wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), + wrap_topi_schedule(topi.hls.schedule_depthwise_conv2d_nhwc)) + else: + raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) + else: # group_conv2d + raise RuntimeError("group_conv2d is not supported for hls") + return strategy + +@override_native_generic_func("conv2d_NCHWc_strategy") +def conv2d_NCHWc_strategy_hls(attrs, inputs, out_type, target): + """conv2d_NCHWc hls strategy""" + strategy = _op.OpStrategy() + strategy.add_implement( + wrap_compute_conv2d(topi.nn.conv2d_NCHWc, True, True), + wrap_topi_schedule(topi.hls.schedule_conv2d_NCHWc)) + return strategy + +@conv2d_transpose_strategy.register("hls") +def conv2d_transpose_strategy_hls(attrs, inputs, out_type, target): + """conv2d_transpose hls strategy""" + layout = attrs.data_layout + dilation = get_const_tuple(attrs.dilation) + groups = attrs.groups + assert layout == "NCHW", "only support nchw for now" + assert dilation == (1, 1), "not support dilate now" + assert groups == 1, "only support groups == 1 for now" + strategy = _op.OpStrategy() + strategy.add_implement( + wrap_comptue_conv2d_transpose(topi.nn.conv2d_transpose_nchw), + wrap_topi_schedule(topi.hls.schedule_conv2d_transpose_nchw)) + return strategy + +@dense_strategy.register("hls") +def dense_strategy_hls(attrs, inputs, out_type, target): + """dense hls strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_dense(topi.nn.dense), + wrap_topi_schedule(topi.hls.schedule_dense)) + return strategy + +@bitserial_conv2d_strategy.register("hls") +def bitserial_conv2d_strategy_hls(attrs, inputs, out_type, target): + """bitserial_conv2d hls strategy""" + strategy = _op.OpStrategy() + layout = attrs.data_layout + if layout == "NCHW": + strategy.add_implement( + wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nchw), + wrap_topi_schedule(topi.hls.schedule_bitserial_conv2d_nchw)) + elif layout == "NHWC": + strategy.add_implement( + wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nhwc), + wrap_topi_schedule(topi.hls.schedule_bitserial_conv2d_nhwc)) + else: + raise ValueError("Data layout {} not supported.".format(layout)) + return strategy diff --git a/python/tvm/relay/op/strategy/intel_graphics.py b/python/tvm/relay/op/strategy/intel_graphics.py new file mode 100644 index 000000000000..c94d5cbc211d --- /dev/null +++ b/python/tvm/relay/op/strategy/intel_graphics.py @@ -0,0 +1,72 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Definition of x86 operator strategy.""" +# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import +from __future__ import absolute_import + +import topi +from .generic import * +from .. import op as _op + + +@conv2d_strategy.register("intel_graphics") +def conv2d_strategy_intel_graphics(attrs, inputs, out_type, target): + """conv2d intel graphics strategy""" + strategy = _op.OpStrategy() + data, kernel = inputs + dilation_h, dilation_w = get_const_tuple(attrs.dilation) + groups = attrs.groups + layout = attrs.data_layout + kernel_layout = attrs.kernel_layout + if dilation_h < 1 or dilation_w < 1: + raise ValueError("dilation should be positive value") + + if groups == 1: + if layout == "NCHW": + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.intel_graphics.conv2d_nchw), + wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_nchw)) + # conv2d_NCHWc won't work without alter op layout pass + # TODO(@Laurawly): fix this + strategy.add_implement( + wrap_compute_conv2d(topi.intel_graphics.conv2d_NCHWc, True, True), + wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_NCHWc), + 5) + else: + raise RuntimeError("Unsupported conv2d layout {} for intel graphics". + format(layout)) + elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): + if layout == "NCHW": + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.intel_graphics.depthwise_conv2d_nchw), + wrap_topi_schedule(topi.intel_graphics.schedule_depthwise_conv2d_nchw)) + else: + raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) + else: # group_conv2d + raise RuntimeError("group_conv2d is not supported for intel graphics") + return strategy + +@conv2d_NCHWc_strategy.register("intel_graphics") +def conv2d_NCHWc_strategy_intel_graphics(attrs, inputs, out_type, target): + """conv2d_NCHWc intel_graphics strategy""" + strategy = _op.OpStrategy() + strategy.add_implement( + wrap_compute_conv2d(topi.intel_graphics.conv2d_NCHWc, True, True), + wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_NCHWc)) + return strategy diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py new file mode 100644 index 000000000000..8641a959952f --- /dev/null +++ b/python/tvm/relay/op/strategy/mali.py @@ -0,0 +1,94 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Definition of mali operator strategy.""" +# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import + +from __future__ import absolute_import + +import topi +from .generic import * +from .. import op as _op + +@conv2d_strategy.register("mali") +def conv2d_strategy_mali(attrs, inputs, out_type, target): + """conv2d mali strategy""" + strategy = _op.OpStrategy() + data, kernel = inputs + dilation_h, dilation_w = attrs.get_int_tuple("dilation") + stride_h, stride_w = attrs.get_int_tuple("strides") + groups = attrs.groups + layout = attrs.data_layout + kernel_layout = attrs.kernel_layout + if dilation_h < 1 or dilation_w < 1: + raise ValueError("dilation should be positive value") + + if groups == 1: + if layout == "NCHW": + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack), + wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack)) + + _, _, kh, kw = get_const_tuple(kernel.shape) + if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \ + dilation_h == 1 and dilation_w == 1: + strategy.add_implement( + wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd), + wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd), + 15) + else: + raise RuntimeError("Unsupported conv2d layout {} for mali".format(layout)) + elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): + if layout == "NCHW": + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.mali.depthwise_conv2d_nchw), + wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nchw)) + else: + raise RuntimeError("Unsupported depthwise_conv2d layout {} for mali".format(layout)) + else: # group_conv2d + raise RuntimeError("group_conv2d is not supported for mali") + return strategy + +@conv2d_winograd_without_weight_transfrom_strategy.register("mali") +def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_type, target): + """conv2d_winograd_without_weight_transfrom mali strategy""" + dilation = attrs.get_int_tuple("dilation") + groups = attrs.get_int("groups") + layout = attrs.data_layout + stride_h, stride_w = attrs.get_int_tuple("strides") + assert dilation == (1, 1), "Do not support dilate now" + assert groups == 1, "Do not supoort arbitrary group number" + strategy = _op.OpStrategy() + if layout == "NCHW": + _, _, kh, kw = get_const_tuple(inputs[1].shape) + assert kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 + strategy.add_implement( + wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd), + wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd)) + else: + raise RuntimeError("Unsupported conv2d_winograd_without_weight_transfrom layout {}". + format(layout)) + return strategy + +@dense_strategy.register(["mali"]) +def dense_strategy_mali(attrs, inputs, out_type, target): + """dense mali strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_dense(topi.mali.dense), + wrap_topi_schedule(topi.mali.schedule_dense)) + return strategy diff --git a/python/tvm/relay/op/strategy/opengl.py b/python/tvm/relay/op/strategy/opengl.py new file mode 100644 index 000000000000..f5da48c150c2 --- /dev/null +++ b/python/tvm/relay/op/strategy/opengl.py @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Definition of OpenGL operator strategy.""" +# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import +from __future__ import absolute_import + +import topi +from .generic import * +from .. import op as _op + +@schedule_injective.register("opengl") +def schedule_injective_opengl(attrs, outs, target): + """schedule injective ops for opengl""" + with target: + return topi.opengl.schedule_injective(outs) + +@schedule_concatenate.register("opengl") +def schedule_concatenate_opengl(attrs, outs, target): + """schedule concatenate for opengl""" + with target: + return topi.opengl.schedule_injective(outs) + +@schedule_pool.register("opengl") +def schedule_pool_opengl(attrs, outs, target): + """schedule pooling ops for opengl""" + with target: + return topi.opengl.schedule_pool(outs, attrs.layout) + +@schedule_adaptive_pool.register("opengl") +def schedule_adaptive_pool_opengl(attrs, outs, target): + """schedule adative pooling ops for opengl""" + with target: + return topi.opengl.schedule_adaptive_pool(outs) + +@schedule_softmax.register("opengl") +def schedule_softmax_opengl(attrs, outs, target): + """schedule softmax for opengl""" + with target: + return topi.opengl.schedule_softmax(outs) + +@conv2d_strategy.register("opengl") +def conv2d_strategy_opengl(attrs, inputs, out_type, target): + """conv2d hls strategy""" + strategy = _op.OpStrategy() + groups = attrs.groups + layout = attrs.data_layout + assert groups == 1, "Don't support group conv2d on OpenGL" + assert layout == "NCHW", "Only support conv2d layout NCHW for OpenGL" + strategy.add_implement(wrap_compute_conv2d(topi.nn.conv2d), + wrap_topi_schedule(topi.opengl.schedule_conv2d_nchw)) + return strategy + +@dense_strategy.register("opengl") +def dense_strategy_opengl(attrs, inputs, out_type, target): + """dense hls strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_dense(topi.nn.dense), + wrap_topi_schedule(topi.opengl.schedule_dense)) + return strategy diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py new file mode 100644 index 000000000000..9e725f65c511 --- /dev/null +++ b/python/tvm/relay/op/strategy/rocm.py @@ -0,0 +1,128 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Definition of ROCm operator strategy.""" +# pylint: disable=invalid-name,unused-argument,unused-wildcard-import,wildcard-import +from __future__ import absolute_import + +import topi +from .generic import * +from .. import op as _op + +@schedule_lrn.register("rocm") +def schedule_lrn_rocm(attrs, outs, target): + """schedule LRN for rocm""" + with target: + return topi.rocm.schedule_lrn(outs) + +@schedule_l2_normalize.register("rocm") +def schedule_l2_normalize_rocm(attrs, outs, target): + """schedule L2 normalize for rocm""" + with target: + return topi.rocm.schedule_l2_normalize(outs) + +@conv2d_strategy.register("rocm") +def conv2d_strategy_cuda(attrs, inputs, out_type, target): + """conv2d cuda strategy""" + strategy = _op.OpStrategy() + data, kernel = inputs + dilation_h, dilation_w = attrs.get_int_tuple("dilation") + groups = attrs.groups + layout = attrs.data_layout + stride_h, stride_w = attrs.get_int_tuple("strides") + kernel_layout = attrs.kernel_layout + if dilation_h < 1 or dilation_w < 1: + raise ValueError("dilation should be positive value") + + if groups == 1: + if layout == "NCHW": + # TODO(@vinx13, @icemelon9): Use conv2d_NCHWc_int8 when dtype is int8/uint8. + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.conv2d_nchw), + wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw)) + _, _, kh, kw = get_const_tuple(kernel.shape) + if kh <= 7 and kw <= 7 and kh == kw and stride_h == 1 and stride_w == 1: + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), + wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd), + 15) + elif layout == "HWCN": + assert kernel_layout == "HWIO" + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.conv2d_hwcn), + wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn)) + elif layout == "NHWC": + assert kernel_layout == "HWIO" + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.conv2d_nhwc), + wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc)) + elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: + assert kernel_layout == "OIHW4o4i" + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), + wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8)) + else: + raise RuntimeError("Unsupported conv2d layout {} for CUDA".format(layout)) + # add miopen implementation + if "miopen" in target.libs: + if layout == "NCHW": + strategy.add_implement( + wrap_compute_conv2d(topi.rocm.conv2d_nchw_miopen, True), + wrap_topi_schedule(topi.rocm.schedule_conv2d_nchw_miopen), 5) + elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): + if layout == "NCHW": + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), + wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw)) + elif layout == "NHWC": + assert kernel_layout == "HWOI" + strategy.add_implement( + wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), + wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc)) + else: + raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) + else: # group_conv2d + if layout == 'NCHW': + # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8. + assert kernel_layout == "OIHW" + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), + wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw)) + elif layout == 'NCHW4c' and data.dtype in ["int8", "uint8"]: + assert kernel_layout == "OIHW4o4i" + strategy.add_implement( + wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True), + wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8)) + else: + raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) + return strategy + +@dense_strategy.register(["rocm"]) +def dense_strategy_rocm(attrs, inputs, out_type, target): + """Dense strategy for ROCM""" + strategy = _op.OpStrategy() + assert len(inputs[0].shape) == 2 and len(inputs[1].shape) == 2, "Only support 2-dim dense" + + strategy.add_implement(wrap_compute_dense(topi.rocm.dense), + wrap_topi_schedule(topi.rocm.schedule_dense)) + if target.target_name == "rocm" and "rocblas" in target.libs: + assert out_type.dtype == inputs[0].dtype, "Mixed precision not supported." + strategy.add_implement( + wrap_compute_dense(topi.rocm.dense_rocblas), + wrap_topi_schedule(topi.rocm.dense_rocblas), 5) + return strategy diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py new file mode 100644 index 000000000000..bb6833d203c8 --- /dev/null +++ b/python/tvm/relay/op/strategy/x86.py @@ -0,0 +1,277 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Definition of x86 operator strategy.""" +# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import +from __future__ import absolute_import + +import logging + +import topi +from .generic import * +from .. import op as _op +from ....schedule import SpecializedCondition + +logger = logging.getLogger('strategy') + +@schedule_injective.register("cpu") +def schedule_injective_cpu(attrs, outs, target): + """schedule injective ops for x86""" + with target: + return topi.x86.schedule_injective(outs) + +@schedule_reduce.register("cpu") +def schedule_reduce_cpu(attrs, outs, target): + """schedule reduction ops for x86""" + with target: + return topi.x86.schedule_reduce(outs) + +@schedule_concatenate.register("cpu") +def schedule_concatenate_cpu(attrs, outs, target): + """schedule concatenate op for x86""" + with target: + return topi.x86.schedule_concatenate(outs) + +@schedule_pool.register("cpu") +def schedule_pool_cpu(attrs, outs, target): + """schedule pooling ops for x86""" + with target: + return topi.x86.schedule_pool(outs, attrs.layout) + +@schedule_adaptive_pool.register("cpu") +def schedule_adaptive_pool_cpu(attrs, outs, target): + """schedule adaptive pooling ops for x86""" + with target: + return topi.x86.schedule_adaptive_pool(outs) + +@schedule_softmax.register("cpu") +def schedule_softmax_cpu(attrs, outs, target): + """schedule softmax for x86""" + with target: + return topi.x86.schedule_softmax(outs) + +@conv2d_strategy.register("cpu") +def conv2d_strategy_cpu(attrs, inputs, out_type, target): + """conv2d x86 strategy""" + strategy = _op.OpStrategy() + data, kernel = inputs + dilation_h, dilation_w = get_const_tuple(attrs.dilation) + groups = attrs.groups + layout = attrs.data_layout + kernel_layout = attrs.kernel_layout + if dilation_h < 1 or dilation_w < 1: + raise ValueError("dilation should be positive value") + + if groups == 1: + if layout == "NCHW": + assert kernel_layout == "OIHW" + if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype): + strategy.add_implement( + wrap_compute_conv2d(topi.x86.conv2d_nchw_int8), + wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8)) + else: + strategy.add_implement( + wrap_compute_conv2d(topi.x86.conv2d_nchw), + wrap_topi_schedule(topi.x86.schedule_conv2d_nchw)) + elif layout == "NHWC": + assert kernel_layout == "HWIO" + logger.warning("For x86 target, NCHW layout is recommended for conv2d.") + strategy.add_implement( + wrap_compute_conv2d(topi.nn.conv2d_nhwc), + wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc)) + elif layout == "HWCN": + assert kernel_layout == "HWIO" + logger.warning("For x86 target, NCHW layout is recommended for conv2d.") + strategy.add_implement( + wrap_compute_conv2d(topi.nn.conv2d_hwcn), + wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn)) + else: + raise RuntimeError("Unsupported conv2d layout {} for cpu".format(layout)) + elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): + if layout == "NCHW": + assert kernel_layout == "OIHW" + channel_multiplier = get_const_tuple(inputs[1].shape)[1] + if channel_multiplier == 1: + strategy.add_implement( + wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw), + wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_nchw)) + else: + logger.warning("For x86 target, depthwise_conv2d with channel " + "multiplier greater than 1 is not optimized") + strategy.add_implement( + wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), + wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nchw)) + elif layout == "NHWC": + assert kernel_layout == "HWOI" + logger.warning("For x86 target, NCHW layout is recommended for depthwise_conv2d.") + strategy.add_implement( + wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), + wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc)) + else: + raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) + else: # group_conv2d + if layout == 'NCHW': + assert kernel_layout == "OIHW" + logger.warning("group_conv2d is not optimized for cpu.") + strategy.add_implement( + wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), + wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw)) + else: + raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) + return strategy + +@conv2d_NCHWc_strategy.register("cpu") +def conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target): + """conv2d_NCHWc x86 strategy""" + strategy = _op.OpStrategy() + data, kernel = inputs + if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype): + strategy.add_implement( + wrap_compute_conv2d(topi.x86.conv2d_NCHWc_int8, True, True), + wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc_int8)) + else: + strategy.add_implement( + wrap_compute_conv2d(topi.x86.conv2d_NCHWc, True, True), + wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc)) + return strategy + +@depthwise_conv2d_NCHWc_strategy.register("cpu") +def depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target): + """depthwise_conv2d x86 strategy""" + strategy = _op.OpStrategy() + strategy.add_implement( + wrap_compute_conv2d(topi.x86.depthwise_conv2d_NCHWc, True, True), + wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc)) + return strategy + +@conv2d_transpose_strategy.register("cpu") +def conv2d_transpose_strategy_cpu(attrs, inputs, out_type, target): + """conv2d_transpose x86 strategy""" + layout = attrs.data_layout + dilation = get_const_tuple(attrs.dilation) + groups = attrs.groups + assert layout == "NCHW", "only support nchw for now" + assert dilation == (1, 1), "not support dilate now" + assert groups == 1, "only support groups == 1 for now" + strategy = _op.OpStrategy() + strategy.add_implement( + wrap_comptue_conv2d_transpose(topi.x86.conv2d_transpose_nchw), + wrap_topi_schedule(topi.x86.schedule_conv2d_transpose_nchw)) + return strategy + +@conv3d_strategy.register("cpu") +def conv3d_strategy_cpu(attrs, inputs, out_type, target): + """conv3d generic strategy""" + strategy = _op.OpStrategy() + layout = attrs.data_layout + if layout == "NCDHW": + logger.warning("conv3d with layout NCDHW is not optimized for cpu.") + strategy.add_implement(wrap_compute_conv3d(topi.nn.conv3d_ncdhw), + wrap_topi_schedule(topi.generic.schedule_conv3d_ncdhw)) + elif layout == "NDHWC": + strategy.add_implement(wrap_compute_conv3d(topi.x86.conv3d_ndhwc), + wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc)) + else: + raise ValueError("Not support this layout {} yet".format(layout)) + return strategy + +@conv1d_strategy.register("cpu") +def conv1d_strategy_cpu(attrs, inputs, out_type, target): + """conv1d x86 strategy""" + layout = attrs.data_layout + dilation = get_const_tuple(attrs.dilation) + if dilation[0] < 1: + raise ValueError("dilation should be a positive value") + strategy = _op.OpStrategy() + if layout == "NCW": + strategy.add_implement(wrap_compute_conv1d(topi.nn.conv1d_ncw), + wrap_topi_schedule(topi.x86.schedule_conv1d_ncw)) + elif layout == "NWC": + strategy.add_implement(wrap_compute_conv1d(topi.nn.conv1d_nwc), + wrap_topi_schedule(topi.x86.schedule_conv1d_nwc)) + else: + raise ValueError("Unsupported conv1d layout {}".format(layout)) + return strategy + +@dense_strategy.register("cpu") +def dense_strategy_cpu(attrs, inputs, out_type, target): + """dense x86 strategy""" + strategy = _op.OpStrategy() + _, k = inputs[0].shape + strategy.add_implement(wrap_compute_dense(topi.x86.dense_nopack), + wrap_topi_schedule(topi.x86.schedule_dense_nopack), + 10) + if "cblas" in target.libs: + strategy.add_implement(wrap_compute_dense(topi.x86.dense_cblas), + wrap_topi_schedule(topi.x86.schedule_dense_cblas), + 5) + with SpecializedCondition(k > 16): + strategy.add_implement(wrap_compute_dense(topi.x86.dense_pack), + wrap_topi_schedule(topi.x86.schedule_dense_pack)) + return strategy + +@batch_matmul_strategy.register("cpu") +def batch_matmul_strategy_cpu(attrs, inputs, out_type, target): + """batch_matmul x86 strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_batch_matmul(topi.x86.batch_matmul), + wrap_topi_schedule(topi.x86.schedule_batch_matmul), + 10) + if "cblas" in target.libs: + strategy.add_implement(wrap_compute_batch_matmul(topi.x86.batch_matmul_cblas), + wrap_topi_schedule(topi.x86.schedule_batch_matmul_cblas), + 5) + return strategy + +@schedule_sparse_dense.register("cpu") +def schedule_sparse_dense_cpu(attrs, outs, target): + """schedule sparse_dense for x86""" + with target: + return topi.x86.schedule_sparse_dense(outs) + +@roi_align_strategy.register("cpu") +def roi_align_strategy_cpu(attrs, inputs, out_type, target): + """roi_align x86 strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_roi_align(topi.x86.roi_align_nchw), + wrap_topi_schedule(topi.generic.schedule_roi_align)) + return strategy + +@bitserial_conv2d_strategy.register("cpu") +def bitserial_conv2d_strategy_cpu(attrs, inputs, out_type, target): + """bitserial_conv2d x86 strategy""" + strategy = _op.OpStrategy() + layout = attrs.data_layout + if layout == "NCHW": + strategy.add_implement( + wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nchw), + wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nchw)) + elif layout == "NHWC": + strategy.add_implement( + wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nhwc), + wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nhwc)) + else: + raise ValueError("Data layout {} not supported.".format(layout)) + return strategy + +@bitserial_dense_strategy.register("cpu") +def bitserial_dense_strategy_cpu(attrs, inputs, out_type, target): + """bitserial_dense x86 strategy""" + strategy = _op.OpStrategy() + strategy.add_implement( + wrap_compute_bitserial_dense(topi.x86.bitserial_dense), + wrap_topi_schedule(topi.x86.schedule_bitserial_dense)) + return strategy diff --git a/python/tvm/relay/op/vision/_rcnn.py b/python/tvm/relay/op/vision/_rcnn.py index f35283961b27..16468e5eabc7 100644 --- a/python/tvm/relay/op/vision/_rcnn.py +++ b/python/tvm/relay/op/vision/_rcnn.py @@ -17,65 +17,27 @@ # pylint: disable=invalid-name, unused-argument """Faster R-CNN and Mask R-CNN operations.""" import topi -from topi.util import get_const_tuple, get_float_tuple, get_const_int +from topi.util import get_const_tuple from .. import op as reg +from .. import strategy from ..op import OpPattern - -@reg.register_compute("vision.roi_align") -def compute_roi_align(attrs, inputs, _, target): - """Compute definition of roi_align""" - assert attrs.layout == "NCHW" - return [topi.vision.rcnn.roi_align_nchw( - inputs[0], inputs[1], pooled_size=get_const_tuple(attrs.pooled_size), - spatial_scale=attrs.spatial_scale, sample_ratio=attrs.sample_ratio)] - -@reg.register_schedule("vision.roi_align") -def schedule_roi_align(_, outs, target): - """Schedule definition of roi_align""" - with target: - return topi.generic.vision.schedule_roi_align(outs) - +# roi_align +reg.register_strategy("vision.roi_align", strategy.roi_align_strategy) reg.register_pattern("vision.roi_align", OpPattern.OUT_ELEMWISE_FUSABLE) +# roi_pool @reg.register_compute("vision.roi_pool") -def compute_roi_pool(attrs, inputs, _, target): +def compute_roi_pool(attrs, inputs, _): """Compute definition of roi_pool""" assert attrs.layout == "NCHW" return [topi.vision.rcnn.roi_pool_nchw( inputs[0], inputs[1], pooled_size=get_const_tuple(attrs.pooled_size), spatial_scale=attrs.spatial_scale)] -@reg.register_schedule("vision.roi_pool") -def schedule_roi_pool(_, outs, target): - """Schedule definition of roi_pool""" - with target: - return topi.generic.vision.schedule_roi_pool(outs) - +reg.register_schedule("vision.roi_pool", strategy.schedule_roi_pool) reg.register_pattern("vision.roi_pool", OpPattern.OUT_ELEMWISE_FUSABLE) -@reg.register_compute("vision.proposal") -def compute_proposal(attrs, inputs, _, target): - """Compute definition of proposal""" - scales = get_float_tuple(attrs.scales) - ratios = get_float_tuple(attrs.ratios) - feature_stride = attrs.feature_stride - threshold = attrs.threshold - rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n - rpn_post_nms_top_n = attrs.rpn_post_nms_top_n - rpn_min_size = attrs.rpn_min_size - iou_loss = bool(get_const_int(attrs.iou_loss)) - with target: - return [ - topi.vision.rcnn.proposal(inputs[0], inputs[1], inputs[2], scales, ratios, - feature_stride, threshold, rpn_pre_nms_top_n, - rpn_post_nms_top_n, rpn_min_size, iou_loss) - ] - -@reg.register_schedule("vision.proposal") -def schedule_proposal(_, outs, target): - """Schedule definition of proposal""" - with target: - return topi.generic.schedule_proposal(outs) - +# proposal +reg.register_strategy("vision.proposal", strategy.proposal_strategy) reg.register_pattern("vision.proposal", OpPattern.OPAQUE) diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py index 7de118071aa4..737954da82ba 100644 --- a/python/tvm/relay/op/vision/_vision.py +++ b/python/tvm/relay/op/vision/_vision.py @@ -21,43 +21,28 @@ import topi from topi.util import get_const_int, get_const_float, get_float_tuple from .. import op as reg +from .. import strategy from ..op import OpPattern - -@reg.register_schedule("vision.multibox_prior") -def schedule_multibox_prior(_, outs, target): - """Schedule definition of multibox_prior""" - with target: - return topi.generic.schedule_multibox_prior(outs) - - +# multibox_prior @reg.register_compute("vision.multibox_prior") -def compute_multibox_prior(attrs, inputs, _, target): +def compute_multibox_prior(attrs, inputs, _): """Compute definition of multibox_prior""" sizes = get_float_tuple(attrs.sizes) ratios = get_float_tuple(attrs.ratios) steps = get_float_tuple(attrs.steps) offsets = get_float_tuple(attrs.offsets) clip = bool(get_const_int(attrs.clip)) - return [ - topi.vision.ssd.multibox_prior(inputs[0], sizes, ratios, steps, - offsets, clip) - ] - + return [topi.vision.ssd.multibox_prior(inputs[0], sizes, ratios, steps, + offsets, clip)] +reg.register_schedule("vision.multibox_prior", strategy.schedule_multibox_prior) reg.register_pattern("vision.multibox_prior", OpPattern.OPAQUE) # multibox_transform_loc -@reg.register_schedule("vision.multibox_transform_loc") -def schedule_multibox_transform_loc(_, outs, target): - """Schedule definition of multibox_detection""" - with target: - return topi.generic.schedule_multibox_transform_loc(outs) - - @reg.register_compute("vision.multibox_transform_loc") -def compute_multibox_transform_loc(attrs, inputs, _, target): +def compute_multibox_transform_loc(attrs, inputs, _): """Compute definition of multibox_detection""" clip = bool(get_const_int(attrs.clip)) threshold = get_const_float(attrs.threshold) @@ -65,57 +50,15 @@ def compute_multibox_transform_loc(attrs, inputs, _, target): return topi.vision.ssd.multibox_transform_loc( inputs[0], inputs[1], inputs[2], clip, threshold, variances) - +reg.register_schedule("vision.multibox_transform_loc", strategy.schedule_multibox_transform_loc) reg.register_pattern("vision.multibox_transform_loc", OpPattern.OPAQUE) -reg.register_pattern("vision.multibox_detection", OpPattern.OPAQUE) # Get counts of valid boxes -@reg.register_schedule("vision.get_valid_counts") -def schedule_get_valid_counts(_, outs, target): - """Schedule definition of get_valid_counts""" - with target: - return topi.generic.schedule_get_valid_counts(outs) - - -@reg.register_compute("vision.get_valid_counts") -def compute_get_valid_counts(attrs, inputs, _, target): - """Compute definition of get_valid_counts""" - score_threshold = get_const_float(attrs.score_threshold) - id_index = get_const_int(attrs.id_index) - score_index = get_const_int(attrs.score_index) - return topi.vision.get_valid_counts(inputs[0], score_threshold, - id_index, score_index) - +reg.register_strategy("vision.get_valid_counts", strategy.get_valid_counts_strategy) reg.register_pattern("vision.get_valid_counts", OpPattern.OPAQUE) # non-maximum suppression -@reg.register_schedule("vision.non_max_suppression") -def schedule_nms(_, outs, target): - """Schedule definition of nms""" - with target: - return topi.generic.schedule_nms(outs) - - -@reg.register_compute("vision.non_max_suppression") -def compute_nms(attrs, inputs, _, target): - """Compute definition of nms""" - return_indices = bool(get_const_int(attrs.return_indices)) - max_output_size = get_const_int(attrs.max_output_size) - iou_threshold = get_const_float(attrs.iou_threshold) - force_suppress = bool(get_const_int(attrs.force_suppress)) - top_k = get_const_int(attrs.top_k) - coord_start = get_const_int(attrs.coord_start) - score_index = get_const_int(attrs.score_index) - id_index = get_const_int(attrs.id_index) - invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom)) - return [ - topi.vision.non_max_suppression(inputs[0], inputs[1], max_output_size, - iou_threshold, force_suppress, top_k, - coord_start, score_index, id_index, - return_indices, invalid_to_bottom) - ] - - +reg.register_strategy("vision.non_max_suppression", strategy.nms_strategy) reg.register_pattern("vision.non_max_suppression", OpPattern.OPAQUE) diff --git a/python/tvm/relay/op/vision/_yolo.py b/python/tvm/relay/op/vision/_yolo.py index 32fc62d5c23a..d6ac0d4bfbcf 100644 --- a/python/tvm/relay/op/vision/_yolo.py +++ b/python/tvm/relay/op/vision/_yolo.py @@ -17,9 +17,9 @@ #pylint: disable=invalid-name, unused-argument """Backend compiler related feature registration""" from __future__ import absolute_import -from ..op import register_schedule, register_pattern -from ..op import schedule_injective, OpPattern +from ..op import register_pattern, OpPattern +from ..op import register_strategy_injective # reorg register_pattern("vision.yolo_reorg", OpPattern.INJECTIVE) -register_schedule("vision.yolo_reorg", schedule_injective) +register_strategy_injective("vision.yolo_reorg") diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py index ba100d8d03e4..82b243a9fc14 100644 --- a/python/tvm/relay/quantize/_annotate.py +++ b/python/tvm/relay/quantize/_annotate.py @@ -52,11 +52,10 @@ def simulated_quantize_compute(attrs, inputs, out_type, target): return [rdata] -_reg.register_schedule("relay.op.annotation.simulated_quantize", - _reg.schedule_injective) +_reg.register_strategy_injective("relay.op.annotation.simulated_quantize") _reg.register_pattern("relay.op.annotation.simulated_quantize", _reg.OpPattern.ELEMWISE) -_reg.register_schedule("annotation.cast_hint", _reg.schedule_injective) +_reg.register_strategy_injective("annotation.cast_hint") @register_relay_node diff --git a/python/tvm/te/schedule.py b/python/tvm/te/schedule.py index d160f78d7c89..affb284da468 100644 --- a/python/tvm/te/schedule.py +++ b/python/tvm/te/schedule.py @@ -517,4 +517,38 @@ def opengl(self): _ffi_api.StageOpenGL(self) +@tvm._ffi.register_object +class SpecializedCondition(Object): + """Specialized condition to enable op specialization.""" + def __init__(self, conditions): + """Create a specialized condition. + + .. note:: + Conditions are represented in conjunctive joint form (CNF). + Each condition should be a simple expression, e.g., n > 16, + m % 8 == 0, etc., where n, m are tvm.Var that represents a + dimension in the tensor shape. + + Parameters + ---------- + conditions : List of tvm.Expr + List of conditions in conjunctive joint form (CNF). + """ + if not isinstance(conditions, (list, _container.Array)): + conditions = [conditions] + self.__init_handle_by_constructor__( + _ffi_api._CreateSpecializedCondition, conditions) + + def __enter__(self): + _ffi_api._EnterSpecializationScope(self) + return self + + def __exit__(self, ptype, value, trace): + _ffi_api._ExitSpecializationScope(self) + + +def current_specialization(): + return _ffi_api._GetCurrentSpecialization() + + tvm._ffi._init_api("schedule", __name__) diff --git a/python/tvm/tir/expr.py b/python/tvm/tir/expr.py index ca11ffc76ea5..a6af6be73576 100644 --- a/python/tvm/tir/expr.py +++ b/python/tvm/tir/expr.py @@ -964,3 +964,11 @@ class Let(PrimExprWithOp): def __init__(self, var, value, body): self.__init_handle_by_constructor__( _ffi_api.Let, var, value, body) + + +@register_object +class Any(PrimExpr): + """Any node. + """ + def __init__(self): + self.__init_handle_by_constructor__(_ffi_api.Any) diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc index e5629e8f3505..bd51fdf1d59e 100644 --- a/src/relay/backend/compile_engine.cc +++ b/src/relay/backend/compile_engine.cc @@ -52,6 +52,24 @@ TVM_REGISTER_NODE_TYPE(CCacheKeyNode); TVM_REGISTER_NODE_TYPE(CCacheValueNode); TVM_REGISTER_OBJECT_TYPE(CompileEngineNode); +CachedFunc CachedFuncNode::make(tvm::Target target, + std::string func_name, + tvm::Array inputs, + tvm::Array outputs, + te::Schedule schedule, + tvm::Array funcs, + tvm::Array shape_func_param_states) { + auto n = make_object(); + n->target = std::move(target); + n->func_name = func_name; + n->inputs = std::move(inputs); + n->outputs = std::move(outputs); + n->schedule = std::move(schedule); + n->funcs = std::move(funcs); + n->shape_func_param_states = std::move(shape_func_param_states); + return CachedFunc(n); +} + CCacheKey CCacheKeyNode::make(Function source_func, Target target) { auto n = make_object(); n->source_func = std::move(source_func); @@ -100,6 +118,7 @@ Array GetShape(const Array& shape) { return res; } +/* // The getter to get schedule from compile engine. // Get schedule from functor. class ScheduleGetter : @@ -208,7 +227,7 @@ class ScheduleGetter : LOG(FATAL) << "not handled"; return tvm::PrimExpr(); } - }, "compile_engine_const", topi::kBroadcast); + }, "compile_engine_const", topi::kBroadcast); scalars_.push_back(value->op); return {value}; } @@ -231,7 +250,7 @@ class ScheduleGetter : } if (count_tuple) { CHECK_EQ(call_node->args.size(), 1U) - << "Only allow function with a single tuple input"; + << "Only allow function with a single tuple input"; } // Prepare the call_node->checked_type(). For the call node inputs, we ensure that the shape is @@ -253,7 +272,7 @@ class ScheduleGetter : } CHECK(call_node->op.as()) - << "Primitive function only allows call into primitive ops"; + << "Primitive function only allows call into primitive ops"; Op op = Downcast(call_node->op); Array outputs; // Skip fcompute for device copy operators as it is not registered. @@ -269,8 +288,8 @@ class ScheduleGetter : int op_pattern = fpattern[op]; if (op_pattern >= kCommReduce) { CHECK(!master_op_.defined() || master_op_pattern_ < kCommReduce) - << "Two complicated op in a primitive function " - << " master=" << master_op_ << " current=" << op; + << "Two complicated op in a primitive function " + << " master=" << master_op_ << " current=" << op; } if (op_pattern >= master_op_pattern_) { master_op_ = op; @@ -339,6 +358,7 @@ class ScheduleGetter : // overhead for each invocation of call node when retrieving schedules. const Op& device_copy_op_; }; +*/ // Creates shape function from functor. class MakeShapeFunc : public ExprFunctor(const Expr&)> { @@ -677,9 +697,14 @@ class CompileEngineImpl : public CompileEngineNode { * \return Pair of schedule and cache. * The funcs field in cache is not yet populated. */ - std::pair CreateSchedule( - const Function& source_func, const Target& target) { - return ScheduleGetter(target).Create(source_func); + CachedFunc CreateSchedule(const Function& source_func, const Target& target) { + CachedFunc cfunc; + if (const auto* f = runtime::Registry::Get("relay.backend.create_schedule")) { + cfunc = (*f)(source_func, target); + } else { + LOG(FATAL) << "relay.backend.create_schedule is not registered"; + } + return cfunc; } private: @@ -713,9 +738,9 @@ class CompileEngineImpl : public CompileEngineNode { With target_scope(key->target); CHECK(!value->cached_func.defined()); - auto spair = CreateSchedule(key->source_func, key->target); + auto cfunc = CreateSchedule(key->source_func, key->target); auto cache_node = make_object( - *(spair.second.operator->())); + *(cfunc.operator->())); // Skip lowering for device copy node. const Expr body = (key->source_func)->body; @@ -735,11 +760,12 @@ class CompileEngineImpl : public CompileEngineNode { // lower the function if (const auto* f = runtime::Registry::Get("relay.backend.lower")) { cache_node->funcs = (*f)( - spair.first, all_args, cache_node->func_name, key->source_func); + cfunc->schedule, all_args, cache_node->func_name, key->source_func); } else { tvm::BuildConfig bcfg = BuildConfig::Create(); std::unordered_map binds; - cache_node->funcs = tvm::lower(spair.first, all_args, cache_node->func_name, binds, bcfg); + cache_node->funcs = tvm::lower(cfunc->schedule, all_args, cache_node->func_name, + binds, bcfg); } value->cached_func = CachedFunc(cache_node); return value; @@ -820,6 +846,9 @@ const CompileEngine& CompileEngine::Global() { return *inst; } +TVM_REGISTER_GLOBAL("relay.backend._make_CachedFunc") +.set_body_typed(CachedFuncNode::make); + TVM_REGISTER_GLOBAL("relay.backend._make_CCacheKey") .set_body_typed(CCacheKeyNode::make); diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h index 15ec2d6bd0f1..a405b208ddcb 100644 --- a/src/relay/backend/compile_engine.h +++ b/src/relay/backend/compile_engine.h @@ -44,6 +44,7 @@ enum ShapeFuncParamState { kNeedBoth = 3, }; +class CachedFunc; /*! \brief Node container to represent a cached function. */ struct CachedFuncNode : public Object { /* \brief compiled target */ @@ -54,6 +55,8 @@ struct CachedFuncNode : public Object { tvm::Array inputs; /* \brief The outputs to the function */ tvm::Array outputs; + /* \brief The schedule to the function */ + te::Schedule schedule; /*! \brief The lowered functions to support the function. */ tvm::Array funcs; /*! \brief Parameter usage states in the shape function. */ @@ -64,10 +67,19 @@ struct CachedFuncNode : public Object { v->Visit("func_name", &func_name); v->Visit("inputs", &inputs); v->Visit("outputs", &outputs); + v->Visit("schedule", &schedule); v->Visit("funcs", &funcs); v->Visit("shape_func_param_states", &shape_func_param_states); } + TVM_DLL static CachedFunc make(tvm::Target target, + std::string func_name, + tvm::Array inputs, + tvm::Array outputs, + te::Schedule schedule, + tvm::Array funcs, + tvm::Array shape_func_param_states); + static constexpr const char* _type_key = "relay.CachedFunc"; TVM_DECLARE_FINAL_OBJECT_INFO(CachedFuncNode, Object); }; diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc index 0292a6c2bb05..f63fc7a26c20 100644 --- a/src/relay/ir/expr.cc +++ b/src/relay/ir/expr.cc @@ -354,6 +354,12 @@ TVM_REGISTER_GLOBAL("relay._expr.TempExprRealize") return temp->Realize(); }); +TVM_REGISTER_GLOBAL("relay._expr.FunctionGetAttr") +.set_body_typed( + [](Function func, std::string name) { + return FunctionGetAttr(func, name); +}); + TVM_REGISTER_GLOBAL("relay._expr.FunctionSetAttr") .set_body_typed( [](Function func, std::string name, ObjectRef ref) { diff --git a/src/relay/ir/op_attr_types.cc b/src/relay/ir/op_attr_types.cc new file mode 100644 index 000000000000..38f890ba75d4 --- /dev/null +++ b/src/relay/ir/op_attr_types.cc @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +namespace tvm { +namespace relay { + +TVM_REGISTER_NODE_TYPE(OpImplementNode); +TVM_REGISTER_NODE_TYPE(OpSpecializationNode); +TVM_REGISTER_NODE_TYPE(OpStrategyNode); + +Array OpImplement::Compute(const Attrs& attrs, + const Array& inputs, + const Type& out_type) { + return (*this)->fcompute(attrs, inputs, out_type); +} + +te::Schedule OpImplement::Schedule(const Attrs& attrs, + const Array &outs, + const Target& target) { + return (*this)->fschedule(attrs, outs, target); +} + +void OpSpecialization::AddImplement(tvm::relay::FTVMCompute fcompute, + tvm::relay::FTVMSchedule fschedule, + int plevel) { + auto n = make_object(); + n->fcompute = fcompute; + n->fschedule = fschedule; + n->plevel = IntImm(DataType::Int(32), plevel); + (*this)->implements.push_back(OpImplement(n)); +} + +void OpStrategy::AddImplement(FTVMCompute fcompute, + FTVMSchedule fschedule, + int plevel) { + auto curr_cond = te::SpecializedCondition::Current(); + auto specializations = (*this)->specializations; + OpSpecialization op_spec; + for (auto e : specializations) { + if (e->condition == curr_cond) { + op_spec = e; + break; + } + } + if (op_spec.defined()) { + op_spec.AddImplement(fcompute, fschedule, plevel); + } else { + ObjectPtr n = make_object(); + n->condition = curr_cond; + op_spec = OpSpecialization(n); + op_spec.AddImplement(fcompute, fschedule, plevel); + (*this)->specializations.push_back(op_spec); + } +} + +TVM_REGISTER_GLOBAL("relay.op._OpImplementCompute") +.set_body([](TVMArgs args, TVMRetValue* rv) { + OpImplement imp = args[0]; + Attrs attrs = args[1]; + Array inputs = args[2]; + Type out_type = args[3]; + *rv = imp.Compute(attrs, inputs, out_type); +}); + +TVM_REGISTER_GLOBAL("relay.op._OpImplementSchedule") +.set_body([](TVMArgs args, TVMRetValue* rv) { + OpImplement imp = args[0]; + Attrs attrs = args[1]; + Array outs = args[2]; + Target target = args[3]; + *rv = imp.Schedule(attrs, outs, target); +}); + +TVM_REGISTER_GLOBAL("relay.op._make.OpStrategy") +.set_body([](TVMArgs args, TVMRetValue* rv) { + ObjectPtr n = make_object(); + *rv = OpStrategy(n); +}); + +TVM_REGISTER_GLOBAL("relay.op._OpStrategyAddImplement") +.set_body([](TVMArgs args, TVMRetValue* rv) { + OpStrategy strategy = args[0]; + FTVMCompute compute = args[1]; + FTVMSchedule schedule = args[2]; + int plevel = args[3]; + strategy.AddImplement(compute, schedule, plevel); +}); + + +} // namespace relay +} // namespace tvm diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc index 6106b07f543b..36f592355a2c 100644 --- a/src/relay/op/annotation/annotation.cc +++ b/src/relay/op/annotation/annotation.cc @@ -79,7 +79,7 @@ TVM_ADD_FILELINE) .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_dtype, const Target& target) -> Array { + const Type& out_dtype) -> Array { return {topi::identity(inputs[0])}; }); @@ -105,7 +105,7 @@ TVM_ADD_FILELINE) .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_dtype, const Target& target) -> Array { + const Type& out_dtype) -> Array { return {topi::identity(inputs[0])}; }); @@ -123,7 +123,7 @@ Mark the start of bitpacking. ElemwiseArbitraryLayout) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_dtype, const Target& target) -> Array { + const Type& out_dtype) -> Array { return {topi::identity(inputs[0])}; }); @@ -140,7 +140,7 @@ Mark the end of bitpacking. ElemwiseArbitraryLayout) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_dtype, const Target& target) -> Array { + const Type& out_dtype) -> Array { return {topi::identity(inputs[0])}; }); @@ -163,7 +163,7 @@ Mark a checkpoint for checkpointing memory optimization. ElemwiseArbitraryLayout) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_dtype, const Target& target) -> Array { + const Type& out_dtype) -> Array { Array outputs; for (size_t i = 0; i < inputs.size(); ++i) { outputs.push_back(topi::identity(inputs[i])); @@ -184,7 +184,7 @@ Beginning of a region that is handled by a given compiler. ElemwiseArbitraryLayout) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_dtype, const Target& target) -> Array { + const Type& out_dtype) -> Array { return {topi::identity(inputs[0])}; }); @@ -209,7 +209,7 @@ End of a region that is handled by a given compiler. ElemwiseArbitraryLayout) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_dtype, const Target& target) -> Array { + const Type& out_dtype) -> Array { return {topi::identity(inputs[0])}; }); diff --git a/src/relay/op/debug.cc b/src/relay/op/debug.cc index 14c0a01576d5..a0f7fbf4cfeb 100644 --- a/src/relay/op/debug.cc +++ b/src/relay/op/debug.cc @@ -36,9 +36,8 @@ namespace relay { TVM_REGISTER_NODE_TYPE(DebugAttrs); Array DebugCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { return Array{ topi::identity(inputs[0]) }; } diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc index 076e3fcb0dbb..d15099b6b451 100644 --- a/src/relay/op/memory/memory.cc +++ b/src/relay/op/memory/memory.cc @@ -83,7 +83,7 @@ RELAY_REGISTER_OP("memory.alloc_storage") .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_dtype, const Target& target) -> Array { + const Type& out_dtype) -> Array { return {topi::identity(inputs[0])}; }); @@ -179,7 +179,7 @@ RELAY_REGISTER_OP("memory.alloc_tensor") .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_dtype, const Target& target) -> Array { + const Type& out_dtype) -> Array { return {topi::identity(inputs[0])}; }); @@ -228,7 +228,7 @@ RELAY_REGISTER_OP("memory.invoke_tvm_op") .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_dtype, const Target& target) -> Array { + const Type& out_dtype) -> Array { return {topi::identity(inputs[0])}; }); @@ -252,7 +252,7 @@ RELAY_REGISTER_OP("memory.kill") .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_dtype, const Target& target) -> Array { + const Type& out_dtype) -> Array { return {topi::identity(inputs[0])}; }); @@ -340,7 +340,7 @@ RELAY_REGISTER_OP("memory.shape_func") .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_dtype, const Target& target) -> Array { + const Type& out_dtype) -> Array { return {topi::identity(inputs[0])}; }); diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc index 6977ac9b8575..cd9b5ddc7fbf 100644 --- a/src/relay/op/nn/convolution.cc +++ b/src/relay/op/nn/convolution.cc @@ -735,58 +735,6 @@ weight transformation in advance. .add_type_rel("Conv2DWinogradWeightTransform", Conv2DWinogradWeightTransformRel); -// Positional relay function to create conv2d winograd nnpack operator -// used by frontend FFI. -Expr MakeConv2DWinogradNNPACK(Expr data, - Expr weight, - Array strides, - Array padding, - Array dilation, - int groups, - IndexExpr channels, - Array kernel_size, - std::string data_layout, - std::string kernel_layout, - std::string out_layout, - DataType out_dtype) { - auto attrs = make_object(); - attrs->strides = std::move(strides); - attrs->padding = std::move(padding); - attrs->dilation = std::move(dilation); - attrs->groups = groups; - attrs->channels = channels; - attrs->kernel_size = std::move(kernel_size); - attrs->data_layout = std::move(data_layout); - attrs->kernel_layout = std::move(kernel_layout); - attrs->out_layout = std::move(out_layout); - attrs->out_dtype = std::move(out_dtype); - static const Op& op = Op::Get("nn.contrib_conv2d_winograd_nnpack_without_weight_transform"); - return CallNode::make(op, {data, weight}, Attrs(attrs), {}); -} - -TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_winograd_nnpack_without_weight_transform") -.set_body_typed(MakeConv2DWinogradNNPACK); - -RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_nnpack_without_weight_transform") -.describe(R"code(Compute conv2d with winograd nnpack. Only supports NCHW layout. - This operator assumes the weight tensor is already pre-transformed by - nn.contrib_conv2d_winograd_nnpack_weight_transform. - -- **data**: Input is 4D array of shape (batch_size, in_channels, height, width) -- **weight**: Any shape - We do not check the shape for this input tensor. Since different backend - has different layout strategy. - -- **out**: Output is 4D array of shape (batch_size, channels, out_height, out_width) -)code" TVM_ADD_FILELINE) -.set_attrs_type() -.set_num_inputs(2) -.add_argument("data", "Tensor", "The input tensor.") -.add_argument("weight", "Tensor", "The weight tensor.") -.set_support_level(10) -.add_type_rel("Conv2DWinogradNNPACKRel", Conv2DWinogradRel) -.set_attr("FInferCorrectLayout", ConvInferCorrectLayout); - // relay.nn.contrib_conv2d_winograd_nnpack_weight_transform TVM_REGISTER_NODE_TYPE(Conv2DWinogradNNPACKWeightTransformAttrs); @@ -848,55 +796,6 @@ weight transformation in advance. .set_support_level(10) .add_type_rel("Conv2DWinogradNNPACKWeightTransform", Conv2DWinogradNNPACKWeightTransformRel); -// Positional relay function to create conv2d NCHWc operator -// used by frontend FFI. -Expr MakeConv2DNCHWcInt8(Expr data, - Expr kernel, - Array strides, - Array padding, - Array dilation, - int groups, - IndexExpr channels, - Array kernel_size, - std::string data_layout, - std::string kernel_layout, - std::string out_layout, - DataType out_dtype) { - auto attrs = make_object(); - attrs->strides = std::move(strides); - attrs->padding = std::move(padding); - attrs->dilation = std::move(dilation); - attrs->groups = groups; - attrs->channels = channels; - attrs->kernel_size = std::move(kernel_size); - attrs->data_layout = std::move(data_layout); - attrs->kernel_layout = std::move(kernel_layout); - attrs->out_layout = std::move(out_layout); - attrs->out_dtype = std::move(out_dtype); - static const Op& op = Op::Get("nn.contrib_conv2d_NCHWc_int8"); - return CallNode::make(op, {data, kernel}, Attrs(attrs), {}); -} - -TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_NCHWc_int8") -.set_body_typed(MakeConv2DNCHWcInt8); - - -RELAY_REGISTER_OP("nn.contrib_conv2d_NCHWc_int8") -.describe(R"code(Compute conv2d with NCHWc data layout with int8 inputs. -- **data**: Input is 5D packed tensor. -- **weight**: 7D packed tensor. - -- **out**: Output is 5D packed tensor -)code" TVM_ADD_FILELINE) -.set_attrs_type() -.set_num_inputs(2) -.add_argument("data", "Tensor", "The input tensor.") -.add_argument("weight", "Tensor", "The weight tensor.") -.set_support_level(10) -.add_type_rel("Conv2DNCHWcInt8", Conv2DWinogradRel) -.set_attr("FInferCorrectLayout", - ConvInferCorrectLayout); - // Positional relay function to create conv2d NCHWc operator // used by frontend FFI. Expr MakeConv2DNCHWc(Expr data, diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index ee4471a85c17..10fd4d975ce4 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -93,8 +93,9 @@ RELAY_REGISTER_OP("nn.bias_add") .add_argument("bias", "1D Tensor", "Bias.") .set_support_level(1) .add_type_rel("BiasAdd", BiasAddRel) -.set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_type, const Target& target) { +.set_attr("FTVMCompute", [](const Attrs& attrs, + const Array& inputs, + const Type& out_type) { const auto* param = attrs.as(); return tvm::Array{topi::nn::bias_add(inputs[0], inputs[1], param->axis)}; }); @@ -234,8 +235,7 @@ RELAY_REGISTER_OP("nn.leaky_relu") .set_attr( "FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_type, - const Target& target) { + const Type& out_type) { const auto* param = attrs.as(); return Array{ topi::leaky_relu(inputs[0], param->alpha) }; }); @@ -315,8 +315,7 @@ where :math:`*` is an channelwise multiplication for each sample in the batch. .set_attr( "FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_type, - const Target& target) { + const Type& out_type) { const auto* param = attrs.as(); return Array{ topi::prelu(inputs[0], inputs[1], param->axis)}; }); @@ -351,8 +350,7 @@ RELAY_REGISTER_OP("nn.softmax") .add_type_rel("Identity", IdentityRel) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_type, - const Target& target) { + const Type& out_type) { const auto* param = attrs.as(); CHECK(param != nullptr); return Array{ topi::nn::softmax(inputs[0], param->axis) }; @@ -385,8 +383,7 @@ RELAY_REGISTER_OP("nn.log_softmax") .add_type_rel("Identity", IdentityRel) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_type, - const Target& target) { + const Type& out_type) { const auto* param = attrs.as(); CHECK(param != nullptr); CHECK(param->axis == -1 || param->axis == static_cast(inputs[0].ndim()) - 1) @@ -462,8 +459,7 @@ Example:: .set_attr( "FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_type, - const Target& target) { + const Type& out_type) { return Array{ topi::nn::flatten(inputs[0]) }; }); @@ -489,8 +485,7 @@ RELAY_REGISTER_OP("nn.relu") .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_type, - const Target& target) { + const Type& out_type) { return Array{ topi::relu(inputs[0], 0.0f) }; }); diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc index 94602ec9a61a..84a49403e837 100644 --- a/src/relay/op/nn/pad.cc +++ b/src/relay/op/nn/pad.cc @@ -161,9 +161,8 @@ bool PadRel(const Array& types, } Array PadCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const auto* param = attrs.as(); CHECK(param != nullptr); diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc index 6775b09e8aa9..e9057b7ac086 100644 --- a/src/relay/op/nn/pooling.cc +++ b/src/relay/op/nn/pooling.cc @@ -164,9 +164,8 @@ bool Pool2DRel(const Array& types, template Array Pool2DCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { static const Layout kNCHW("NCHW"); const auto* param = attrs.as(); CHECK(param != nullptr); @@ -331,9 +330,8 @@ bool GlobalPool2DRel(const Array& types, template Array GlobalPool2DCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { static const Layout kNCHW("NCHW"); const auto* param = attrs.as(); CHECK(param != nullptr); @@ -465,9 +463,8 @@ bool AdaptivePool2DRel(const Array& types, template Array AdaptivePool2DCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { static const Layout kNCHW("NCHW"); const auto* param = attrs.as(); CHECK(param != nullptr); @@ -593,8 +590,9 @@ bool Pool2DGradRel(const Array& types, int num_inputs, const Attrs& attrs, } template -Array Pool2DGradCompute(const Attrs& attrs, const Array& inputs, - const Type& out_type, const Target& target) { +Array Pool2DGradCompute(const Attrs& attrs, + const Array& inputs, + const Type& out_type) { static const Layout kNCHW("NCHW"); const auto* param = attrs.as(); CHECK(param != nullptr); @@ -793,9 +791,8 @@ bool Pool1DRel(const Array& types, template Array Pool1DCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { static const Layout kNCW("NCW"); const auto* param = attrs.as(); CHECK(param != nullptr); @@ -985,9 +982,8 @@ bool Pool3DRel(const Array& types, template Array Pool3DCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { static const Layout kNCDHW("NCDHW"); const auto* param = attrs.as(); CHECK(param != nullptr); diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc index d1b915cfa142..58221ae66f6e 100644 --- a/src/relay/op/tensor/binary.cc +++ b/src/relay/op/tensor/binary.cc @@ -32,9 +32,8 @@ namespace relay { #define RELAY_BINARY_COMPUTE(FTOPI) \ [] (const Attrs& attrs, \ - const Array& inputs, \ - const Type& out_type, \ - const Target& target) -> Array { \ + const Array& inputs, \ + const Type& out_type) -> Array { \ CHECK_EQ(inputs.size(), 2U); \ return {FTOPI(inputs[0], inputs[1])}; \ } \ diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc index acbde0d6e28b..5e0795eaa60b 100644 --- a/src/relay/op/tensor/reduce.cc +++ b/src/relay/op/tensor/reduce.cc @@ -176,7 +176,6 @@ template Array ReduceCompute(const Attrs& attrs, const Array& inputs, const Type& out_type, - const Target& target, F f) { const ReduceAttrs* param = attrs.as(); CHECK(param != nullptr); @@ -321,10 +320,9 @@ bool ReduceRel(const Array& types, Array ArgMaxCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { - return ReduceCompute(attrs, inputs, out_type, target, topi::argmax); + const Array& inputs, + const Type& out_type) { + return ReduceCompute(attrs, inputs, out_type, topi::argmax); } @@ -341,10 +339,9 @@ values over a given axis. Array ArgMinCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { - return ReduceCompute(attrs, inputs, out_type, target, topi::argmin); + const Array& inputs, + const Type& out_type) { + return ReduceCompute(attrs, inputs, out_type, topi::argmin); } RELAY_REGISTER_REDUCE_OP("argmin") @@ -359,10 +356,9 @@ values over a given axis. .set_attr("TOpPattern", kCommReduce); Array SumCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { - return ReduceCompute(attrs, inputs, out_type, target, topi::sum); + const Array& inputs, + const Type& out_type) { + return ReduceCompute(attrs, inputs, out_type, topi::sum); } @@ -393,10 +389,9 @@ Example:: Array AllCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { - return ReduceCompute(attrs, inputs, out_type, target, topi::all); + const Array& inputs, + const Type& out_type) { + return ReduceCompute(attrs, inputs, out_type, topi::all); } @@ -430,10 +425,9 @@ Example:: Array AnyCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { - return ReduceCompute(attrs, inputs, out_type, target, topi::any); + const Array& inputs, + const Type& out_type) { + return ReduceCompute(attrs, inputs, out_type, topi::any); } @@ -467,10 +461,9 @@ Example:: Array MaxCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { - return ReduceCompute(attrs, inputs, out_type, target, topi::max); + const Array& inputs, + const Type& out_type) { + return ReduceCompute(attrs, inputs, out_type, topi::max); } RELAY_REGISTER_REDUCE_OP("max") @@ -485,10 +478,9 @@ RELAY_REGISTER_REDUCE_OP("max") Array MinCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { - return ReduceCompute(attrs, inputs, out_type, target, topi::min); + const Array& inputs, + const Type& out_type) { + return ReduceCompute(attrs, inputs, out_type, topi::min); } @@ -504,10 +496,9 @@ RELAY_REGISTER_REDUCE_OP("min") Array ProdCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { - return ReduceCompute(attrs, inputs, out_type, target, topi::prod); + const Array& inputs, + const Type& out_type) { + return ReduceCompute(attrs, inputs, out_type, topi::prod); } RELAY_REGISTER_REDUCE_OP("prod") @@ -534,9 +525,8 @@ Example:: Array MeanCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { IndexExpr count = tir::make_const(inputs[0]->dtype, 1); const ReduceAttrs* param = attrs.as(); CHECK(param != nullptr); @@ -546,7 +536,7 @@ Array MeanCompute(const Attrs& attrs, param->exclude)) { count *= inputs[0]->shape[i]; } - auto res = ReduceCompute(attrs, inputs, out_type, target, topi::sum); + auto res = ReduceCompute(attrs, inputs, out_type, topi::sum); return {topi::divide(res[0], count)}; } @@ -599,9 +589,8 @@ bool VarianceRel(const Array& types, } Array VarianceCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { IndexExpr count = tir::make_const(inputs[0]->dtype, 1); const ReduceAttrs* param = attrs.as(); CHECK(param != nullptr); @@ -615,7 +604,7 @@ Array VarianceCompute(const Attrs& attrs, } std::vector expand_shape; auto sq_diff = topi::power(topi::subtract(data, mean), 2); - auto var = topi::divide(ReduceCompute(attrs, {sq_diff}, out_type, target, topi::sum)[0], count); + auto var = topi::divide(ReduceCompute(attrs, {sq_diff}, out_type, topi::sum)[0], count); return {var}; } diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc index 969912f4de8b..53bcba7f1356 100644 --- a/src/relay/op/tensor/transform.cc +++ b/src/relay/op/tensor/transform.cc @@ -66,9 +66,8 @@ bool CastRel(const Array& types, } Array CastCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const CastAttrs *param = attrs.as(); CHECK(param != nullptr); DataType dtype = param->dtype; @@ -126,9 +125,8 @@ bool CastLikeRel(const Array& types, Array CastLikeCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { return { topi::cast(inputs[0], inputs[1]->dtype) }; } @@ -156,8 +154,9 @@ RELAY_REGISTER_OP("cast_like") .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout); -Array ReinterpretCompute(const Attrs& attrs, const Array& inputs, - const Type& out_type, const Target& target) { +Array ReinterpretCompute(const Attrs& attrs, + const Array& inputs, + const Type& out_type) { const CastAttrs* param = attrs.as(); CHECK(param != nullptr); DataType dtype = param->dtype; @@ -231,9 +230,8 @@ bool ExpandDimsRel(const Array& types, } Array ExpandDimsCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const ExpandDimsAttrs *param = attrs.as(); CHECK(param != nullptr); return { topi::expand_dims(inputs[0], param->axis, param->num_newaxis) }; @@ -270,9 +268,8 @@ RELAY_REGISTER_OP("expand_dims") TVM_REGISTER_NODE_TYPE(ConcatenateAttrs); Array ConcatenateCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const ConcatenateAttrs *param = attrs.as(); CHECK(param != nullptr); return { topi::concatenate(inputs, param->axis) }; @@ -413,9 +410,8 @@ bool StackRel(const Array& types, } Array StackCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const StackAttrs *param = attrs.as(); CHECK(param != nullptr); return { topi::stack(inputs, param->axis) }; @@ -505,9 +501,8 @@ bool TransposeRel(const Array& types, } Array TransposeCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const auto* param = attrs.as(); CHECK(param != nullptr); return Array{ topi::transpose(inputs[0], param->axes) }; @@ -688,9 +683,8 @@ bool ReshapeRel(const Array& types, } Array ReshapeCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const auto* out_ttype = out_type.as(); CHECK(out_ttype != nullptr); Array newshape; @@ -923,9 +917,8 @@ bool TakeRel(const Array& types, } Array TakeCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const auto* param = attrs.as(); CHECK(param != nullptr); if (!param->axis.defined()) { @@ -1010,9 +1003,8 @@ bool FullRel(const Array& types, } Array FullCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const auto* out_ttype = out_type.as(); return { topi::full(out_ttype->shape, out_ttype->dtype, inputs[0]()) }; } @@ -1118,9 +1110,8 @@ bool FullLikeRel(const Array& types, } Array FullLikeCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { return { topi::full_like(inputs[0], inputs[1]()) }; } @@ -1230,9 +1221,8 @@ inline te::Tensor DynamicArange(const te::Tensor& start, } Array ArangeCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const ArangeAttrs* param = attrs.as(); te::Tensor start = inputs[0]; te::Tensor stop = inputs[1]; @@ -1325,9 +1315,8 @@ bool RepeatRel(const Array& types, } Array RepeatCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const RepeatAttrs *param = attrs.as(); CHECK(param != nullptr); return { topi::repeat(inputs[0], param->repeats, param->axis) }; @@ -1436,9 +1425,8 @@ bool TileRel(const Array& types, } Array TileCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const TileAttrs *param = attrs.as(); CHECK(param != nullptr); return { topi::tile(inputs[0], param->reps) }; @@ -1497,9 +1485,8 @@ bool ReverseRel(const Array& types, } Array ReverseCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const ReverseAttrs *param = attrs.as(); CHECK(param != nullptr); return { topi::flip(inputs[0], param->axis) }; @@ -1571,9 +1558,8 @@ Expr MakeWhere(const Expr& condition, const Expr& x, const Expr& y) { } Array WhereCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { return { topi::where(inputs[0], inputs[1], inputs[2]) }; } @@ -1688,9 +1674,8 @@ bool SqueezeRel(const Array& types, } Array SqueezeCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const SqueezeAttrs *param = attrs.as(); CHECK(param != nullptr); return { topi::squeeze(inputs[0], param->axis) }; @@ -1729,9 +1714,8 @@ Expr MakeCollapseSumLike(Expr data, } Array CollapseSumLikeCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const auto* out_ttype = out_type.as(); CHECK(out_ttype != nullptr); return { topi::collapse_sum(inputs[0], out_ttype->shape) }; @@ -1774,9 +1758,8 @@ Expr MakeBroadCastTo(Expr data, Array shape) { } Array BroadCastToCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { auto ioattrs = attrs.as(); CHECK(ioattrs != nullptr); return { topi::broadcast_to(inputs[0], ioattrs->shape) }; @@ -1812,9 +1795,8 @@ Expr MakeBroadCastToLike(Expr data, } Array BroadCastToLikeCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const auto* out_ttype = out_type.as(); CHECK(out_ttype != nullptr); return { topi::broadcast_to(inputs[0], out_ttype->shape) }; @@ -2019,9 +2001,8 @@ Expr MakeStridedSlice(Expr data, } Array StridedSliceCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const StridedSliceAttrs *param = attrs.as(); CHECK(param != nullptr); return Array{ @@ -2176,9 +2157,8 @@ bool SplitRel(const Array& types, } Array SplitCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const auto param = attrs.as(); CHECK(param != nullptr); @@ -2305,9 +2285,8 @@ Expr MakeSliceLike(Expr data, } Array SliceLikeCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const auto* param = attrs.as(); CHECK(param != nullptr); Array src_shape = inputs[0]->shape; @@ -2371,9 +2350,8 @@ RELAY_REGISTER_OP("slice_like") TVM_REGISTER_NODE_TYPE(LayoutTransformAttrs); Array LayoutTransformCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const auto* param = attrs.as(); CHECK(param != nullptr); return Array{ @@ -2504,9 +2482,8 @@ bool GatherNDRel(const Array& types, } Array GatherNDCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { return { topi::gather_nd(inputs[0], inputs[1]) }; } @@ -2558,9 +2535,8 @@ bool SequenceMaskRel(const Array& types, } Array SequenceMaskCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const auto* param = attrs.as(); CHECK(param != nullptr); return Array{ @@ -2671,9 +2647,8 @@ bool OneHotRel(const Array& types, } Array OneHotCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { const auto* param = attrs.as(); CHECK(param != nullptr); return Array { diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc index 7f6db50bf702..caa6451542c9 100644 --- a/src/relay/op/tensor/unary.cc +++ b/src/relay/op/tensor/unary.cc @@ -34,9 +34,8 @@ namespace relay { #define RELAY_UNARY_COMPUTE(FTOPI) \ [] (const Attrs& attrs, \ - const Array& inputs, \ - const Type& out_type, \ - const Target& target) -> Array { \ + const Array& inputs, \ + const Type& out_type) -> Array { \ return {FTOPI(inputs[0])}; \ } \ @@ -302,9 +301,8 @@ bool ShapeOfRel(const Array& types, } Array ShapeOfCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { CHECK_EQ(inputs.size(), 1); const auto* param = attrs.as(); CHECK(param != nullptr); @@ -353,9 +351,8 @@ bool NdarraySizeRel(const Array& types, } Array NdarraySizeCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type, - const Target& target) { + const Array& inputs, + const Type& out_type) { CHECK_EQ(inputs.size(), 1); const auto* param = attrs.as(); CHECK(param != nullptr); diff --git a/src/relay/op/vision/yolo.cc b/src/relay/op/vision/yolo.cc index 9c4a2850903b..7d152718f3a0 100644 --- a/src/relay/op/vision/yolo.cc +++ b/src/relay/op/vision/yolo.cc @@ -83,8 +83,7 @@ Its function is mostly shape transform.")doc" TVM_ADD_FILELINE) .add_type_rel("YoloReorg", YoloReorgRel) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, - const Type& out_type, - const Target& target) { + const Type& out_type) { const auto* params = attrs.as(); CHECK(params != nullptr); return Array{ topi::vision::reorg(inputs[0], params->stride) }; diff --git a/src/relay/pass/alter_op_layout.cc b/src/relay/pass/alter_op_layout.cc index 0cc3ff090dd8..fe8862523dda 100644 --- a/src/relay/pass/alter_op_layout.cc +++ b/src/relay/pass/alter_op_layout.cc @@ -83,7 +83,10 @@ class AlterTransformMemorizer : public TransformMemorizer { auto ttype = expr->type_as(); tinfos.push_back(tvm::te::placeholder(ttype->shape, ttype->dtype)); } - Expr altered_value = falter_layout[op](ref_call->attrs, new_args, tinfos); + // TODO(@kevinthesun, @icemelon9): This won't work if inputs/outputs are dynamic shapes. + // Probably we need to disable the AlterOpLayout when compiling dynamic models. + Expr altered_value = falter_layout[op](ref_call->attrs, new_args, tinfos, + ref_call->checked_type()); if (altered_value.defined()) { new_e = altered_value; modified = true; diff --git a/src/te/schedule/schedule_lang.cc b/src/te/schedule/schedule_lang.cc index d3b448d37790..1f3c3a17aa0e 100644 --- a/src/te/schedule/schedule_lang.cc +++ b/src/te/schedule/schedule_lang.cc @@ -20,9 +20,11 @@ /*! * \file schedule_lang.cc */ +#include #include #include #include +#include #include #include "graph.h" @@ -787,6 +789,67 @@ IterVarRelation SingletonNode::make(IterVar iter) { return IterVarRelation(n); } +SpecializedCondition SpecializedConditionNode::make(Array conditions) { + auto n = make_object(); + n->clauses = conditions; + return SpecializedCondition(n); +} + +/*! \brief Entry to hold the SpecializedCondition context stack. */ +struct TVMSpecializationThreadLocalEntry { + /*! \brief The current specialized condition */ + std::stack condition_stack; +}; + +/*! \brief Thread local store to hold the Target context stack. */ +typedef dmlc::ThreadLocalStore TVMSpecializationThreadLocalStore; + +void SpecializedCondition::EnterWithScope() { + TVMSpecializationThreadLocalEntry *entry = TVMSpecializationThreadLocalStore::Get(); + entry->condition_stack.push(*this); +} + +void SpecializedCondition::ExitWithScope() { + TVMSpecializationThreadLocalEntry *entry = TVMSpecializationThreadLocalStore::Get(); + CHECK(!entry->condition_stack.empty()); + CHECK(entry->condition_stack.top().same_as(*this)); + entry->condition_stack.pop(); +} + +SpecializedCondition SpecializedCondition::Current() { + TVMSpecializationThreadLocalEntry *entry = TVMSpecializationThreadLocalStore::Get(); + SpecializedCondition cond; + if (entry->condition_stack.size() > 0) { + cond = entry->condition_stack.top(); + } + return cond; +} + +TVM_REGISTER_GLOBAL("_CreateSpecializedCondition") +.set_body_typed(SpecializedConditionNode::make); + +TVM_REGISTER_GLOBAL("_GetCurrentSpecialization") +.set_body([](TVMArgs args, TVMRetValue* ret) { + *ret = SpecializedCondition::Current(); +}); + +class SpecializedCondition::Internal { + public: + static void EnterScope(SpecializedCondition cond) { + cond.EnterWithScope(); + } + + static void ExitScope(SpecializedCondition cond) { + cond.ExitWithScope(); + } +}; + +TVM_REGISTER_GLOBAL("_EnterSpecializationScope") +.set_body_typed(SpecializedCondition::Internal::EnterScope); + +TVM_REGISTER_GLOBAL("_ExitSpecializationScope") +.set_body_typed(SpecializedCondition::Internal::ExitScope); + TVM_REGISTER_NODE_TYPE(StageNode); TVM_REGISTER_NODE_TYPE(IterVarAttrNode); TVM_REGISTER_NODE_TYPE(SplitNode); @@ -794,6 +857,7 @@ TVM_REGISTER_NODE_TYPE(FuseNode); TVM_REGISTER_NODE_TYPE(RebaseNode); TVM_REGISTER_NODE_TYPE(SingletonNode); TVM_REGISTER_NODE_TYPE(ScheduleNode); +TVM_REGISTER_NODE_TYPE(SpecializedConditionNode); // Printer TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) @@ -848,7 +912,13 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) .set_dispatch([](const ObjectRef& node, ReprPrinter* p) { auto* op = static_cast(node.get()); p->stream << "schedule(" << op << ")"; - }); +}) +.set_dispatch([](const ObjectRef& node, ReprPrinter* p) { + auto* op = static_cast(node.get()); + p->stream << "specialization("; + p->Print(op->clauses); + p->stream << ')'; +}); TVM_REGISTER_GLOBAL("te.CreateSchedule") @@ -962,5 +1032,6 @@ TVM_REGISTER_GLOBAL("te.ScheduleCacheWrite") TVM_REGISTER_GLOBAL("te.ScheduleRFactor") .set_body_method(&Schedule::rfactor); + } // namespace te } // namespace tvm diff --git a/tests/python/relay/test_autotvm_task_extraction.py b/tests/python/relay/test_autotvm_task_extraction.py index 8f550d82c4f6..73dbf106b541 100644 --- a/tests/python/relay/test_autotvm_task_extraction.py +++ b/tests/python/relay/test_autotvm_task_extraction.py @@ -39,25 +39,28 @@ def test_task_extraction(): target = 'llvm' mod_list = [] params_list = [] + conv2d = relay.op.get("nn.conv2d") + conv2d_transpose = relay.op.get("nn.conv2d_transpose") + dense = relay.op.get("nn.dense") mod, params, _ = get_network('resnet-18', batch_size=1) tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params, - ops=(relay.op.nn.conv2d,)) + ops=(conv2d,)) assert len(tasks) == 12 tasks = autotvm.task.extract_from_program(mod, target=target, params=params, - ops=(relay.op.nn.conv2d,)) + ops=(conv2d,)) assert len(tasks) == 12 mod, params, _ = get_network('resnet-18', batch_size=1) tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params, - ops=(relay.op.nn.dense,)) + ops=(dense,)) assert len(tasks) == 1 tasks = autotvm.task.extract_from_program(mod, target=target, params=params, - ops=(relay.op.nn.dense,)) + ops=(dense,)) assert len(tasks) == 1 mod, params, _ = get_network('resnet-18', batch_size=1) @@ -65,11 +68,14 @@ def test_task_extraction(): params_list.append(params) tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params, - ops=(relay.op.nn.conv2d, relay.op.nn.dense)) + ops=(conv2d, dense)) assert len(tasks) == 13 tasks = autotvm.task.extract_from_program(mod, target=target, params=params, - ops=(relay.op.nn.conv2d, relay.op.nn.dense)) + ops=(conv2d, dense)) + assert len(tasks) == 13 + tasks = autotvm.task.extract_from_program(mod, target=target, + params=params) assert len(tasks) == 13 mod, params, _ = get_network('mobilenet', batch_size=1) @@ -77,18 +83,18 @@ def test_task_extraction(): params_list.append(params) tasks = autotvm.task.extract_from_program(mod, target=target, params=params, - ops=(relay.op.nn.conv2d, relay.op.nn.dense)) + ops=(conv2d, dense)) assert len(tasks) == 20 mod, params, _ = get_network('dcgan', batch_size=1) tasks = autotvm.task.extract_from_program(mod, target=target, params=params, - ops=(relay.op.nn.conv2d_transpose,)) + ops=(conv2d_transpose,)) assert len(tasks) == 4 tasks = autotvm.task.extract_from_multiple_program(mod_list, params_list, target=target, - ops=(relay.op.nn.conv2d,)) + ops=(conv2d,)) assert len(tasks) == 31 def test_template_key_provided(): @@ -136,6 +142,7 @@ def test_template_key_default(): if __name__ == '__main__': test_task_extraction() - test_template_key_provided() - test_template_key_empty() - test_template_key_default() + # TODO(@icemelon9): template key will no long exist, remove these tasks. + # test_template_key_provided() + # test_template_key_empty() + # test_template_key_default() diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py index 0d3fd4b3f829..e9acd96f3935 100644 --- a/tests/python/relay/test_op_level2.py +++ b/tests/python/relay/test_op_level2.py @@ -222,7 +222,7 @@ def run_test_conv2d(dtype, out_dtype, scale, dshape, kshape, continue intrp1 = relay.create_executor("graph", ctx=ctx, target=target) op_res1 = intrp1.evaluate(func)(data, kernel) - tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) + tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-4, atol=1e-4) def compile_test_conv2d_arm_cpu(dtype, out_dtype, scale, dshape, kshape, padding=(1, 1), @@ -240,13 +240,13 @@ def compile_test_conv2d_arm_cpu(dtype, out_dtype, scale, dshape, kshape, mod = tvm.IRModule() mod["main"] = func - test_schedule='{"i": ["llvm -device=arm_cpu", "topi_nn_depthwise_conv2d_nchw", \ + test_schedule='{"i": ["llvm -device=arm_cpu", "depthwise_conv2d_nchw_spatial_pack.arm_cpu", \ [["TENSOR", [1, 512, 32, 32], "float32"], \ ["TENSOR", [512, 1, 3, 3], "float32"], \ [1, 1], [1, 1], [1, 1], "float32"], {}, \ - ["depthwise_conv2d_nchw", [1, 512, 32, 32, "float32"], \ + ["depthwise_conv2d_nchw_spatial_pack.arm_cpu", [1, 512, 32, 32, "float32"], \ [512, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], \ - {"i": 743640, "t": "contrib_spatial_pack", "c": null, \ + {"i": 743640, "t": "", "c": null, \ "e": [["tile_co", "sp", [32, 16]], ["tile_oh", "sp", [8, 1]], \ ["tile_ow", "sp", [1, 8]], \ ["reorder_0", "re", [0, 1, 2, 3, 4, 5, 8, 6, 7]], \ @@ -319,7 +319,6 @@ def _query_inside(self, target, workload): if key in self.memory: return self.memory[key] cfg = autotvm.task.space.FallbackConfigEntity() - cfg.template_key = 'winograd' cfg.is_fallback = False cfg['tile_b'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1]) cfg['tile_y'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1]) @@ -1113,6 +1112,9 @@ def _has_fast_int8_instructions(asm, target): else: assert False, "Target should be Skylake or Cascadelake" + # TODO(@anijain2305, @icemelon9): disable conv2d_int8 for NHWC data layout. + # Re-enable this after adding conv2d_NCHWc_int8 support for NHWC. + # compile conv2d for x86 (skylake, cascadelake) and test assembly contains *pmadd* instructions targets = ["llvm -mcpu=skylake-avx512", "llvm -mcpu=cascadelake"] llvm_version = tvm.target.codegen.llvm_version_major() @@ -1127,11 +1129,11 @@ def _has_fast_int8_instructions(asm, target): dtypes=dtypes) assert _has_fast_int8_instructions(asm, target) - for ic in [1, 4, 6]: - asm = _compile(ic=ic, oc=16, target=target, data_layout="NHWC", - kernel_layout='HWIO', - dtypes=dtypes) - assert _has_fast_int8_instructions(asm, target) + # for ic in [1, 4, 6]: + # asm = _compile(ic=ic, oc=16, target=target, data_layout="NHWC", + # kernel_layout='HWIO', + # dtypes=dtypes) + # assert _has_fast_int8_instructions(asm, target) # Sweep the output channels to check int8 robustness # Output channels should be a multiple of 16 internally. @@ -1141,20 +1143,20 @@ def _has_fast_int8_instructions(asm, target): dtypes=dtypes) assert _has_fast_int8_instructions(asm, target) - for oc in [4, 16, 20]: - asm = _compile(ic=8, oc=oc, target=target, data_layout="NHWC", - kernel_layout='HWIO', - dtypes=dtypes) - assert _has_fast_int8_instructions(asm, target) + # for oc in [4, 16, 20]: + # asm = _compile(ic=8, oc=oc, target=target, data_layout="NHWC", + # kernel_layout='HWIO', + # dtypes=dtypes) + # assert _has_fast_int8_instructions(asm, target) # Check that both non-divisible oc and ic work asm = _compile(ic=17, oc=29, target=target, data_layout="NCHW", kernel_layout='OIHW', dtypes=dtypes) assert _has_fast_int8_instructions(asm, target) - asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO', - dtypes=dtypes) - assert _has_fast_int8_instructions(asm, target) + # asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO', + # dtypes=dtypes) + # assert _has_fast_int8_instructions(asm, target) # Check that int8 x int8 goes through legalization so that fast instructions can be picked up. for target in targets: @@ -1165,16 +1167,16 @@ def _has_fast_int8_instructions(asm, target): dtypes=dtypes) assert _has_fast_int8_instructions(asm, target) - asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO', - dtypes=dtypes) - assert _has_fast_int8_instructions(asm, target) + # asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO', + # dtypes=dtypes) + # assert _has_fast_int8_instructions(asm, target) # Ensure that code is generated when datatypes are not HW supported. - dtypes = ('uint8', 'uint8', 'int32') - asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO', - dtypes=dtypes) - # Check that intrinisic is not present in the assembly. - assert not _has_fast_int8_instructions(asm, target) + # dtypes = ('uint8', 'uint8', 'int32') + # asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO', + # dtypes=dtypes) + # # Check that intrinisic is not present in the assembly. + # assert not _has_fast_int8_instructions(asm, target) # Check that a vectorized instruction is generated for older Intel # generations, because we default to NCHWc layout. @@ -1223,7 +1225,7 @@ def test_bitserial_conv2d_infer_type(): y = relay.nn.bitserial_conv2d( x, w, kernel_size=(3, 3), padding=(0, 0), channels=32) yy = run_infer_type(y) - assert yy.checked_type == relay.TensorType( + assert yy.checked_type == relay.TensorType( (n, 32, 222, 222), "int16") @@ -1233,9 +1235,11 @@ def test_bitpack_infer_type(): x = relay.var("x", relay.ty.TensorType((o, i, h, w), "int16")) y = relay.nn.bitpack(x, bit_axis=4, pack_axis=1, pack_type='uint16', bits=1) yy = run_infer_type(y) - assert yy.checked_type == relay.TensorType( + assert yy.checked_type == relay.TensorType( (32, 2, 128, 128, 1), "uint16") +# TODO(@jwfromm): Need to add bitserial_conv2d & bitpack run test cases + if __name__ == "__main__": test_pool1d() diff --git a/tests/python/unittest/test_graph_tuner_core.py b/tests/python/unittest/test_graph_tuner_core.py index a8b22fd787ee..173a237bf8d9 100644 --- a/tests/python/unittest/test_graph_tuner_core.py +++ b/tests/python/unittest/test_graph_tuner_core.py @@ -48,7 +48,7 @@ def _create_data(target, dshape, dtype, layout): tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params, - ops=(relay.op.nn.conv2d,)) + ops=(relay.op.get("nn.conv2d"),)) wkl_list = [ create_workload((1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype), create_workload((1, 16, 8, 8), (32, 16, 1, 1), (1, 1), (0, 0, 0, 0), (1, 1), layout, layout, dtype, dtype), @@ -121,7 +121,8 @@ def test_graph_tuner_layout_transform(): dshape = (1, 3, 8, 8) dtype = "float32" layout = "NCHW" - target_ops = [relay.nn.conv2d] + conv2d = relay.op.get("nn.conv2d") + target_ops = [conv2d] g, records, ltf_records, ltf_keys, _ = _create_data(target, dshape, dtype, layout) executor = DPTuner(g, {"data": dshape}, records, target_ops, target=target, log_file=log_file) @@ -156,7 +157,8 @@ def test_DPTuner_run(): dtype = "float32" layout = "NCHW" dshape = (1, 3, 8, 8) - target_ops = [relay.nn.conv2d] + conv2d = relay.op.get("nn.conv2d") + target_ops = [conv2d] g, records, ltf_records, ltf_keys, tasks = _create_data(target, dshape, dtype, layout) mod = tvm.IRModule() @@ -207,7 +209,8 @@ def test_PBQPTuner_run(): dtype = "float32" layout = "NCHW" dshape = (1, 3, 8, 8) - target_ops = [relay.nn.conv2d] + conv2d = relay.op.get("nn.conv2d") + target_ops = [conv2d] g, records, ltf_records, ltf_keys, tasks = _create_data(target, dshape, dtype, layout) costs = [0.02, 0.02, 0.045] @@ -255,7 +258,8 @@ def test_many_sub_graphs(): dtype = "float32" dshape = (1, 8, 8, 3) layout = "NCHW" - target_ops = [relay.nn.conv2d] + conv2d = relay.op.get("nn.conv2d") + target_ops = [conv2d] data = relay.var("data", shape=dshape, dtype=dtype) t0 = relay.transpose(data, (0, 3, 1, 2)) @@ -277,7 +281,7 @@ def test_many_sub_graphs(): tasks = autotvm.task.extract_from_program(net["main"], target=target, params=params, - ops=(relay.op.nn.conv2d,)) + ops=(conv2d,)) wkl_list = [ create_workload((1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype), create_workload((1, 16, 8, 8), (32, 16, 1, 1), (1, 1), (0, 0, 0, 0), (1, 1), layout, layout, dtype, dtype), @@ -376,7 +380,8 @@ def test_tuple(): dtype = "float32" dshape = (1, 5, 32, 32) layout = "NCHW" - target_ops = [relay.nn.conv2d] + conv2d = relay.op.get("nn.conv2d") + target_ops = [conv2d] data = relay.var("data", shape=dshape, dtype=dtype) w0 = relay.var("w0_weight") @@ -390,7 +395,7 @@ def test_tuple(): tasks = autotvm.task.extract_from_program(net["main"], target=target, params=params, - ops=(relay.op.nn.conv2d,)) + ops=(conv2d,)) wkl_list = [ create_workload((1, 5, 32, 32), (2, 5, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype), create_workload((1, 5, 32, 32), (3, 5, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype), @@ -472,7 +477,8 @@ def test_triangle_block(): dtype = "float32" dshape = (1, 3, 8, 8) layout = "NCHW" - target_ops = [relay.nn.conv2d] + conv2d = relay.op.get("nn.conv2d") + target_ops = [conv2d] data = relay.var("data", shape=dshape, dtype=dtype) w0 = relay.var("w0_weight") @@ -488,7 +494,7 @@ def test_triangle_block(): tasks = autotvm.task.extract_from_program(net["main"], target=target, params=params, - ops=(relay.op.nn.conv2d,)) + ops=(conv2d,)) wkl_list = [ create_workload((1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype), create_workload((1, 16, 8, 8), (32, 16, 1, 1), (1, 1), (0, 0, 0, 0), (1, 1), layout, layout, dtype, dtype), diff --git a/tests/python/unittest/test_graph_tuner_utils.py b/tests/python/unittest/test_graph_tuner_utils.py index 397ea235ecbf..885065fee8d0 100644 --- a/tests/python/unittest/test_graph_tuner_utils.py +++ b/tests/python/unittest/test_graph_tuner_utils.py @@ -36,7 +36,7 @@ def create_workload(dshape, kshape, strides, data = tvm.placeholder(dshape, dtype=dtype) kernel = tvm.placeholder(kshape, dtype=dtype) return autotvm.task.args_to_workload([data, kernel, strides, padding, dilation, layout, - out_dtype], conv2d) + out_layout, out_dtype], "conv2d_NCHWc.x86") def verify_has_multiple_inputs(node_list, node_idx, input_names, expected_result): @@ -119,7 +119,7 @@ def test_get_in_nodes(): out = relay.nn.conv2d(out3, w1) net = relay.Function(relay.analysis.free_vars(out), out) net = bind_inputs(net, {"data": (1, 16, 224, 224), "w0": (16, 16, 1, 1), "w1": (16, 16, 1, 1)}) - target_ops = ["conv2d"] + target_ops = [relay.op.get("nn.conv2d")] input_names = ["data"] node_list = [] node_dict = {} diff --git a/topi/include/topi/cuda/normalization.h b/topi/include/topi/cuda/normalization.h index 1b42308d0ac2..bfc209db213b 100644 --- a/topi/include/topi/cuda/normalization.h +++ b/topi/include/topi/cuda/normalization.h @@ -35,13 +35,10 @@ using namespace tvm::te; namespace cuda { /*! * \brief Create a CUDA schedule for LRN -* -* \param target The target to generate a schedule for. * \param outs The output tensors. -* * \return A schedule for the given ops. */ -inline Schedule schedule_lrn(const Target &target, const Array& outs) { +inline Schedule schedule_lrn(const Array& outs) { Array out_ops; for (auto t : outs) { out_ops.push_back(t->op); diff --git a/topi/include/topi/rocm/normalization.h b/topi/include/topi/rocm/normalization.h index 692370d65bb7..303f4a8302c7 100644 --- a/topi/include/topi/rocm/normalization.h +++ b/topi/include/topi/rocm/normalization.h @@ -34,14 +34,11 @@ using namespace tvm::te; namespace rocm { /*! * \brief Create a rocm schedule for LRN -* -* \param target The target to generate a schedule for. * \param outs The output tensors. -* * \return A schedule for the given ops. */ -inline Schedule schedule_lrn(const Target &target, const Array& outs) { - return topi::cuda::schedule_lrn(target, outs); +inline Schedule schedule_lrn(const Array& outs) { + return topi::cuda::schedule_lrn(outs); } } // namespace rocm diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py index a0c6ab0c6d2d..f1019e667e81 100644 --- a/topi/python/topi/__init__.py +++ b/topi/python/topi/__init__.py @@ -40,6 +40,7 @@ from .broadcast import * from .sort import * from .argwhere import * +from . import generic from . import nn from . import x86 from . import cuda diff --git a/topi/python/topi/argwhere.py b/topi/python/topi/argwhere.py index 32f4e8718c46..c2a9adea0c2a 100644 --- a/topi/python/topi/argwhere.py +++ b/topi/python/topi/argwhere.py @@ -16,7 +16,6 @@ # under the License. # pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks """Argwhere operator""" -import tvm from tvm import hybrid @hybrid.script @@ -164,7 +163,6 @@ def hybrid_argwhere_5d(output_shape, condition): valid_index += 1 return a -@tvm.target.generic_func def argwhere(output_shape, condition): """Find the indices of elements of a tensor that are non-zero. diff --git a/topi/python/topi/arm_cpu/__init__.py b/topi/python/topi/arm_cpu/__init__.py index 517941c1905f..63f17422bcf1 100644 --- a/topi/python/topi/arm_cpu/__init__.py +++ b/topi/python/topi/arm_cpu/__init__.py @@ -17,10 +17,11 @@ """Schedule for ARM CPU""" -from . import conv2d -from . import depthwise_conv2d -from . import conv2d_transpose -from . import conv2d_int8 -from . import bitserial_conv2d -from . import bitserial_dense -from . import injective +from .conv2d import * +from .depthwise_conv2d import * +from .conv2d_transpose import * +from .conv2d_int8 import * +from . import conv2d_alter_op +from .bitserial_conv2d import * +from .bitserial_dense import * +from .injective import * diff --git a/topi/python/topi/arm_cpu/bitserial_conv2d.py b/topi/python/topi/arm_cpu/bitserial_conv2d.py index 4de2b1438a92..4b80b6b3b7af 100644 --- a/topi/python/topi/arm_cpu/bitserial_conv2d.py +++ b/topi/python/topi/arm_cpu/bitserial_conv2d.py @@ -26,7 +26,6 @@ from ..nn.bitserial_util import bitpack, binary_op_multiplier from ..nn.util import get_pad_tuple from ..util import get_const_int, get_const_tuple -from .. import generic def _kernel_vec_spatial_pack_nhwc(kernel, kernel_bits, VC, use_bitpack=True): if use_bitpack: @@ -38,9 +37,9 @@ def _kernel_vec_spatial_pack_nhwc(kernel, kernel_bits, VC, use_bitpack=True): return tvm.compute(kvshape, lambda co, dh, dw, b, vc, ci: \ kernel_q[dh][dw][b][ci][co*VC+vc], name='kernel_vec') -@autotvm.register_topi_compute(bitserial_conv2d_nhwc, 'arm_cpu', 'direct') -def spatial_pack_nhwc(cfg, data, kernel, stride, padding, activation_bits, weight_bits, - pack_dtype, out_dtype, unipolar): +@autotvm.register_topi_compute("bitserial_conv2d_nhwc.arm_cpu") +def bitserial_conv2d_nhwc(cfg, data, kernel, stride, padding, activation_bits, weight_bits, + pack_dtype, out_dtype, unipolar): """ Compute convolution with pack on spatial axes. """ assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" assert pack_dtype == 'uint8', "only support packing into uint8 bits" @@ -323,7 +322,7 @@ def _schedule_spatial_conv2d_nhwc(cfg, s, data_pad, data_vec, kernel_vec, s[last].parallel(oh) return s -@autotvm.register_topi_schedule(generic.nn.schedule_bitserial_conv2d_nhwc, 'arm_cpu', 'direct') +@autotvm.register_topi_schedule("bitserial_conv2d_nhwc.arm_cpu") def schedule_bitserial_conv2d_nhwc(cfg, outs): """Arm cpu schedule for bitserial conv2d""" s = tvm.create_schedule([x.op for x in outs]) diff --git a/topi/python/topi/arm_cpu/bitserial_dense.py b/topi/python/topi/arm_cpu/bitserial_dense.py index 8bd6c5d15f8c..3f1889c8d7ff 100644 --- a/topi/python/topi/arm_cpu/bitserial_dense.py +++ b/topi/python/topi/arm_cpu/bitserial_dense.py @@ -21,15 +21,13 @@ from tvm import autotvm from topi.util import get_const_tuple from .. import tag -from .. import generic from .bitserial_conv2d import _intrin_popcount from ..nn.pad import pad -from ..nn.bitserial_dense import bitserial_dense from ..nn.bitserial_util import bitpack, binary_op_multiplier -@autotvm.register_topi_compute(bitserial_dense, ['arm_cpu'], 'direct') -def bitserial_dense_generic(cfg, data, weight, data_bits, weight_bits, pack_dtype, out_dtype, - unipolar): +@autotvm.register_topi_compute('bitserial_dense.arm_cpu') +def bitserial_dense(cfg, data, weight, data_bits, weight_bits, pack_dtype, out_dtype, + unipolar): """The default implementation of bitserial dense in topi. Parameters @@ -111,7 +109,7 @@ def bitserial_dense_generic(cfg, data, weight, data_bits, weight_bits, pack_dtyp return matmul -@autotvm.register_topi_schedule(generic.nn.schedule_bitserial_dense, ['arm_cpu'], 'direct') +@autotvm.register_topi_schedule('bitserial_dense.arm_cpu') def schedule_bitserial_dense(cfg, outs): """Schedule for binary_dense. diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py index f0d650adeac1..54672810a19f 100644 --- a/topi/python/topi/arm_cpu/conv2d.py +++ b/topi/python/topi/arm_cpu/conv2d.py @@ -18,20 +18,12 @@ """Conv2D schedule for ARM CPU""" from __future__ import absolute_import as _abs -import logging - import tvm from tvm import autotvm import tvm.contrib.nnpack -from ..generic import schedule_conv2d_nchw, schedule_conv2d_nhwc, \ - schedule_conv2d_winograd_without_weight_transform, \ - schedule_conv2d_winograd_nnpack_without_weight_transform from ..util import traverse_inline, get_const_tuple -from ..nn import dilate, pad, conv2d, conv2d_alter_layout, \ - conv2d_winograd_without_weight_transform, \ - conv2d_winograd_nnpack_without_weight_transform, \ - depthwise_conv2d_nchw +from .. import nn from ..nn.util import get_const_int, get_pad_tuple from ..nn.winograd_util import winograd_transform_matrices from .conv2d_spatial_pack import conv2d_spatial_pack_nchw, \ @@ -39,75 +31,15 @@ schedule_conv2d_spatial_pack_nchw, \ schedule_conv2d_spatial_pack_nhwc -logger = logging.getLogger('topi') - -@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['direct']) -def conv2d_arm_cpu(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): - """TOPI compute callback for conv2d - - Parameters - ---------- - cfg: ConfigEntity - The config for this template - - data : tvm.Tensor - 4-D with shape [batch, in_channel, in_height, in_width] - - kernel : tvm.Tensor - 4-D with shape [num_filter, in_channel, filter_height, filter_width] or - pre-packed 5-D with shape [num_filter_chunk, in_channel, filter_height, - filter_width, num_filter_block] - strides : list of two ints - [stride_height, stride_width] +@autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu") +def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype): + return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, + dilation, out_dtype, num_tile=2) - padding : list of two ints - [pad_height, pad_width] - - dilation : list of two ints - [dilation_height, dilation_width] - - layout : str - layout of data - - out_dtype: str - The output type. This is used for mixed precision. - - Returns - ------- - output : tvm.Tensor - 4-D with shape [batch, out_channel, out_height, out_width] - """ - if layout == 'NCHW': - return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, - dilation, out_dtype, num_tile=2) - elif layout == 'NHWC': - return conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, - dilation, out_dtype) - else: - raise ValueError("Unsupported layout {}".format(layout)) - -@autotvm.register_topi_schedule( - schedule_conv2d_nchw, 'arm_cpu', - ['direct', 'winograd', 'winograd_nnpack_fp16', 'winograd_nnpack_fp32']) -def schedule_conv2d_nchw_arm_cpu(cfg, outs): - """TOPI schedule callback for conv2d - - Parameters - ---------- - cfg: ConfigEntity - The config for this template - - outs: Array of Tensor - The computation graph description of conv2d - in the format of an array of tensors. - - Returns - ------- - s: Schedule - The computation schedule for conv2d. - """ +@autotvm.register_topi_schedule("conv2d_nchw_spatial_pack.arm_cpu") +def schedule_conv2d_nchw_spatial_pack(cfg, outs): s = tvm.create_schedule([x.op for x in outs]) def _callback(op): @@ -131,35 +63,18 @@ def _callback(op): schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec, conv, output, outs[0]) - if 'winograd_conv2d_output' in op.tag: - output = op.output(0) - _schedule_winograd(cfg, s, output, outs[0]) - - if 'winograd_nnpack_conv2d_output' in op.tag: - output = op.output(0) - _schedule_winograd_nnpack(cfg, s, output, outs[0]) - traverse_inline(s, outs[0].op, _callback) return s -@autotvm.register_topi_schedule(schedule_conv2d_nhwc, 'arm_cpu', ['direct']) -def schedule_conv2d_nhwc_arm_cpu(cfg, outs): - """TOPI schedule callback for conv2d - Parameters - ---------- - cfg: ConfigEntity - The config for this template +@autotvm.register_topi_compute("conv2d_nhwc_spatial_pack.arm_cpu") +def conv2d_nhwc_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype): + return conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, + dilation, out_dtype) - outs: Array of Tensor - The computation graph description of conv2d - in the format of an array of tensors. - Returns - ------- - s: Schedule - The computation schedule for conv2d. - """ +@autotvm.register_topi_schedule("conv2d_nhwc_spatial_pack.arm_cpu") +def schedule_conv2d_nhwc_spatial_pack(cfg, outs): s = tvm.create_schedule([x.op for x in outs]) def _callback(op): @@ -170,14 +85,27 @@ def _callback(op): return s -@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['winograd']) -def conv2d_arm_cpu_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): - """ TOPI compute callback. Use winograd template """ +@autotvm.register_topi_compute("conv2d_nchw_winograd.arm_cpu") +def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype): tile_size = 4 - return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, + return _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size) -def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size): + +@autotvm.register_topi_schedule("conv2d_nchw_winograd.arm_cpu") +def schedule_conv2d_nchw_winograd(cfg, outs): + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if 'winograd_conv2d_output' in op.tag: + output = op.output(0) + _schedule_winograd(cfg, s, output, outs[0]) + + traverse_inline(s, outs[0].op, _callback) + return s + + +def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size): N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): @@ -187,7 +115,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt if len(kernel.shape) == 4: if dilation_h != 1 or dilation_w != 1: - kernel = dilate(kernel, (1, 1, dilation_h, dilation_w)) + kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w)) pre_computed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: @@ -199,9 +127,8 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) - assert layout == 'NCHW' assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1 - data_pad = pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad") + data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad") idxd = tvm.indexdiv idxm = tvm.indexmod @@ -272,6 +199,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt cfg.add_flop(2 * N * K * H * W * KH * KW * C) return output + def _schedule_winograd(cfg, s, output, last): Y = output.op.input_tensors[0] M, A = Y.op.input_tensors @@ -356,26 +284,37 @@ def _schedule_winograd(cfg, s, output, last): s[output].compute_inline() -@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['winograd_nnpack_fp16']) -def conv2d_arm_cpu_winograd_nnpack_fp16( - cfg, data, kernel, strides, padding, dilation, layout, out_dtype): - """ TOPI compute callback. Use winograd_nnpack_fp16 template """ - return conv2d_arm_cpu_winograd_nnpack( - cfg, data, kernel, strides, padding, dilation, layout, out_dtype, - tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8_FP16) +@autotvm.register_topi_compute("conv2d_nchw_winograd_nnpack.arm_cpu") +def conv2d_nchw_winograd_nnpack(cfg, data, kernel, strides, padding, dilation, out_dtype): + dtype = data.dtype + if dtype == "float32": + return _conv2d_arm_cpu_winograd_nnpack( + cfg, data, kernel, strides, padding, dilation, out_dtype, + tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8) + elif dtype == "float16": + return _conv2d_arm_cpu_winograd_nnpack( + cfg, data, kernel, strides, padding, dilation, out_dtype, + tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8_FP16) + else: + raise ValueError("Unsupported data type {} for conv2d winograd nnpack". + format(dtype)) + + +@autotvm.register_topi_schedule("conv2d_nchw_winograd_nnpack.arm_cpu") +def schedule_conv2d_nchw_winograd_nnpack(cfg, outs): + s = tvm.create_schedule([x.op for x in outs]) + def _callback(op): + if 'winograd_nnpack_conv2d_output' in op.tag: + output = op.output(0) + _schedule_winograd_nnpack(cfg, s, output, outs[0]) -@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['winograd_nnpack_fp32']) -def conv2d_arm_cpu_winograd_nnpack_fp32( - cfg, data, kernel, strides, padding, dilation, layout, out_dtype): - """ TOPI compute callback. Use winograd_nnpack_fp32 template """ - return conv2d_arm_cpu_winograd_nnpack( - cfg, data, kernel, strides, padding, dilation, layout, out_dtype, - tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8) + traverse_inline(s, outs[0].op, _callback) + return s -def conv2d_arm_cpu_winograd_nnpack( - cfg, data, kernel, strides, padding, dilation, layout, out_dtype, convolution_algorithm): +def _conv2d_arm_cpu_winograd_nnpack( + cfg, data, kernel, strides, padding, dilation, out_dtype, convolution_algorithm): """ TOPI compute callback. Use winograd NNPACK template """ N, CI, IH, IW = get_const_tuple(data.shape) @@ -389,7 +328,6 @@ def conv2d_arm_cpu_winograd_nnpack( HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) - assert layout == 'NCHW' assert KH == 3 and KW == 3 and pt == 1 and pb == 1 and pl == 1 and pr == 1 and HSTR == 1\ and WSTR == 1 H = (IH + pt + pb - 3) // HSTR + 1 @@ -416,6 +354,7 @@ def conv2d_arm_cpu_winograd_nnpack( cfg.add_flop(2 * N * CI * H * W * KH * KW * CO) return output + def _schedule_winograd_nnpack(cfg, s, output, last): # Could have bias. @@ -429,36 +368,9 @@ def _schedule_winograd_nnpack(cfg, s, output, last): s[TK].pragma(s[TK].op.axis[0], 'debug_skip_region') -##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM ##### -@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'arm_cpu', ['winograd']) -def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size): - """TOPI compute callback""" - return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,\ - tile_size) - - -@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform, - 'arm_cpu', ['winograd']) -def schedule_conv2d_winograd_without_weight_transform_(cfg, outs): - """TOPI schedule callback""" - s = tvm.create_schedule([x.op for x in outs]) - - def _callback(op): - if 'winograd_conv2d_output' in op.tag: - output = op.output(0) - _schedule_winograd(cfg, s, output, outs[0]) - - traverse_inline(s, outs[0].op, _callback) - return s - - -##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD NNPACK WITHOUT WEIGHT TRANSFORM ##### -@autotvm.register_topi_compute(conv2d_winograd_nnpack_without_weight_transform, - 'arm_cpu', - ['winograd_nnpack_fp16', 'winograd_nnpack_fp32']) -def conv2d_winograd_nnpack_ww(cfg, data, transformed_kernel, bias, strides, - padding, dilation, layout, out_dtype): - """ TOPI compute callback. Use winograd NNPACK template """ +@autotvm.register_topi_compute("conv2d_nchw_winograd_nnpack_without_weight_transform.arm_cpu") +def conv2d_nchw_winograd_nnpack_without_weight_transform( + cfg, data, transformed_kernel, bias, strides, padding, dilation, out_dtype): N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation @@ -471,7 +383,6 @@ def conv2d_winograd_nnpack_ww(cfg, data, transformed_kernel, bias, strides, KH, KW = 3, 3 pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) - assert layout == 'NCHW' assert KH == 3 and KW == 3 and pt == 1 and pb == 1 and pl == 1 and pr == 1 and HSTR == 1\ and WSTR == 1 H = (IH + pt + pb - 3) // HSTR + 1 @@ -492,9 +403,8 @@ def conv2d_winograd_nnpack_ww(cfg, data, transformed_kernel, bias, strides, return output -@autotvm.register_topi_schedule(schedule_conv2d_winograd_nnpack_without_weight_transform, - 'arm_cpu', ['winograd_nnpack_fp16', 'winograd_nnpack_fp32']) -def schedule_conv2d_winograd_nnpack_without_weight_transform_(cfg, outs): +@autotvm.register_topi_schedule("conv2d_nchw_winograd_nnpack_without_weight_transform.arm_cpu") +def schedule_conv2d_nchw_winograd_nnpack_without_weight_transform(cfg, outs): """TOPI schedule callback""" s = tvm.create_schedule([x.op for x in outs]) @@ -505,226 +415,3 @@ def _callback(op): traverse_inline(s, outs[0].op, _callback) return s - - -##### REGISTER ALTER OP LAYOUT ##### -@conv2d_alter_layout.register(["arm_cpu"]) -def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F): - """Alter op layout for pre-computing kernel transformation - - Parameters - ---------- - attrs : tvm.ir.Attrs - Attributes of current convolution - inputs : tvm.relay.Expr - Grouped input symbols - tinfos : list - Input shape and dtype - F: symbol - The context, can be either relay.op - - Note - ---- - Unlike other TOPI functions, this function operates on both graph level and operator level, - so we have to pass 'F' to make it support our two versions of graph IR, Relay. - """ - copy_inputs = list(inputs) - new_attrs = {k: attrs[k] for k in attrs.keys()} - - if F.__name__ == 'tvm.relay.op': - # Derive channels for frontends (e.g ONNX) that miss "channel" field. - new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')] - - dilation = attrs.get_int_tuple("dilation") - strides = attrs.get_int_tuple("strides") - padding = attrs.get_int_tuple("padding") - groups = attrs.get_int('groups') - data_layout_key = "data_layout" if "data_layout" in new_attrs else "layout" - layout = attrs[data_layout_key] - kernel_layout = attrs['kernel_layout'] - out_dtype = attrs["out_dtype"] - if out_dtype in ("same", ""): - out_dtype = tinfos[0].dtype - - if dilation != (1, 1): - logger.warning("Does not support weight pre-transform for dilated convolution.") - return None - - # query config of this workload - data, kernel = tinfos[0:2] - if groups == 1: - workload = autotvm.task.args_to_workload( - [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d) - else: - workload = autotvm.task.args_to_workload( - [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) - - if layout == 'NCHW' and kernel_layout == 'OIHW': - N, CI, H, W = get_const_tuple(data.shape) - CO, _, KH, KW = get_const_tuple(kernel.shape) - elif layout == 'NHWC' and kernel_layout == 'HWIO': - N, H, W, CI = get_const_tuple(data.shape) - KH, KW, _, CO = get_const_tuple(kernel.shape) - # Also modify the workload to pick up because later we convert to NCHW - # layout. - new_data = tvm.placeholder((N, CI, H, W), dtype=data.dtype) - new_kernel = tvm.placeholder((CO, CI, KH, KW), dtype=kernel.dtype) - new_layout = 'NCHW' - workload = autotvm.task.args_to_workload( - [new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype], conv2d) - elif layout == 'NHWC' and kernel_layout == 'HWOI': - # This is the case for depthwise convolution. - N, H, W, CI = get_const_tuple(data.shape) - KH, KW, CO, M = get_const_tuple(kernel.shape) - # Also modify the workload to pick up because later we convert to NCHW - # layout. - new_data = tvm.placeholder((N, CI, H, W), dtype=data.dtype) - new_kernel = tvm.placeholder((CO, M, KH, KW), dtype=kernel.dtype) - workload = autotvm.task.args_to_workload( - [new_data, new_kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) - else: - return None - - idxd = tvm.indexdiv - - if groups == 1: - target = tvm.target.Target.current() - dispatch_ctx = autotvm.DispatchContext.current - cfg = dispatch_ctx.query(target, workload) - - if cfg.is_fallback: # if is fallback, clear query cache and return None - autotvm.task.clear_fallback_cache(target, workload) - if layout == 'NHWC' and kernel_layout == 'HWIO': - new_attrs['data_layout'] = 'NCHW' - new_attrs['kernel_layout'] = 'OIHW' - return F.nn.conv2d(*copy_inputs, **new_attrs) - return None - - if cfg.template_key == 'direct': # pack weight tensor - VC = cfg['tile_co'].size[-1] - new_attrs['kernel_layout'] = 'OIHW%do' % VC - - # Store the same config for the altered operator (workload) - new_data = tvm.placeholder((N, CI, H, W), dtype=data.dtype) - new_attrs[data_layout_key] = 'NCHW' - new_kernel = tvm.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype) - new_workload = autotvm.task.args_to_workload( - [new_data, new_kernel, strides, padding, dilation, 'NCHW', out_dtype], conv2d) - dispatch_ctx.update(target, new_workload, cfg) - - return F.nn.conv2d(*copy_inputs, **new_attrs) - elif cfg.template_key == "winograd": # pre-compute weight transformation in winograd - if "-device=arm_cpu" in target.options: - tile_size = 4 - VC = cfg['tile_k'].size[-1] - elif "-device=bifrost" in target.options: - tile_size = 2 - VC = 0 - else: - from ..mali.conv2d import _pick_tile_size - tile_size = _pick_tile_size(tinfos[0], tinfos[1]) - VC = cfg['tile_bna'].val - - weight = copy_inputs[1] - if kernel_layout != 'OIHW': - weight = F.transpose(weight, axes=(2, 3, 0, 1)) - weight = F.nn.contrib_conv2d_winograd_weight_transform(weight, - tile_size=tile_size) - if VC > 0: - weight = F.reshape(weight, - newshape=(KH + tile_size - 1, - KW + tile_size - 1, - idxd(CO, VC), VC, CI)) - weight = F.transpose(weight, axes=[0, 1, 2, 4, 3]) - new_weight = tvm.placeholder((KH + tile_size - 1, - KW + tile_size -1, - idxd(CO, VC), CI, VC), - kernel.dtype) - else: - weight = F.reshape(weight, - newshape=(KH + tile_size - 1, KW + tile_size - 1, CO, CI)) - new_weight = tvm.placeholder( - (KH + tile_size - 1, KW + tile_size -1, CO, CI), kernel.dtype - ) - - copy_inputs[1] = weight - new_attrs['tile_size'] = tile_size - new_attrs[data_layout_key] = 'NCHW' - - # Store the same config for the altered operator (workload) - new_data = tvm.placeholder((N, CI, H, W), dtype=data.dtype) - new_workload = autotvm.task.args_to_workload( - [new_data, new_weight, strides, padding, dilation, - new_attrs[data_layout_key], out_dtype, tile_size], - conv2d_winograd_without_weight_transform) - dispatch_ctx.update(target, new_workload, cfg) - - return F.nn.contrib_conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs) - elif cfg.template_key in ["winograd_nnpack_fp16", "winograd_nnpack_fp32"]: - # pre-compute winograd_nnpack transform - # for winograd_nnpack_fp16, the the precomputeprune pass must run on device, - # where float16 is supported - weight_dtype = 'float32' - weight = copy_inputs[1] - if kernel_layout != 'OIHW': - weight = F.transpose(weight, axes=(2, 3, 0, 1)) - weight = F.nn.contrib_conv2d_winograd_weight_transform(weight, - tile_size=tile_size) - transformed_kernel = F.nn.contrib_conv2d_winograd_nnpack_weight_transform( - weight, - convolution_algorithm=cfg['winograd_nnpack_algorithm'].val, - out_dtype=weight_dtype) - copy_inputs[1] = transformed_kernel - - new_data = tvm.placeholder((N, CI, H, W), dtype=data.dtype) - new_kernel = tvm.placeholder((CO, CI, 8, 8), "float32") - bias = tvm.placeholder((CO, ), "float32") - new_attrs[data_layout_key] = 'NCHW' - new_workload = autotvm.task.args_to_workload( - [new_data, new_kernel, bias, strides, - padding, dilation, new_attrs[data_layout_key], out_dtype] - if len(copy_inputs) == 3 else - [new_data, new_kernel, strides, - padding, dilation, new_attrs[data_layout_key], out_dtype], - conv2d_winograd_nnpack_without_weight_transform) - dispatch_ctx.update(target, new_workload, cfg) - return F.nn.contrib_conv2d_winograd_nnpack_without_weight_transform( - *copy_inputs, **new_attrs) - else: - raise RuntimeError("Unsupported template_key '%s'" % cfg.template_key) - else: - target = tvm.target.Target.current() - dispatch_ctx = autotvm.DispatchContext.current - cfg = dispatch_ctx.query(target, workload) - - if cfg.is_fallback: # if is fallback, clear query cache and return None - autotvm.task.clear_fallback_cache(tvm.target.Target.current(), workload) - if layout == 'NHWC' and kernel_layout == 'HWOI': - new_attrs['data_layout'] = 'NCHW' - new_attrs['kernel_layout'] = 'OIHW' - return F.nn.conv2d(*copy_inputs, **new_attrs) - return None - if cfg.template_key == 'contrib_spatial_pack': - VC = cfg['tile_co'].size[-1] - new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1]) - - # Store the same config for the altered operator (workload) - new_data = tvm.placeholder((N, CI, H, W), dtype=data.dtype) - new_attrs[data_layout_key] = 'NCHW' - if attrs['kernel_layout'] == 'OIHW': - CO, M, KH, KW = get_const_tuple(kernel.shape) - elif attrs['kernel_layout'] == 'HWOI': - KH, KW, CO, M = get_const_tuple(kernel.shape) - else: - raise RuntimeError("Depthwise conv should either have OIHW/HWIO kernel layout") - new_kernel = tvm.placeholder((idxd(CO, VC), M, KH, KW, VC), dtype=kernel.dtype) - new_workload = autotvm.task.args_to_workload( - [new_data, new_kernel, strides, padding, dilation, out_dtype], - depthwise_conv2d_nchw) - dispatch_ctx.update(target, new_workload, cfg) - - return F.nn.conv2d(*copy_inputs, **new_attrs) - else: - # currently we only have contrib_spatial_pack and direct template - # add more schedule templates. - return None diff --git a/topi/python/topi/arm_cpu/conv2d_alter_op.py b/topi/python/topi/arm_cpu/conv2d_alter_op.py new file mode 100644 index 000000000000..869b1d44ed64 --- /dev/null +++ b/topi/python/topi/arm_cpu/conv2d_alter_op.py @@ -0,0 +1,167 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +"""Conv2D alter op and legalize functions for arm cpu""" + +import logging + +import tvm +from tvm import relay +from tvm import autotvm + +from ..nn import conv2d_alter_layout +from ..util import get_const_tuple + + +logger = logging.getLogger('topi') + + +@conv2d_alter_layout.register(["arm_cpu"]) +def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): + target = tvm.target.current_target(allow_none=False) + dispatch_ctx = autotvm.task.DispatchContext.current + + _, outs = relay.backend.compile_engine.select_implement( + relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target) + workload = autotvm.task.get_workload(outs) + if workload is None: + # The best implementation is not an AutoTVM template, + # we then assume it's not necessary to alter this op. + return None + cfg = dispatch_ctx.query(target, workload) + if cfg.is_fallback: # if is fallback, clear query cache and return None + autotvm.task.clear_fallback_cache(target, workload) + return None + + topi_tmpl = workload[0] + new_attrs = {k: attrs[k] for k in attrs.keys()} + + strides = attrs.get_int_tuple("strides") + padding = attrs.get_int_tuple("padding") + dilation = attrs.get_int_tuple("dilation") + data_layout = attrs["data_layout"] + kernel_layout = attrs["kernel_layout"] + data, kernel = tinfos + out_dtype = out_type.dtype + + idxd = tvm.indexdiv + + if topi_tmpl == "conv2d_nchw_spatial_pack.arm_cpu": + assert data_layout == "NCHW" and kernel_layout == "OIHW" + N, CI, H, W = get_const_tuple(data.shape) + CO, _, KH, KW = get_const_tuple(kernel.shape) + VC = cfg['tile_co'].size[-1] + + new_attrs['kernel_layout'] = 'OIHW%do' % VC + + new_data = data + new_kernel = tvm.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, out_dtype], + "conv2d_nchw_spatial_pack.arm_cpu") + dispatch_ctx.update(target, new_workload, cfg) + + return relay.nn.conv2d(*inputs, **new_attrs) + elif topi_tmpl == "conv2d_nhwc_spatial_pack.arm_cpu": + assert data_layout == "NHWC" and kernel_layout == "HWIO" + N, H, W, CI = get_const_tuple(data.shape) + KH, KW, _, CO = get_const_tuple(kernel.shape) + VC = cfg['tile_co'].size[-1] + + new_attrs['kernel_layout'] = 'OHWI%do' % VC + + new_data = data + new_kernel = tvm.placeholder((idxd(CO, VC), KH, KW, CI, VC), dtype=kernel.dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, out_dtype], + "conv2d_nhwc_spatial_pack.arm_cpu") + dispatch_ctx.update(target, new_workload, cfg) + + return relay.nn.conv2d(*inputs, **new_attrs) + elif topi_tmpl == "conv2d_nchw_winograd.arm_cpu": + assert data_layout == "NCHW" and kernel_layout == "OIHW" + N, CI, H, W = get_const_tuple(data.shape) + CO, _, KH, KW = get_const_tuple(kernel.shape) + VC = cfg['tile_k'].size[-1] + tile_size = 4 + + weight_expr = inputs[1] + weight_expr = relay.nn.contrib_conv2d_winograd_weight_transform( + weight_expr, tile_size=tile_size) + weight_expr = relay.reshape(weight_expr, + newshape=(KH + tile_size - 1, + KW + tile_size - 1, + idxd(CO, VC), VC, CI)) + weight_expr = relay.transpose(weight_expr, axes=[0, 1, 2, 4, 3]) + + new_attrs['tile_size'] = tile_size + + new_data = data + new_kernel = tvm.placeholder((KH + tile_size - 1, + KW + tile_size -1, + idxd(CO, VC), CI, VC), + kernel.dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, out_dtype], + 'conv2d_nchw_winograd.arm_cpu') + dispatch_ctx.update(target, new_workload, cfg) + + return relay.nn.contrib_conv2d_winograd_without_weight_transform( + inputs[0], weight_expr, **new_attrs) + elif topi_tmpl == "conv2d_nchw_winograd_nnpack.arm_cpu": + assert data_layout == "NCHW" and kernel_layout == "OIHW" + N, CI, H, W = get_const_tuple(data.shape) + CO, _, KH, KW = get_const_tuple(kernel.shape) + + # pre-compute winograd_nnpack transform + # for winograd_nnpack_fp16, the the precompute prune pass must run on device, + # where float16 is supported + weight_dtype = 'float32' + weight_expr = inputs[1] + transformed_weight = relay.nn.contrib_conv2d_winograd_nnpack_weight_transform( + weight_expr, + convolution_algorithm=cfg['winograd_nnpack_algorithm'].val, + out_dtype=weight_dtype) + + new_data = data + new_kernel = tvm.placeholder((CO, CI, 8, 8), "float32") + + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, None, strides, padding, dilation, out_dtype], + "conv2d_nchw_winograd_nnpack_without_weight_transform.arm_cpu") + dispatch_ctx.update(target, new_workload, cfg) + return relay.nn.contrib_conv2d_winograd_without_weight_transform( + inputs[0], transformed_weight, **new_attrs) + elif topi_tmpl == "depthwise_conv2d_nchw_spatial_pack.arm_cpu": + assert data_layout == "NCHW" and kernel_layout == "OIHW" + N, CI, H, W = get_const_tuple(data.shape) + CO, _, KH, KW = get_const_tuple(kernel.shape) + VC = cfg['tile_co'].size[-1] + + new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1]) + + # Store the same config for the altered operator (workload) + new_data = data + new_kernel = tvm.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, out_dtype], + "depthwise_conv2d_nchw_spatial_pack.arm_cpu") + dispatch_ctx.update(target, new_workload, cfg) + + return relay.nn.conv2d(*inputs, **new_attrs) + else: + return None diff --git a/topi/python/topi/arm_cpu/conv2d_int8.py b/topi/python/topi/arm_cpu/conv2d_int8.py index 8f43f5c210d4..cd413d659203 100644 --- a/topi/python/topi/arm_cpu/conv2d_int8.py +++ b/topi/python/topi/arm_cpu/conv2d_int8.py @@ -21,7 +21,6 @@ from tvm import autotvm from .. import generic, tag from ..util import get_const_tuple -from ..nn.conv2d import conv2d_NCHWc_int8 from ..generic import conv2d as conv2d_generic from .. import nn from ..nn.conv2d import _get_workload as _get_conv2d_workload @@ -42,9 +41,9 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype): cfg, wkl, int32_lanes=2, num_int8_elements=4) -@autotvm.register_topi_compute(conv2d_NCHWc_int8, ['arm_cpu'], 'direct') -def _declaration_conv_NCHWc_int8(cfg, data, kernel, strides, - padding, dilation, layout, out_layout, out_dtype): +@autotvm.register_topi_compute("conv2d_NCHWc_int8.arm_cpu") +def conv2d_NCHWc_int8(cfg, data, kernel, strides, + padding, dilation, layout, out_layout, out_dtype): # layout and out_layout are not used here, # we keep them for debug convenience when dumping autotvm workload n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) @@ -68,8 +67,8 @@ def _declaration_conv_NCHWc_int8(cfg, data, kernel, strides, out_dtype) -@autotvm.register_topi_schedule(generic.schedule_conv2d_NCHWc_int8, ['arm_cpu'], ['direct']) -def _schedule_conv2d_NCHWc_int8(cfg, outs): +@autotvm.register_topi_schedule("conv2d_NCHWc_int8.arm_cpu") +def schedule_conv2d_NCHWc_int8(cfg, outs): """Create schedule for tensors""" s = tvm.create_schedule([x.op for x in outs]) scheduled_ops = [] @@ -86,7 +85,7 @@ def traverse(op): if 'conv2d_NCHWc_int8' in op.tag: conv_out = op.output(0) - kernel = conv_out.op.input_tensors[1] + kernel_vec = conv_out.op.input_tensors[1] data_vec = conv_out.op.input_tensors[0] data = data_vec.op.input_tensors[0] \ if isinstance(data_vec.op, tvm.tensor.ComputeOp) and "pad" not in data_vec.op.tag \ @@ -95,9 +94,9 @@ def traverse(op): data_pad = data data = data_pad.op.input_tensors[0] - args = [s, cfg, data_vec, conv_out, outs[0]] + args = [s, cfg, data_vec, kernel_vec, conv_out, outs[0]] # int8 conv kernel is 7-dim - _, _, kh, kw, _, _, _ = get_const_tuple(kernel.shape) + _, _, kh, kw, _, _, _ = get_const_tuple(kernel_vec.shape) dtype = "uint" if data.dtype == "uint8" else "int" if kh == 1 and kw == 1: conv2d_generic.schedule_conv_NCHWc_cpu_1x1_int8( diff --git a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py index 350a0227ef48..032ac76ff6a2 100644 --- a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py +++ b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py @@ -78,10 +78,12 @@ def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation, # fallback support if cfg.is_fallback: if num_tile == 2: # arm cpu - ref_log = autotvm.tophub.load_reference_log('arm_cpu', 'rk3399', 'conv2d', 'direct') + ref_log = autotvm.tophub.load_reference_log( + 'arm_cpu', 'rk3399', 'conv2d_nchw_spatial_pack.arm_cpu') cfg.fallback_with_reference_log(ref_log) elif num_tile == 3: # mali gpu - ref_log = autotvm.tophub.load_reference_log('mali', 'rk3399', 'conv2d', 'direct') + ref_log = autotvm.tophub.load_reference_log( + 'mali', 'rk3399', 'conv2d_nchw_spatial_pack.mali') cfg.fallback_with_reference_log(ref_log) # ==================================================================== diff --git a/topi/python/topi/arm_cpu/conv2d_transpose.py b/topi/python/topi/arm_cpu/conv2d_transpose.py index 65f1024c88a3..93ff02900f37 100644 --- a/topi/python/topi/arm_cpu/conv2d_transpose.py +++ b/topi/python/topi/arm_cpu/conv2d_transpose.py @@ -21,13 +21,12 @@ import tvm from tvm import autotvm -from ..generic import schedule_conv2d_transpose_nchw -from ..nn import conv2d_transpose_nchw, dilate, pad, get_pad_tuple +from ..nn import dilate, pad, get_pad_tuple from ..util import get_const_tuple, traverse_inline from .conv2d_spatial_pack import schedule_conv2d_spatial_pack_nchw -@autotvm.task.register_topi_compute(conv2d_transpose_nchw, "arm_cpu", "direct") -def conv2d_transpose_nchw_arm(cfg, Input, Filter, strides, padding, out_dtype): +@autotvm.register_topi_compute("conv2d_transpose_nchw.arm_cpu") +def conv2d_transpose_nchw(cfg, Input, Filter, strides, padding, out_dtype): """Transposed 2D convolution nchw forward operator. Parameters @@ -135,8 +134,8 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n # register customized schedule for arm cpu. -@autotvm.task.register_topi_schedule(schedule_conv2d_transpose_nchw, "arm_cpu", "direct") -def schedule_conv2d_transpose_arm(cfg, outs): +@autotvm.register_topi_schedule("conv2d_transpose_nchw.arm_cpu") +def schedule_conv2d_transpose_nchw(cfg, outs): """Schedule conv2d transpose for arm cpu""" s = tvm.create_schedule([x.op for x in outs]) diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py index 207fc712c450..9a79f984edb1 100644 --- a/topi/python/topi/arm_cpu/depthwise_conv2d.py +++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py @@ -20,19 +20,18 @@ import tvm from tvm import autotvm -from ..generic import schedule_depthwise_conv2d_nchw -from ..nn import depthwise_conv2d_nchw, pad +from .. import nn from ..util import traverse_inline, get_const_tuple, get_const_int from ..nn.util import get_pad_tuple -# register original implementation of depthwise_conv2d_nchw since we don't need to change this part -autotvm.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', 'direct', - depthwise_conv2d_nchw.fdefault) -# register customized schedule for arm cpu. -@autotvm.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu', - ['direct', 'contrib_spatial_pack']) -def schedule_depthwise_conv2d_nchw_arm(cfg, outs): +@autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu") +def depthwise_conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype): + return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype) + + +@autotvm.register_topi_schedule("depthwise_conv2d_nchw.arm_cpu") +def schedule_depthwise_conv2d_nchw(cfg, outs): """Schedule depthwise conv2d Parameters @@ -65,7 +64,7 @@ def _schedule(cfg, s, data, data_pad, kernel, output): # fallback support if cfg.is_fallback: ref_log = autotvm.tophub.load_reference_log( - 'arm_cpu', 'rk3399', 'depthwise_conv2d_nchw', 'direct') + 'arm_cpu', 'rk3399', 'depthwise_conv2d_nchw.arm_cpu') cfg.fallback_with_reference_log(ref_log) ##### space definition end ##### @@ -134,25 +133,12 @@ def _callback(op): data = data_pad.op.input_tensors[0] _schedule(cfg, s, data, data_pad, kernel, output) - if op.tag == 'spatial_depthwise_conv2d_nchw_output': - output = op.output(0) - conv = op.input_tensors[0] - data_vec = conv.op.input_tensors[0] - kernel_vec = conv.op.input_tensors[1] - if kernel_vec.op.name == 'kernel_vec': - kernel = kernel_vec.op.input_tensors[0] - else: - kernel = kernel_vec - if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag: - s[kernel].compute_inline() - - _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0]) - traverse_inline(s, outs[0].op, _callback) return s -@autotvm.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', ['contrib_spatial_pack']) -def depthwise_conv2d_arm_cpu(cfg, data, kernel, strides, padding, dilation, out_dtype): + +@autotvm.register_topi_compute("depthwise_conv2d_nchw_spatial_pack.arm_cpu") +def depthwise_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype): """TOPI compute callback for depthwise_conv2d nchw Parameters @@ -189,6 +175,29 @@ def depthwise_conv2d_arm_cpu(cfg, data, kernel, strides, padding, dilation, out_ return _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile=2) +@autotvm.register_topi_schedule("depthwise_conv2d_nchw_spatial_pack.arm_cpu") +def schedule_depthwise_conv2d_nchw_spatial_pack(cfg, outs): + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if op.tag == 'spatial_depthwise_conv2d_nchw_output': + output = op.output(0) + conv = op.input_tensors[0] + data_vec = conv.op.input_tensors[0] + kernel_vec = conv.op.input_tensors[1] + if kernel_vec.op.name == 'kernel_vec': + kernel = kernel_vec.op.input_tensors[0] + else: + kernel = kernel_vec + if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag: + s[kernel].compute_inline() + _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0]) + + traverse_inline(s, outs[0].op, _callback) + return s + + def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile): out_dtype = out_dtype or data.dtype @@ -220,16 +229,16 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype, WPAD = pad_left + pad_right DOPAD = (HPAD != 0 or WPAD != 0) if DOPAD: - data_pad = pad(data, (0, 0, pad_top, pad_left), (0, 0, pad_down, pad_right), - name="data_pad") + data_pad = nn.pad(data, (0, 0, pad_top, pad_left), (0, 0, pad_down, pad_right), + name="data_pad") else: data_pad = data # fallback support # Currently, Mali schedule doesn't use it like conv2d. if cfg.is_fallback: - ref_log = autotvm.tophub.load_reference_log('arm_cpu', 'rk3399', 'depthwise_conv2d_nchw', - 'contrib_spatial_pack') + ref_log = autotvm.tophub.load_reference_log( + 'arm_cpu', 'rk3399', 'depthwise_conv2d_nchw_spatial_pack.arm_cpu') cfg.fallback_with_reference_log(ref_log) # ==================== define configuration space ==================== diff --git a/topi/python/topi/arm_cpu/injective.py b/topi/python/topi/arm_cpu/injective.py index 0b6a16d37d1a..644a7e3fb523 100644 --- a/topi/python/topi/arm_cpu/injective.py +++ b/topi/python/topi/arm_cpu/injective.py @@ -17,10 +17,8 @@ # pylint: disable=invalid-name, unused-variable """Schedule for pooling operators""" import tvm -from .. import generic from ..util import is_empty_shape -@generic.schedule_injective_from_existing.register(["arm_cpu"]) def schedule_injective_from_existing(sch, out): """Schedule for injective op from existing schedule. @@ -46,7 +44,6 @@ def schedule_injective_from_existing(sch, out): sch[out].parallel(sch[out].op.axis[0]) return sch -@generic.schedule_injective.register(["arm_cpu"]) def schedule_injective(outs): """ARM CPU schedule for injective op. @@ -74,7 +71,6 @@ def schedule_injective(outs): schedule_injective_from_existing(s, x) return s -@generic.schedule_concatenate.register(["arm_cpu"]) def schedule_concatenate(outs): """Schedule for concatenate op. diff --git a/topi/python/topi/bifrost/conv2d.py b/topi/python/topi/bifrost/conv2d.py index 2ae65800e925..7956d06fc3fa 100644 --- a/topi/python/topi/bifrost/conv2d.py +++ b/topi/python/topi/bifrost/conv2d.py @@ -19,23 +19,21 @@ """conv2d schedule on ARM Mali (Bifrost) GPU""" import tvm +from tvm import relay from tvm import autotvm from .gemm import decl_winograd_gemm, schedule_gemm from .transforms import tile_and_bind, tile_and_bind3d -from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform from ..util import traverse_inline, get_const_int, get_const_tuple -from ..nn import conv2d, conv2d_winograd_without_weight_transform, \ - get_pad_tuple, pad, conv2d_alter_layout, dilate +from .. import nn from ..nn.winograd_util import winograd_transform_matrices # reuse some compute declarations from ARM CPU from ..arm_cpu.conv2d_spatial_pack import conv2d_spatial_pack_nchw -from ..arm_cpu.conv2d import _alter_conv2d_layout_arm -@autotvm.register_topi_compute(conv2d, 'bifrost', ['direct']) -def conv2d_bifrost(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): +@autotvm.register_topi_compute("conv2d_nchw_spatial_pack.bifrost") +def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype): """TOPI compute callback for conv2d Parameters @@ -60,9 +58,6 @@ def conv2d_bifrost(cfg, data, kernel, strides, padding, dilation, layout, out_dt dilation : list of two ints [dilation_height, dilation_width] - layout : str - layout of data - out_dtype: str The output type. This is used for mixed precision. @@ -71,14 +66,12 @@ def conv2d_bifrost(cfg, data, kernel, strides, padding, dilation, layout, out_dt output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ - if layout == 'NCHW': - return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, - dilation, out_dtype, num_tile=3) - raise ValueError("Unsupported layout {}".format(layout)) + return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, + dilation, out_dtype, num_tile=3) -@autotvm.register_topi_schedule(schedule_conv2d_nchw, 'bifrost', ['direct', 'winograd']) -def schedule_conv2d_nchw_bifrost(cfg, outs): +@autotvm.register_topi_schedule("conv2d_nchw_spatial_pack.bifrost") +def schedule_conv2d_nchw_spatial_pack(cfg, outs): """TOPI schedule callback for conv2d Parameters @@ -116,9 +109,6 @@ def _callback(op): _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec) - if 'winograd_conv2d_output' in op.tag: - _schedule_winograd(cfg, s, op) - traverse_inline(s, outs[0].op, _callback) return s @@ -195,10 +185,22 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec): return s -@autotvm.register_topi_compute(conv2d, 'bifrost', ['winograd']) -def conv2d_bifrost_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): +@autotvm.register_topi_compute("conv2d_nchw_winograd.bifrost") +def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype): """Use Winograd as the convolution method""" - return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype) + return _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype) + + +@autotvm.register_topi_schedule("conv2d_nchw_winograd.bifrost") +def schedule_conv2d_nchw_winograd(cfg, outs): + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if 'winograd_conv2d_output' in op.tag: + _schedule_winograd(cfg, s, op) + + traverse_inline(s, outs[0].op, _callback) + return s def _decl_winograd_kernel_transform(kernel, tile_size, G): @@ -256,7 +258,7 @@ def upround(x, align): return U -def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size=2): +def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size=2): """Declare a winograd convolution - only tile_size=2 is currently supported""" N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): @@ -266,7 +268,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt if int(kernel.shape[2]) == 3: if dilation_h != 1 or dilation_w != 1: - kernel = dilate(kernel, (1, 1, dilation_h, dilation_w)) + kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w)) pre_computed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: @@ -275,11 +277,10 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt H_CAT, W_CAT, CO, CI = get_const_tuple(kernel.shape) KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1 HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) - pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) + pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW)) - assert layout == 'NCHW' assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1 - data_pad = pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad") + data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad") r = KW m = tile_size @@ -454,31 +455,77 @@ def _schedule_winograd(cfg, s, op): tile_and_bind3d(s, output, k, h, w, 1, 2, 2) -##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM ##### -@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'bifrost', ['winograd']) -def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size): - """TOPI compute callback""" - return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype) +##### REGISTER ALTER OP LAYOUT ##### +@nn.conv2d_alter_layout.register(["bifrost"]) +def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): + target = tvm.target.current_target(allow_none=False) + dispatch_ctx = autotvm.task.DispatchContext.current + + _, outs = relay.backend.compile_engine.select_implement( + relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target) + workload = autotvm.task.get_workload(outs) + if workload is None: + # The best implementation is not an AutoTVM template, + # we then assume it's not necessary to alter this op. + return None + cfg = dispatch_ctx.query(target, workload) + if cfg.is_fallback: # if is fallback, clear query cache and return None + autotvm.task.clear_fallback_cache(target, workload) + return None -@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform, - 'bifrost', ['winograd']) -def schedule_conv2d_winograd_without_weight_transform_(cfg, outs): - """TOPI schedule callback""" - s = tvm.create_schedule([x.op for x in outs]) + topi_tmpl = workload[0] + new_attrs = {k: attrs[k] for k in attrs.keys()} - def _callback(op): - if 'winograd_conv2d_output' in op.tag: - _schedule_winograd(cfg, s, op) + strides = attrs.get_int_tuple("strides") + padding = attrs.get_int_tuple("padding") + dilation = attrs.get_int_tuple("dilation") + data_layout = attrs["data_layout"] + kernel_layout = attrs["kernel_layout"] + data, kernel = tinfos + out_dtype = out_type.dtype - traverse_inline(s, outs[0].op, _callback) - return s + idxd = tvm.indexdiv + + if topi_tmpl == "conv2d_nchw_spatial_pack.bifrost": + assert data_layout == "NCHW" and kernel_layout == "OIHW" + N, CI, H, W = get_const_tuple(data.shape) + CO, _, KH, KW = get_const_tuple(kernel.shape) + VC = cfg['tile_co'].size[-1] + new_attrs['kernel_layout'] = 'OIHW%do' % VC -##### REGISTER ALTER OP LAYOUT ##### -@conv2d_alter_layout.register(["bifrost"]) -def _alter_conv2d_layout(attrs, inputs, tinfos, F): - try: - return _alter_conv2d_layout_arm(attrs, inputs, tinfos, F) - except KeyError: # to filter out fallback opencl templates + new_data = data + new_kernel = tvm.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, out_dtype], + "conv2d_nchw_spatial_pack.bifrost") + dispatch_ctx.update(target, new_workload, cfg) + + return relay.nn.conv2d(*inputs, **new_attrs) + elif topi_tmpl == "conv2d_nchw_winograd.bifrost": + assert data_layout == "NCHW" and kernel_layout == "OIHW" + N, CI, H, W = get_const_tuple(data.shape) + CO, _, KH, KW = get_const_tuple(kernel.shape) + tile_size = 2 + + weight_expr = inputs[1] + weight_expr = relay.nn.contrib_conv2d_winograd_weight_transform( + weight_expr, tile_size=tile_size) + weight_expr = relay.reshape( + weight_expr, newshape=(KH + tile_size - 1, KW + tile_size - 1, CO, CI)) + + new_attrs['tile_size'] = tile_size + + new_data = data + new_kernel = tvm.placeholder( + (KH + tile_size - 1, KW + tile_size -1, CO, CI), kernel.dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, out_dtype], + 'conv2d_nchw_winograd.bifrost') + dispatch_ctx.update(target, new_workload, cfg) + + return relay.nn.contrib_conv2d_winograd_without_weight_transform( + inputs[0], weight_expr, **new_attrs) + else: return None diff --git a/topi/python/topi/bifrost/dense.py b/topi/python/topi/bifrost/dense.py index 114168f27514..dadb8db96bc8 100644 --- a/topi/python/topi/bifrost/dense.py +++ b/topi/python/topi/bifrost/dense.py @@ -15,19 +15,22 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=invalid-name,unused-variable -"""dense schedule on ARM Mali GPU""" +"""dense schedule on ARM Mali Biforst GPU""" from __future__ import absolute_import as _abs import tvm from tvm import autotvm -from .. import generic, nn +from .. import nn from ..util import traverse_inline -autotvm.register_topi_compute(nn.dense, 'bifrost', 'direct', nn.dense.fdefault) +@autotvm.register_topi_compute('dense.biforst') +def dense(_, data, weight, bias=None, out_dtype=None): + """Dense operator on Biforst""" + return nn.dense(data, weight, bias, out_dtype) -@autotvm.register_topi_schedule(generic.schedule_dense, 'bifrost', 'direct') +@autotvm.register_topi_schedule('dense.bifrost') def schedule_dense(cfg, outs): """Schedule for dense operator. @@ -66,7 +69,7 @@ def _callback(op): # fallback support if cfg.is_fallback: ref_log = autotvm.tophub.load_reference_log( - 'mali', 'rk3399', 'dense', 'direct') + 'mali', 'rk3399', 'dense.bifrost') cfg.fallback_with_reference_log(ref_log) ##### space definition end ##### diff --git a/topi/python/topi/bifrost/depthwise_conv2d.py b/topi/python/topi/bifrost/depthwise_conv2d.py index 305abee0bcd9..4f7b0db7f95f 100644 --- a/topi/python/topi/bifrost/depthwise_conv2d.py +++ b/topi/python/topi/bifrost/depthwise_conv2d.py @@ -21,11 +21,9 @@ from __future__ import absolute_import as _abs import tvm -from .. import generic from .. import util from .. import tag -@generic.schedule_depthwise_conv2d_nchw.register(["bifrost"]) def schedule_depthwise_conv2d_nchw(outs): """Schedule for depthwise_conv2d nchw forward. diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py index 4c20dd0075d6..6e38318a0062 100644 --- a/topi/python/topi/cuda/__init__.py +++ b/topi/python/topi/cuda/__init__.py @@ -19,23 +19,27 @@ """CUDA specific declaration and schedules.""" from __future__ import absolute_import as _abs -from . import conv1d, conv2d, depthwise_conv2d, conv2d_transpose_nchw, \ - deformable_conv2d, group_conv2d_nchw, dense, conv1d_transpose_ncw -from . import conv3d -from .conv2d_hwcn import schedule_conv2d_hwcn -from .depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc -from .depthwise_conv2d import schedule_depthwise_conv2d_backward_weight_nhwc -from .group_conv2d_nchw import schedule_conv2d_nchw_cuda +from .conv1d import * +from .conv1d_transpose_ncw import * +from .conv2d import * +from .conv2d_hwcn import * +from .conv2d_int8 import * +from .conv2d_winograd import * +from .depthwise_conv2d import * +from .group_conv2d_nchw import * +from . import conv2d_alter_op +from .conv2d_transpose_nchw import * +from .deformable_conv2d import * +from .conv3d import * from .reduction import schedule_reduce from .softmax import schedule_softmax from .injective import schedule_injective, schedule_elemwise, schedule_broadcast -from .dense import schedule_dense -from .pooling import schedule_pool, schedule_adaptive_pool +from .dense import * +from .pooling import * from .nn import schedule_lrn -from .batch_matmul import schedule_batch_matmul +from .batch_matmul import * from .vision import * -from . import ssd from .ssd import * -from .nms import * +from .nms import get_valid_counts, non_max_suppression from .rcnn import * from .sort import * diff --git a/topi/python/topi/cuda/batch_matmul.py b/topi/python/topi/cuda/batch_matmul.py index 24fc2a17aa18..e293c7ad41e8 100644 --- a/topi/python/topi/cuda/batch_matmul.py +++ b/topi/python/topi/cuda/batch_matmul.py @@ -19,34 +19,8 @@ from __future__ import absolute_import as _abs import tvm from tvm.contrib import cublas -from topi.nn import batch_matmul, batch_matmul_default -from .. import generic from ..util import traverse_inline, get_const_tuple, get_max_power2_factor -@batch_matmul.register(["cuda", "gpu"]) -def batch_matmul_cuda(x, y): - """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are - data in batch. - - Parameters - ---------- - x : tvm.Tensor - 3-D with shape [batch, M, K] - - y : tvm.Tensor - 3-D with shape [batch, N, K] - - Returns - ------- - output : tvm.Tensor - 3-D with shape [batch, M, N] - """ - target = tvm.target.Target.current() - if target.target_name == "cuda" and "cublas" in target.libs: - return cublas.batch_matmul(x, y, False, True) - return batch_matmul_default(x, y) - -@generic.schedule_batch_matmul.register(["cuda", "gpu"]) def schedule_batch_matmul(outs): """Schedule for batch_matmul @@ -61,10 +35,6 @@ def schedule_batch_matmul(outs): s: Schedule The computation schedule for the op. """ - target = tvm.target.Target.current() - if target.target_name == "cuda" and "cublas" in target.libs: - return generic.schedule_extern(outs) - outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) @@ -134,3 +104,22 @@ def _callback(op): traverse_inline(s, outs[0].op, _callback) return s + +def batch_matmul_cublas(x, y): + """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are + data in batch. + + Parameters + ---------- + x : tvm.Tensor + 3-D with shape [batch, M, K] + + y : tvm.Tensor + 3-D with shape [batch, N, K] + + Returns + ------- + output : tvm.Tensor + 3-D with shape [batch, M, N] + """ + return cublas.batch_matmul(x, y, False, True) diff --git a/topi/python/topi/cuda/conv1d.py b/topi/python/topi/cuda/conv1d.py index 43754a31df48..56918e2bbba2 100644 --- a/topi/python/topi/cuda/conv1d.py +++ b/topi/python/topi/cuda/conv1d.py @@ -19,67 +19,22 @@ import tvm from tvm import autotvm -from .. import nn, generic +from .. import nn from ..util import traverse_inline, get_const_tuple -@autotvm.register_topi_compute(nn.conv1d, ['cuda', 'gpu'], ['direct']) -def conv1d_cuda(cfg, - data, - kernel, - strides, - padding, - dilation, - layout='NCW', - out_dtype='float32'): - """ 1D convolution forward operator for cuda backend. +@autotvm.register_topi_compute("conv1d_ncw.cuda") +def conv1d_ncw(cfg, + data, + kernel, + strides, + padding, + dilation, + out_dtype='float32'): + return nn.conv1d_ncw(data, kernel, strides, padding, dilation, out_dtype) - Parameters - ---------- - cfg : ConfigEntity - The config for this template - - data : tvm.Tensor - 3-D input shape [batch, in_channel, in_width] for layout == 'NCW' - and [batch, in_width, in_channel] for layout == 'NWC' - - kernel : tvm.Tensor - 3-D kernel with shape [num_filter, in_channel, filter_size] for layout == 'NCW' - and [filter_size, in_channel, num_filter] for layout == 'NWC' - - strides : int or tuple - The spatial stride along width - padding : int or str - Padding size, or ['VALID', 'SAME'] - - dilation : int or tuple - Dilation rate if convolution should be dilated. - - layout : str - How input data is laid out, must be one of ['NCW', 'NWC'] - - out_dtype : str - The output data type. If None then output is same type as input. - """ - if out_dtype is None: - out_dtype = data.dtype - if isinstance(strides, (tuple, list)): - strides = strides[0] - if isinstance(dilation, (tuple, list)): - dilation = dilation[0] - - if layout == 'NCW': - return nn.conv1d_ncw(data, kernel, strides, padding, dilation, - out_dtype) - if layout == 'NWC': - return nn.conv1d_nwc(data, kernel, strides, padding, dilation, - out_dtype) - raise ValueError("This layout is not yet supported: {}".format(layout)) - - -@autotvm.register_topi_schedule(generic.schedule_conv1d_ncw, ["cuda", "gpu"], - ["direct"]) +@autotvm.register_topi_schedule("conv1d_ncw.cuda") def schedule_conv1d_ncw(cfg, outs): """TOPI schedule callback of conv1d ncw for cuda gpu @@ -193,8 +148,18 @@ def _callback(op): return s -@autotvm.register_topi_schedule(generic.schedule_conv1d_nwc, ["cuda", "gpu"], - ["direct"]) +@autotvm.register_topi_compute("conv1d_nwc.cuda") +def conv1d_nwc(cfg, + data, + kernel, + strides, + padding, + dilation, + out_dtype='float32'): + return nn.conv1d_nwc(data, kernel, strides, padding, dilation, out_dtype) + + +@autotvm.register_topi_schedule("conv1d_nwc.cuda") def schedule_conv1d_nwc(cfg, outs): """TOPI schedule callback of conv1d nwc for cuda gpu diff --git a/topi/python/topi/cuda/conv1d_transpose_ncw.py b/topi/python/topi/cuda/conv1d_transpose_ncw.py index 4cedbd529f02..4802a0d144a3 100644 --- a/topi/python/topi/cuda/conv1d_transpose_ncw.py +++ b/topi/python/topi/cuda/conv1d_transpose_ncw.py @@ -19,11 +19,11 @@ import tvm from tvm import autotvm -from .. import nn, generic +from .. import nn from ..util import get_const_tuple, traverse_inline -@autotvm.task.register_topi_compute(nn.conv1d_transpose_ncw, ['cuda', 'gpu'], "direct") -def conv1d_transpose_ncw_cuda(cfg, data, kernel, stride, padding, out_dtype): +@autotvm.task.register_topi_compute("conv1d_transpose_nchw.cuda") +def conv1d_transpose_ncw(cfg, data, kernel, stride, padding, out_dtype): """Transposed 1D convolution ncw forward operator. Parameters @@ -79,9 +79,8 @@ def conv1d_transpose_ncw_cuda(cfg, data, kernel, stride, padding, out_dtype): return data_out -@autotvm.task.register_topi_schedule(generic.schedule_conv1d_transpose_ncw, - ['cuda', 'gpu'], 'direct') -def schedule_conv1d_transpose_ncw_cuda(cfg, outs): +@autotvm.task.register_topi_schedule("conv1d_transpose_nchw.cuda") +def schedule_conv1d_transpose_ncw(cfg, outs): """TOPI Schedule callback for conv1d_transpose operator. Parameters diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py index f26069cfc3f0..6fabb9d076ca 100644 --- a/topi/python/topi/cuda/conv2d.py +++ b/topi/python/topi/cuda/conv2d.py @@ -23,179 +23,91 @@ from .. import nn, generic from ..nn.util import get_pad_tuple from ..util import get_const_tuple, traverse_inline - from .conv2d_direct import schedule_direct_cuda -from .conv2d_winograd import winograd_cuda, schedule_winograd_cuda -from .conv2d_int8 import conv2d_NCHWc_int8, schedule_conv2d_NCHWc_int8 - - -@autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd', 'int8']) -def conv2d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCHW', out_dtype='float32'): - """Conv2D operator for cuda backend. - - Parameters - ---------- - cfg: ConfigEntity - The config for this template - - data : tvm.Tensor - 4-D with shape [batch, in_channel, in_height, in_width] or - 5-D with shape [batch, ic_chunk, in_height, in_width, ic_block] - - kernel : tvm.Tensor - 4-D with shape [num_filter, in_channel, filter_height, filter_width] or - 6-D with shape [num_filter_chunk, in_channel_chunk, filter_height, - filter_width, num_filter_block, in_channel_block] - - strides : int or a list/tuple of two ints - stride size, or [stride_height, stride_width] - - padding : int or a list/tuple of 2 or 4 ints - padding size, or - [pad_height, pad_width] for 2 ints, or - [pad_top, pad_left, pad_bottom, pad_right] for 4 ints - dilation: int or a list/tuple of two ints - dilation size, or [dilation_height, dilation_width] - layout : str - layout of data +@autotvm.register_topi_compute("conv2d_nchw.cuda") +def conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'): + return nn.conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype) - out_dtype: str - The output type. This is used for mixed precision. - - Returns - ------- - output : tvm.Tensor - 4-D with shape [batch, out_channel, out_height, out_width] - """ - target = tvm.target.Target.current() - - if "cudnn" in target.libs: - if layout == 'NCHW': - tensor_format = 0 # CUDNN_TENSOR_NCHW - N, _, H, W = get_const_tuple(data.shape) - elif layout == 'NHWC': - tensor_format = 1 # CUDNN_TENSOR_NHWC - N, H, W, _ = get_const_tuple(data.shape) - else: - raise ValueError("Unsupported layout %s in cudnn" % layout) - CO, CI, KH, KW = get_const_tuple(kernel.shape) - - # handle dilation - stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides - dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation - - if isinstance(padding, (list, tuple)) and len(padding) == 4 and \ - (padding[0] != padding[2] or padding[1] != padding[3]): - raise ValueError("Cudnn doesn't support asymmetric padding.") - pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) - OH = (H + pt + pb - KH) // stride_h + 1 - OW = (W + pl + pr - KW) // stride_w + 1 - cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) *\ - ((KW - 1) * dilation_w + 1)) - - if data.dtype == "int8" or kernel.dtype == "int8": - if layout == 'NCHW': - raise ValueError("NCHW layout do not support int8 in cudnn") - dtype = "int32" - else: - dtype = data.dtype - - return cudnn.conv_forward(data, - kernel, - [pt, pl], # cudnn padding pt, pl on both sides of input - [stride_h, stride_w], - [dilation_h, dilation_w], - conv_mode=1, - tensor_format=tensor_format, - algo=-1, # let CUDNN choose the best algo - conv_dtype=dtype) - - if cfg.template_key == 'winograd': - return winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, - pre_computed=False) - if cfg.template_key == 'int8': - if (data.dtype == 'int8' or data.dtype == 'uint8'): - return conv2d_NCHWc_int8( - cfg, data, kernel, strides, padding, dilation, layout, out_dtype) - - if layout == 'NCHW': - return nn.conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype) - if layout == 'HWCN': - return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype) - if layout == 'NHWC': - return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype) - raise ValueError("not support this layout {} yet".format(layout)) - - -@autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, ["cuda", "gpu"], - ["direct", 'winograd', "int8"]) -def schedule_conv2d_nchw_cuda(cfg, outs): - """TOPI schedule callback of conv2d for cuda gpu - - Parameters - ---------- - cfg: ConfigEntity - The config for this template - - outs: Array of Tensor - The computation graph description of conv2d - in the format of an array of tensors. - - Returns - ------- - s: Schedule - The computation schedule for conv2d. - """ - target = tvm.target.Target.current() - if 'cudnn' in target.libs: - return generic.schedule_extern(outs) +@autotvm.register_topi_schedule("conv2d_nchw.cuda") +def schedule_conv2d_nchw(cfg, outs): outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) def _callback(op): if op.tag == 'conv2d_nchw': schedule_direct_cuda(cfg, s, op.output(0)) - if op.tag == 'conv2d_nchw_winograd': - schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False) - if op.tag == "conv2d_NCHWc_int8": - schedule_conv2d_NCHWc_int8(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) return s -@autotvm.register_topi_schedule(generic.schedule_conv2d_nhwc, ["cuda", "gpu"], - ["direct"]) -def schedule_conv2d_nhwc_cuda(cfg, outs): - """TOPI schedule for CUDA conv2d_nhwc - - Parameters - ---------- - cfg: ConfigEntity - The config for this template - - outs: Array of Tensor - The computation graph description of conv2d - in the format of an array of tensors. - - Returns - ------- - s: Schedule - The computation schedule for conv2d. - """ - target = tvm.target.Target.current() - if 'cudnn' in target.libs: - return generic.schedule_extern(outs) - - outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs - s = tvm.create_schedule([x.op for x in outs]) +# TODO(@alexgl-github): It's invalid to call schedule_direct_cuda for NHWC layout +# as it assumes the input layout to be NCHW. Please fix this. +# @autotvm.register_topi_compute("conv2d_nhwc.cuda") +# def conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'): +# return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype) +# +# +# @autotvm.register_topi_schedule("conv2d_nhwc.cuda") +# def schedule_conv2d_nhwc(cfg, outs): +# outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs +# s = tvm.create_schedule([x.op for x in outs]) +# +# def _callback(op): +# if op.tag == 'conv2d_nhwc': +# schedule_direct_cuda(cfg, s, op.output(0)) +# +# traverse_inline(s, outs[0].op, _callback) +# return s - def _callback(op): - if op.tag == 'conv2d_nhwc': - schedule_direct_cuda(cfg, s, op.output(0)) - traverse_inline(s, outs[0].op, _callback) - return s +@autotvm.register_topi_compute("conv2d_cudnn.cuda") +def conv2d_cudnn(cfg, data, kernel, strides, padding, dilation, layout='NCHW', + out_dtype='float32'): + if layout == 'NCHW': + tensor_format = 0 # CUDNN_TENSOR_NCHW + N, _, H, W = get_const_tuple(data.shape) + elif layout == 'NHWC': + tensor_format = 1 # CUDNN_TENSOR_NHWC + N, H, W, _ = get_const_tuple(data.shape) + else: + raise ValueError("Unsupported layout %s in cudnn" % layout) + CO, CI, KH, KW = get_const_tuple(kernel.shape) + + # handle dilation + stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides + dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation + + if isinstance(padding, (list, tuple)) and len(padding) == 4 and \ + (padding[0] != padding[2] or padding[1] != padding[3]): + raise ValueError("Cudnn doesn't support asymmetric padding.") + pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) + OH = (H + pt + pb - KH) // stride_h + 1 + OW = (W + pl + pr - KW) // stride_w + 1 + cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) * \ + ((KW - 1) * dilation_w + 1)) + + if data.dtype == "int8" or kernel.dtype == "int8": + if layout == 'NCHW': + raise ValueError("NCHW layout do not support int8 in cudnn") + dtype = "int32" + else: + dtype = data.dtype + + return cudnn.conv_forward(data, + kernel, + [pt, pl], # cudnn padding pt, pl on both sides of input + [stride_h, stride_w], + [dilation_h, dilation_w], + conv_mode=1, + tensor_format=tensor_format, + algo=-1, # let CUDNN choose the best algo + conv_dtype=dtype) + + +@autotvm.register_topi_schedule("conv2d_cudnn.cuda") +def schedule_conv2d_cudnn(cfg, outs): + return generic.schedule_extern(outs) diff --git a/topi/python/topi/cuda/conv2d_alter_op.py b/topi/python/topi/cuda/conv2d_alter_op.py new file mode 100644 index 000000000000..614158c1ac3d --- /dev/null +++ b/topi/python/topi/cuda/conv2d_alter_op.py @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument +"""Conv2D alter op and legalize functions for cuda backend""" + +import logging +import tvm +from tvm import relay +from tvm import autotvm + +from .. import nn +from ..util import get_const_tuple +from .conv2d_winograd import _infer_tile_size + +logger = logging.getLogger('topi') + +@nn.conv2d_alter_layout.register(["cuda", "gpu"]) +def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): + target = tvm.target.current_target(allow_none=False) + dispatch_ctx = autotvm.task.DispatchContext.current + + _, outs = relay.backend.compile_engine.select_implement( + relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target) + workload = autotvm.task.get_workload(outs) + if workload is None: + # The best implementation is not an AutoTVM template, + # we then assume it's not necessary to alter this op. + return None + cfg = dispatch_ctx.query(target, workload) + if cfg.is_fallback: # if is fallback, clear query cache and return None + autotvm.task.clear_fallback_cache(target, workload) + return None + + topi_tmpl = workload[0] + new_attrs = {k: attrs[k] for k in attrs.keys()} + + strides = attrs.get_int_tuple("strides") + padding = attrs.get_int_tuple("padding") + dilation = attrs.get_int_tuple("dilation") + groups = attrs.get_int('groups') + data_layout = attrs["data_layout"] + kernel_layout = attrs["kernel_layout"] + data, kernel = tinfos + out_dtype = out_type.dtype + + if topi_tmpl == "conv2d_NCHWc_int8.cuda": + assert data_layout == "NCHW" and kernel_layout == "OIHW" + N, CI, H, W = get_const_tuple(data.shape) + CO, _, KH, KW = get_const_tuple(kernel.shape) + + new_layout = 'NCHW4c' + new_attrs["channels"] = CO + new_attrs["data_layout"] = new_layout + new_attrs['out_layout'] = new_layout + new_attrs['kernel_layout'] = 'OIHW4o4i' + ic_block_factor = oc_block_factor = 4 + + # Store the same config for the altered operator (workload) + new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor), + dtype=data.dtype) + new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor, KH, KW, \ + oc_block_factor, ic_block_factor), dtype=kernel.dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype], + "conv2d_NCHWc_int8.cuda") + dispatch_ctx.update(target, new_workload, cfg) + return relay.nn.conv2d(*inputs, **new_attrs) + elif topi_tmpl == "conv2d_nchw_winograd.cuda": + if dilation != (1, 1): + logger.warning("Does not support weight pre-transform for dilated convolution.") + return None + + assert data_layout == "NCHW" and kernel_layout == "OIHW" + N, CI, H, W = get_const_tuple(data.shape) + CO, _, KH, KW = get_const_tuple(kernel.shape) + + # pre-compute weight transformation in winograd + tile_size = _infer_tile_size(tinfos[0], tinfos[1]) + + weight = relay.nn.contrib_conv2d_winograd_weight_transform(inputs[1], + tile_size=tile_size) + weight = relay.transpose(weight, axes=[0, 1, 3, 2]) + new_attrs['tile_size'] = tile_size + new_attrs['channels'] = CO + + # Store the same config for the altered operator (workload) + new_data = data + new_weight = tvm.placeholder((KH + tile_size - 1, KW + tile_size - 1, CI, CO), + dtype=kernel.dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_weight, strides, padding, dilation, out_dtype, tile_size], + "conv2d_nchw_winograd_without_weight_transform.cuda") + dispatch_ctx.update(target, new_workload, cfg) + return relay.nn.contrib_conv2d_winograd_without_weight_transform( + inputs[0], weight, **new_attrs) + elif topi_tmpl == "group_conv2d_NCHWc_int8.cuda": + assert data_layout == "NCHW" and kernel_layout == "OIHW" + N, CI, H, W = get_const_tuple(data.shape) + CO, _, KH, KW = get_const_tuple(kernel.shape) + + new_layout = 'NCHW4c' + new_attrs["channels"] = CO + new_attrs["data_layout"] = new_layout + new_attrs['out_layout'] = new_layout + new_attrs['kernel_layout'] = 'OIHW4o4i' + ic_block_factor = oc_block_factor = 4 + + # Store the same config for the altered operator (workload) + new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor), + dtype=data.dtype) + new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor // groups, + KH, KW, oc_block_factor, ic_block_factor), + dtype=kernel.dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, groups, out_dtype], + "group_conv2d_NCHWc_int8.cuda") + dispatch_ctx.update(target, new_workload, cfg) + return relay.nn.conv2d(*inputs, **new_attrs) + else: + return None diff --git a/topi/python/topi/cuda/conv2d_direct.py b/topi/python/topi/cuda/conv2d_direct.py index b7df88579f49..2fab8cf12253 100644 --- a/topi/python/topi/cuda/conv2d_direct.py +++ b/topi/python/topi/cuda/conv2d_direct.py @@ -43,7 +43,7 @@ def schedule_direct_cuda(cfg, s, conv): # fallback support if cfg.is_fallback: ref_log = autotvm.tophub.load_reference_log( - target.target_name, target.model, 'conv2d', 'direct') + target.target_name, target.model, 'conv2d_nchw.cuda') cfg.fallback_with_reference_log(ref_log) ##### space definition end ##### diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py index 18a624a67aea..635bf4d2fd6e 100644 --- a/topi/python/topi/cuda/conv2d_hwcn.py +++ b/topi/python/topi/cuda/conv2d_hwcn.py @@ -20,10 +20,14 @@ from tvm import autotvm from tvm.autotvm.task.space import SplitEntity -from .. import generic, tag +from .. import nn, tag +@autotvm.register_topi_compute("conv2d_hwcn.cuda") +def conv2d_hwcn(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'): + return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype) -@autotvm.register_topi_schedule(generic.schedule_conv2d_hwcn, ["cuda", "gpu"], ["direct"]) + +@autotvm.register_topi_schedule("conv2d_hwcn.cuda") def schedule_conv2d_hwcn(cfg, outs): """Schedule for conv2d_hwcn and any element-wise operations. diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py index 580cf96b53e8..cab1191be5fc 100644 --- a/topi/python/topi/cuda/conv2d_int8.py +++ b/topi/python/topi/cuda/conv2d_int8.py @@ -23,9 +23,10 @@ from .tensor_intrin import dp4a from ..nn.pad import pad from ..nn.util import get_pad_tuple -from ..util import get_const_tuple +from ..util import get_const_tuple, traverse_inline +@autotvm.register_topi_compute("conv2d_NCHWc_int8.cuda") def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_dtype): """Convolution operator in NCHW[x]c layout for int8. @@ -152,7 +153,20 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_ _dp4a = dp4a('shared', 'shared', 'local') -def schedule_conv2d_NCHWc_int8(cfg, s, output): +@autotvm.register_topi_schedule("conv2d_NCHWc_int8.cuda") +def schedule_conv2d_NCHWc_int8(cfg, outs): + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if op.tag == 'conv2d_NCHWc_int8': + _schedule_conv2d_NCHWc_int8(cfg, s, op.output(0)) + + traverse_inline(s, outs[0].op, _callback) + return s + + +def _schedule_conv2d_NCHWc_int8(cfg, s, output): """Schedule conv2d int8 NCHWc template""" conv = output.op.input_tensors[0] packed_data, packed_kernel = conv.op.input_tensors diff --git a/topi/python/topi/cuda/conv2d_transpose_nchw.py b/topi/python/topi/cuda/conv2d_transpose_nchw.py index be9f31567bc9..c39a2fcac6a6 100644 --- a/topi/python/topi/cuda/conv2d_transpose_nchw.py +++ b/topi/python/topi/cuda/conv2d_transpose_nchw.py @@ -20,12 +20,12 @@ import tvm from tvm import autotvm from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity -from .. import nn, generic +from .. import nn from ..util import get_const_tuple, traverse_inline -@autotvm.task.register_topi_compute(nn.conv2d_transpose_nchw, ['cuda', 'gpu'], "direct") -def conv2d_transpose_nchw_cuda(cfg, data, kernel, stride, padding, out_dtype): +@autotvm.register_topi_compute("nn.conv2d_transpose_nchw.cuda") +def conv2d_transpose_nchw(cfg, data, kernel, stride, padding, out_dtype): """Transposed 2D convolution nchw forward operator. Parameters @@ -101,9 +101,8 @@ def conv2d_transpose_nchw_cuda(cfg, data, kernel, stride, padding, out_dtype): return data_out -@autotvm.task.register_topi_schedule(generic.schedule_conv2d_transpose_nchw, - ['cuda', 'gpu'], 'direct') -def schedule_conv2d_transpose_nchw_cuda(cfg, outs): +@autotvm.register_topi_schedule("nn.conv2d_transpose_nchw.cuda") +def schedule_conv2d_transpose_nchw(cfg, outs): """TOPI Schedule callback for conv2d transpose operator. Parameters diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py index 37307d62357d..0f22a48bd368 100644 --- a/topi/python/topi/cuda/conv2d_winograd.py +++ b/topi/python/topi/cuda/conv2d_winograd.py @@ -22,9 +22,7 @@ from tvm import autotvm from .. import nn -from ..nn import conv2d, group_conv2d_nchw, conv2d_winograd_without_weight_transform from ..util import get_const_int, get_const_tuple, traverse_inline -from ..generic import schedule_conv2d_winograd_without_weight_transform from ..nn.winograd_util import winograd_transform_matrices @@ -37,10 +35,9 @@ def _infer_tile_size(data, kernel): return 4 return 2 -def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, pre_computed): +def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, + pre_computed): """Compute declaration for winograd""" - assert layout == 'NCHW' - tile_size = _infer_tile_size(data, kernel) N, CI, H, W = get_const_tuple(data.shape) @@ -53,7 +50,7 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dty if not pre_computed: # kernel tensor is raw tensor, do strict check if dilation_h != 1 or dilation_w != 1: - kernel = dilation(kernel, (1, 1, dilation_h, dilation_w)) + kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w)) CO, CI, KH, KW = get_const_tuple(kernel.shape) alpha = KW + tile_size - 1 assert HSTR == 1 and WSTR == 1 and KH == KW @@ -282,161 +279,38 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed): return s -##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM ##### -@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, - ['cuda', 'gpu'], ['winograd']) -def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size): - return winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, - pre_computed=True) - +@autotvm.register_topi_compute("conv2d_nchw_winograd.cuda") +def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype): + return winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, + pre_computed=False) -@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform, - ['cuda', 'gpu'], ['winograd']) -def schedule_conv2d_winograd_without_weight_transform_cuda(cfg, outs): - """TOPI schedule callback""" +@autotvm.register_topi_schedule("conv2d_nchw_winograd.cuda") +def schedule_conv2d_nchw_winograd(cfg, outs): s = tvm.create_schedule([x.op for x in outs]) def _callback(op): if 'conv2d_nchw_winograd' in op.tag: - schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=True) + schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False) traverse_inline(s, outs[0].op, _callback) return s -##### REGISTER ALTER OP LAYOUT ##### -@nn.conv2d_alter_layout.register(["cuda", "gpu"]) -def _alter_conv2d_layout(attrs, inputs, tinfos, F): - """Alter op layout for pre-computing kernel transformation - - Parameters - ---------- - attrs : tvm.ir.Attrs - Attributes of current convolution - inputs : tvm.relay.Expr - Grouped input symbols - tinfos : list - Input shape and dtype - F: symbol - The context, can be relay.op - - Note - ---- - Unlike other TOPI functions, this function operates on both graph level and operator level, - so we have to pass 'F' to make it support our two versions of graph IR, Relay. - """ - if 'cudnn' in tvm.target.Target.current().libs or 'miopen' in tvm.target.Target.current().libs: - return None - - copy_inputs = list(inputs) - new_attrs = {k: attrs[k] for k in attrs.keys()} - - - new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')] - - strides = attrs.get_int_tuple("strides") - padding = attrs.get_int_tuple("padding") - dilation = attrs.get_int_tuple("dilation") - groups = attrs.get_int('groups') - data_layout_key = "data_layout" if "data_layout" in new_attrs else "layout" - layout = attrs[data_layout_key] - out_dtype = attrs["out_dtype"] - if out_dtype in ("", "same"): - out_dtype = tinfos[0].dtype - - data, kernel = tinfos[0:2] - N, CI, H, W = get_const_tuple(data.shape) - CO, _, KH, KW = get_const_tuple(kernel.shape) +@autotvm.register_topi_compute("conv2d_nchw_winograd_without_weight_transform.cuda") +def conv2d_nchw_winograd_without_weight_transform(cfg, data, kernel, strides, + padding, dilation, out_dtype): + return winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, + pre_computed=True) - dispatch_ctx = autotvm.DispatchContext.current - target = tvm.target.Target.current() - if groups == 1: - # query config of this workload - workload = autotvm.task.args_to_workload( - [tinfos[0], tinfos[1], strides, padding, dilation, layout, out_dtype], conv2d) - cfg = autotvm.DispatchContext.current.query(target, workload) - - if cfg.is_fallback: # if is fallback, clear query cache and return None - autotvm.task.clear_fallback_cache(target, workload) - return None - - if cfg.template_key == 'direct': - return None - - if cfg.template_key == 'int8': - assert 'cuda' in target.keys - new_layout = 'NCHW4c' - new_attrs[data_layout_key] = new_layout - new_attrs['out_layout'] = new_layout - new_attrs['kernel_layout'] = 'OIHW4o4i' - ic_block_factor = oc_block_factor = 4 - - # Store the same config for the altered operator (workload) - new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor), - dtype=data.dtype) - new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor, KH, KW,\ - oc_block_factor, ic_block_factor), dtype=kernel.dtype) - new_workload = autotvm.task.args_to_workload( - [new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype], - conv2d - ) - dispatch_ctx.update(target, new_workload, cfg) - return F.nn.conv2d(*copy_inputs, **new_attrs) - - if attrs.get_int_tuple("dilation") != (1, 1): - logger.warning("Does not support weight pre-transform for dilated convolution.") - return None - - # pre-compute weight transformation in winograd - tile_size = _infer_tile_size(tinfos[0], tinfos[1]) - - weight = F.nn.contrib_conv2d_winograd_weight_transform(copy_inputs[1], - tile_size=tile_size) - weight = F.transpose(weight, axes=[0, 1, 3, 2]) - copy_inputs[1] = weight - new_attrs['tile_size'] = tile_size - - # Store the same config for the altered operator (workload) - new_data = data - new_weight = tvm.placeholder((KH + tile_size - 1, KW + tile_size - 1, CI, CO), - dtype=kernel.dtype) - new_workload = autotvm.task.args_to_workload( - [new_data, new_weight, strides, padding, dilation, layout, out_dtype, tile_size], - conv2d_winograd_without_weight_transform - ) - dispatch_ctx.update(target, new_workload, cfg) - return F.nn.contrib_conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs) - if groups != CI: - workload = autotvm.task.args_to_workload( - [tinfos[0], tinfos[1], strides, padding, dilation, groups, out_dtype], - group_conv2d_nchw) - cfg = autotvm.DispatchContext.current.query(target, workload) - - if cfg.is_fallback: # if is fallback, clear query cache and return None - autotvm.task.clear_fallback_cache(target, workload) - return None - - if cfg.template_key == 'int8': - assert 'cuda' in target.keys - new_layout = 'NCHW4c' - new_attrs[data_layout_key] = new_layout - new_attrs['out_layout'] = new_layout - new_attrs['kernel_layout'] = 'OIHW4o4i' - ic_block_factor = oc_block_factor = 4 - - # Store the same config for the altered operator (workload) - new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor), - dtype=data.dtype) - new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor // groups,\ - KH, KW, oc_block_factor, ic_block_factor), - dtype=kernel.dtype) - new_workload = autotvm.task.args_to_workload( - [new_data, new_kernel, strides, padding, dilation, groups, out_dtype], - group_conv2d_nchw - ) - dispatch_ctx.update(target, new_workload, cfg) - return F.nn.conv2d(*copy_inputs, **new_attrs) - - # do nothing for depthwise convolution - return None +@autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform.cuda") +def schedule_conv2d_nchw_winograd_without_weight_transform_cuda(cfg, outs): + """TOPI schedule callback""" + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if 'conv2d_nchw_winograd' in op.tag: + schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=True) + + traverse_inline(s, outs[0].op, _callback) + return s diff --git a/topi/python/topi/cuda/conv3d.py b/topi/python/topi/cuda/conv3d.py index b46f284ef5b7..016fc7fb757c 100644 --- a/topi/python/topi/cuda/conv3d.py +++ b/topi/python/topi/cuda/conv3d.py @@ -21,14 +21,13 @@ from tvm.contrib import cudnn from .. import nn, generic -from ..nn.util import get_pad_tuple3d from ..util import get_const_tuple, traverse_inline +from .conv3d_direct import schedule_direct_conv3d_cuda -from .conv3d_direct import schedule_direct_3d_cuda - -@autotvm.register_topi_compute(nn.conv3d, ['cuda', 'gpu'], ['direct']) -def conv3d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCDHW', out_dtype='float32'): +@autotvm.register_topi_compute("conv3d_ncdhw.cuda") +def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, layout='NCDHW', + out_dtype='float32'): """Conv3D operator for cuda backend. Parameters @@ -45,10 +44,8 @@ def conv3d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCDHW', o strides : int or a list/tuple of three ints stride size, or [stride_depth, stride_height, stride_width] - padding : int or a list/tuple of 3 or 6 ints - padding size, or - [pad_depth, pad_height, pad_width] for 3 ints, or - [pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right] for 6 ints + padding : int or a list/tuple of three ints + padding size, or [pad_depth, pad_height, pad_width] dilation: int or a list/tuple of three ints dilation size, or [dilation_depth, dilation_height, dilation_width] @@ -64,52 +61,11 @@ def conv3d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCDHW', o output : tvm.Tensor 5-D with shape [batch, out_channel, out_depth, out_height, out_width] """ - target = tvm.target.Target.current() - - if "cudnn" in target.libs: - if layout == 'NCDHW': - tensor_format = 0 # CUDNN_TENSOR_NCHW - N, _, D, H, W = get_const_tuple(data.shape) - elif layout == 'NDHWC': - tensor_format = 1 # CUDNN_TENSOR_NHWC - N, D, H, W, _ = get_const_tuple(data.shape) - else: - raise ValueError("Unsupported layout %s in cudnn" % layout) - CO, CI, KD, KH, KW = get_const_tuple(kernel.shape) - - # handle dilation - stride_d, stride_h, stride_w = (strides, strides, strides) if isinstance(strides, int) \ - else strides - if isinstance(padding, (list, tuple)) and len(padding) > 3: - raise ValueError("Cudnn doesn't support asymmetric padding.") - pf, pt, pl, pk, pb, pr = get_pad_tuple3d(padding, (KD, KH, KW)) - dilation_d, dilation_h, dilation_w = (dilation, dilation, dilation) if \ - isinstance(dilation, int) else dilation - - OD = (D + pf + pk - KD) // stride_d + 1 - OH = (H + pt + pb - KH) // stride_h + 1 - OW = (W + pl + pr - KW) // stride_w + 1 - cfg.add_flop(2 * N * OD * OH * OW * CO * CI * ((KD - 1) * dilation_d + 1) *\ - ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1)) - - return cudnn.conv_forward(data, - kernel, - [pf, pt, pl], # cudnn padding pt, pl on both sides of input - [stride_d, stride_h, stride_w], - [dilation_d, dilation_h, dilation_w], - conv_mode=1, - tensor_format=tensor_format, - algo=-1, # let CUDNN choose the best algo - conv_dtype=data.dtype) - - if layout == 'NCDHW': - return nn.conv3d_ncdhw(data, kernel, strides, padding, dilation, out_dtype) - raise ValueError("not support this layout {} yet".format(layout)) + return nn.conv3d_ncdhw(data, kernel, strides, padding, dilation, layout, out_dtype) -@autotvm.register_topi_schedule(generic.schedule_conv3d_ncdhw, ["cuda", "gpu"], - ["direct"]) -def schedule_conv3d_ncdhw_cuda(cfg, outs): +@autotvm.register_topi_schedule("conv3d_ncdhw.cuda") +def schedule_conv3d_ncdhw(cfg, outs): """TOPI schedule callback of conv3d for cuda gpu Parameters @@ -126,24 +82,59 @@ def schedule_conv3d_ncdhw_cuda(cfg, outs): s: Schedule The computation schedule for conv2d. """ - target = tvm.target.Target.current() - if 'cudnn' in target.libs: - return generic.schedule_extern(outs) - outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) def _callback(op): if op.tag == 'conv3d_ncdhw': - schedule_direct_3d_cuda(cfg, s, op.output(0)) + schedule_direct_conv3d_cuda(cfg, s, op.output(0), "NCDHW", + "conv3d_ncdhw.cuda") traverse_inline(s, outs[0].op, _callback) return s -@autotvm.register_topi_schedule(generic.schedule_conv3d_ndhwc, ["cuda", "gpu"], - ["direct"]) -def schedule_conv3d_ndhwc_cuda(cfg, outs): +@autotvm.register_topi_compute("conv3d_ndhwc.cuda") +def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, layout='NDHWC', + out_dtype='float32'): + """Conv3D operator for cuda backend. + + Parameters + ---------- + cfg: ConfigEntity + The config for this template + + data : tvm.Tensor + 5-D with shape [batch, in_channel, in_depth, in_height, in_width] + + kernel : tvm.Tensor + 5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width] + + strides : int or a list/tuple of three ints + stride size, or [stride_depth, stride_height, stride_width] + + padding : int or a list/tuple of three ints + padding size, or [pad_depth, pad_height, pad_width] + + dilation: int or a list/tuple of three ints + dilation size, or [dilation_depth, dilation_height, dilation_width] + + layout : str + layout of data + + out_dtype: str + The output type. This is used for mixed precision. + + Returns + ------- + output : tvm.Tensor + 5-D with shape [batch, out_channel, out_depth, out_height, out_width] + """ + return nn.conv3d_ndhwc(data, kernel, strides, padding, dilation, layout, out_dtype) + + +@autotvm.register_topi_schedule("conv3d_ndhwc.cuda") +def schedule_conv3d_ndhwc(cfg, outs): """TOPI schedule callback of conv3d for cuda gpu Parameters @@ -160,16 +151,104 @@ def schedule_conv3d_ndhwc_cuda(cfg, outs): s: Schedule The computation schedule for conv2d. """ - target = tvm.target.Target.current() - if 'cudnn' in target.libs: - return generic.schedule_extern(outs) - outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) def _callback(op): if op.tag == 'conv3d_ndhwc': - schedule_direct_3d_cuda(cfg, s, op.output(0)) + schedule_direct_conv3d_cuda(cfg, s, op.output(0), "NDHWC", + "conv3d_ndhwc.cuda") traverse_inline(s, outs[0].op, _callback) return s + + +@autotvm.register_topi_compute("conv3d_cudnn.cuda") +def conv3d_cudnn(cfg, data, kernel, strides, padding, dilation, layout='NCDHW', + out_dtype='float32'): + """Conv3D operator for cuda backend. + + Parameters + ---------- + cfg: ConfigEntity + The config for this template + + data : tvm.Tensor + 5-D with shape [batch, in_channel, in_depth, in_height, in_width] + + kernel : tvm.Tensor + 5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width] + + strides : int or a list/tuple of three ints + stride size, or [stride_depth, stride_height, stride_width] + + padding : int or a list/tuple of three ints + padding size, or [pad_depth, pad_height, pad_width] + + dilation: int or a list/tuple of three ints + dilation size, or [dilation_depth, dilation_height, dilation_width] + + layout : str + layout of data + + out_dtype: str + The output type. This is used for mixed precision. + + Returns + ------- + output : tvm.Tensor + 5-D with shape [batch, out_channel, out_depth, out_height, out_width] + """ + if layout == 'NCDHW': + tensor_format = 0 # CUDNN_TENSOR_NCHW + N, _, D, H, W = get_const_tuple(data.shape) + elif layout == 'NDHWC': + tensor_format = 1 # CUDNN_TENSOR_NHWC + N, D, H, W, _ = get_const_tuple(data.shape) + else: + raise ValueError("Unsupported layout %s in cudnn" % layout) + CO, CI, KD, KH, KW = get_const_tuple(kernel.shape) + + # handle dilation + stride_d, stride_h, stride_w = (strides, strides, strides) if isinstance(strides, int) \ + else strides + pad_d, pad_h, pad_w = (padding, padding, padding) if isinstance(padding, int) else padding + dilation_d, dilation_h, dilation_w = (dilation, dilation, dilation) if \ + isinstance(dilation, int) else dilation + + OD = (D + 2 * pad_d - KD) // stride_d + 1 + OH = (H + 2 * pad_h - KH) // stride_h + 1 + OW = (W + 2 * pad_w - KW) // stride_w + 1 + cfg.add_flop(2 * N * OD * OH * OW * CO * CI * ((KD - 1) * dilation_d + 1) * \ + ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1)) + + return cudnn.conv_forward(data, + kernel, + [pad_d, pad_h, pad_w], + [stride_d, stride_h, stride_w], + [dilation_d, dilation_h, dilation_w], + conv_mode=1, + tensor_format=tensor_format, + algo=-1, # let CUDNN choose the best algo + conv_dtype=dtype) + + +@autotvm.register_topi_schedule("conv3d_cudnn.cuda") +def schedule_conv3d_cudnn(_, outs): + """TOPI schedule callback of conv3d for cuda gpu + + Parameters + ---------- + cfg: ConfigEntity + The config for this template + + outs: Array of Tensor + The computation graph description of conv2d + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for conv2d. + """ + return generic.schedule_extern(outs) diff --git a/topi/python/topi/cuda/conv3d_direct.py b/topi/python/topi/cuda/conv3d_direct.py index ad48deb27539..fa6c8781b5d3 100644 --- a/topi/python/topi/cuda/conv3d_direct.py +++ b/topi/python/topi/cuda/conv3d_direct.py @@ -20,11 +20,16 @@ from tvm import autotvm from ..util import get_const_tuple -def schedule_direct_3d_cuda(cfg, s, conv): +def schedule_direct_conv3d_cuda(cfg, s, conv, layout, workload_name): """schedule optimized for batch size = 1""" ##### space definition begin ##### - n, f, d, y, x = s[conv].op.axis + if layout == "NCDHW": + n, f, d, y, x = s[conv].op.axis + elif layout == "NDHWC": + n, d, y, x, f = s[conv].op.axis + else: + raise ValueError("not support this layout {} yet".format(layout)) rc, rd, ry, rx = s[conv].op.reduce_axis cfg.define_split("tile_f", f, num_outputs=4) cfg.define_split("tile_d", d, num_outputs=4) @@ -45,7 +50,7 @@ def schedule_direct_3d_cuda(cfg, s, conv): # fallback support if cfg.is_fallback: ref_log = autotvm.tophub.load_reference_log( - target.target_name, target.model, 'conv3d', 'direct') + target.target_name, target.model, workload_name) cfg.fallback_with_reference_log(ref_log) ##### space definition end ##### diff --git a/topi/python/topi/cuda/deformable_conv2d.py b/topi/python/topi/cuda/deformable_conv2d.py index 33a8c9adc1ca..0cf7f5a799cc 100644 --- a/topi/python/topi/cuda/deformable_conv2d.py +++ b/topi/python/topi/cuda/deformable_conv2d.py @@ -18,16 +18,18 @@ """Schedule template of deformable conv2d with cuda backend""" import tvm from tvm import autotvm -from .. import nn, generic +from .. import nn from ..util import traverse_inline -autotvm.register_topi_compute(nn.deformable_conv2d_nchw, ["cuda", "gpu"], "direct", - nn.deformable_conv2d_nchw.fdefault) +@autotvm.register_topi_compute("deformable_conv2d_nchw.cuda") +def deformable_conv2d_nchw(cfg, data, offset, kernel, strides, padding, dilation, + deformable_groups, groups, out_dtype): + return nn.deformable_conv2d_nchw(data, offset, kernel, strides, padding, dilation, + deformable_groups, groups, out_dtype) - -@autotvm.register_topi_schedule(generic.schedule_deformable_conv2d_nchw, ["cuda", "gpu"], "direct") -def schedule_deformable_conv2d_nchw_cuda(cfg, outs): +@autotvm.register_topi_schedule("deformable_conv2d_nchw.cuda") +def schedule_deformable_conv2d_nchw(cfg, outs): """TOPI schedule callback of deformable conv2d for cuda gpu Parameters @@ -49,13 +51,13 @@ def schedule_deformable_conv2d_nchw_cuda(cfg, outs): def _callback(op): if op.tag == 'deformable_conv2d_nchw': - schedule_direct_cuda(cfg, s, op.output(0)) + _schedule_direct_cuda(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) return s -def schedule_direct_cuda(cfg, s, conv): +def _schedule_direct_cuda(cfg, s, conv): """Schedule template of deformable conv2d""" n, f, y, x = s[conv].op.axis rc, ry, rx = s[conv].op.reduce_axis diff --git a/topi/python/topi/cuda/dense.py b/topi/python/topi/cuda/dense.py index 1a1af703c55c..6cdf5d8b8b3e 100644 --- a/topi/python/topi/cuda/dense.py +++ b/topi/python/topi/cuda/dense.py @@ -23,110 +23,59 @@ from tvm.autotvm.task.space import SplitEntity from tvm.contrib import cublas from .tensor_intrin import dp4a -from ..nn.dense import dense, dense_default +from .. import nn from .. import tag from .. import generic from ..util import traverse_inline, get_const_tuple logger = logging.getLogger('topi') - -@autotvm.register_topi_compute(dense, ["cuda", "gpu"], "direct") -def dense_cuda(cfg, data, weight, bias=None, out_dtype=None): - """Dense operator for cuda backend. - - Parameters - ---------- - data : tvm.Tensor - 2-D with shape [batch, in_dim] - - weight : tvm.Tensor - 2-D with shape [out_dim, in_dim] - - bias : tvm.Tensor, optional - 1-D with shape [out_dim] - - Returns - ------- - output : tvm.Tensor - 2-D with shape [batch, out_dim] - """ - # pylint: disable=unused-argument +@autotvm.register_topi_compute("dense_cublas.cuda") +def dense_cublas(cfg, data, weight, bias=None, out_dtype=None): + """Dense operator on CUDA with CUBLAS""" assert len(data.shape) == 2 and len(weight.shape) == 2, \ "only support 2-dim dense" if bias is not None: assert len(bias.shape) == 1 if out_dtype is None: out_dtype = data.dtype + assert out_dtype == data.dtype, "Mixed precision not supported." batch, in_dim = data.shape out_dim, _ = weight.shape - target = tvm.target.Target.current() - if "cublas" in target.libs: - matmul = cublas.matmul(data, weight, False, True, out_dtype) - if bias is not None: - matmul = tvm.compute((batch, out_dim), \ - lambda i, j: matmul[i, j] + bias[j], \ - tag=tag.BROADCAST) - return matmul - return dense_default(data, weight, bias, out_dtype) + matmul = cublas.matmul(data, weight, False, True) + cfg.add_flop(batch * in_dim * out_dim * 2) + if bias is not None: + matmul = tvm.compute((batch, out_dim), + lambda i, j: matmul[i, j] + bias[j], + tag=tag.BROADCAST) + return matmul -@autotvm.register_topi_schedule(generic.schedule_dense, ["cuda", "gpu"], "direct") -def schedule_dense(cfg, outs): - """Schedule for dense operator. +@autotvm.register_topi_schedule("dense_cublas.cuda") +def schedule_dense_cublas(_, outs): + """Schedule dense operator using CUBLAS""" + return generic.schedule_extern(outs) - Parameters - ---------- - outs: Array of Tensor - The computation graph description of dense - in the format of an array of tensors. - Returns - ------- - s: Schedule - The computation schedule for dense. - """ - # pylint: disable=unused-argument - target = tvm.target.Target.current() +@autotvm.register_topi_compute("dense_small_batch.cuda") +def dense_small_batch(cfg, data, weight, bias=None, out_dtype=None): + """Dense operator on CUDA""" + return nn.dense(data, weight, bias, out_dtype) - outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs - if target.target_name == "cuda" and "cublas" in target.libs: - return generic.schedule_extern(outs) +@autotvm.register_topi_schedule("dense_small_batch.cuda") +def schedule_dense_small_batch(cfg, outs): + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) - def _schedule(C): - A, _ = C.op.input_tensors - batch, _ = get_const_tuple(A.shape) - if batch < 32: - return schedule_dense_small_batch(cfg, s, C) - return schedule_dense_large_batch(cfg, s, C) - - scheduled_ops = [] - - def traverse(OP): - """Internal traverse function""" - # inline all one-to-one-mapping operators except the last stage (output) - if tag.is_broadcast(OP.tag): - if OP not in s.outputs: - s[OP].compute_inline() - for tensor in OP.input_tensors: - if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops: - traverse(tensor.op) - # schedule dense - elif OP.tag == 'dense': - Dense = OP.output(0) - _schedule(Dense) - else: - raise RuntimeError("Unsupported operator: %s" % OP.tag) - - scheduled_ops.append(OP) + def _callback(op): + if op.tag == 'dense': + _schedule_dense_small_batch(cfg, s, op.output(0)) - traverse(outs[0].op) + traverse_inline(s, outs[0].op, _callback) return s - -def schedule_dense_small_batch(cfg, s, C): +def _schedule_dense_small_batch(cfg, s, C): """Schedule float32/64 dense with small batch size""" A, _ = C.op.input_tensors _, in_dim = get_const_tuple(A.shape) @@ -152,7 +101,27 @@ def schedule_dense_small_batch(cfg, s, C): s[C].set_store_predicate(thread_x.var.equal(0)) s[Out].set_store_predicate(thread_x.var.equal(0)) -def schedule_dense_large_batch(cfg, s, C): + +@autotvm.register_topi_compute("dense_large_batch.cuda") +def dense_large_batch(cfg, data, weight, bias=None, out_dtype=None): + """Dense operator on CUDA""" + return nn.dense(data, weight, bias, out_dtype) + + +@autotvm.register_topi_schedule("dense_large_batch.cuda") +def schedule_dense_large_batch(cfg, outs): + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if op.tag == 'dense': + _schedule_dense_large_batch(cfg, s, op.output(0)) + + traverse_inline(s, outs[0].op, _callback) + return s + + +def _schedule_dense_large_batch(cfg, s, C): """Schedule float32/64 dense with large batch size""" A, B = C.op.input_tensors batch, in_dim = get_const_tuple(A.shape) @@ -250,7 +219,8 @@ def schedule_dense_large_batch(cfg, s, C): s[BB].bind(tx, tvm.thread_axis("threadIdx.x")) s[BB].double_buffer() -@autotvm.register_topi_compute(dense, ['cuda'], ['int8']) + +@autotvm.register_topi_compute("dense_int8.cuda") def dense_int8(cfg, data, weight, bias=None, out_dtype=None): """Dense operator for int8 on CUDA""" if out_dtype is None: @@ -286,11 +256,11 @@ def dense_int8(cfg, data, weight, bias=None, out_dtype=None): return matmul -@autotvm.register_topi_schedule(generic.schedule_dense, ['cuda', 'gpu'], ['int8']) +@autotvm.register_topi_schedule("dense_int8.cuda") def schedule_dense_int8(cfg, outs): """Dense schedule for int8 on CUDA""" s = tvm.create_schedule([x.op for x in outs]) - target = tvm.target.Target.current() + target = tvm.target.current_target() outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs if "cublas" in target.libs: diff --git a/topi/python/topi/cuda/depthwise_conv2d.py b/topi/python/topi/cuda/depthwise_conv2d.py index 05e1117ac2ce..c8cd7934bd3e 100644 --- a/topi/python/topi/cuda/depthwise_conv2d.py +++ b/topi/python/topi/cuda/depthwise_conv2d.py @@ -20,14 +20,15 @@ from tvm import autotvm from ..util import traverse_inline from .. import tag -from .. import generic, nn +from .. import nn # register original implementation of depthwise_conv2d_nchw since we don't need to change this part -autotvm.register_topi_compute(nn.depthwise_conv2d_nchw, ['cuda', 'gpu'], 'direct', - nn.depthwise_conv2d_nchw.fdefault) +@autotvm.register_topi_compute("depthwise_conv2d_nchw.cuda") +def depthwise_conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype): + return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype) -@autotvm.register_topi_schedule(generic.schedule_depthwise_conv2d_nchw, ['cuda', 'gpu'], 'direct') -def schedule_depthwise_conv2d_nchw_cuda(cfg, outs): +@autotvm.register_topi_schedule("depthwise_conv2d_nchw.cuda") +def schedule_depthwise_conv2d_nchw(cfg, outs): """Schedule for depthwise_conv2d nchw forward. Parameters @@ -66,7 +67,7 @@ def _callback(op): # fallback support if cfg.is_fallback: ref_log = autotvm.tophub.load_reference_log( - target.target_name, target.model, 'depthwise_conv2d_nchw', 'direct') + target.target_name, target.model, 'depthwise_conv2d_nchw.cuda') cfg.fallback_with_reference_log(ref_log) # TODO(lmzheng): A bug here, set unroll_explicit to False as workaround cfg['unroll_explicit'].val = 0 @@ -131,7 +132,6 @@ def _callback(op): traverse_inline(s, outs[0].op, _callback) return s -@generic.schedule_depthwise_conv2d_nhwc.register(["cuda", "gpu"]) def schedule_depthwise_conv2d_nhwc(outs): """Schedule for depthwise_conv2d nhwc forward. diff --git a/topi/python/topi/cuda/group_conv2d_nchw.py b/topi/python/topi/cuda/group_conv2d_nchw.py index 54e8427daf79..24a4be5dbe92 100644 --- a/topi/python/topi/cuda/group_conv2d_nchw.py +++ b/topi/python/topi/cuda/group_conv2d_nchw.py @@ -24,15 +24,163 @@ from ..nn.pad import pad from ..nn.util import get_pad_tuple from ..util import traverse_inline, get_const_tuple, get_const_int -from .. import nn, generic +from .. import nn -autotvm.register_topi_compute(nn.group_conv2d_nchw, ['cuda', 'gpu'], 'direct', - nn.group_conv2d_nchw.fdefault) - -@autotvm.register_topi_compute(nn.group_conv2d_nchw, ['cuda', 'gpu'], ['int8']) +@autotvm.register_topi_compute("group_conv2d_nchw.cuda") def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups, out_dtype='float32'): + return nn.group_conv2d_nchw(data, kernel, stride, padding, dilation, groups, out_dtype) + + +@autotvm.register_topi_schedule("group_conv2d_nchw.cuda") +def schedule_group_conv2d_nchw(cfg, outs): + """TOPI schedule callback of group conv2d for cuda gpu + + Parameters + ---------- + cfg: ConfigEntity + The config for this template + + outs: Array of Tensor + The computation graph description of conv2d + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for group conv2d. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if op.tag == "group_conv2d_nchw": + _schedule_group_conv2d_nchw_direct(cfg, s, op.output(0)) + + traverse_inline(s, outs[0].op, _callback) + return s + + +def _schedule_group_conv2d_nchw_direct(cfg, s, conv): + """Schedule group conv2d NCHW direct template""" + workload = conv.op.attrs["workload"] + groups = get_const_int(workload[6]) + num_filters = get_const_int(conv.shape[1]) + + ##### space definition begin ##### + n, f, y, x = s[conv].op.axis + rc, ry, rx = s[conv].op.reduce_axis + cfg.define_split("tile_n", n, num_outputs=4) + cfg.define_split("tile_g", cfg.axis(groups), num_outputs=2) + cfg.define_split("tile_f", cfg.axis(num_filters // groups), num_outputs=4) + cfg.define_split("tile_y", y, num_outputs=4) + cfg.define_split("tile_x", x, num_outputs=4) + cfg.define_split("tile_rc", rc, num_outputs=2) + cfg.define_split("tile_ry", ry, num_outputs=2) + cfg.define_split("tile_rx", rx, num_outputs=2) + cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) + + target = tvm.target.current_target() + if target.target_name in ['nvptx', 'rocm']: + cfg.define_knob("unroll_explicit", [1]) + else: + cfg.define_knob("unroll_explicit", [0, 1]) + + pad_data, kernel = s[conv].op.input_tensors + + s[pad_data].compute_inline() + + if conv.op in s.outputs: + output = conv + OL = s.cache_write(conv, 'local') + else: + output = s.outputs[0].output(0) + s[conv].set_scope('local') + OL = conv + + # create cache stage + AA = s.cache_read(pad_data, 'shared', [OL]) + WW = s.cache_read(kernel, 'shared', [OL]) + + # tile and bind spatial axes + n, f, y, x = s[output].op.axis + kernel_scope, n = s[output].split(n, nparts=1) + + g, f = s[output].split(f, nparts=groups) + bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n) + bg, vg = cfg["tile_g"].apply(s, output, g) + bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) + by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) + bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) + + s[output].reorder(bn, bg, bf, by, bx, vn, vg, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi) + s[output].bind(bn, tvm.thread_axis("blockIdx.z")) + s[output].bind(s[output].fuse(bg, bf), tvm.thread_axis("blockIdx.y")) + s[output].bind(s[output].fuse(by, bx), tvm.thread_axis("blockIdx.x")) + s[output].bind(vn, tvm.thread_axis("vthread")) + s[output].bind(vg, tvm.thread_axis("vthread")) + s[output].bind(vf, tvm.thread_axis("vthread")) + s[output].bind(vy, tvm.thread_axis("vthread")) + s[output].bind(vx, tvm.thread_axis("vthread")) + + cfg.define_knob("fuse_yx", [0, 1]) # fuse ty,tx or tn,tf + if cfg["fuse_yx"].val: + s[output].bind(tn, tvm.thread_axis("threadIdx.z")) + s[output].bind(tf, tvm.thread_axis("threadIdx.y")) + tyx = s[output].fuse(ty, tx) + s[output].bind(tyx, tvm.thread_axis("threadIdx.x")) + s[OL].compute_at(s[output], tyx) + + # number of threads + n_tz = cfg["tile_n"].size[2] + n_ty = cfg["tile_f"].size[2] + n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2] + else: + s[output].bind(s[output].fuse(tn, tf), tvm.thread_axis("threadIdx.z")) + s[output].bind(ty, tvm.thread_axis("threadIdx.y")) + s[output].bind(tx, tvm.thread_axis("threadIdx.x")) + s[OL].compute_at(s[output], tx) + + # number of threads + n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2] + n_ty = cfg["tile_y"].size[2] + n_tx = cfg["tile_x"].size[2] + + # tile reduction axes + n, f, y, x = s[OL].op.axis + rc, ry, rx = s[OL].op.reduce_axis + rco, rci = cfg['tile_rc'].apply(s, OL, rc) + ryo, ryi = cfg['tile_rx'].apply(s, OL, ry) + rxo, rxi = cfg['tile_ry'].apply(s, OL, rx) + s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x) + + s[AA].compute_at(s[OL], rxo) + s[WW].compute_at(s[OL], rxo) + + # cooperative fetching + for load in [AA, WW]: + n, f, y, x = s[load].op.axis + fused = s[load].fuse(n, f, y, x) + fused, tx = s[load].split(fused, factor=n_tx) + fused, ty = s[load].split(fused, factor=n_ty) + fused, tz = s[load].split(fused, factor=n_tz) + s[load].bind(tz, tvm.thread_axis("threadIdx.z")) + s[load].bind(ty, tvm.thread_axis("threadIdx.y")) + s[load].bind(tx, tvm.thread_axis("threadIdx.x")) + + # unroll + s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) + s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val) + + N, CO, OH, OW = get_const_tuple(output.shape) + _, CI_div_groups, KH, KW = get_const_tuple(kernel.shape) + cfg.add_flop(2 * N * OH * OW * CO * CI_div_groups * KH * KW) + + +@autotvm.register_topi_compute("group_conv2d_NCHWc_int8.cuda") +def group_conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, groups, + out_dtype='float32'): """Group convolution operator for 'group_conv2d_NCHWc_int8'. Parameters @@ -155,29 +303,58 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups, # Compared with a normal convolution, group convolution only sums # input channels from the group that an output channel resides in. conv = tvm.compute(oshape, lambda n, occ, oh, ow, ocb: - tvm.sum(pad_data[n, occ//(oc_chunk//groups)*(ic_chunk//groups)+icc, - oh*stride_h+kh*dilation_h, ow*stride_w+kw*dilation_w, icb] - .astype('int32') * - packed_kernel[occ, icc, - kh, kw, ocb, icb] - .astype('int32'), - axis=[icc, kh, kw, icb])) + tvm.sum(pad_data[n, occ//(oc_chunk//groups)*(ic_chunk//groups)+icc, + oh*stride_h+kh*dilation_h, ow*stride_w+kw*dilation_w, icb] + .astype('int32') * + packed_kernel[occ, icc, + kh, kw, ocb, icb] + .astype('int32'), + axis=[icc, kh, kw, icb])) # Type conversion output = tvm.compute(oshape, lambda *index: conv(*index).astype(out_dtype), tag='group_conv2d_NCHWc_int8') num_flop = batch * oc_chunk * oc_block * out_height * out_width * \ - ic_chunk * ic_block * kernel_h * kernel_w * 2 // groups + ic_chunk * ic_block * kernel_h * kernel_w * 2 // groups cfg.add_flop(num_flop) return output +@autotvm.register_topi_schedule("group_conv2d_NCHWc_int8.cuda") +def schedule_group_conv2d_NCHWc_int8(cfg, outs): + """TOPI schedule callback of group conv2d for cuda gpu + + Parameters + ---------- + cfg: ConfigEntity + The config for this template + + outs: Array of Tensor + The computation graph description of conv2d + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for group conv2d. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if op.tag == "group_conv2d_NCHWc_int8": + _schedule_group_conv2d_NCHWc_int8(cfg, s, op.output(0)) + + traverse_inline(s, outs[0].op, _callback) + return s + + _dp4a = dp4a('shared', 'shared', 'local') -def schedule_group_conv2d_NCHWc_int8(cfg, s, output): +def _schedule_group_conv2d_NCHWc_int8(cfg, s, output): """Schedule group conv2d int8 NCHWc template""" workload = output.op.attrs["workload"] groups = get_const_int(workload[6]) @@ -198,7 +375,7 @@ def schedule_group_conv2d_NCHWc_int8(cfg, s, output): s[packed_kernel].pragma( s[packed_kernel].op.axis[0], "debug_skip_region") else: - if isinstance(packed_kernel.op, tvm.tensor.ComputeOp) and\ + if isinstance(packed_kernel.op, tvm.tensor.ComputeOp) and \ packed_kernel.name == 'packed_kernel': # data and kernel are not pre-computed, schedule layout transform here schedule_injective_from_existing(s, packed_data) @@ -319,151 +496,3 @@ def schedule_group_conv2d_NCHWc_int8(cfg, s, output): s[output].pragma(kernel_scope, 'unroll_explicit', False) return s - - -def schedule_group_conv2d_nchw_direct(cfg, s, conv): - """Schedule group conv2d NCHW direct template""" - workload = conv.op.attrs["workload"] - groups = get_const_int(workload[6]) - num_filters = get_const_int(conv.shape[1]) - - ##### space definition begin ##### - n, f, y, x = s[conv].op.axis - rc, ry, rx = s[conv].op.reduce_axis - cfg.define_split("tile_n", n, num_outputs=4) - cfg.define_split("tile_g", cfg.axis(groups), num_outputs=2) - cfg.define_split("tile_f", cfg.axis(num_filters // groups), num_outputs=4) - cfg.define_split("tile_y", y, num_outputs=4) - cfg.define_split("tile_x", x, num_outputs=4) - cfg.define_split("tile_rc", rc, num_outputs=2) - cfg.define_split("tile_ry", ry, num_outputs=2) - cfg.define_split("tile_rx", rx, num_outputs=2) - cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) - - target = tvm.target.Target.current() - if target.target_name in ['nvptx', 'rocm']: - cfg.define_knob("unroll_explicit", [1]) - else: - cfg.define_knob("unroll_explicit", [0, 1]) - - pad_data, kernel = s[conv].op.input_tensors - - s[pad_data].compute_inline() - - if conv.op in s.outputs: - output = conv - OL = s.cache_write(conv, 'local') - else: - output = s.outputs[0].output(0) - s[conv].set_scope('local') - OL = conv - - # create cache stage - AA = s.cache_read(pad_data, 'shared', [OL]) - WW = s.cache_read(kernel, 'shared', [OL]) - - # tile and bind spatial axes - n, f, y, x = s[output].op.axis - kernel_scope, n = s[output].split(n, nparts=1) - - g, f = s[output].split(f, nparts=groups) - bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n) - bg, vg = cfg["tile_g"].apply(s, output, g) - bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) - by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) - bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) - - s[output].reorder(bn, bg, bf, by, bx, vn, vg, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi) - s[output].bind(bn, tvm.thread_axis("blockIdx.z")) - s[output].bind(s[output].fuse(bg, bf), tvm.thread_axis("blockIdx.y")) - s[output].bind(s[output].fuse(by, bx), tvm.thread_axis("blockIdx.x")) - s[output].bind(vn, tvm.thread_axis("vthread")) - s[output].bind(vg, tvm.thread_axis("vthread")) - s[output].bind(vf, tvm.thread_axis("vthread")) - s[output].bind(vy, tvm.thread_axis("vthread")) - s[output].bind(vx, tvm.thread_axis("vthread")) - - cfg.define_knob("fuse_yx", [0, 1]) # fuse ty,tx or tn,tf - if cfg["fuse_yx"].val: - s[output].bind(tn, tvm.thread_axis("threadIdx.z")) - s[output].bind(tf, tvm.thread_axis("threadIdx.y")) - tyx = s[output].fuse(ty, tx) - s[output].bind(tyx, tvm.thread_axis("threadIdx.x")) - s[OL].compute_at(s[output], tyx) - - # number of threads - n_tz = cfg["tile_n"].size[2] - n_ty = cfg["tile_f"].size[2] - n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2] - else: - s[output].bind(s[output].fuse(tn, tf), tvm.thread_axis("threadIdx.z")) - s[output].bind(ty, tvm.thread_axis("threadIdx.y")) - s[output].bind(tx, tvm.thread_axis("threadIdx.x")) - s[OL].compute_at(s[output], tx) - - # number of threads - n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2] - n_ty = cfg["tile_y"].size[2] - n_tx = cfg["tile_x"].size[2] - - # tile reduction axes - n, f, y, x = s[OL].op.axis - rc, ry, rx = s[OL].op.reduce_axis - rco, rci = cfg['tile_rc'].apply(s, OL, rc) - ryo, ryi = cfg['tile_rx'].apply(s, OL, ry) - rxo, rxi = cfg['tile_ry'].apply(s, OL, rx) - s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x) - - s[AA].compute_at(s[OL], rxo) - s[WW].compute_at(s[OL], rxo) - - # cooperative fetching - for load in [AA, WW]: - n, f, y, x = s[load].op.axis - fused = s[load].fuse(n, f, y, x) - fused, tx = s[load].split(fused, factor=n_tx) - fused, ty = s[load].split(fused, factor=n_ty) - fused, tz = s[load].split(fused, factor=n_tz) - s[load].bind(tz, tvm.thread_axis("threadIdx.z")) - s[load].bind(ty, tvm.thread_axis("threadIdx.y")) - s[load].bind(tx, tvm.thread_axis("threadIdx.x")) - - # unroll - s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) - s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val) - - N, CO, OH, OW = get_const_tuple(output.shape) - _, CI_div_groups, KH, KW = get_const_tuple(kernel.shape) - cfg.add_flop(2 * N * OH * OW * CO * CI_div_groups * KH * KW) - - -@autotvm.register_topi_schedule(generic.schedule_group_conv2d_nchw, - ["cuda", "gpu"], ["int8", "direct"]) -def schedule_conv2d_nchw_cuda(cfg, outs): - """TOPI schedule callback of group conv2d for cuda gpu - - Parameters - ---------- - cfg: ConfigEntity - The config for this template - - outs: Array of Tensor - The computation graph description of conv2d - in the format of an array of tensors. - - Returns - ------- - s: Schedule - The computation schedule for group conv2d. - """ - outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs - s = tvm.create_schedule([x.op for x in outs]) - - def _callback(op): - if op.tag == "group_conv2d_NCHWc_int8": - schedule_group_conv2d_NCHWc_int8(cfg, s, op.output(0)) - if op.tag == "group_conv2d_nchw": - schedule_group_conv2d_nchw_direct(cfg, s, op.output(0)) - - traverse_inline(s, outs[0].op, _callback) - return s diff --git a/topi/python/topi/cuda/injective.py b/topi/python/topi/cuda/injective.py index eb7019bd7654..1690407a1602 100644 --- a/topi/python/topi/cuda/injective.py +++ b/topi/python/topi/cuda/injective.py @@ -17,10 +17,8 @@ # pylint: disable=invalid-name, unused-variable, """Schedule for composition of injective operator""" import tvm -from .. import generic, util -from ..util import is_empty_shape +from .. import util -@generic.schedule_injective_from_existing.register(["cuda", "gpu"]) def schedule_injective_from_existing(sch, out): """Schedule for injective op from existing schedule. @@ -67,7 +65,6 @@ def schedule_injective_from_existing(sch, out): return sch -@generic.schedule_injective.register(["cuda", "gpu"]) def schedule_injective(outs): """Schedule for injective op. @@ -87,7 +84,7 @@ def schedule_injective(outs): tvm.schedule.AutoInlineInjective(s) for out in outs: - if not is_empty_shape(out.shape): + if not util.is_empty_shape(out.shape): schedule_injective_from_existing(s, out) return s diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 5485859de01f..27a52724fb2d 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -22,7 +22,6 @@ from tvm import api from tvm.intrin import if_then_else -from topi.vision import non_max_suppression, get_valid_counts from .sort import argsort from .. import tag @@ -238,8 +237,7 @@ def out_rewrite(data, flag, prefix_sum, valid_count, out): return ib.get() -@get_valid_counts.register(["cuda", "gpu"]) -def get_valid_counts_gpu(data, score_threshold=0, id_index=0, score_index=1): +def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1): """Get valid count of bounding boxes given a score threshold. Also moves valid boxes to the top of input data. @@ -585,11 +583,10 @@ def invalid_to_bottom_ir(data, flag, idx, out): return ib.get() -@non_max_suppression.register(["cuda", "gpu"]) -def non_max_suppression_gpu(data, valid_count, max_output_size=-1, - iou_threshold=0.5, force_suppress=False, top_k=-1, - coord_start=2, score_index=1, id_index=0, - return_indices=True, invalid_to_bottom=False): +def non_max_suppression(data, valid_count, max_output_size=-1, + iou_threshold=0.5, force_suppress=False, top_k=-1, + coord_start=2, score_index=1, id_index=0, + return_indices=True, invalid_to_bottom=False): """Non-maximum suppression operator for object detection. Parameters diff --git a/topi/python/topi/cuda/nn.py b/topi/python/topi/cuda/nn.py index 327afa87edb5..c0230ec0be48 100644 --- a/topi/python/topi/cuda/nn.py +++ b/topi/python/topi/cuda/nn.py @@ -19,10 +19,8 @@ from __future__ import absolute_import as _abs import tvm -from .. import generic from .. import cpp -@generic.schedule_lrn.register(["cuda"]) def schedule_lrn(outs): """Schedule for LRN @@ -37,6 +35,4 @@ def schedule_lrn(outs): sch: Schedule The computation schedule for the op. """ - target = tvm.target.Target.current(allow_none=False) - cpp_target = cpp.TEST_create_target(target.target_name) - return cpp.cuda.schedule_lrn(cpp_target, outs) + return cpp.cuda.schedule_lrn(outs) diff --git a/topi/python/topi/cuda/pooling.py b/topi/python/topi/cuda/pooling.py index 2bf1e6bb9ef0..2bebd3912378 100644 --- a/topi/python/topi/cuda/pooling.py +++ b/topi/python/topi/cuda/pooling.py @@ -18,12 +18,9 @@ """Schedule for pooling operators""" import tvm from .. import tag -from .. import generic from ..util import traverse_inline - -@generic.schedule_adaptive_pool.register(["cuda", "gpu"]) def schedule_adaptive_pool(outs): """Schedule for adaptive_pool. @@ -89,7 +86,6 @@ def traverse(OP): return s -@generic.schedule_pool.register(["cuda", "gpu"]) def schedule_pool(outs, layout): """Schedule for pool. @@ -153,8 +149,7 @@ def traverse(OP): return s -@generic.schedule_pool_grad.register(['cuda', 'gpu']) -def schedule_pool_grad_cuda(outs): +def schedule_pool_grad(outs): """Schedule for pool_grad on CUDA Parameters diff --git a/topi/python/topi/cuda/rcnn/__init__.py b/topi/python/topi/cuda/rcnn/__init__.py index 42b34f0a31e6..da55b070a807 100644 --- a/topi/python/topi/cuda/rcnn/__init__.py +++ b/topi/python/topi/cuda/rcnn/__init__.py @@ -17,4 +17,4 @@ # pylint: disable=wildcard-import """Faster R-CNN and Mask R-CNN operators""" -from .proposal import * +from .proposal import proposal diff --git a/topi/python/topi/cuda/rcnn/proposal.py b/topi/python/topi/cuda/rcnn/proposal.py index 4344226d787e..71f9c4ac305e 100644 --- a/topi/python/topi/cuda/rcnn/proposal.py +++ b/topi/python/topi/cuda/rcnn/proposal.py @@ -308,9 +308,8 @@ def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf): return body -@proposal.register("cuda") -def proposal_cuda(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, threshold, - rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss): +def proposal(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, threshold, + rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss): """Proposal operator. Parameters diff --git a/topi/python/topi/cuda/reduction.py b/topi/python/topi/cuda/reduction.py index 69c685cb50b4..0b9d5885375e 100644 --- a/topi/python/topi/cuda/reduction.py +++ b/topi/python/topi/cuda/reduction.py @@ -19,7 +19,6 @@ from __future__ import absolute_import as _abs import tvm from .. import tag -from .. import generic from .injective import schedule_injective_from_existing def _schedule_reduce(op, sch, is_idx_reduce=False): @@ -89,7 +88,6 @@ def _schedule_reduce(op, sch, is_idx_reduce=False): return sch -@generic.schedule_reduce.register(["cuda", "gpu"]) def schedule_reduce(outs): """Schedule for inject->reduce->bcast ops. diff --git a/topi/python/topi/cuda/softmax.py b/topi/python/topi/cuda/softmax.py index 26a1baffa092..afd11ea0e71e 100644 --- a/topi/python/topi/cuda/softmax.py +++ b/topi/python/topi/cuda/softmax.py @@ -17,10 +17,9 @@ # pylint: disable=invalid-name, unused-variable, trailing-whitespace """Schedule for softmax operator""" import tvm -from .. import generic from .injective import schedule_injective_from_existing -@generic.schedule_softmax.register(["cuda", "gpu"]) + def schedule_softmax(outs): """Schedule for softmax op. diff --git a/topi/python/topi/cuda/sort.py b/topi/python/topi/cuda/sort.py index b32cce75362f..88ca9d876abc 100644 --- a/topi/python/topi/cuda/sort.py +++ b/topi/python/topi/cuda/sort.py @@ -19,10 +19,9 @@ import tvm from tvm import api -from ..sort import argsort, topk +from .injective import schedule_injective_from_existing from ..math import identity from ..transform import strided_slice -from .. import generic from .. import tag def _schedule_sort(outs): @@ -42,8 +41,7 @@ def _schedule_sort(outs): outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) scheduled_ops = [] - # pylint: disable=import-outside-toplevel - from .injective import schedule_injective_from_existing + def traverse(op): if tag.is_injective(op.tag): schedule_injective_from_existing(s, op.output(0)) @@ -239,8 +237,7 @@ def sort_nms_ir(data, valid_count, output, axis, is_ascend): return ib.get() -@argsort.register(["cuda", "gpu"]) -def argsort_gpu(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"): +def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"): """Performs sorting along the given axis and returns an array of indicies having same shape as an input array that index data in sorted order. @@ -294,7 +291,6 @@ def argsort_gpu(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"): tag="argsort_gpu")[1] return out -@generic.schedule_argsort.register(["cuda", "gpu"]) def schedule_argsort(outs): """Schedule for argsort operator. @@ -311,8 +307,7 @@ def schedule_argsort(outs): """ return _schedule_sort(outs) -@topk.register(["cuda", "gpu"]) -def topk_gpu(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): +def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): """Get the top k elements in an input tensor along the given axis. Parameters @@ -389,7 +384,6 @@ def topk_gpu(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64" return output -@generic.schedule_topk.register(["cuda", "gpu"]) def schedule_topk(outs): """Schedule for argsort operator. diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py index 10ba7a1051ea..0b3f50ba0031 100644 --- a/topi/python/topi/cuda/ssd/multibox.py +++ b/topi/python/topi/cuda/ssd/multibox.py @@ -25,9 +25,6 @@ import topi -from topi.vision.ssd import multibox_prior -from topi.vision.ssd import multibox_detection -from topi.vision.ssd import multibox_transform_loc from ..nms import non_max_suppression @@ -112,9 +109,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): return body -@multibox_prior.register(["cuda", "gpu"]) -def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), - offsets=(0.5, 0.5), clip=False): +def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), + offsets=(0.5, 0.5), clip=False): """Generate prior(anchor) boxes from data, sizes and ratios. Parameters @@ -346,9 +342,8 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, return ib.get() -@multibox_transform_loc.register(["cuda", "gpu"]) -def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, \ - threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)): +def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, \ + threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)): """Location transformation for multibox detection Parameters @@ -426,9 +421,8 @@ def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, \ return [out_loc, valid_count] -@multibox_detection.register(["cuda", "gpu"]) -def multibox_detection_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5, - force_suppress=False, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=-1): +def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5, + force_suppress=False, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=-1): """Convert multibox detection predictions. Parameters diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py index d456aadf4f5e..499288829e44 100644 --- a/topi/python/topi/cuda/vision.py +++ b/topi/python/topi/cuda/vision.py @@ -22,13 +22,13 @@ from .. import cpp from .. import tag from .pooling import schedule_pool +from .injective import schedule_injective_from_existing def _default_schedule(outs): """Default schedule for gpu.""" outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) scheduled_ops = [] - from .injective import schedule_injective_from_existing def traverse(op): if tag.is_broadcast(op.tag) or op.tag in ['bbox_score', 'sorted_bbox']: schedule_injective_from_existing(s, op.output(0)) @@ -39,7 +39,6 @@ def traverse(op): traverse(outs[0].op) return s -@generic.schedule_reorg.register(["cuda", "gpu"]) def schedule_reorg(outs): """Schedule for reorg operator. Parameters @@ -57,7 +56,6 @@ def schedule_reorg(outs): cpp_target = cpp.TEST_create_target(target.target_name) return cpp.cuda.schedule_injective(cpp_target, outs) -@generic.schedule_nms.register(["cuda", "gpu"]) def schedule_nms(outs): """Schedule for non-maximum suppression @@ -74,7 +72,6 @@ def schedule_nms(outs): """ return _default_schedule(outs) -@generic.schedule_multibox_prior.register(["cuda", "gpu"]) def schedule_multibox_prior(outs): """Schedule for multibox_prior operator. @@ -91,7 +88,6 @@ def schedule_multibox_prior(outs): """ return _default_schedule(outs) -@generic.schedule_multibox_transform_loc.register(["cuda", "gpu"]) def schedule_multibox_transform_loc(outs): """Schedule for multibox_transform_loc @@ -109,7 +105,6 @@ def schedule_multibox_transform_loc(outs): """ return _default_schedule(outs) -@generic.schedule_multibox_detection.register(["cuda", "gpu"]) def schedule_multibox_detection(outs): """Schedule for multibox_detection operator. @@ -126,15 +121,12 @@ def schedule_multibox_detection(outs): """ return _default_schedule(outs) -@generic.schedule_roi_align.register(["cuda", "gpu"]) def schedule_roi_align(outs): return schedule_pool(outs, 'NCHW') -@generic.schedule_roi_pool.register(["cuda", "gpu"]) def schedule_roi_pool(outs): return schedule_pool(outs, 'NCHW') -@generic.schedule_proposal.register(["cuda", "gpu"]) def schedule_proposal(outs): """Schedule for proposal operator. @@ -151,7 +143,6 @@ def schedule_proposal(outs): """ return _default_schedule(outs) -@generic.schedule_get_valid_counts.register(["cuda", "gpu"]) def schedule_get_valid_counts(outs): """Schedule for get_valid_counts operator. diff --git a/topi/python/topi/generic/conv2d.py b/topi/python/topi/generic/conv2d.py index 332c2fdad459..08bb06c6f855 100644 --- a/topi/python/topi/generic/conv2d.py +++ b/topi/python/topi/generic/conv2d.py @@ -19,6 +19,7 @@ """Generic convolution schedules""" from __future__ import absolute_import as _abs import tvm +from tvm import autotvm from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity from ..util import get_const_tuple @@ -109,7 +110,8 @@ def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements): raise ValueError("cannot decide default schedule for workload: {}".format(wkl)) -def schedule_conv_NCHWc_cpu_common_int8(s, cfg, data, conv_out, last, int32_lanes=16, intrin=None): +def schedule_conv_NCHWc_cpu_common_int8(s, cfg, data_vec, kernel_vec, conv_out, + last, int32_lanes=16, intrin=None): """ Defines the schedule for INT8 for Intel and ARM machines Uses the Intel/ARM intrinsics to use INT8 operations @@ -117,14 +119,39 @@ def schedule_conv_NCHWc_cpu_common_int8(s, cfg, data, conv_out, last, int32_lane lower-numerical-precision-deep-learning-inference-and-training """ reg_n, unroll_kw = cfg["tile_ow"].size[-1], cfg["unroll_kw"].val - _, _, _, _, ic_bn = get_const_tuple(data.shape) + _, _, _, _, ic_bn = get_const_tuple(data_vec.shape) _, _, _, _, oc_bn = get_const_tuple(conv_out.shape) - A = data - if isinstance(s[A].op, tvm.tensor.ComputeOp): - batch, ic_chunk, ih, iw, _ = s[A].op.axis - parallel_axis = s[A].fuse(batch, ic_chunk, ih) - s[A].parallel(parallel_axis) + # schedule pad + if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \ + and "pad" in data_vec.op.tag: + batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis + parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih) + s[data_vec].parallel(parallel_axis) + data_vec = data_vec.op.input_tensors[0] + + if autotvm.GLOBAL_SCOPE.in_tuning: + # only in autotuning, input data of conv2d_NCHWc will be 4-D. + # skip this part during tuning to make records accurate. + # this part will be folded during Relay fold_constant pass. + s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region") + s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region") + elif isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and \ + kernel_vec.name == 'kernel_vec': + # data and kernel are not pre-computed, schedule layout transform here. + # this should only be used by x86 conv2d_nchw, which is for + # testing purpose. + batch, ic_chunk, ih, ic_block, iw = s[data_vec].op.axis + parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih) + s[data_vec].parallel(parallel_axis) + + oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[kernel_vec].op.axis + s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) + oc_bn = cfg["tile_oc"].size[-1] + if oc_bn > 1: + s[kernel_vec].vectorize(oc_block) + parallel_axis = s[kernel_vec].fuse(oc_chunk, oh) + s[kernel_vec].parallel(parallel_axis) # schedule 5-D NCHW[x]c conv C, O = conv_out, last @@ -173,7 +200,8 @@ def schedule_conv_NCHWc_cpu_common_int8(s, cfg, data, conv_out, last, int32_lane return s -def schedule_conv_NCHWc_cpu_1x1_int8(s, cfg, data, conv_out, last, int32_lanes=16, intrin=None): +def schedule_conv_NCHWc_cpu_1x1_int8(s, cfg, data_vec, kernel_vec, conv_out, + last, int32_lanes=16, intrin=None): """ Defines the 1x1 conv schedule for INT8 for Intel and ARM machines Uses the Intel/ARM intrinsics to use INT8 operations @@ -181,15 +209,39 @@ def schedule_conv_NCHWc_cpu_1x1_int8(s, cfg, data, conv_out, last, int32_lanes=1 lower-numerical-precision-deep-learning-inference-and-training """ oh_factor, ow_factor = cfg["tile_oh"].val, cfg["tile_ow"].size[-1] - _, _, _, _, ic_bn = get_const_tuple(data.shape) + _, _, _, _, ic_bn = get_const_tuple(data_vec.shape) _, _, _, _, oc_bn = get_const_tuple(conv_out.shape) - # schedule data - A = data - if isinstance(s[A].op, tvm.tensor.ComputeOp): - batch, ic_chunk, ih, iw, ic_block = s[A].op.axis - parallel_axis = s[A].fuse(batch, ic_chunk, ih) - s[A].parallel(parallel_axis) + # schedule pad + if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \ + and "pad" in data_vec.op.tag: + batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis + parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih) + s[data_vec].parallel(parallel_axis) + data_vec = data_vec.op.input_tensors[0] + + if autotvm.GLOBAL_SCOPE.in_tuning: + # only in autotuning, input data of conv2d_NCHWc will be 4-D. + # skip this part during tuning to make records accurate. + # this part will be folded during Relay fold_constant pass. + s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region") + s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region") + elif isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and \ + kernel_vec.name == 'kernel_vec': + # data and kernel are not pre-computed, schedule layout transform here. + # this should only be used by x86 conv2d_nchw, which is for + # testing purpose. + batch, ic_chunk, ih, ic_block, iw = s[data_vec].op.axis + parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih) + s[data_vec].parallel(parallel_axis) + + oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[kernel_vec].op.axis + s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) + oc_bn = cfg["tile_oc"].size[-1] + if oc_bn > 1: + s[kernel_vec].vectorize(oc_block) + parallel_axis = s[kernel_vec].fuse(oc_chunk, oh) + s[kernel_vec].parallel(parallel_axis) C, O = conv_out, last CC = s.cache_write(C, 'global') diff --git a/topi/python/topi/generic/extern.py b/topi/python/topi/generic/extern.py index e895385e8b66..977c53763a52 100644 --- a/topi/python/topi/generic/extern.py +++ b/topi/python/topi/generic/extern.py @@ -21,7 +21,6 @@ import tvm from .. import cpp -@tvm.target.generic_func def schedule_extern(outs): """Schedule for an extern op followed by injective operations. diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py index 2aff96f9636c..6f1013c06dbd 100644 --- a/topi/python/topi/generic/injective.py +++ b/topi/python/topi/generic/injective.py @@ -20,7 +20,6 @@ import tvm -@tvm.target.override_native_generic_func("schedule_injective_from_existing") def schedule_injective_from_existing(sch, out): """Schedule for injective op from existing schedule. @@ -36,10 +35,9 @@ def schedule_injective_from_existing(sch, out): sch: Schedule The updated schedule. """ - sch[out].fuse(s[out].op.axis) + sch[out].fuse(*sch[out].op.axis) return sch -@tvm.target.override_native_generic_func("schedule_injective") def schedule_injective(outs): """Schedule for injective op. @@ -64,22 +62,5 @@ def schedule_injective(outs): schedule_injective_from_existing(s, x) return s -@tvm.target.generic_func -def schedule_concatenate(outs): - """Schedule for concatenate op. - - Parameters - ---------- - outs: Array of Tensor - The computation graph description of reduce in the format - of an array of tensors. - - Returns - ------- - sch: Schedule - The computation schedule for the op. - """ - return schedule_injective(outs) - schedule_elemwise = schedule_injective schedule_broadcast = schedule_injective diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 883182941202..ab926e8fb162 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -18,7 +18,6 @@ """Generic nn operators""" from __future__ import absolute_import as _abs import tvm -from .. import cpp def _default_schedule(outs, auto_inline): """Default schedule for llvm.""" @@ -34,7 +33,6 @@ def _default_schedule(outs, auto_inline): return s -@tvm.target.generic_func def schedule_conv1d_ncw(outs): """Schedule for conv1d_ncw @@ -52,7 +50,6 @@ def schedule_conv1d_ncw(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_conv1d_nwc(outs): """Schedule for conv1d_nwc @@ -70,7 +67,6 @@ def schedule_conv1d_nwc(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_conv2d_hwcn(outs): """Schedule for conv2d_hwcn @@ -88,7 +84,6 @@ def schedule_conv2d_hwcn(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_conv2d_nchw(outs): """Schedule for conv2d_nchw @@ -106,7 +101,6 @@ def schedule_conv2d_nchw(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_conv2d_nhwc_pack(outs): """Schedule for conv2d_nhwc_pack @@ -124,7 +118,6 @@ def schedule_conv2d_nhwc_pack(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_conv2d_nhwc(outs): """Schedule for conv2d_nhwc @@ -142,7 +135,6 @@ def schedule_conv2d_nhwc(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_conv2d_NCHWc(outs): """Schedule for conv2d_NCHW[x]c @@ -161,7 +153,6 @@ def schedule_conv2d_NCHWc(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_conv2d_NCHWc_int8(outs): """Schedule for conv2d_NCHW[x]c_int8 @@ -180,7 +171,6 @@ def schedule_conv2d_NCHWc_int8(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_conv2d_winograd_weight_transform(outs): """Schedule for weight transformation of winograd @@ -210,7 +200,6 @@ def schedule_conv2d_winograd_weight_transform(outs): return s -@tvm.target.generic_func def schedule_conv2d_winograd_without_weight_transform(outs): """Schedule for winograd without weight transformation @@ -228,7 +217,6 @@ def schedule_conv2d_winograd_without_weight_transform(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_conv2d_winograd_nnpack_weight_transform(outs): """Schedule for weight transformation of winograd Parameters @@ -245,23 +233,7 @@ def schedule_conv2d_winograd_nnpack_weight_transform(outs): s = tvm.create_schedule([x.op for x in outs]) return s -@tvm.target.generic_func -def schedule_conv2d_winograd_nnpack_without_weight_transform(outs): - """Schedule for winograd without weight transformation - Parameters - ---------- - outs: Array of Tensor - The computation graph description of this operator - in the format of an array of tensors. - Returns - ------- - sch: Schedule - The computation schedule for the op. - """ - return _default_schedule(outs, False) - -@tvm.target.generic_func def schedule_conv3d_ncdhw(outs): """Schedule for conv3d_ncdhw @@ -278,7 +250,6 @@ def schedule_conv3d_ncdhw(outs): """ return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_conv3d_ndhwc(outs): """Schedule for conv3d_ndhwc @@ -295,7 +266,6 @@ def schedule_conv3d_ndhwc(outs): """ return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_conv2d_transpose_nchw(outs): """Schedule for conv2d_transpose_nchw @@ -313,7 +283,6 @@ def schedule_conv2d_transpose_nchw(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_conv1d_transpose_ncw(outs): """Schedule for conv1d_transpose_ncw @@ -331,7 +300,6 @@ def schedule_conv1d_transpose_ncw(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_depthwise_conv2d_nchw(outs): """Schedule for depthwise_conv2d_nchw @@ -349,7 +317,6 @@ def schedule_depthwise_conv2d_nchw(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_depthwise_conv2d_nhwc(outs): """Schedule for depthwise_conv2d_nhwc Parameters @@ -366,7 +333,6 @@ def schedule_depthwise_conv2d_nhwc(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_depthwise_conv2d_NCHWc(outs): """Schedule for depthwise_conv2d_NCHWc Parameters @@ -383,7 +349,6 @@ def schedule_depthwise_conv2d_NCHWc(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_group_conv2d_nchw(outs): """Schedule for group_conv2d_nchw @@ -401,7 +366,6 @@ def schedule_group_conv2d_nchw(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_deformable_conv2d_nchw(outs): """Schedule for deformable_conv2d_nchw @@ -419,7 +383,6 @@ def schedule_deformable_conv2d_nchw(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_bitserial_conv2d_nchw(outs): """Schedule for bitserial_conv2d_nchw @@ -437,7 +400,6 @@ def schedule_bitserial_conv2d_nchw(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_bitserial_conv2d_nhwc(outs): """Schedule for bitserial_conv2d_nhwc @@ -455,7 +417,6 @@ def schedule_bitserial_conv2d_nhwc(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_bitserial_dense(outs): """Schedule for bitserial_dense Parameters @@ -471,7 +432,6 @@ def schedule_bitserial_dense(outs): return _default_schedule(outs, False) -@tvm.target.override_native_generic_func("schedule_reduce") def schedule_reduce(outs): """Schedule for reduction @@ -489,7 +449,6 @@ def schedule_reduce(outs): return _default_schedule(outs, True) -@tvm.target.override_native_generic_func("schedule_softmax") def schedule_softmax(outs): """Schedule for softmax @@ -507,7 +466,6 @@ def schedule_softmax(outs): return _default_schedule(outs, False) -@tvm.target.override_native_generic_func("schedule_dense") def schedule_dense(outs): """Schedule for dense @@ -525,7 +483,6 @@ def schedule_dense(outs): return _default_schedule(outs, False) -@tvm.target.override_native_generic_func("schedule_pool") def schedule_pool(outs, layout): """Schedule for pool @@ -546,7 +503,6 @@ def schedule_pool(outs, layout): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_pool_grad(outs): """Schedule for pool_grad @@ -559,7 +515,6 @@ def schedule_pool_grad(outs): return _default_schedule(outs, False) -@tvm.target.override_native_generic_func("schedule_adaptive_pool") def schedule_adaptive_pool(outs): """Schedule for adaptive pool @@ -595,7 +550,6 @@ def schedule_binarize_pack(outs): return _default_schedule(outs, False) -@tvm.target.override_native_generic_func("schedule_bitpack") def schedule_bitpack(outs): """Schedule for bitpack Parameters @@ -630,7 +584,6 @@ def schedule_binary_dense(outs): return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_lrn(outs): """Schedule for lrn @@ -645,12 +598,9 @@ def schedule_lrn(outs): sch: Schedule The computation schedule for the op. """ - target = tvm.target.Target.current(allow_none=False) - cpp_target = cpp.TEST_create_target(target.target_name) - return cpp.generic.default_schedule(cpp_target, outs, False) + return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_sparse_dense(outs): """Schedule for sparse_dense @@ -667,7 +617,7 @@ def schedule_sparse_dense(outs): """ return _default_schedule(outs, False) -@tvm.target.generic_func + def schedule_sparse_transpose(outs): """Schedule for sparse_transpose @@ -684,8 +634,19 @@ def schedule_sparse_transpose(outs): """ return _default_schedule(outs, False) -@tvm.target.generic_func + def schedule_batch_matmul(outs): - target = tvm.target.Target.current(allow_none=False) - cpp_target = cpp.TEST_create_target(target.target_name) - return cpp.generic.default_schedule(cpp_target, outs, False) + """Schedule for batch_matmul + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of sparse_transpose + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) diff --git a/topi/python/topi/generic/search.py b/topi/python/topi/generic/search.py index 41045e492e53..69f236684bb3 100644 --- a/topi/python/topi/generic/search.py +++ b/topi/python/topi/generic/search.py @@ -17,10 +17,8 @@ # pylint: disable=invalid-name, no-member """Generic search operators""" from __future__ import absolute_import as _abs -import tvm from .vision import _default_schedule -@tvm.target.generic_func def schedule_argwhere(outs): """Schedule for argwhere operator. diff --git a/topi/python/topi/generic/sort.py b/topi/python/topi/generic/sort.py index 5462f2ce917c..e28ab2c8b20c 100644 --- a/topi/python/topi/generic/sort.py +++ b/topi/python/topi/generic/sort.py @@ -20,7 +20,6 @@ import tvm from .vision import _default_schedule -@tvm.target.generic_func def schedule_argsort(outs): """Schedule for argsort operator. @@ -37,7 +36,6 @@ def schedule_argsort(outs): """ return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_topk(outs): """Schedule for topk operator. diff --git a/topi/python/topi/generic/vision.py b/topi/python/topi/generic/vision.py index 85d9153e6424..d6e80df9b89d 100644 --- a/topi/python/topi/generic/vision.py +++ b/topi/python/topi/generic/vision.py @@ -33,7 +33,6 @@ def _default_schedule(outs, auto_inline): s[x].fuse(s[x].op.axis) return s -@tvm.target.generic_func def schedule_reorg(outs): """Schedule for reorg @@ -52,7 +51,6 @@ def schedule_reorg(outs): cpp_target = cpp.TEST_create_target(target.target_name) return cpp.generic.default_schedule(cpp_target, outs, False) -@tvm.target.generic_func def schedule_get_valid_counts(outs): """Schedule for get_valid_counts @@ -69,7 +67,6 @@ def schedule_get_valid_counts(outs): """ return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_nms(outs): """Schedule for non-maximum suppression @@ -86,7 +83,6 @@ def schedule_nms(outs): """ return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_multibox_prior(outs): """Schedule for multibox_prior @@ -103,7 +99,6 @@ def schedule_multibox_prior(outs): """ return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_multibox_transform_loc(outs): """Schedule for multibox_transform_loc @@ -121,7 +116,6 @@ def schedule_multibox_transform_loc(outs): """ return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_multibox_detection(outs): """Schedule for multibox_detection @@ -138,7 +132,6 @@ def schedule_multibox_detection(outs): """ return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_roi_align(outs): """Schedule for roi_align @@ -155,7 +148,6 @@ def schedule_roi_align(outs): """ return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_roi_pool(outs): """Schedule for roi_align @@ -172,7 +164,6 @@ def schedule_roi_pool(outs): """ return _default_schedule(outs, False) -@tvm.target.generic_func def schedule_proposal(outs): """Schedule for proposal operator. diff --git a/topi/python/topi/hls/injective.py b/topi/python/topi/hls/injective.py index de584287a90e..d4ccf41ed26d 100644 --- a/topi/python/topi/hls/injective.py +++ b/topi/python/topi/hls/injective.py @@ -17,9 +17,7 @@ # pylint: disable=invalid-name, unused-variable, """Schedule for composition of injective operator""" import tvm -from .. import generic -@generic.schedule_injective_from_existing.register(["hls"]) def schedule_injective_from_existing(sch, out): """Schedule for injective op from existing schedule. @@ -40,7 +38,6 @@ def schedule_injective_from_existing(sch, out): sch[out].bind(px, tvm.thread_axis("pipeline")) return sch -@generic.schedule_injective.register(["hls"]) def schedule_injective(outs): """Schedule for injective op. diff --git a/topi/python/topi/hls/nn.py b/topi/python/topi/hls/nn.py index d73cb9c847f7..06cf3298682d 100644 --- a/topi/python/topi/hls/nn.py +++ b/topi/python/topi/hls/nn.py @@ -19,7 +19,6 @@ from __future__ import absolute_import as _abs import tvm from .. import tag -from .. import generic def _schedule_conv2d(outs): @@ -52,7 +51,6 @@ def traverse(OP): return s -@generic.schedule_conv2d_nchw.register(["hls"]) def schedule_conv2d_nchw(outs): """Schedule for conv2d_nchw @@ -70,7 +68,6 @@ def schedule_conv2d_nchw(outs): return _schedule_conv2d(outs) -@generic.schedule_conv2d_nhwc.register(["hls"]) def schedule_conv2d_nhwc(outs): """Schedule for conv2d_nhwc @@ -88,7 +85,6 @@ def schedule_conv2d_nhwc(outs): return _schedule_conv2d(outs) -@generic.schedule_conv2d_NCHWc.register(["hls"]) def schedule_conv2d_NCHWc(outs): """Schedule for conv2d_NCHW[x]c @@ -106,7 +102,6 @@ def schedule_conv2d_NCHWc(outs): return _schedule_conv2d(outs) -@generic.schedule_conv2d_transpose_nchw.register(["hls"]) def schedule_conv2d_transpose_nchw(outs): """Schedule for conv2d_transpose_nchw @@ -124,7 +119,6 @@ def schedule_conv2d_transpose_nchw(outs): return _schedule_conv2d(outs) -@generic.schedule_depthwise_conv2d_nchw.register(["hls"]) def schedule_depthwise_conv2d_nchw(outs): """Schedule for depthwise_conv2d_nchw @@ -142,7 +136,6 @@ def schedule_depthwise_conv2d_nchw(outs): return _schedule_conv2d(outs) -@generic.schedule_depthwise_conv2d_nhwc.register(["hls"]) def schedule_depthwise_conv2d_nhwc(outs): """Schedule for depthwise_conv2d_nhwc Parameters @@ -158,7 +151,6 @@ def schedule_depthwise_conv2d_nhwc(outs): """ return _schedule_conv2d(outs) -@generic.schedule_bitserial_conv2d_nchw.register(["hls"]) def schedule_bitserial_conv2d_nchw(outs): """Schedule for bitserial_conv2d_nchw @@ -176,7 +168,6 @@ def schedule_bitserial_conv2d_nchw(outs): return _schedule_conv2d(outs) -@generic.schedule_bitserial_conv2d_nhwc.register(["hls"]) def schedule_bitserial_conv2d_nhwc(outs): """Schedule for bitserial_conv2d_nhwc @@ -194,7 +185,6 @@ def schedule_bitserial_conv2d_nhwc(outs): return _schedule_conv2d(outs) -@generic.schedule_reduce.register(["hls"]) def schedule_reduce(outs): """Schedule for reduction @@ -241,7 +231,6 @@ def traverse(OP): return s -@generic.schedule_softmax.register(["hls"]) def schedule_softmax(outs): """Schedule for softmax @@ -286,7 +275,6 @@ def schedule_softmax(outs): return s -@generic.schedule_dense.register(["hls"]) def schedule_dense(outs): """Schedule for dense @@ -330,7 +318,6 @@ def traverse(OP): return s -@generic.schedule_pool.register(["hls"]) def schedule_pool(outs, layout): """Schedule for pool @@ -374,7 +361,6 @@ def traverse(OP): return s -@generic.schedule_adaptive_pool.register(["hls"]) def schedule_adaptive_pool(outs): """Schedule for adaptive_pool diff --git a/topi/python/topi/intel_graphics/__init__.py b/topi/python/topi/intel_graphics/__init__.py index 5223d2d2bbc9..5f82fe758786 100644 --- a/topi/python/topi/intel_graphics/__init__.py +++ b/topi/python/topi/intel_graphics/__init__.py @@ -20,3 +20,5 @@ from __future__ import absolute_import as _abs from .conv2d import * +from . import conv2d_alter_op +from .depthwise_conv2d import * diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py index 65ea590905f9..0a0dc468f31a 100644 --- a/topi/python/topi/intel_graphics/conv2d.py +++ b/topi/python/topi/intel_graphics/conv2d.py @@ -20,35 +20,28 @@ from __future__ import absolute_import as _abs import tvm - from tvm import autotvm from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity -from tvm.autotvm.task.topi_integration import deserialize_args -from tvm.autotvm.task import get_config -from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, conv2d_infer_layout -from ..nn.util import get_pad_tuple -from ..nn.depthwise_conv2d import depthwise_conv2d_nchw -from ..nn import pad -from .. import tag -from .. import generic + +from .. import nn from .. import util -from ..util import simplify, get_const_tuple +from ..util import simplify, get_const_tuple, traverse_inline def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False): if is_depthwise: raise RuntimeError("Depthwise not supported for intel graphics.") + else: + batch_size, in_channel, height, width = get_const_tuple(data.shape) + out_channel, _, hkernel, _ = get_const_tuple(kernel.shape) + HSTR, _ = strides - batch_size, in_channel, height, width = get_const_tuple(data.shape) - out_channel, _, hkernel, _ = get_const_tuple(kernel.shape) - HSTR, _ = strides - - ic_bn = 1 - oc_bn, oc_bn_upper = 16, 16 - for i in range(oc_bn_upper, 0, -1): - if out_channel % i == 0: - oc_bn = i - break + ic_bn = 1 + oc_bn, oc_bn_upper = 16, 16 + for i in range(oc_bn_upper, 0, -1): + if out_channel % i == 0: + oc_bn = i + break if HSTR == 2: if out_channel + hkernel == 515: @@ -73,17 +66,12 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depth cfg["block_ow"] = OtherOptionEntity(block_ow) -def _create_schedule_template(cfg, data, kernel, strides, padding, dilation, layout): +def _create_schedule_template(cfg, dshape, kshape, strides, padding, dilation): """Create schedule configuration from input arguments""" - dshape = get_const_tuple(data.shape) - kshape = get_const_tuple(kernel.shape) - if layout == 'NCHW': - n, ic, h, w = dshape - oc, _, kh, kw = kshape - else: - raise ValueError("Not support this layout {} with " - "schedule template.".format(layout)) - pt, pl, pb, pr = get_pad_tuple(padding, kernel) + n, ic, h, w = dshape + oc, _, kh, kw = kshape + + pt, pl, pb, pr = nn.get_pad_tuple(padding, (kh, kw)) sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides) oh = (h - kh + pt + pb) // sh + 1 ow = (w - kw + pl + pr) // sw + 1 @@ -159,108 +147,59 @@ def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None # We define schedule template in this function instead of # declaration function since actual input arguments need # to be altered by the schedule selected. -@autotvm.task.register("topi_intel_graphics_conv2d_NCHWc") -def __topi_nn_conv2d_NCHWc(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - data, kernel, strides, padding, dilation, layout, dtype = deserialize_args(args) - raw_data_shape = get_const_tuple(data.shape) - raw_kernel_shape = get_const_tuple(kernel.shape) - - # get config here - cfg = get_config() - _create_schedule_template(cfg, data, kernel, strides, padding, dilation, layout) - cfg.add_flop(1) - - # change shape with the value in config - ic_bn = cfg["tile_ic"].val if hasattr(cfg["tile_ic"], "val") else cfg["tile_ic"].size[-1] - oc_bn = cfg["tile_oc"].val if hasattr(cfg["tile_oc"], "val") else cfg["tile_oc"].size[-1] +# @autotvm.task.register("topi_intel_graphics_conv2d_NCHWc") +# def __topi_nn_conv2d_NCHWc(*args, **kwargs): +# assert not kwargs, "Do not support kwargs in template function call" +# data, kernel, strides, padding, dilation, layout, dtype = deserialize_args(args) +# raw_data_shape = get_const_tuple(data.shape) +# raw_kernel_shape = get_const_tuple(kernel.shape) +# +# # get config here +# cfg = get_config() +# _create_schedule_template(cfg, data, kernel, strides, padding, dilation, layout) +# cfg.add_flop(1) +# +# # change shape with the value in config +# ic_bn = cfg["tile_ic"].val if hasattr(cfg["tile_ic"], "val") else cfg["tile_ic"].size[-1] +# oc_bn = cfg["tile_oc"].val if hasattr(cfg["tile_oc"], "val") else cfg["tile_oc"].size[-1] +# +# new_data_shape = (raw_data_shape[0], raw_data_shape[1] // ic_bn, +# raw_data_shape[2], raw_data_shape[3], ic_bn) +# new_kernel_shape = (raw_kernel_shape[0] // oc_bn, raw_kernel_shape[1] // ic_bn, +# raw_kernel_shape[2], raw_kernel_shape[3], ic_bn, oc_bn) +# new_data = tvm.placeholder(new_data_shape, data.dtype) +# new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype) +# +# C = _decl_cl_spatialpack_NCHWc(cfg, new_data, new_kernel, strides, padding, dilation, dtype) +# s = _schedule_conv2d_NCHWc(cfg, [C]) +# +# return s, [new_data, new_kernel, C] - new_data_shape = (raw_data_shape[0], raw_data_shape[1] // ic_bn, - raw_data_shape[2], raw_data_shape[3], ic_bn) - new_kernel_shape = (raw_kernel_shape[0] // oc_bn, raw_kernel_shape[1] // ic_bn, - raw_kernel_shape[2], raw_kernel_shape[3], ic_bn, oc_bn) - new_data = tvm.placeholder(new_data_shape, data.dtype) - new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype) - - C = _decl_cl_spatialpack_NCHWc(cfg, new_data, new_kernel, strides, padding, dilation, dtype) - s = _schedule_conv2d_NCHWc(cfg, [C]) - - return s, [new_data, new_kernel, C] - -@conv2d_alter_layout.register(["intel_graphics"]) -def _alter_conv2d_layout(attrs, inputs, tinfo, F): - copy_inputs = list(inputs) - new_attrs = {k : attrs[k] for k in attrs.keys()} - - if F.__name__ == 'tvm.relay.op': - # Derive channels for frontends (e.g ONNX) that miss "channel" field. - new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')] - - data, kernel = tinfo[0], tinfo[1] - batch_size, in_channel, height, width = get_const_tuple(data.shape) - - groups = attrs.get_int("groups") - out_channel = attrs.get_int("channels") - padding = attrs.get_int_tuple("padding") - strides = attrs.get_int_tuple("strides") - dilation = attrs.get_int_tuple("dilation") - out_dtype = attrs["out_dtype"] - - layout_name = 'data_layout' - layout = attrs[layout_name] - kh, kw = attrs.get_int_tuple("kernel_size") - - dtype = data.dtype - out_dtype = dtype if out_dtype in ("same", "") else out_dtype - is_depthwise = groups == in_channel and groups == out_channel - - # only optimize for NCHW - if layout != 'NCHW': - return None - if groups != 1 and not is_depthwise: - return None - - dispatch_ctx = autotvm.task.DispatchContext.current - target = tvm.target.Target.current() - - # query schedule and fallback if necessary - workload = autotvm.task.args_to_workload( - [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) \ - if is_depthwise else \ - autotvm.task.args_to_workload( - [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d) - if is_depthwise: - return None - cfg = dispatch_ctx.query(target, workload) - if cfg.is_fallback: - _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise) - ic_bn = cfg["tile_ic"].val if hasattr(cfg["tile_ic"], "val") else cfg["tile_ic"].size[-1] - oc_bn = cfg["tile_oc"].val if hasattr(cfg["tile_oc"], "val") else cfg["tile_oc"].size[-1] +def _pack_data(data, kernel, ic_bn, oc_bn): + n, _, ih, iw = get_const_tuple(data.shape) + oc, ic, kh, kw = get_const_tuple(kernel.shape) - new_attrs[layout_name] = 'NCHW%dc' % ic_bn - new_attrs['out_layout'] = 'NCHW%dc' % oc_bn + ic_chunk = ic // ic_bn + oc_chunk = oc // oc_bn - new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn), - dtype=data.dtype) + data = tvm.compute((n, ic_chunk, ih, iw, ic_bn), + lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w], + name="data_vec") - out_channel, _, kh, kw = get_const_tuple(kernel.shape) - # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) - new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) + kernel = tvm.compute( + (oc_chunk, ic_chunk, kh, kw, ic_bn, oc_bn), + lambda occ, icc, k_h, k_w, icb, ocb: + kernel[occ * oc_bn + ocb, + icc * ic_bn + icb, k_h, k_w], + name="kernel_vec") - # Store altered operator's config - new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn, kh, kw, ic_bn, oc_bn), - dtype=kernel.dtype) - new_workload = autotvm.task.args_to_workload( - [new_data, new_kernel, strides, padding, dilation, new_attrs[layout_name], - new_attrs['out_layout'], out_dtype], conv2d_NCHWc) + return data, kernel - dispatch_ctx.update(target, new_workload, cfg) - return F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs) -@autotvm.register_topi_compute(conv2d_NCHWc, 'intel_graphics', 'direct') -def _decl_conv2d(cfg, data, kernel, strides, padding, dilation, - layout, out_layout, out_dtype='float32'): +@autotvm.register_topi_compute("conv2d_NCHWc.intel_graphics") +def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, + out_layout, out_dtype='float32'): """Conv2D operator for Intel Graphics backend. Parameters @@ -285,96 +224,48 @@ def _decl_conv2d(cfg, data, kernel, strides, padding, dilation, output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ + if len(data.shape) == 5: + batch, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) + oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple(kernel.shape) + in_channel = ic_chunk * ic_bn + num_filter = oc_chunk * oc_bn + else: + batch, in_channel, ih, iw = get_const_tuple(data.shape) + num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape) + dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(padding, (kernel_height, kernel_width)) assert (dh, dw) == (1, 1), "Does not support dilation" + if isinstance(strides, (tuple, list)): + stride_h, stride_w = strides + else: + stride_h, stride_w = strides, strides + + data_shape = (batch, in_channel, ih, iw) + kernel_shape = (num_filter, in_channel, kernel_height, kernel_width) + _create_schedule_template(cfg, data_shape, kernel_shape, strides, padding, dilation) - n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) - oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple(kernel.shape) - in_channel = ic_chunk * ic_bn - num_filter = oc_chunk * oc_bn if cfg.is_fallback: - _get_default_config(cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype), + _get_default_config(cfg, tvm.placeholder((batch, in_channel, ih, iw), dtype=data.dtype), tvm.placeholder((num_filter, in_channel, kernel_height, kernel_width), dtype=kernel.dtype), strides, padding, out_dtype) - return _decl_cl_spatialpack_NCHWc(cfg, data, kernel, strides, padding, dilation, out_dtype) - - -@conv2d_infer_layout.register("intel_graphics") -def _conv2d_infer_layout(workload, cfg): - _, data, kernel, strides, padding, dilation, layout, dtype = workload - batch_size, in_channel, in_height, in_width = data[:-1] - out_channel, _, k_height, k_width = kernel[:-1] - out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1 - out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1 - tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] - in_shape = (batch_size, in_channel // tile_ic, in_height, in_width, tile_ic) - in_layout = "NCHW%dc" % tile_ic - out_shape = (batch_size, out_channel // tile_oc, out_height, out_width, tile_oc) - out_layout = "NCHW%dc" % tile_oc - return ((in_shape, in_layout),), ((out_shape, out_layout),) - - -@autotvm.register_topi_schedule(generic.schedule_conv2d_NCHWc, 'intel_graphics', ['direct']) -def _schedule_conv2d_NCHWc(cfg, outs): - """Schedule for conv2d_nchw for Intel Graphics - - Parameters - ---------- - outs: Array of Tensor - The computation graph description of conv2d_nchw - in the format of an array of tensors. - - Returns - ------- - s: Schedule - The computation schedule for conv2d_nchw. - """ - outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs - s = tvm.create_schedule([x.op for x in outs]) - scheduled_ops = [] - - def traverse(op): - """inline all one-to-one-mapping operators except the last stage (output)""" - if tag.is_injective(op.tag): - if op not in s.outputs: - s[op].compute_inline() - for tensor in op.input_tensors: - if tensor.op.input_tensors and tensor.op not in scheduled_ops: - traverse(tensor.op) - if "conv" in op.tag: - _schedule_cl_spatialpack_NCHWc(cfg, s, op) - - scheduled_ops.append(op) - - traverse(outs[0].op) - - return s - -def _decl_cl_spatialpack_NCHWc(cfg, data, kernel, strides, padding, dilation, out_dtype='float16'): - batch, in_channel, in_height, in_width, vc = [util.get_const_int(x) for x in data.shape] - in_channel *= vc - num_filter, channel, kernel_h, kernel_w, ci, co = [util.get_const_int(x) for x in kernel.shape] - num_filter *= co - pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel) - - ic_bn = vc - assert vc == ci + ic_bn = cfg["tile_ic"].val if hasattr(cfg["tile_ic"], "val") else cfg["tile_ic"].size[-1] + oc_bn = cfg["tile_oc"].val if hasattr(cfg["tile_oc"], "val") else cfg["tile_oc"].size[-1] - if isinstance(strides, (tuple, list)): - stride_h, stride_w = strides - else: - stride_h, stride_w = strides, strides + # Pack data if raw 4-D data is provided. + if len(data.shape) == 4: + data, kernel = _pack_data(data, kernel, ic_bn, oc_bn) out_channel = num_filter - out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) - out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) - oshape = (batch, out_channel // co, out_height, out_width, co) + out_height = simplify((ih - kernel_height + pad_top + pad_down) // stride_h + 1) + out_width = simplify((iw - kernel_width + pad_left + pad_right) // stride_w + 1) + oshape = (batch, out_channel // oc_bn, out_height, out_width, oc_bn) rc = tvm.reduce_axis((0, in_channel), name='rc') - ry = tvm.reduce_axis((0, kernel_h), name='ry') - rx = tvm.reduce_axis((0, kernel_w), name='rx') + ry = tvm.reduce_axis((0, kernel_height), name='ry') + rx = tvm.reduce_axis((0, kernel_width), name='rx') block_h = cfg["block_oh"].val block_w = cfg["block_ow"].val @@ -388,7 +279,7 @@ def _decl_cl_spatialpack_NCHWc(cfg, data, kernel, strides, padding, dilation, ou if out_width % block_w != 0: c_w = (out_width // block_w + 1) * block_w - cshape = (batch, out_channel // co, c_h, c_w, co) + cshape = (batch, out_channel // oc_bn, c_h, c_w, oc_bn) pad_before = [0, 0, pad_top, pad_left, 0] pad_after = [0, 0, pad_down + c_h - out_height, pad_right + \ @@ -397,7 +288,7 @@ def _decl_cl_spatialpack_NCHWc(cfg, data, kernel, strides, padding, dilation, ou or pad_right + c_w - out_width != 0) DOUNPACK = (c_h - out_height != 0 or c_w - out_width != 0) if DOPAD: - temp = pad(data, pad_before, pad_after, name="pad_temp") + temp = nn.pad(data, pad_before, pad_after, name="pad_temp") else: temp = data @@ -406,33 +297,53 @@ def _decl_cl_spatialpack_NCHWc(cfg, data, kernel, strides, padding, dilation, ou lambda nn, ff, yy, xx, ff_v: \ tvm.sum( temp[nn, rc//ic_bn, yy * stride_h + ry, xx * stride_w + rx, rc%ic_bn]. \ - astype(out_dtype) * + astype(out_dtype) * kernel[ff, rc//ic_bn, ry, rx, rc%ic_bn, ff_v].astype(out_dtype), - axis=[rc, ry, rx]), tag="conv", name='conv') + axis=[rc, ry, rx]), tag="conv2d_NCHWc", name='conv2d_NCHWc') if DOUNPACK: output = tvm.compute( oshape, lambda nn, ff, yy, xx, ff_v: conv[nn][ff][yy][xx][ff_v], - name='output_unpack', tag="conv_unpack") + name='output_unpack', tag="conv2d_NCHWc_unpack") else: output = conv - return output +@autotvm.register_topi_schedule("conv2d_NCHWc.intel_graphics") +def schedule_conv2d_NCHWc(cfg, outs): + """Schedule for conv2d_nchw for Intel Graphics + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of conv2d_nchw + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for conv2d_nchw. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + """inline all one-to-one-mapping operators except the last stage (output)""" + if "conv2d_NCHWc" in op.tag: + _schedule_cl_spatialpack_NCHWc(cfg, s, op) + + traverse_inline(s, outs[0].op, _callback) + + return s + + def _schedule_cl_spatialpack_NCHWc(cfg, s, op): output = op.output(0) - conv = op.input_tensors[0] - if conv.op.name == "conv": - temp = s[conv].op.input_tensors[0] - kernel = s[conv].op.input_tensors[1] - temp_W = s.cache_read(temp, "warp", [conv]) - conv_L = s.cache_write(conv, "local") - SCHEDULE_OUTPUT = True - else: + if op.name == "conv2d_NCHWc": temp = op.input_tensors[0] kernel = op.input_tensors[1] temp_W = s.cache_read(temp, "warp", [output]) @@ -443,8 +354,32 @@ def _schedule_cl_spatialpack_NCHWc(cfg, s, op): s[output].compute_inline() conv = s.outputs[0] SCHEDULE_OUTPUT = False + else: # conv2d_NCHWc_unpack + conv = op.input_tensors[0] + temp = s[conv].op.input_tensors[0] + kernel = s[conv].op.input_tensors[1] + temp_W = s.cache_read(temp, "warp", [conv]) + conv_L = s.cache_write(conv, "local") + SCHEDULE_OUTPUT = True kernel_L = s.cache_read(kernel, "local", [conv_L]) + if temp.name == "pad_temp": + data = temp.op.input_tensors[0] + # TODO(@Laurawly): Do we need to schedule pad op here? + else: + data = temp + + if autotvm.GLOBAL_SCOPE.in_tuning: + # only in autotuning, input data of conv2d_NCHWc will be 4-D. + # skip this part during tuning to make records accurate. + # this part will be folded during Relay fold_constant pass. + s[data].pragma(s[data].op.axis[0], "debug_skip_region") + s[kernel].pragma(s[kernel].op.axis[0], "debug_skip_region") + elif isinstance(kernel.op, tvm.tensor.ComputeOp) and kernel.name == "kernel_vec": + # data and kernel are not pre-computed, schedule layout transform here. + # TODO(@Laurawly): Add schedule for data and kernel pack + pass + OUTPUT_BLOCK_HEIGHT = cfg["block_oh"].val OUTPUT_BLOCK_WIDTH = cfg["block_ow"].val @@ -515,19 +450,7 @@ def _schedule_cl_spatialpack_NCHWc(cfg, s, op): tile_and_bind3d(s, out, w, h, vc, 4, 8, 8) -def conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype): - """convert argument to workload""" - if len(kernel.shape) == 4: - raw_kernel = kernel - else: # the input kernel is transformed by alter_op_layout - shape = get_const_tuple(kernel.shape) - raw_kernel = tvm.placeholder((shape[0] * shape[4], shape[1], shape[2], shape[3]), - dtype=kernel.dtype) - return ('conv2d', ) + autotvm.task.args_to_workload( - [data, raw_kernel, strides, padding, layout, out_dtype]) - -@autotvm.register_topi_compute(conv2d, 'intel_graphics', 'direct') -def decl_conv2d(cfg, data, kernel, stride, padding, dilation, layout='NCHW', out_dtype='float32'): +def conv2d_nchw(data, kernel, stride, padding, dilation, out_dtype='float32'): """Conv2D operator for Intel Graphics backend. Parameters @@ -540,21 +463,18 @@ def decl_conv2d(cfg, data, kernel, stride, padding, dilation, layout='NCHW', out stride size, or [stride_height, stride_width] padding : int or a list/tuple of two ints padding size, or [pad_height, pad_width] - layout : str - layout of data Returns ------- output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ - assert layout == 'NCHW', "only support NCHW convolution on intel gpu" assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu" assert data.dtype == kernel.dtype, "Do not support inputs with different data types now." - return _decl_cl_spatialpack(cfg, data, kernel, stride, padding, layout, out_dtype) + return _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype) + -@autotvm.task.register_topi_schedule(generic.schedule_conv2d_nchw, 'intel_graphics', ['direct']) -def schedule_conv2d_nchw(cfg, outs): +def schedule_conv2d_nchw(outs): """Schedule for conv2d_nchw for Intel Graphics Parameters @@ -569,28 +489,20 @@ def schedule_conv2d_nchw(cfg, outs): """ outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) - scheduled_ops = [] - def traverse(op): + def _callback(op): """inline all one-to-one-mapping operators except the last stage (output)""" - if tag.is_broadcast(op.tag): - if op not in s.outputs: - s[op].compute_inline() - for tensor in op.input_tensors: - if tensor.op.input_tensors and tensor.op not in scheduled_ops: - traverse(tensor.op) if 'conv2d' in op.tag: - _schedule_cl_spatialpack(cfg, s, op) - - scheduled_ops.append(op) + _schedule_cl_spatialpack(s, op) - traverse(outs[0].op) + traverse_inline(s, outs[0].op, _callback) return s -def _decl_cl_spatialpack(cfg, data, kernel, stride, padding, layout, out_dtype='float16'): + +def _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype='float16'): batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape] num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape] - pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel) + pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(padding, (kernel_h, kernel_w)) if isinstance(stride, (tuple, list)): stride_h, stride_w = stride @@ -606,8 +518,6 @@ def _decl_cl_spatialpack(cfg, data, kernel, stride, padding, layout, out_dtype=' ry = tvm.reduce_axis((0, kernel_h), name='ry') rx = tvm.reduce_axis((0, kernel_w), name='rx') - block_w = 1 - block_h = 1 if stride_h == 2: if num_filter + kernel_h == 515: block_h = 4 @@ -640,7 +550,7 @@ def _decl_cl_spatialpack(cfg, data, kernel, stride, padding, layout, out_dtype=' pad_before = [0, 0, pad_top, pad_left] pad_after = [0, 0, pad_down + c_h - block_h, pad_right + c_w - block_w] - temp = pad(data, pad_before, pad_after, name="pad_temp") + temp = nn.pad(data, pad_before, pad_after, name="pad_temp") nv = 16 if num_filter % nv != 0: @@ -667,13 +577,12 @@ def _decl_cl_spatialpack(cfg, data, kernel, stride, padding, layout, out_dtype=' oshape, lambda nn, ff, yy, xx: conv[nn][ff//nv][yy][xx][ff%nv], - name='output_unpack', tag='conv2d', - attrs={'workload': conv_arg_to_workload(data, kernel, stride, padding, - layout, out_dtype)}) + name='output_unpack', tag='conv2d') return output -def _schedule_cl_spatialpack(cfg, s, op): + +def _schedule_cl_spatialpack(s, op): output = op.output(0) _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape] @@ -742,7 +651,7 @@ def _schedule_cl_spatialpack(cfg, s, op): s[kernel_vec].compute_inline() # schedule kernel_L - if "2_14" in s[conv].op.tag: + if OUTPUT_BLOCK_HEIGHT == 2 and OUTPUT_BLOCK_WIDTH == 14: s[kernel_L].compute_at(s[conv_L], ry) else: s[kernel_L].compute_at(s[conv_L], rx) diff --git a/topi/python/topi/intel_graphics/conv2d_alter_op.py b/topi/python/topi/intel_graphics/conv2d_alter_op.py new file mode 100644 index 000000000000..d21a86909baf --- /dev/null +++ b/topi/python/topi/intel_graphics/conv2d_alter_op.py @@ -0,0 +1,102 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +"""Conv2D alter op and legalize functions for x86""" + +import tvm +from tvm import relay +from tvm import autotvm + +from ..util import get_const_tuple +from ..nn import conv2d_alter_layout, conv2d_infer_layout +from .conv2d import _get_default_config + + +@conv2d_alter_layout.register(["intel_graphics"]) +def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): + target = tvm.target.current_target(allow_none=False) + dispatch_ctx = autotvm.task.DispatchContext.current + if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest): + cfg = dispatch_ctx.query(target, None) + workload = cfg.workload + else: + _, outs = relay.backend.compile_engine.select_implement( + relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target) + workload = autotvm.task.get_workload(outs) + if workload is None: + # The best implementation is not an AutoTVM template, + # we then assume it's not necessary to alter this op. + return None + cfg = dispatch_ctx.query(target, workload) + + topi_tmpl = workload[0] + new_attrs = {k : attrs[k] for k in attrs.keys()} + + padding = attrs.get_int_tuple("padding") + strides = attrs.get_int_tuple("strides") + dilation = attrs.get_int_tuple("dilation") + data_layout = attrs["data_layout"] + kernel_layout = attrs["kernel_layout"] + data_tensor, kernel_tensor = tinfos + data_dtype = data_tensor.dtype + kernel_dtype = kernel_tensor.dtype + out_dtype = out_type.dtype + + if topi_tmpl == "conv2d_NCHWc.intel_graphics": + assert data_layout == "NCHW" and kernel_layout == "OIHW" + if cfg.is_fallback: + _get_default_config(cfg, data_tensor, kernel_tensor, strides, padding, + out_dtype, False) + batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) + out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape) + ic_bn = cfg["tile_ic"].val if hasattr(cfg["tile_ic"], "val") else cfg["tile_ic"].size[-1] + oc_bn = cfg["tile_oc"].val if hasattr(cfg["tile_oc"], "val") else cfg["tile_oc"].size[-1] + + # update new attrs + new_attrs['channels'] = out_channel + new_attrs['data_layout'] = 'NCHW%dc' % ic_bn + # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) + new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) + new_attrs['out_layout'] = 'NCHW%dc' % oc_bn + + # Store altered operator's config + new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn), + dtype=data_dtype) + new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn, + kh, kw, ic_bn, oc_bn), dtype=kernel_dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, new_attrs["data_layout"], + new_attrs["out_layout"], out_dtype], "conv2d_NCHWc.intel_graphics") + dispatch_ctx.update(target, new_workload, cfg) + return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs) + else: + return None + + +@conv2d_infer_layout.register("intel_graphics") +def _conv2d_infer_layout(workload, cfg): + _, data, kernel, strides, padding, dilation, layout, dtype = workload + batch_size, in_channel, in_height, in_width = data[:-1] + out_channel, _, k_height, k_width = kernel[:-1] + out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1 + out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1 + tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + in_shape = (batch_size, in_channel // tile_ic, in_height, in_width, tile_ic) + in_layout = "NCHW%dc" % tile_ic + out_shape = (batch_size, out_channel // tile_oc, out_height, out_width, tile_oc) + out_layout = "NCHW%dc" % tile_oc + return ((in_shape, in_layout),), ((out_shape, out_layout),) diff --git a/topi/python/topi/intel_graphics/depthwise_conv2d.py b/topi/python/topi/intel_graphics/depthwise_conv2d.py index 97b7376933de..90f4c85d21db 100644 --- a/topi/python/topi/intel_graphics/depthwise_conv2d.py +++ b/topi/python/topi/intel_graphics/depthwise_conv2d.py @@ -20,16 +20,17 @@ from tvm import autotvm from ..util import traverse_inline from .. import tag -from .. import generic, nn +from .. import nn from ..nn.depthwise_conv2d import depthwise_conv2d_infer_layout # register original implementation of depthwise_conv2d_nchw since we don't need to change this part -autotvm.register_topi_compute(nn.depthwise_conv2d_nchw, ['intel_graphics'], 'direct', - nn.depthwise_conv2d_nchw.fdefault) +@autotvm.register_topi_compute("depthwise_conv2d_nchw.intel_graphics") +def depthwise_conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype): + return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype) -@autotvm.register_topi_schedule(generic.schedule_depthwise_conv2d_nchw, \ - ['intel_graphics'], 'direct') -def schedule_depthwise_conv2d_nchw_intel_graphics(cfg, outs): + +@autotvm.register_topi_schedule("depthwise_conv2d_nchw.intel_graphics") +def schedule_depthwise_conv2d_nchw(cfg, outs): """Schedule for depthwise_conv2d nchw forward. Parameters @@ -68,7 +69,7 @@ def _callback(op): # fallback support if cfg.is_fallback: ref_log = autotvm.tophub.load_reference_log( - target.target_name, target.model, 'depthwise_conv2d_nchw', 'direct') + target.target_name, target.model, 'depthwise_conv2d_nchw.intel_graphics') cfg.fallback_with_reference_log(ref_log) cfg['unroll_explicit'].val = 0 ##### space definition end ##### @@ -132,7 +133,7 @@ def _callback(op): traverse_inline(s, outs[0].op, _callback) return s -@generic.schedule_depthwise_conv2d_nhwc.register(["intel_graphics"]) + def schedule_depthwise_conv2d_nhwc(outs): """Schedule for depthwise_conv2d nhwc forward. diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py index 35a86e991c23..0ee92280ca96 100644 --- a/topi/python/topi/mali/conv2d.py +++ b/topi/python/topi/mali/conv2d.py @@ -17,22 +17,20 @@ # pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return """conv2d schedule on ARM Mali GPU""" import tvm +from tvm import relay from tvm import autotvm from tvm.autotvm.task.space import get_factors -from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform from ..util import traverse_inline, get_const_int, get_const_tuple -from ..nn import conv2d, conv2d_winograd_without_weight_transform, \ - get_pad_tuple, pad, conv2d_alter_layout +from .. import nn from ..nn.winograd_util import winograd_transform_matrices # reuse some compute declarations from ARM CPU -from ..arm_cpu.conv2d import _alter_conv2d_layout_arm from ..arm_cpu.conv2d_spatial_pack import conv2d_spatial_pack_nchw -@autotvm.register_topi_compute(conv2d, 'mali', ['direct']) -def conv2d_mali(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): +@autotvm.register_topi_compute("conv2d_nchw_spatial_pack.mali") +def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype): """TOPI compute callback for conv2d Parameters @@ -57,9 +55,6 @@ def conv2d_mali(cfg, data, kernel, strides, padding, dilation, layout, out_dtype dilation : list of two ints [dilation_height, dilation_width] - layout : str - layout of data - out_dtype: str The output type. This is used for mixed precision. @@ -68,14 +63,11 @@ def conv2d_mali(cfg, data, kernel, strides, padding, dilation, layout, out_dtype output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ - if layout == 'NCHW': - return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, - dilation, out_dtype, num_tile=3) - else: - raise ValueError("Unsupported layout {}".format(layout)) + return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, + dilation, out_dtype, num_tile=3) -@autotvm.register_topi_schedule(schedule_conv2d_nchw, 'mali', ['direct', 'winograd']) -def schedule_conv2d_nchw_mali(cfg, outs): +@autotvm.register_topi_schedule("conv2d_nchw_spatial_pack.mali") +def schedule_conv2d_nchw_spatial_pack(cfg, outs): """TOPI schedule callback for conv2d Parameters @@ -113,9 +105,6 @@ def _callback(op): _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec) - if 'winograd_conv2d_output' in op.tag: - _schedule_winograd(cfg, s, op) - traverse_inline(s, outs[0].op, _callback) return s @@ -200,13 +189,27 @@ def _pick_tile_size(data, kernel): else: return 2 -@autotvm.register_topi_compute(conv2d, 'mali', ['winograd']) -def conv2d_mali_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): + +@autotvm.register_topi_compute("conv2d_nchw_winograd.mali") +def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype): tile_size = _pick_tile_size(data, kernel) - return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, + return _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size) -def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size): + +@autotvm.register_topi_schedule("conv2d_nchw_winograd.mali") +def schedule_conv2d_nchw_winograd(cfg, outs): + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if 'winograd_conv2d_output' in op.tag: + _schedule_winograd(cfg, s, op) + + traverse_inline(s, outs[0].op, _callback) + return s + + +def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size): N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation @@ -214,9 +217,8 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt dilation_h, dilation_w = dilation if len(kernel.shape) == 4: - if dilation_h != 1 or dilation_w != 1: - kernel = dilate(kernel, (1, 1, dilation_h, dilation_w)) + kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w)) pre_computed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: @@ -226,11 +228,10 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt CO *= VC KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1 HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) - pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) + pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW)) - assert layout == 'NCHW' assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1 - data_pad = pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad") + data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad") r = KW m = tile_size @@ -420,34 +421,85 @@ def _schedule_winograd(cfg, s, op): s[Y].compute_at(s[output], tt) -##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM ##### -@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'mali', ['winograd']) -def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size): - """TOPI compute callback""" - return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, - tile_size) +##### REGISTER ALTER OP LAYOUT ##### +@nn.conv2d_alter_layout.register(["mali"]) +def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): + target = tvm.target.current_target(allow_none=False) + dispatch_ctx = autotvm.task.DispatchContext.current + + _, outs = relay.backend.compile_engine.select_implement( + relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target) + workload = autotvm.task.get_workload(outs) + if workload is None: + # The best implementation is not an AutoTVM template, + # we then assume it's not necessary to alter this op. + return None + cfg = dispatch_ctx.query(target, workload) + if cfg.is_fallback: # if is fallback, clear query cache and return None + autotvm.task.clear_fallback_cache(target, workload) + return None -@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform, - 'mali', ['winograd']) -def schedule_conv2d_winograd_without_weight_transform_(cfg, outs): - """TOPI schedule callback""" - s = tvm.create_schedule([x.op for x in outs]) + topi_tmpl = workload[0] + new_attrs = {k: attrs[k] for k in attrs.keys()} - def _callback(op): - if 'winograd_conv2d_output' in op.tag: - _schedule_winograd(cfg, s, op) + strides = attrs.get_int_tuple("strides") + padding = attrs.get_int_tuple("padding") + dilation = attrs.get_int_tuple("dilation") + data_layout = attrs["data_layout"] + kernel_layout = attrs["kernel_layout"] + data, kernel = tinfos + out_dtype = out_type.dtype - traverse_inline(s, outs[0].op, _callback) - return s + idxd = tvm.indexdiv + if topi_tmpl == "conv2d_nchw_spatial_pack.mali": + assert data_layout == "NCHW" and kernel_layout == "OIHW" + N, CI, H, W = get_const_tuple(data.shape) + CO, _, KH, KW = get_const_tuple(kernel.shape) + VC = cfg['tile_co'].size[-1] -##### REGISTER ALTER OP LAYOUT ##### -@conv2d_alter_layout.register(["mali"]) -def _alter_conv2d_layout(attrs, inputs, tinfos, F): - try: - return _alter_conv2d_layout_arm(attrs, inputs, tinfos, F) - except KeyError: # to filter out fallback opencl templates + new_attrs['kernel_layout'] = 'OIHW%do' % VC + + new_data = data + new_kernel = tvm.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, out_dtype], + "conv2d_nchw_spatial_pack.mali") + dispatch_ctx.update(target, new_workload, cfg) + + return relay.nn.conv2d(*inputs, **new_attrs) + elif topi_tmpl == "conv2d_nchw_winograd.mali": + assert data_layout == "NCHW" and kernel_layout == "OIHW" + N, CI, H, W = get_const_tuple(data.shape) + CO, _, KH, KW = get_const_tuple(kernel.shape) + tile_size = _pick_tile_size(data, kernel) + VC = cfg['tile_bna'].val + + weight_expr = inputs[1] + weight_expr = relay.nn.contrib_conv2d_winograd_weight_transform( + weight_expr, tile_size=tile_size) + weight_expr = relay.reshape(weight_expr, + newshape=(KH + tile_size - 1, + KW + tile_size - 1, + idxd(CO, VC), VC, CI)) + weight_expr = relay.transpose(weight_expr, axes=[0, 1, 2, 4, 3]) + + new_attrs['tile_size'] = tile_size + + new_data = data + new_kernel = tvm.placeholder((KH + tile_size - 1, + KW + tile_size -1, + idxd(CO, VC), CI, VC), + kernel.dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, out_dtype], + 'conv2d_nchw_winograd.mali') + dispatch_ctx.update(target, new_workload, cfg) + + return relay.nn.contrib_conv2d_winograd_without_weight_transform( + inputs[0], weight_expr, **new_attrs) + else: return None diff --git a/topi/python/topi/mali/dense.py b/topi/python/topi/mali/dense.py index 6096a99c97c2..3b233e92ba8a 100644 --- a/topi/python/topi/mali/dense.py +++ b/topi/python/topi/mali/dense.py @@ -22,12 +22,18 @@ import tvm from tvm import autotvm -from .. import generic, nn +from .. import nn from ..util import traverse_inline -autotvm.register_topi_compute(nn.dense, 'mali', 'direct', nn.dense.fdefault) -@autotvm.register_topi_schedule(generic.schedule_dense, 'mali', 'direct') + +@autotvm.register_topi_compute('dense.mali') +def dense(_, data, weight, bias=None, out_dtype=None): + """Dense operator on Mali""" + return nn.dense(data, weight, bias, out_dtype) + + +@autotvm.register_topi_schedule('dense.mali') def schedule_dense(cfg, outs): """Schedule for dense operator. @@ -52,11 +58,11 @@ def _callback(op): vec_size = [1, 2, 4, 8, 16] max_unroll = 32 - dense = op.output(0) + dense_out = op.output(0) output = outs[0] y, x = s[output].op.axis - c = s[dense].op.reduce_axis[0] + c = s[dense_out].op.reduce_axis[0] ##### space definition begin ##### cfg.define_split('tile_y', y, num_outputs=3) @@ -66,12 +72,12 @@ def _callback(op): # fallback support if cfg.is_fallback: ref_log = autotvm.tophub.load_reference_log( - 'mali', 'rk3399', 'dense', 'direct') + 'mali', 'rk3399', 'dense.mali') cfg.fallback_with_reference_log(ref_log) ##### space definition end ##### - if dense.op in s.outputs: - dense = s.cache_write(output, 'local') + if dense_out.op in s.outputs: + dense_out = s.cache_write(output, 'local') by, ty, yi = cfg['tile_y'].apply(s, output, y) bx, tx, xi = cfg['tile_x'].apply(s, output, x) @@ -85,23 +91,25 @@ def _callback(op): s[output].unroll(yi) if cfg['tile_x'].size[-1] in vec_size: s[output].vectorize(xi) - s[dense].compute_at(s[output], tx) + s[dense_out].compute_at(s[output], tx) - k = s[dense].op.reduce_axis[0] - y, x = s[dense].op.axis - k, k_unroll = cfg['c_unroll'].apply(s, dense, k) - s[dense].reorder(k, k_unroll, y, x) - s[dense].unroll(k_unroll) + k = s[dense_out].op.reduce_axis[0] + y, x = s[dense_out].op.axis + k, k_unroll = cfg['c_unroll'].apply(s, dense_out, k) + s[dense_out].reorder(k, k_unroll, y, x) + s[dense_out].unroll(k_unroll) if cfg['tile_y'].size[-1] < max_unroll: - s[dense].unroll(y) + s[dense_out].unroll(y) if cfg['tile_x'].size[-1] in vec_size: - s[dense].vectorize(x) + s[dense_out].vectorize(x) traverse_inline(s, outs[0].op, _callback) return s + def fuse_and_bind(s, tensor, axis=None, num_thread=None): """ fuse all the axis and bind to GPU threads """ + # TODO(@comaniac): figure out where this function is used. axis = axis or s[tensor].op.axis fused = s[tensor].fuse(*axis) bx, tx = s[tensor].split(fused, num_thread) diff --git a/topi/python/topi/mali/depthwise_conv2d.py b/topi/python/topi/mali/depthwise_conv2d.py index 274b2944e4d9..4ff17e534feb 100644 --- a/topi/python/topi/mali/depthwise_conv2d.py +++ b/topi/python/topi/mali/depthwise_conv2d.py @@ -20,17 +20,18 @@ import tvm from tvm import autotvm -from ..generic import schedule_depthwise_conv2d_nchw -from ..nn import depthwise_conv2d_nchw +from .. import nn from ..util import traverse_inline # register original implementation of depthwise_conv2d_nchw since we don't need to change this part -autotvm.register_topi_compute(depthwise_conv2d_nchw, 'mali', 'direct', - depthwise_conv2d_nchw.fdefault) +@autotvm.register_topi_compute("depthwise_conv2d_nchw.mali") +def depthwise_conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype): + return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype) + # register customized schedule for arm cpu. -@autotvm.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'mali', 'direct') -def schedule_depthwise_conv2d_nchw_mali(cfg, outs): +@autotvm.register_topi_schedule("depthwise_conv2d_nchw.mali") +def schedule_depthwise_conv2d_nchw(cfg, outs): """Schedule depthwise conv2d Parameters @@ -64,7 +65,7 @@ def _schedule(pad_data, kernel, conv): # fallback support if cfg.is_fallback: ref_log = autotvm.tophub.load_reference_log( - 'mali', 'rk3399', 'depthwise_conv2d_nchw', 'direct') + 'mali', 'rk3399', 'depthwise_conv2d_nchw.mali') cfg.fallback_with_reference_log(ref_log) ###### space definition end ###### diff --git a/topi/python/topi/nn/batch_matmul.py b/topi/python/topi/nn/batch_matmul.py index 7b872ceacf29..d69562c4daf6 100644 --- a/topi/python/topi/nn/batch_matmul.py +++ b/topi/python/topi/nn/batch_matmul.py @@ -20,7 +20,7 @@ import tvm from ..util import get_const_tuple -def batch_matmul_default(x, y): +def batch_matmul(x, y): """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are data in batch. @@ -48,23 +48,3 @@ def batch_matmul_default(x, y): return tvm.compute((batch, M, N), lambda b, i, j: tvm.sum(x[b, i, k] * y[b, j, k], axis=k), tag='batch_matmul') - -@tvm.target.generic_func -def batch_matmul(x, y): - """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are - data in batch. - - Parameters - ---------- - x : tvm.Tensor - 3-D with shape [batch, M, K] - - y : tvm.Tensor - 3-D with shape [batch, N, K] - - Returns - ------- - output : tvm.Tensor - 3-D with shape [batch, M, N] - """ - return batch_matmul_default(x, y) diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py index e1f8f819968f..f18a5aae7eed 100644 --- a/topi/python/topi/nn/bitserial_conv2d.py +++ b/topi/python/topi/nn/bitserial_conv2d.py @@ -19,13 +19,11 @@ """Bitserial Conv2D operators""" from __future__ import absolute_import as _abs import tvm -from tvm import autotvm from .pad import pad from .util import get_pad_tuple -from .bitserial_util import bitpack, binary_op_multiplier +from .bitserial_util import bitpack from ..util import get_const_tuple -@tvm.target.generic_func def bitserial_conv2d_nchw(data, kernel, stride, padding, activation_bits, weight_bits, pack_dtype='uint32', out_dtype='int16', unipolar=True): """Bitserial Conv2D operator. @@ -117,7 +115,6 @@ def _conv(nn, ff, yy, xx): return tvm.compute((batch, out_channel, out_height, out_width), _conv, name="Conv2dOutput", tag="bitserial_conv2d_nchw") -@tvm.target.generic_func def bitserial_conv2d_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, pack_dtype='uint32', out_dtype='int16', unipolar=True): """Bitserial Conv2D operator. @@ -213,222 +210,6 @@ def _conv(nn, yy, xx, ff): return conv -@autotvm.register_topi_compute(bitserial_conv2d_nchw, ['cpu', 'arm_cpu'], 'direct') -def spatial_pack_nchw(cfg, data, kernel, stride, padding, in_bits, weight_bits, - pack_dtype='uint32', out_dtype='int16', unipolar=True): - """ Compute convolution with pack on spatial axes. """ - assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" - data_q = bitpack(data, in_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype) - # Check if kernel is already bitpacked - if len(kernel.shape) == 4: - kernel_q = bitpack(kernel, weight_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype) - KB, CO, _, KH, KW = get_const_tuple(kernel_q.shape) - else: - kernel_vec = kernel - OCO, _, KH, KW, KB, VC = get_const_tuple(kernel_vec.shape) - CO = OCO * VC - - IB, N, CI, H, W = get_const_tuple(data_q.shape) - KB, CO, _, KH, KW = get_const_tuple(kernel_q.shape) - - if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2): - TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel) - else: - TPAD, LPAD, DPAD, RPAD = padding - pad_before = [0, 0, 0, TPAD, LPAD] - pad_after = [0, 0, 0, DPAD, RPAD] - - if isinstance(stride, (tuple, list)): - HSTR, WSTR = stride - else: - HSTR, WSTR = stride, stride - HCAT, WCAT = KH-1, KW-1 - - TH = H + TPAD + DPAD - TW = W + LPAD + RPAD - OH = (H + TPAD + DPAD - KH) // HSTR + 1 - OW = (W + LPAD + RPAD - KW) // WSTR + 1 - - # ==================== define configuration space ==================== - n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW) - ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW) - ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits) - - co, vc = cfg.define_split('tile_co', co, num_outputs=2, - filter=lambda x: max(x.size[1:]) <= 16) - oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2, - filter=lambda x: max(x.size[1:]) <= 16) - ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2, - filter=lambda x: max(x.size[1:]) <= 16) - cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll') - - cfg.define_reorder("reorder_0", - [n, co, oh, ow, vc, vh, vw, kh, kw, kb, ib, ci], - policy='interval_all', interval=(6, 11)) - # binary ops - cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype)) - # ==================== - - VC = cfg["tile_co"].size[-1] - VH = cfg["tile_oh"].size[-1] - VW = cfg["tile_ow"].size[-1] - - dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT, IB) - kvshape = (CO//VC, CI, KH, KW, KB, VC) - ovshape = (1, CO//VC, OH//VH, OW//VW, VH, VW, VC) - oshape = (1, CO, OH, OW) - - if (TPAD != 0 and RPAD != 0): - data_pad = pad(data_q, pad_before, pad_after, name="data_pad") - else: - data_pad = data_q - - data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw, b: \ - data_pad[b][n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec') - - if len(kernel.shape) == 4: - kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, b, vc: \ - kernel_q[b][co*VC+vc][ci][dh][dw], name='kernel_vec') - - ci = tvm.reduce_axis((0, CI), name='ci') - dh = tvm.reduce_axis((0, KH), name='dh') - dw = tvm.reduce_axis((0, KW), name='dw') - b1 = tvm.reduce_axis((0, IB), name='ib') - b2 = tvm.reduce_axis((0, KB), name='kb') - - def _conv(n, co, h, w, vh, vw, vc): - b1b2 = (b1+b2).astype(out_dtype) - if unipolar: - return tvm.sum((tvm.popcount( - data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) & - kernel_vec[co, ci, dh, dw, b2, vc].astype(out_dtype)) - - tvm.popcount( - data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) - & ~kernel_vec[co, ci, dh, dw, b2, vc]).astype(out_dtype)) << b1b2, - axis=[ci, dh, dw, b1, b2]) - - return tvm.sum((tvm.popcount( - data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & - kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2, - axis=[ci, dh, dw, b1, b2]) - - conv = tvm.compute(ovshape, _conv, name='conv_out') - idxd = tvm.indexdiv - idxm = tvm.indexmod - - return tvm.compute( - oshape, lambda n, co, h, w: - conv[n, - idxd(co, VC), idxd(h, VH), idxd(w, VW), - idxm(h, VH), idxm(w, VW), idxm(co, VC)], - name='conv_vec', tag='spatial_bitserial_conv_nchw') - -@autotvm.register_topi_compute(bitserial_conv2d_nhwc, 'cpu', 'direct') -def spatial_pack_nhwc(cfg, data, kernel, stride, padding, in_bits, weight_bits, - pack_dtype='uint32', out_dtype='int16', unipolar=True): - """ Compute convolution with pack on spatial axes. """ - assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" - data_q = bitpack(data, in_bits, pack_axis=3, bit_axis=4, pack_type=pack_dtype) - pack_kernel = len(kernel.shape) == 4 - - if pack_kernel: - kernel_q = bitpack(kernel, weight_bits, pack_axis=2, bit_axis=4, pack_type=pack_dtype) - else: - kernel_q = kernel - - KH, KW, _, CO, KB = get_const_tuple(kernel_q.shape) - N, H, W, CI, IB = get_const_tuple(data_q.shape) - - if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2): - TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel) - else: - TPAD, LPAD, DPAD, RPAD = padding - pad_before = [0, TPAD, LPAD, 0, 0] - pad_after = [0, DPAD, RPAD, 0, 0] - - if isinstance(stride, (tuple, list)): - HSTR, WSTR = stride - else: - HSTR, WSTR = stride, stride - HCAT, WCAT = KH-1, KW-1 - - PAD_H = H + (TPAD + DPAD) - PAD_W = W + (LPAD + RPAD) - OH = (PAD_H - KH) // HSTR + 1 - OW = (PAD_W - KW) // WSTR + 1 - oshape = (1, OH, OW, CO) - - # ==================== define configuration space ==================== - n, oh, ow, co = cfg.axis(N), cfg.axis(OH), cfg.axis(OW), cfg.axis(CO) - ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW) - ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits) - - co, vc = cfg.define_split('tile_co', co, num_outputs=2, - filter=lambda x: max(x.size[1:]) <= 16) - oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2, - filter=lambda x: max(x.size[1:]) <= 16) - ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2, - filter=lambda x: max(x.size[1:]) <= 16) - cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll') - cfg.define_reorder("reorder_0", - [n, oh, ow, co, vh, vw, kh, kw, kb, ib, vc, ci], - policy='interval_all', interval=(3, 7)) - # binary ops - cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype)) - # ==================== - - VC = cfg["tile_co"].size[-1] - VH = cfg["tile_oh"].size[-1] - VW = cfg["tile_ow"].size[-1] - - dvshape = (1, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, CI, IB) - kvshape = (CO, KH, KW, CI, VC, KB) - ovshape = (1, OH, OW, CO, VH, VW, VC) - oshape = (1, OH, OW, CO) - - if (DPAD != 0 and RPAD != 0): - data_pad = pad(data_q, pad_before, pad_after, name="data_pad") - else: - data_pad = data_q - - data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, ci, b: \ - data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][ci][b], name='data_vec') - - kernel_vec = tvm.compute(kvshape, lambda co, dh, dw, ci, vc, b: \ - kernel_q[dh][dw][ci][co*VC+vc][b], name='kernel_vec') - - ci = tvm.reduce_axis((0, CI), name='ci') - dh = tvm.reduce_axis((0, KH), name='dh') - dw = tvm.reduce_axis((0, KW), name='dw') - b1 = tvm.reduce_axis((0, IB), name='ib') - b2 = tvm.reduce_axis((0, KB), name='kb') - - def _conv(n, h, w, co, vh, vw, vc): - b1b2 = (b1+b2).astype(out_dtype) - if unipolar: - return tvm.sum( - ((tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & - kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) - - tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1]& - ~kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype)) << b1b2), - axis=[dh, dw, ci, b1, b2]) - - return tvm.sum(tvm.popcount( - data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & - kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2, - axis=[dh, dw, ci, b1, b2]) - - conv = tvm.compute(ovshape, _conv, name='conv') - - idxd = tvm.indexdiv - idxm = tvm.indexmod - return tvm.compute( - oshape, lambda n, h, w, co: - conv[n, - idxd(h, VH), idxd(w, VW), idxd(co, VC), - idxm(h, VH), idxm(w, VW), idxm(co, VC)], - name='output_unpack', tag='spatial_bitserial_conv_nhwc') - @tvm.target.generic_func def bitserial_conv2d_legalize(attrs, inputs, types): """Legalizes Bitserial Conv2D op. diff --git a/topi/python/topi/nn/bitserial_dense.py b/topi/python/topi/nn/bitserial_dense.py index d77a1b7b0fc2..fa1b5df7d066 100644 --- a/topi/python/topi/nn/bitserial_dense.py +++ b/topi/python/topi/nn/bitserial_dense.py @@ -18,11 +18,9 @@ """Bitserial Dense operator.""" from __future__ import absolute_import import tvm -from tvm import autotvm from topi.util import get_const_tuple -from .bitserial_util import bitpack, binary_op_multiplier +from .bitserial_util import bitpack -@tvm.target.generic_func def bitserial_dense(data, weight, data_bits, weight_bits, pack_dtype='uint32', out_dtype='int16', unipolar=True): """The default implementation of bitserial dense in topi. @@ -66,78 +64,3 @@ def bitserial_dense(data, weight, data_bits, weight_bits, pack_dtype='uint32', if unipolar: return matmul_unipolar return matmul - - -@autotvm.register_topi_compute(bitserial_dense, ['cpu'], 'direct') -def bitserial_dense_default(cfg, data, weight, data_bits, weight_bits, pack_dtype='uint32', - out_dtype='int16', unipolar=True): - """Bitserial dense implementation. TODO: Why are these separate - - Parameters - ---------- - data : tvm.Tensor - 2-D with shape [batch, in_dim] - weight : tvm.Tensor - 2-D with shape [out_dim, in_dim] or - 3-D with shape [out_dim, weight_bits, in_dim] - Returns - ------- - output : tvm.Tensor - 2-D with shape [batch, out_dim] - """ - data_packed = bitpack(data, data_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype) - if len(weight.shape) == 2: - weight_packed = bitpack(weight, weight_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype) - else: - weight_packed = weight - Y, DB, K = get_const_tuple(data_packed.shape) - X, WB, _ = get_const_tuple(weight_packed.shape) - ######## Search space - x, y = cfg.axis(X), cfg.axis(Y) - db, wb, k = cfg.reduce_axis(DB), cfg.reduce_axis(WB), cfg.reduce_axis(K) - ko, ki = cfg.define_split('tile_k', k, num_outputs=2) - yo, yi = cfg.define_split('tile_y', y, num_outputs=2) - xo, xi = cfg.define_split('tile_x', x, num_outputs=2) - - cfg.define_reorder('reorder_0', [yo, xo, ko, yi, wb, db, ki, xi], - policy='candidate', candidate=[ - [yo, xo, ko, yi, wb, db, ki, xi], - [yo, xo, yi, ko, wb, db, ki, xi]]) - - cfg.define_annotate('ann_reduce', [db, wb], policy='try_unroll') - cfg.define_annotate('ann_spatial', [yi, xi], policy='try_unroll_vec') - - ###### Compute rule - VX = cfg['tile_x'].size[-1] - - wvshape = (X//VX, WB, VX, K) - oshape = (Y, X) - - k = tvm.reduce_axis((0, K), name='k') - db = tvm.reduce_axis((0, DB), name='db') - wb = tvm.reduce_axis((0, WB), name='wb') - - # Tile data and weights - weight_vec = tvm.compute(wvshape, lambda xo, wb, vx, k: - weight_packed[xo*VX+vx][wb][k], name='weight_vec') - - idxdiv = tvm.indexdiv - idxmod = tvm.indexmod - - matmul_unipolar = tvm.compute(oshape, lambda i, j: tvm.sum( - (tvm.popcount(weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]) - - tvm.popcount(~weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]) - ).astype(out_dtype) - << (db+wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense_unipolar') - - matmul = tvm.compute(oshape, lambda i, j: tvm.sum( - tvm.popcount(weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k] - ).astype(out_dtype) - << (db+wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense') - - # binary ops - cfg.add_flop(2 * Y * X * K * binary_op_multiplier(pack_dtype)) - - if unipolar: - return matmul_unipolar - return matmul diff --git a/topi/python/topi/nn/conv1d.py b/topi/python/topi/nn/conv1d.py index 98fa2e3d7001..4565fd2f5a46 100644 --- a/topi/python/topi/nn/conv1d.py +++ b/topi/python/topi/nn/conv1d.py @@ -23,7 +23,6 @@ from .util import get_pad_tuple1d -@tvm.target.generic_func def conv1d(data, kernel, strides=1, @@ -101,6 +100,13 @@ def conv1d_ncw(data, out_dtype : str The output data type. If None then output is same type as input. """ + if out_dtype is None: + out_dtype = data.dtype + if isinstance(strides, (tuple, list)): + strides = strides[0] + if isinstance(dilation, (tuple, list)): + dilation = dilation[0] + batch, in_channels, data_width = data.shape out_channels, _, kernel_size = kernel.shape @@ -158,6 +164,13 @@ def conv1d_nwc(data, out_dtype : str The output data type. If None then output is same type as input. """ + if out_dtype is None: + out_dtype = data.dtype + if isinstance(strides, (tuple, list)): + strides = strides[0] + if isinstance(dilation, (tuple, list)): + dilation = dilation[0] + batch, data_width, in_channels = data.shape kernel_size, _, out_channels = kernel.shape diff --git a/topi/python/topi/nn/conv1d_transpose.py b/topi/python/topi/nn/conv1d_transpose.py index 39918e90c317..8d224247db01 100644 --- a/topi/python/topi/nn/conv1d_transpose.py +++ b/topi/python/topi/nn/conv1d_transpose.py @@ -24,7 +24,6 @@ from .util import get_pad_tuple1d -@tvm.target.generic_func def conv1d_transpose_ncw(data, kernel, stride, padding, out_dtype): """Transposed 1D convolution ncw forward operator. diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index 52f4b12a1d2d..0d73c8b0b866 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -23,7 +23,7 @@ from .pad import pad from .util import get_pad_tuple -from ..util import simplify, get_const_tuple, get_const_int +from ..util import simplify, get_const_tuple, get_const_int, tag from .winograd_util import winograd_transform_matrices # workload description of conv2d @@ -31,7 +31,6 @@ ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'groups', 'out_filter', 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride']) -@tvm.target.generic_func def conv2d(input, filter, strides, padding, dilation, layout='NCHW', out_dtype=None): """Conv2D operator. @@ -96,7 +95,7 @@ def conv2d_legalize(attrs, inputs, types): @tvm.target.generic_func -def conv2d_alter_layout(attrs, inputs, tinfos, F): +def conv2d_alter_layout(attrs, inputs, tinfos, out_type): """Change Conv2D layout. Parameters @@ -107,13 +106,12 @@ def conv2d_alter_layout(attrs, inputs, tinfos, F): Grouped input symbols tinfos : list Input shape and dtype - F: symbol - The context, can be either relay.op + out_type: type + The output type Note ---- - Unlike other TOPI functions, this function operates on both graph level and operator level, - so we have to pass 'F' to make it support our two versions of graph IR, Relay. + Unlike other TOPI functions, this function operates on both graph level and operator level. """ # not to change by default return None @@ -368,7 +366,6 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'): return Output -@tvm.target.generic_func def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype='float32'): """Conv2D operator for nChw[x]c layout. @@ -408,58 +405,9 @@ def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, ou 5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block] """ - return conv2d_NCHWc_compute(data, - kernel, - stride, - padding, - dilation, - layout, - out_layout, - out_dtype) - - -def conv2d_NCHWc_compute(data, kernel, strides, padding, dilation, layout, out_layout, out_dtype): - """Conv2D operator compute for nChw[x]c layout. - - Parameters - ---------- - data : tvm.Tensor - 5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block] - - kernel : tvm.Tensor - 6-D with shape - [num_filter_chunk, in_channel_chunk, filter_height, filter_width, - in_channel_block, num_filter_block] - - stride : int or a list/tuple of two ints - stride size, or [stride_height, stride_width] - - padding : int or a list/tuple of 2 or 4 ints - padding size, or - [pad_height, pad_width] for 2 ints, or - [pad_top, pad_left, pad_bottom, pad_right] for 4 ints - - dilation: int or a list/tuple of two ints - dilation size, or [dilation_height, dilation_width] - - layout : str - Input data layout - - out_layout : str - Output data layout - - out_dtype : str - output data type - - Returns - ------- - output : tvm.Tensor - 5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block] - """ - # layout and out_layout are not used here, # we keep them for debug convenience when dumping autotvm workload - HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) + HSTR, WSTR = stride if isinstance(stride, (tuple, list)) else (stride, stride) dilation_h, dilation_w = dilation if isinstance(dilation, (tuple, list)) \ else (dilation, dilation) @@ -516,8 +464,7 @@ def conv2d_NCHWc_compute(data, kernel, strides, padding, dilation, layout, out_l name='conv2d_NCHWc', tag="conv2d_NCHWc") -@tvm.target.generic_func -def conv2d_NCHWc_int8(data, kernel, strides, padding, dilation, layout, out_layout, +def conv2d_NCHWc_int8(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype='int32'): """Conv2D operator for nChw[x]c layout. @@ -557,59 +504,9 @@ def conv2d_NCHWc_int8(data, kernel, strides, padding, dilation, layout, out_layo 5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block] """ - return conv2d_NCHWc_int8_compute(data, - kernel, - strides, - padding, - dilation, - layout, - out_layout, - out_dtype) - - -def conv2d_NCHWc_int8_compute(data, kernel, strides, padding, dilation, layout, out_layout, - out_dtype='int32'): - """Conv2D operator for nChw[x]c layout. - - Parameters - ---------- - data : tvm.Tensor - 5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block] - - kernel : tvm.Tensor - 7-D with shape - [num_filter_chunk, in_channel_chunk, filter_height, filter_width, in_channel_block/4, - num_filter_block, 4] - - stride : int or a list/tuple of two ints - stride size, or [stride_height, stride_width] - - padding : int or a list/tuple of 2 or 4 ints - padding size, or - [pad_height, pad_width] for 2 ints, or - [pad_top, pad_left, pad_bottom, pad_right] for 4 ints - - dilation: int or a list/tuple of two ints - dilation size, or [dilation_height, dilation_width] - - layout : str - Input data layout - - out_layout : str - Output data layout - - out_dtype : str - output data type - - Returns - ------- - output : tvm.Tensor - 5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block] - """ - # layout and out_layout are not used here, # we keep them for debug convenience when dumping autotvm workload - HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) + HSTR, WSTR = stride if isinstance(stride, (tuple, list)) else (stride, stride) dilation_h, dilation_w = dilation if isinstance(dilation, (tuple, list)) \ else (dilation, dilation) @@ -724,33 +621,6 @@ def conv2d_winograd_weight_transform(kernel, tile_size): axis=[r_kh, r_kw]), name='transform_weight') -@tvm.target.generic_func -def conv2d_winograd_without_weight_transform(input, filter, strides, padding, dilation, - layout, out_dtype, tile_size): - """Compute convolution in winograd algorithm. The filter is supposed to be transformed - in advance. - - Parameters - ---------- - input : tvm.Tensor - 4-D with shape [batch, in_height, in_width, in_channel] - filter : tvm.Tensor - 4-D with shape [filter_height, filter_width, in_channel, num_filter] - strides : int or a list/tuple of two ints - Stride size, or [stride_height, stride_width] - padding : int or str - Padding size, or ['VALID', 'SAME'] - tile_size: int - Tile size of winograd transform. e.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3) - - Returns - ------- - output : tvm.Tensor - 4-D with shape [batch, out_height, out_width, out_channel] - """ - raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform") - - def conv2d_winograd_nnpack_weight_transform(kernel, convolution_algorithm, out_dtype): """Weight transformation for winograd Parameters @@ -769,32 +639,7 @@ def conv2d_winograd_nnpack_weight_transform(kernel, convolution_algorithm, out_d return nnpack.convolution_inference_weight_transform( kernel, algorithm=convolution_algorithm, dtype=out_dtype) -@tvm.target.generic_func -def conv2d_winograd_nnpack_without_weight_transform( - input, filter, bias, strides, padding, dilation, layout, out_dtype): - """Compute convolution in winograd algorithm. The filter is supposed to be transformed - in advance. - Parameters - ---------- - input : tvm.Tensor - 4-D with shape [batch, in_height, in_width, in_channel] - filter : tvm.Tensor - 4-D with shape [num_filter, in_channel, 8, 8] - bias : tvm.Tensor - 1-D with shape [num_filter] - strides : int or a list/tuple of two ints - Stride size, or [stride_height, stride_width] - padding : int or str - Padding size, or ['VALID', 'SAME'] - Returns - ------- - output : tvm.Tensor - 4-D with shape [batch, out_height, out_width, out_channel] - """ - raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform") - -@tvm.target.generic_func def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtype=None): """Group convolution operator in NCHW layout. @@ -871,3 +716,20 @@ def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtyp xx * stride_w + rx * dilation_w].astype(out_dtype) * Filter[ff, rc, ry, rx].astype(out_dtype), axis=[rc, ry, rx]), tag='group_conv2d_nchw') + + +def unpack_NCHWc_to_nchw(packed_out, out_dtype): + n, oc_chunk, oh, ow, oc_bn = get_const_tuple(packed_out.shape) + + idxmod = tvm.indexmod + idxdiv = tvm.indexdiv + + oshape = (n, oc_chunk * oc_bn, oh, ow) + unpacked_out = \ + tvm.compute(oshape, + lambda n, c, h, w: + packed_out[n, idxdiv(c, oc_bn), h, w, idxmod(c, oc_bn)] + .astype(out_dtype), + name='output_unpack', + tag=tag.INJECTIVE+",unpack_nchwc") + return unpacked_out \ No newline at end of file diff --git a/topi/python/topi/nn/conv2d_transpose.py b/topi/python/topi/nn/conv2d_transpose.py index e635f43cdbc4..db132fc81f13 100644 --- a/topi/python/topi/nn/conv2d_transpose.py +++ b/topi/python/topi/nn/conv2d_transpose.py @@ -25,7 +25,6 @@ from ..util import simplify -@tvm.target.generic_func def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype): """Transposed 2D convolution nchw forward operator. diff --git a/topi/python/topi/nn/conv3d.py b/topi/python/topi/nn/conv3d.py index 83c16dae7ac4..a37d9894d4c3 100644 --- a/topi/python/topi/nn/conv3d.py +++ b/topi/python/topi/nn/conv3d.py @@ -25,46 +25,8 @@ from ..util import simplify -@tvm.target.generic_func -def conv3d(input, filter, strides, padding, dilation, layout='NCDHW', out_dtype=None): - """Conv3D operator. - - Parameters - ---------- - input : tvm.Tensor - 5-D with shape [batch, in_depth, in_channel, in_height, in_width] - - filter : tvm.Tensor - 5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width] - - strides : int or a list/tuple of three ints - stride size, or [stride_depth, stride_height, stride_width] - - padding : int or a list/tuple of three ints - padding size, or [pad_depth, pad_height, pad_width] - - dilation: int or a list/tuple of three ints - dilation size, or [dilation_depth, dilation_height, dilation_width] - - layout : str - layout of data - - Returns - ------- - output : tvm.Tensor - 5-D with shape [batch, out_depth, out_channel, out_height, out_width] - """ - # search platform specific declaration first - # default declaration - if layout == 'NCDHW': - return conv3d_ncdhw(input, filter, strides, padding, dilation, out_dtype) - elif layout == 'NDHWC': - return conv3d_ndhwc(input, filter, strides, padding, dilation, out_dtype) - raise ValueError("not support this layout {} yet".format(layout)) - - -def conv3d_ncdhw(Input, Filter, stride, padding, dilation, out_dtype=None): - """Convolution operator in NCDHW layout. +def conv3d_ncdhw(Input, Filter, stride, padding, dilation, layout='NCDHW', out_dtype=None): + """Conv3D operator in NCDHW layout. Parameters ---------- @@ -88,6 +50,7 @@ def conv3d_ncdhw(Input, Filter, stride, padding, dilation, out_dtype=None): Output : tvm.Tensor 5-D with shape [batch, out_channel, out_depth, out_height, out_width] """ + assert layout == "NCDHW" if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 3 @@ -132,7 +95,7 @@ def conv3d_ncdhw(Input, Filter, stride, padding, dilation, out_dtype=None): axis=[rc, rz, ry, rx]), tag="conv3d_ncdhw") -def conv3d_ndhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'): +def conv3d_ndhwc(Input, Filter, stride, padding, dilation, layout='NDHWC', out_dtype='float32'): """Convolution operator in NDHWC layout. Parameters @@ -157,6 +120,7 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'): Output : tvm.Tensor 5-D with shape [batch, out_channel, out_depth, out_height, out_width] """ + assert layout == "NDHWC" assert isinstance(stride, int) or len(stride) == 3 assert isinstance(dilation, int) or len(dilation) == 3 diff --git a/topi/python/topi/nn/deformable_conv2d.py b/topi/python/topi/nn/deformable_conv2d.py index 2417411efc37..251f68aa8c25 100644 --- a/topi/python/topi/nn/deformable_conv2d.py +++ b/topi/python/topi/nn/deformable_conv2d.py @@ -22,7 +22,6 @@ from ..util import get_const_tuple from ..cpp.util import bilinear_sample_nchw -@tvm.target.generic_func def deformable_conv2d_nchw(data, offset, kernel, strides, padding, dilation, deformable_groups, groups, out_dtype): """Deformable conv2D operator in NCHW layout. diff --git a/topi/python/topi/nn/dense.py b/topi/python/topi/nn/dense.py index 671b602edc30..fe21e7417bda 100644 --- a/topi/python/topi/nn/dense.py +++ b/topi/python/topi/nn/dense.py @@ -19,7 +19,7 @@ import tvm from .. import tag -def dense_default(data, weight, bias=None, out_dtype=None): +def dense(data, weight, bias=None, out_dtype=None): """The default implementation of dense in topi. Parameters @@ -59,29 +59,3 @@ def dense_default(data, weight, bias=None, out_dtype=None): lambda i, j: matmul[i, j] + bias[j].astype(out_dtype), \ tag=tag.BROADCAST) return matmul - - -@tvm.target.override_native_generic_func("dense") -def dense(data, weight, bias=None, out_dtype=None): - """Applies a linear transformation: :math:`Y = XW^T + b`. - - Parameters - ---------- - data : tvm.Tensor - 2-D with shape [batch, in_dim] - - weight : tvm.Tensor - 2-D with shape [out_dim, in_dim] - - bias : tvm.Tensor, optional - 1-D with shape [out_dim] - - out_dtype : str - The output type. This is used for mixed precision. - - Returns - ------- - output : tvm.Tensor - 2-D with shape [batch, out_dim] - """ - return dense_default(data, weight, bias, out_dtype) diff --git a/topi/python/topi/nn/depthwise_conv2d.py b/topi/python/topi/nn/depthwise_conv2d.py index f50e357a3bb8..49aaace0f833 100644 --- a/topi/python/topi/nn/depthwise_conv2d.py +++ b/topi/python/topi/nn/depthwise_conv2d.py @@ -47,7 +47,6 @@ def _get_workload(data, kernel, stride, padding, out_dtype): out_channel, kh, kw, HPAD, WPAD, HSTR, WSTR) -@tvm.target.generic_func def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None): """Depthwise convolution nchw forward operator. @@ -121,7 +120,6 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=No return Output -@tvm.target.generic_func def depthwise_conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype=None): """Depthwise convolution nhwc forward operator. @@ -307,7 +305,6 @@ def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, strid return Weight_grad -@tvm.target.generic_func def depthwise_conv2d_NCHWc(Input, Filter, stride, padding, dilation, layout, out_layout, out_dtype=None): """Depthwise convolution NCHW[x]c forward operator. diff --git a/topi/python/topi/nn/local_response_norm.py b/topi/python/topi/nn/local_response_norm.py index de002bfffbe6..1b41c7dbfb5e 100644 --- a/topi/python/topi/nn/local_response_norm.py +++ b/topi/python/topi/nn/local_response_norm.py @@ -17,10 +17,8 @@ # pylint: disable=invalid-name """TVM operator for local response norm compute.""" from __future__ import absolute_import -import tvm from .. import cpp -@tvm.target.generic_func def lrn(data, size, axis=1, alpha=0.0001, beta=0.75, bias=2): """Perform the across channels local response normalisation on the input data. diff --git a/topi/python/topi/nn/sparse.py b/topi/python/topi/nn/sparse.py index 584126ea2015..6974ff4a13ab 100644 --- a/topi/python/topi/nn/sparse.py +++ b/topi/python/topi/nn/sparse.py @@ -22,7 +22,6 @@ from ..util import get_const_tuple -@tvm.target.generic_func def sparse_dense(data, weight_data, weight_indices, weight_indptr): """ Computes sparse-dense matrix multiplication of `data` and @@ -105,7 +104,7 @@ def _compute_block(i, nb_j, j): lambda m, n: bsrmm_block[m, idxd(n, bs_r), idxm(n, bs_r)], tag="sparse_dense_bsrmm") -@tvm.target.generic_func + def sparse_transpose(sparse_data, sparse_indices, sparse_indptr): """ Transpose a square sparse matrix, @@ -148,14 +147,15 @@ def sparse_transpose(sparse_data, sparse_indices, sparse_indptr): shape=output_shape, inputs=[sparse_data, sparse_indices, sparse_indptr], fcompute=lambda ins, outs: - csr_transpose_ir(ins[0], ins[1], ins[2], outs[0], outs[1], outs[2]), + _csr_transpose_ir(ins[0], ins[1], ins[2], outs[0], outs[1], outs[2]), tag="sparse_transpose_csr", dtype=['float32', 'int32', 'int32'], name='out') return [output_data, output_indices, output_indptr] -def csr_transpose_ir(data, indices, indptr, out_data, out_indices, out_indptr): + +def _csr_transpose_ir(data, indices, indptr, out_data, out_indices, out_indptr): """define ir for csr_transpose""" irb = tvm.ir_builder.create() diff --git a/topi/python/topi/nn/util.py b/topi/python/topi/nn/util.py index aa73e849427b..f0cdd9a0d3c2 100644 --- a/topi/python/topi/nn/util.py +++ b/topi/python/topi/nn/util.py @@ -143,7 +143,7 @@ def get_pad_tuple(padding, kernel): pad_h = padding[0] * 2 pad_w = padding[1] * 2 elif len(padding) == 4: - return padding[0], padding[1], padding[2], padding[3] + return padding[0], padding[1], padding[2], padding[3] else: raise ValueError("Size of padding can only be 2 or 4") elif isinstance(padding, int): diff --git a/topi/python/topi/opengl/conv2d_nchw.py b/topi/python/topi/opengl/conv2d_nchw.py index e39d1ad805b0..52ed11972e6f 100644 --- a/topi/python/topi/opengl/conv2d_nchw.py +++ b/topi/python/topi/opengl/conv2d_nchw.py @@ -18,9 +18,7 @@ """Schedule for conv2d_nchw with auto fusion""" import tvm from .. import tag -from .. import generic -@generic.schedule_conv2d_nchw.register(["opengl"]) def schedule_conv2d_nchw(outs): """Schedule for conv2d_nchw. diff --git a/topi/python/topi/opengl/dense.py b/topi/python/topi/opengl/dense.py index c93dfccbeece..db2c4a677904 100644 --- a/topi/python/topi/opengl/dense.py +++ b/topi/python/topi/opengl/dense.py @@ -19,9 +19,7 @@ from __future__ import absolute_import as _abs import tvm from .. import tag -from .. import generic -@generic.schedule_dense.register(["opengl"]) def schedule_dense(outs): """Schedule for dense operator. diff --git a/topi/python/topi/opengl/injective.py b/topi/python/topi/opengl/injective.py index d3ebc943b962..28dc87d1a5fb 100644 --- a/topi/python/topi/opengl/injective.py +++ b/topi/python/topi/opengl/injective.py @@ -17,9 +17,7 @@ # pylint: disable=invalid-name, unused-variable, """Schedule for composition of injective operator""" import tvm -from .. import generic -@generic.schedule_injective_from_existing.register(["opengl"]) def schedule_injective_from_existing(sch, out): """Schedule for injective op from existing schedule. @@ -38,7 +36,6 @@ def schedule_injective_from_existing(sch, out): sch[out].opengl() return sch -@generic.schedule_injective.register(["opengl"]) def schedule_injective(outs): """Schedule for injective op. diff --git a/topi/python/topi/opengl/pooling.py b/topi/python/topi/opengl/pooling.py index 04c7b0cd0002..3226422048e5 100644 --- a/topi/python/topi/opengl/pooling.py +++ b/topi/python/topi/opengl/pooling.py @@ -18,9 +18,7 @@ """Schedule for pooling operators""" import tvm from .. import tag -from .. import generic -@generic.schedule_adaptive_pool.register(["opengl"]) def schedule_adaptive_pool(outs): """Schedule for adaptive pool. @@ -69,7 +67,6 @@ def traverse(OP): return s -@generic.schedule_pool.register(["opengl"]) def schedule_pool(outs, layout): """Schedule for pool. diff --git a/topi/python/topi/opengl/softmax.py b/topi/python/topi/opengl/softmax.py index e343d4513241..ff218d13c2b1 100644 --- a/topi/python/topi/opengl/softmax.py +++ b/topi/python/topi/opengl/softmax.py @@ -17,9 +17,7 @@ # pylint: disable=invalid-name, unused-variable, trailing-whitespace """Schedule for softmax operator""" import tvm -from .. import generic -@generic.schedule_softmax.register(["opengl"]) def schedule_softmax(outs): """Schedule for softmax op. diff --git a/topi/python/topi/rocm/conv2d.py b/topi/python/topi/rocm/conv2d.py index be29c6f6b0cc..0daa4be58e6c 100644 --- a/topi/python/topi/rocm/conv2d.py +++ b/topi/python/topi/rocm/conv2d.py @@ -20,13 +20,12 @@ from tvm import autotvm from tvm.contrib import miopen -from .. import nn, generic +from .. import generic from ..util import get_const_tuple -from ..cuda.conv2d import conv2d_cuda, schedule_conv2d_nchw_cuda from ..nn.util import get_pad_tuple -@autotvm.register_topi_compute(nn.conv2d, 'rocm', ['direct', 'winograd']) -def conv2d_rocm(cfg, data, kernel, strides, padding, dilation, layout='NCHW', out_dtype='float32'): +@autotvm.register_topi_compute("conv2d_nchw_miopen.rocm") +def conv2d_nchw_miopen(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'): """Conv2D operator for rocm backend. Parameters @@ -57,39 +56,34 @@ def conv2d_rocm(cfg, data, kernel, strides, padding, dilation, layout='NCHW', ou 4-D with shape [batch, out_channel, out_height, out_width] """ - target = tvm.target.Target.current() - if "miopen" in target.libs: - assert layout == 'NCHW', "Only NCHW layout is supported." - CO, CI, KH, KW = get_const_tuple(kernel.shape) - N, _, H, W = get_const_tuple(data.shape) - - # handle dilation - stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides - pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) - pad_h, pad_w = pt + pb, pl + pr - dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation - - OH = (H + 2 * pad_h - KH) // stride_h + 1 - OW = (W + 2 * pad_w - KW) // stride_w + 1 - cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) *\ - ((KW - 1) * dilation_w + 1)) - - return miopen.conv2d_forward(data, - kernel, - stride_h, - stride_w, - pad_h, - pad_w, - dilation_h, - dilation_w, - conv_mode=0, - data_type=1) - - return conv2d_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype) - - -@autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, 'rocm', ["direct", 'winograd']) -def schedule_conv2d_nchw_rocm(cfg, outs): + CO, CI, KH, KW = get_const_tuple(kernel.shape) + N, _, H, W = get_const_tuple(data.shape) + + # handle dilation + stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides + pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) + pad_h, pad_w = pt + pb, pl + pr + dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation + + OH = (H + 2 * pad_h - KH) // stride_h + 1 + OW = (W + 2 * pad_w - KW) // stride_w + 1 + cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) *\ + ((KW - 1) * dilation_w + 1)) + + return miopen.conv2d_forward(data, + kernel, + stride_h, + stride_w, + pad_h, + pad_w, + dilation_h, + dilation_w, + conv_mode=0, + data_type=1) + + +@autotvm.register_topi_schedule("conv2d_nchw_miopen.rocm") +def schedule_conv2d_nchw_miopen(cfg, outs): """TOPI schedule callback of conv2d for rocm Parameters @@ -106,8 +100,4 @@ def schedule_conv2d_nchw_rocm(cfg, outs): s: Schedule The computation schedule for conv2d. """ - target = tvm.target.Target.current() - if target and "miopen" in target.libs: - return generic.schedule_extern(outs) - - return schedule_conv2d_nchw_cuda(cfg, outs) + return generic.schedule_extern(outs) diff --git a/topi/python/topi/rocm/dense.py b/topi/python/topi/rocm/dense.py index f2adeaabef61..8729a62bd677 100644 --- a/topi/python/topi/rocm/dense.py +++ b/topi/python/topi/rocm/dense.py @@ -20,13 +20,12 @@ import tvm from tvm import autotvm from tvm.contrib import rocblas -import topi -from ..nn.dense import dense, dense_default +from .. import generic, nn from .. import tag -from .. import generic +from ..util import traverse_inline -@autotvm.register_topi_compute(dense, "rocm", "direct") -def dense_rocm(cfg, data, weight, bias=None, out_dtype=None): +@autotvm.register_topi_compute('dense.rocm') +def dense(cfg, data, weight, bias=None, out_dtype=None): """Dense operator for rocm backend. Parameters @@ -54,21 +53,10 @@ def dense_rocm(cfg, data, weight, bias=None, out_dtype=None): assert len(bias.shape) == 1 if out_dtype is None: out_dtype = data.dtype - batch, in_dim = data.shape - out_dim, _ = weight.shape - target = tvm.target.Target.current() - if "rocblas" in target.libs: - assert out_dtype == data.dtype, "Mixed precision not supported." - matmul = rocblas.matmul(data, weight, False, True) - if bias is not None: - matmul = tvm.compute((batch, out_dim), \ - lambda i, j: matmul[i, j] + bias[j], \ - tag=tag.BROADCAST) - return matmul - return dense_default(data, weight, bias, out_dtype) - - -@autotvm.register_topi_schedule(generic.schedule_dense, "rocm", "direct") + return nn.dense(data, weight, bias, out_dtype) + + +@autotvm.register_topi_schedule('dense.rocm') def schedule_dense(cfg, outs): """Schedule for dense operator. @@ -83,7 +71,72 @@ def schedule_dense(cfg, outs): s: Schedule The computation schedule for dense. """ - target = tvm.target.Target.current() - if target.target_name == "rocm" and "rocblas" in target.libs: - return generic.schedule_extern(outs) - return topi.cuda.schedule_dense(cfg, outs) + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if op.tag == 'dense': + Dense = op.output(0) + num_thread = 64 + k = Dense.op.reduce_axis[0] + ko, kf = s[Dense].split(k, factor=num_thread) + DenseF = s.rfactor(Dense, kf) + + if Dense.op in s.outputs: + Out = Dense + else: + Out = outs[0].op.output(0) + s[Dense].compute_at(s[Out], s[Out].op.axis[1]) + s[Out].bind(s[Out].op.axis[0], tvm.thread_axis("blockIdx.y")) + s[Out].bind(s[Out].op.axis[1], tvm.thread_axis("blockIdx.x")) + + tx = s[Dense].op.reduce_axis[0] + thread_x = tvm.thread_axis("threadIdx.x") + s[Dense].bind(tx, thread_x) + s[DenseF].compute_at(s[Dense], tx) + s[Dense].set_store_predicate(thread_x.var.equal(0)) + s[Out].set_store_predicate(thread_x.var.equal(0)) + + traverse_inline(s, outs[0].op, _callback) + return s + + +@autotvm.register_topi_compute('dense_rocblas.rocm') +def dense_rocblas(cfg, data, weight, bias=None, out_dtype=None): + """Dense operator for rocm backend with cblas. + + Parameters + ---------- + data : tvm.Tensor + 2-D with shape [batch, in_dim] + + weight : tvm.Tensor + 2-D with shape [out_dim, in_dim] + + bias : tvm.Tensor, optional + 1-D with shape [out_dim] + + out_dtype : str + The output type. This is used for mixed precision. + + Returns + ------- + output : tvm.Tensor + 2-D with shape [batch, out_dim] + """ + assert out_dtype == data.dtype, "Mixed precision not supported." + matmul = rocblas.matmul(data, weight, False, True) + batch, in_dim = data.shape + out_dim, _ = weight.shape + cfg.add_flop(batch * in_dim * out_dim * 2) + if bias is not None: + matmul = tvm.compute((batch, out_dim), + lambda i, j: matmul[i, j] + bias[j], + tag=tag.BROADCAST) + return matmul + + +@autotvm.register_topi_schedule('dense_rocblas.rocm') +def schedule_dense_rocblas(_, outs): + """Schedule for dense operator with rocm cblas""" + return generic.schedule_extern(outs) diff --git a/topi/python/topi/rocm/nn.py b/topi/python/topi/rocm/nn.py index 8a9c8c393da6..5f134cb32c98 100644 --- a/topi/python/topi/rocm/nn.py +++ b/topi/python/topi/rocm/nn.py @@ -17,12 +17,7 @@ """scheduler for normalization functions on rocm backend""" from __future__ import absolute_import as _abs -import tvm -from .. import generic from .. import cpp -@generic.schedule_lrn.register(["rocm", "gpu"]) def schedule_lrn(outs): - target = tvm.target.Target.current(allow_none=False) - cpp_target = cpp.TEST_create_target(target.target_name) - return cpp.rocm.schedule_lrn(cpp_target, outs) + return cpp.rocm.schedule_lrn(outs) diff --git a/topi/python/topi/sort.py b/topi/python/topi/sort.py index 22899c4232f7..96a088923d2d 100644 --- a/topi/python/topi/sort.py +++ b/topi/python/topi/sort.py @@ -20,7 +20,6 @@ from tvm import api from .util import get_const_tuple -@tvm.target.generic_func def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"): """Performs sorting along the given axis and returns an array of indices having the same shape as an input array that index @@ -99,7 +98,6 @@ def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"): return out -@tvm.target.generic_func def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): """Get the top k elements in an input tensor along the given axis. diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py index 5bb36f7dfa74..c171f8ca5fe3 100644 --- a/topi/python/topi/vision/nms.py +++ b/topi/python/topi/vision/nms.py @@ -116,7 +116,7 @@ def hybrid_get_valid_counts(data, score_threshold, id_index, score_index, one): out_tensor[i, j, k] = -one return valid_count, out_tensor -@tvm.target.generic_func + def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1): """Get valid count of bounding boxes given a score threshold. Also moves valid boxes to the top of input data. @@ -289,7 +289,6 @@ def hybrid_nms(data, sorted_index, valid_count, return output, box_indices -@tvm.target.generic_func def non_max_suppression(data, valid_count, max_output_size=-1, iou_threshold=0.5, force_suppress=False, top_k=-1, coord_start=2, score_index=1, id_index=0, diff --git a/topi/python/topi/vision/rcnn/proposal.py b/topi/python/topi/vision/rcnn/proposal.py index d48c89078ec0..5de4998c066c 100644 --- a/topi/python/topi/vision/rcnn/proposal.py +++ b/topi/python/topi/vision/rcnn/proposal.py @@ -317,7 +317,7 @@ def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf): body = ib.get() return body -@tvm.target.generic_func + def proposal(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, threshold, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss): """Proposal operator. diff --git a/topi/python/topi/vision/rcnn/roi_align.py b/topi/python/topi/vision/rcnn/roi_align.py index a6540b3666a5..a0bc5e291597 100644 --- a/topi/python/topi/vision/rcnn/roi_align.py +++ b/topi/python/topi/vision/rcnn/roi_align.py @@ -21,7 +21,6 @@ from ...cpp.util import bilinear_sample_nchw -@tvm.target.generic_func def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1): """ROI align operator in NCHW layout. diff --git a/topi/python/topi/vision/rcnn/roi_pool.py b/topi/python/topi/vision/rcnn/roi_pool.py index 53ffe35e7e1b..f346f580b3ba 100644 --- a/topi/python/topi/vision/rcnn/roi_pool.py +++ b/topi/python/topi/vision/rcnn/roi_pool.py @@ -19,7 +19,6 @@ import tvm from ...util import get_const_tuple -@tvm.target.generic_func def roi_pool_nchw(data, rois, pooled_size, spatial_scale): """ROI pool operator in NCHW layout. diff --git a/topi/python/topi/vision/reorg.py b/topi/python/topi/vision/reorg.py index 7adfc73d9be1..3ba5e8495a22 100644 --- a/topi/python/topi/vision/reorg.py +++ b/topi/python/topi/vision/reorg.py @@ -20,10 +20,8 @@ Reorg operator, used in darknet. """ from __future__ import absolute_import as _abs -import tvm from .. import cpp -@tvm.target.generic_func def reorg(data, stride): """Reorg forward operators. diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py index 8c31f823cbe4..4309af4303f1 100644 --- a/topi/python/topi/vision/ssd/multibox.py +++ b/topi/python/topi/vision/ssd/multibox.py @@ -89,7 +89,6 @@ def hybrid_multibox_prior(data, sizes, ratios, steps, offsets): return output -@tvm.target.generic_func def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False): """Generate prior(anchor) boxes from data, sizes and ratios. @@ -233,7 +232,6 @@ def hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor, return out_loc, valid_count -@tvm.target.generic_func def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)): """Location transformation for multibox detection @@ -267,7 +265,6 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01 tvm.const(threshold, "float32"), tvm.convert(variances)) -@tvm.target.generic_func def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5, force_suppress=False, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=-1): """Convert multibox detection predictions. diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py index d1c728d7b75c..ce07c194268a 100644 --- a/topi/python/topi/x86/__init__.py +++ b/topi/python/topi/x86/__init__.py @@ -19,9 +19,9 @@ """x86 specific declaration and schedules.""" from __future__ import absolute_import as _abs -from .conv1d import schedule_conv1d_nwc -from .conv2d import schedule_conv2d, schedule_conv2d_nhwc -from .conv3d import schedule_conv3d_ndhwc +from .conv1d import * +from .conv2d import * +from .conv3d import * from .binarize_pack import schedule_binarize_pack from .binary_dense import schedule_binary_dense from .nn import * @@ -29,12 +29,12 @@ from .injective import * from .reduction import * from .pooling import schedule_pool, schedule_adaptive_pool -from .bitserial_conv2d import schedule_bitserial_conv2d -from .bitserial_dense import schedule_bitserial_dense -from .depthwise_conv2d import schedule_depthwise_conv2d_NCHWc -from .dense import _schedule_dense, _schedule_dense_pack, _schedule_dense_nopack -from .batch_matmul import schedule_batch_matmul +from .bitserial_conv2d import * +from .bitserial_dense import * +from .depthwise_conv2d import * +from .dense import * +from .batch_matmul import * from .roi_align import roi_align_nchw -from .conv2d_transpose import _schedule_conv2d_transpose_nchw +from .conv2d_transpose import * from .sparse import * from .conv2d_alter_op import * diff --git a/topi/python/topi/x86/batch_matmul.py b/topi/python/topi/x86/batch_matmul.py index fef6c48d6bed..a7cb9e98f11f 100644 --- a/topi/python/topi/x86/batch_matmul.py +++ b/topi/python/topi/x86/batch_matmul.py @@ -21,12 +21,12 @@ from tvm import autotvm from tvm.autotvm.task.space import SplitEntity from tvm.contrib import cblas -from .. import generic, nn +from .. import generic from ..util import traverse_inline, get_const_tuple, get_max_power2_factor -@autotvm.register_topi_compute(nn.batch_matmul, "cpu", "direct") -def _declaration_batch_matmul_nopack(cfg, x, y): +@autotvm.register_topi_compute("batch_matmul.x86") +def batch_matmul(cfg, x, y): """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are data in batch. @@ -43,10 +43,6 @@ def _declaration_batch_matmul_nopack(cfg, x, y): output : tvm.Tensor 3-D with shape [batch, M, N] """ - target = tvm.target.Target.current() - if "cblas" in target.libs: - return cblas.batch_matmul(x, y, False, True) - assert len(x.shape) == 3 and len( y.shape) == 3, "only support 3-dim batch_matmul" XB, M, XK = get_const_tuple(x.shape) @@ -56,7 +52,7 @@ def _declaration_batch_matmul_nopack(cfg, x, y): B = XB K = XK if cfg.is_fallback: - _default_batch_matmul_nopack_config(cfg, M, N, K) + _default_batch_matmul_config(cfg, M, N, K) k = tvm.reduce_axis((0, K), name='k') C = tvm.compute( @@ -66,7 +62,7 @@ def _declaration_batch_matmul_nopack(cfg, x, y): return C -@autotvm.register_topi_schedule(generic.schedule_batch_matmul, "cpu", "direct") +@autotvm.register_topi_schedule("batch_matmul.x86") def schedule_batch_matmul(cfg, outs): """Schedule for batch_matmul @@ -83,10 +79,6 @@ def schedule_batch_matmul(cfg, outs): sch: Schedule The computation schedule for the op. """ - target = tvm.target.Target.current() - if "cblas" in target.libs: - return generic.schedule_extern(outs) - s = tvm.create_schedule([x.op for x in outs]) def _callback(op): @@ -131,9 +123,42 @@ def _callback(op): return s -def _default_batch_matmul_nopack_config(cfg, M, N, K): +def _default_batch_matmul_config(cfg, M, N, K): cfg["tile_k"] = SplitEntity([K // 16, 16]) x_bn = get_max_power2_factor(N, 8) cfg["tile_x"] = SplitEntity([N // x_bn, x_bn]) y_bn = get_max_power2_factor(M, 8) cfg["tile_y"] = SplitEntity([M // y_bn, y_bn]) + + +@autotvm.register_topi_compute("batch_matmul_cblas.x86") +def batch_matmul_cblas(cfg, x, y): + """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are + data in batch. + + Parameters + ---------- + cfg : ConfigSpace + Autotvm tuning space config file + x : tvm.Tensor + 3-D with shape [batch, M, K] + y : tvm.Tensor + 3-D with shape [batch, N, K] + Returns + ------- + output : tvm.Tensor + 3-D with shape [batch, M, N] + """ + assert len(x.shape) == 3 and len( + y.shape) == 3, "only support 3-dim batch_matmul" + XB, M, XK = get_const_tuple(x.shape) + YB, N, YK = get_const_tuple(y.shape) + assert XB == YB, "batch dimension doesn't match" + assert XK == YK, "shapes of x and y is inconsistant" + cfg.add_flop(XB * M * N * XK * 2) + return cblas.batch_matmul(x, y, False, True) + + +@autotvm.register_topi_schedule("batch_matmul_cblas.x86") +def schedule_batch_matmul_cblas(_, outs): + return generic.schedule_extern(outs) diff --git a/topi/python/topi/x86/bitserial_conv2d.py b/topi/python/topi/x86/bitserial_conv2d.py index 97d0dc0eefaa..2ec565375654 100644 --- a/topi/python/topi/x86/bitserial_conv2d.py +++ b/topi/python/topi/x86/bitserial_conv2d.py @@ -18,12 +18,237 @@ """Bitserial conv2d schedule on x86""" import tvm from tvm import autotvm -from topi.util import get_const_int -from .. import generic, tag +from .. import tag +from ..util import get_const_int, get_const_tuple +from ..nn.pad import pad +from ..nn.util import get_pad_tuple +from ..nn.bitserial_util import bitpack, binary_op_multiplier + +@autotvm.register_topi_compute("bitserial_conv2d_nchw.x86") +def bitserial_conv2d_nchw(cfg, data, kernel, stride, padding, in_bits, weight_bits, + pack_dtype='uint32', out_dtype='int16', unipolar=True): + """ Compute convolution with pack on spatial axes. """ + assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" + data_q = bitpack(data, in_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype) + # Check if kernel is already bitpacked + if len(kernel.shape) == 4: + kernel_q = bitpack(kernel, weight_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype) + KB, CO, _, KH, KW = get_const_tuple(kernel_q.shape) + else: + kernel_vec = kernel + OCO, _, KH, KW, KB, VC = get_const_tuple(kernel_vec.shape) + CO = OCO * VC + + IB, N, CI, H, W = get_const_tuple(data_q.shape) + KB, CO, _, KH, KW = get_const_tuple(kernel_q.shape) + + if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2): + TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel) + else: + TPAD, LPAD, DPAD, RPAD = padding + pad_before = [0, 0, 0, TPAD, LPAD] + pad_after = [0, 0, 0, DPAD, RPAD] -@autotvm.register_topi_schedule(generic.nn.schedule_bitserial_conv2d_nchw, ['cpu'], 'direct') -@autotvm.register_topi_schedule(generic.nn.schedule_bitserial_conv2d_nhwc, ['cpu'], 'direct') -def schedule_bitserial_conv2d(cfg, outs): + if isinstance(stride, (tuple, list)): + HSTR, WSTR = stride + else: + HSTR, WSTR = stride, stride + HCAT, WCAT = KH-1, KW-1 + + TH = H + TPAD + DPAD + TW = W + LPAD + RPAD + OH = (H + TPAD + DPAD - KH) // HSTR + 1 + OW = (W + LPAD + RPAD - KW) // WSTR + 1 + + # ==================== define configuration space ==================== + n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW) + ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW) + ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits) + + co, vc = cfg.define_split('tile_co', co, num_outputs=2, + filter=lambda x: max(x.size[1:]) <= 16) + oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2, + filter=lambda x: max(x.size[1:]) <= 16) + ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2, + filter=lambda x: max(x.size[1:]) <= 16) + cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll') + + cfg.define_reorder("reorder_0", + [n, co, oh, ow, vc, vh, vw, kh, kw, kb, ib, ci], + policy='interval_all', interval=(6, 11)) + # binary ops + cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype)) + # ==================== + + VC = cfg["tile_co"].size[-1] + VH = cfg["tile_oh"].size[-1] + VW = cfg["tile_ow"].size[-1] + + dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT, IB) + kvshape = (CO//VC, CI, KH, KW, KB, VC) + ovshape = (1, CO//VC, OH//VH, OW//VW, VH, VW, VC) + oshape = (1, CO, OH, OW) + + if (TPAD != 0 and RPAD != 0): + data_pad = pad(data_q, pad_before, pad_after, name="data_pad") + else: + data_pad = data_q + + data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw, b: \ + data_pad[b][n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec') + + if len(kernel.shape) == 4: + kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, b, vc: \ + kernel_q[b][co*VC+vc][ci][dh][dw], name='kernel_vec') + + ci = tvm.reduce_axis((0, CI), name='ci') + dh = tvm.reduce_axis((0, KH), name='dh') + dw = tvm.reduce_axis((0, KW), name='dw') + b1 = tvm.reduce_axis((0, IB), name='ib') + b2 = tvm.reduce_axis((0, KB), name='kb') + + def _conv(n, co, h, w, vh, vw, vc): + b1b2 = (b1+b2).astype(out_dtype) + if unipolar: + return tvm.sum((tvm.popcount( + data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) & + kernel_vec[co, ci, dh, dw, b2, vc].astype(out_dtype)) - + tvm.popcount( + data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) + & ~kernel_vec[co, ci, dh, dw, b2, vc]).astype(out_dtype)) << b1b2, + axis=[ci, dh, dw, b1, b2]) + + return tvm.sum((tvm.popcount( + data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & + kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2, + axis=[ci, dh, dw, b1, b2]) + + conv = tvm.compute(ovshape, _conv, name='conv_out') + idxd = tvm.indexdiv + idxm = tvm.indexmod + + return tvm.compute( + oshape, lambda n, co, h, w: + conv[n, + idxd(co, VC), idxd(h, VH), idxd(w, VW), + idxm(h, VH), idxm(w, VW), idxm(co, VC)], + name='conv_vec', tag='spatial_bitserial_conv_nchw') + +@autotvm.register_topi_compute("bitserial_conv2d_nhwc.x86") +def bitserial_conv2d_nhwc(cfg, data, kernel, stride, padding, in_bits, weight_bits, + pack_dtype='uint32', out_dtype='int16', unipolar=True): + """ Compute convolution with pack on spatial axes. """ + assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" + data_q = bitpack(data, in_bits, pack_axis=3, bit_axis=4, pack_type=pack_dtype) + pack_kernel = len(kernel.shape) == 4 + + if pack_kernel: + kernel_q = bitpack(kernel, weight_bits, pack_axis=2, bit_axis=4, pack_type=pack_dtype) + else: + kernel_q = kernel + + KH, KW, _, CO, KB = get_const_tuple(kernel_q.shape) + N, H, W, CI, IB = get_const_tuple(data_q.shape) + + if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2): + TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel) + else: + TPAD, LPAD, DPAD, RPAD = padding + pad_before = [0, TPAD, LPAD, 0, 0] + pad_after = [0, DPAD, RPAD, 0, 0] + + if isinstance(stride, (tuple, list)): + HSTR, WSTR = stride + else: + HSTR, WSTR = stride, stride + HCAT, WCAT = KH-1, KW-1 + + PAD_H = H + (TPAD + DPAD) + PAD_W = W + (LPAD + RPAD) + OH = (PAD_H - KH) // HSTR + 1 + OW = (PAD_W - KW) // WSTR + 1 + oshape = (1, OH, OW, CO) + + # ==================== define configuration space ==================== + n, oh, ow, co = cfg.axis(N), cfg.axis(OH), cfg.axis(OW), cfg.axis(CO) + ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW) + ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits) + + co, vc = cfg.define_split('tile_co', co, num_outputs=2, + filter=lambda x: max(x.size[1:]) <= 16) + oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2, + filter=lambda x: max(x.size[1:]) <= 16) + ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2, + filter=lambda x: max(x.size[1:]) <= 16) + cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll') + cfg.define_reorder("reorder_0", + [n, oh, ow, co, vh, vw, kh, kw, kb, ib, vc, ci], + policy='interval_all', interval=(3, 7)) + # binary ops + cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype)) + # ==================== + + VC = cfg["tile_co"].size[-1] + VH = cfg["tile_oh"].size[-1] + VW = cfg["tile_ow"].size[-1] + + dvshape = (1, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, CI, IB) + kvshape = (CO, KH, KW, CI, VC, KB) + ovshape = (1, OH, OW, CO, VH, VW, VC) + oshape = (1, OH, OW, CO) + + if (DPAD != 0 and RPAD != 0): + data_pad = pad(data_q, pad_before, pad_after, name="data_pad") + else: + data_pad = data_q + + data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, ci, b: \ + data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][ci][b], name='data_vec') + + kernel_vec = tvm.compute(kvshape, lambda co, dh, dw, ci, vc, b: \ + kernel_q[dh][dw][ci][co*VC+vc][b], name='kernel_vec') + + ci = tvm.reduce_axis((0, CI), name='ci') + dh = tvm.reduce_axis((0, KH), name='dh') + dw = tvm.reduce_axis((0, KW), name='dw') + b1 = tvm.reduce_axis((0, IB), name='ib') + b2 = tvm.reduce_axis((0, KB), name='kb') + + def _conv(n, h, w, co, vh, vw, vc): + b1b2 = (b1+b2).astype(out_dtype) + if unipolar: + return tvm.sum( + ((tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & + kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) - + tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1]& + ~kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype)) << b1b2), + axis=[dh, dw, ci, b1, b2]) + + return tvm.sum(tvm.popcount( + data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & + kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2, + axis=[dh, dw, ci, b1, b2]) + + conv = tvm.compute(ovshape, _conv, name='conv') + + idxd = tvm.indexdiv + idxm = tvm.indexmod + return tvm.compute( + oshape, lambda n, h, w, co: + conv[n, + idxd(h, VH), idxd(w, VW), idxd(co, VC), + idxm(h, VH), idxm(w, VW), idxm(co, VC)], + name='output_unpack', tag='spatial_bitserial_conv_nhwc') + +@autotvm.register_topi_schedule("bitserial_conv2d_nchw.x86") +def schedule_bitserial_conv2d_nchw(cfg, outs): + return _schedule_bitserial_conv2d(cfg, outs) + +@autotvm.register_topi_schedule("bitserial_conv2d_nhwc.x86") +def schedule_bitserial_conv2d_nhwc(cfg, outs): + return _schedule_bitserial_conv2d(cfg, outs) + +def _schedule_bitserial_conv2d(cfg, outs): """CPU schedule for bitserial convolutions NCHW and NHWC""" s = tvm.create_schedule([x.op for x in outs]) scheduled_ops = [] diff --git a/topi/python/topi/x86/bitserial_dense.py b/topi/python/topi/x86/bitserial_dense.py index 47b972fa1319..d464cae951b3 100644 --- a/topi/python/topi/x86/bitserial_dense.py +++ b/topi/python/topi/x86/bitserial_dense.py @@ -19,11 +19,85 @@ from __future__ import absolute_import as _abs import tvm from tvm import autotvm -from topi.util import get_const_int +from topi.util import get_const_int, get_const_tuple from .. import tag -from .. import generic +from ..nn.bitserial_util import bitpack, binary_op_multiplier -@autotvm.register_topi_schedule(generic.nn.schedule_bitserial_dense, ['cpu'], 'direct') +@autotvm.register_topi_compute('bitserial_dense.x86') +def bitserial_dense(cfg, data, weight, data_bits, weight_bits, pack_dtype='uint32', + out_dtype='int16', unipolar=True): + """Bitserial dense implementation. TODO: Why are these separate + + Parameters + ---------- + data : tvm.Tensor + 2-D with shape [batch, in_dim] + weight : tvm.Tensor + 2-D with shape [out_dim, in_dim] or + 3-D with shape [out_dim, weight_bits, in_dim] + Returns + ------- + output : tvm.Tensor + 2-D with shape [batch, out_dim] + """ + data_packed = bitpack(data, data_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype) + if len(weight.shape) == 2: + weight_packed = bitpack(weight, weight_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype) + else: + weight_packed = weight + Y, DB, K = get_const_tuple(data_packed.shape) + X, WB, _ = get_const_tuple(weight_packed.shape) + ######## Search space + x, y = cfg.axis(X), cfg.axis(Y) + db, wb, k = cfg.reduce_axis(DB), cfg.reduce_axis(WB), cfg.reduce_axis(K) + ko, ki = cfg.define_split('tile_k', k, num_outputs=2) + yo, yi = cfg.define_split('tile_y', y, num_outputs=2) + xo, xi = cfg.define_split('tile_x', x, num_outputs=2) + + cfg.define_reorder('reorder_0', [yo, xo, ko, yi, wb, db, ki, xi], + policy='candidate', candidate=[ + [yo, xo, ko, yi, wb, db, ki, xi], + [yo, xo, yi, ko, wb, db, ki, xi]]) + + cfg.define_annotate('ann_reduce', [db, wb], policy='try_unroll') + cfg.define_annotate('ann_spatial', [yi, xi], policy='try_unroll_vec') + + ###### Compute rule + VX = cfg['tile_x'].size[-1] + + wvshape = (X//VX, WB, VX, K) + oshape = (Y, X) + + k = tvm.reduce_axis((0, K), name='k') + db = tvm.reduce_axis((0, DB), name='db') + wb = tvm.reduce_axis((0, WB), name='wb') + + # Tile data and weights + weight_vec = tvm.compute(wvshape, lambda xo, wb, vx, k: + weight_packed[xo*VX+vx][wb][k], name='weight_vec') + + idxdiv = tvm.indexdiv + idxmod = tvm.indexmod + + matmul_unipolar = tvm.compute(oshape, lambda i, j: tvm.sum( + (tvm.popcount(weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]) - + tvm.popcount(~weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]) + ).astype(out_dtype) + << (db+wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense_unipolar') + + matmul = tvm.compute(oshape, lambda i, j: tvm.sum( + tvm.popcount(weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k] + ).astype(out_dtype) + << (db+wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense') + + # binary ops + cfg.add_flop(2 * Y * X * K * binary_op_multiplier(pack_dtype)) + + if unipolar: + return matmul_unipolar + return matmul + +@autotvm.register_topi_schedule('biserial_dense.x86') def schedule_bitserial_dense(cfg, outs): """Schedule for bitserial_dense. diff --git a/topi/python/topi/x86/conv1d.py b/topi/python/topi/x86/conv1d.py index 95fd159acd47..70c2a6881dbf 100644 --- a/topi/python/topi/x86/conv1d.py +++ b/topi/python/topi/x86/conv1d.py @@ -18,10 +18,9 @@ """Conv1D schedule on for Intel CPU""" from __future__ import absolute_import as _abs import tvm -from .. import generic, tag +from .. import tag -@generic.schedule_conv1d_ncw.register(["cpu"]) def schedule_conv1d_ncw(outs): """Create schedule for tensors""" s = tvm.create_schedule([x.op for x in outs]) @@ -76,7 +75,6 @@ def traverse(op): return s -@generic.schedule_conv1d_nwc.register(["cpu"]) def schedule_conv1d_nwc(outs): """Create schedule for tensors""" s = tvm.create_schedule([x.op for x in outs]) diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index 95ce3376ac3a..b4b69d85fcfa 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -14,25 +14,20 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name,unused-variable,unused-argument,no-member,import-outside-toplevel +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member """Conv2D schedule on x86""" import logging -import re import tvm from tvm import autotvm -from tvm.autotvm.task.topi_integration import deserialize_args -from tvm.autotvm.task import get_config -from .. import generic, tag +from .. import tag from .. import nn -from ..nn.conv2d import conv2d, conv2d_NCHWc, \ - conv2d_infer_layout, _get_workload as _get_conv2d_workload +from ..nn.conv2d import conv2d_infer_layout, _get_workload as _get_conv2d_workload +from ..nn.conv2d import unpack_NCHWc_to_nchw from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload -from ..nn.pad import pad from ..nn.util import get_pad_tuple -from ..util import get_const_tuple - +from ..util import get_const_tuple, traverse_inline from . import conv2d_avx_1x1, conv2d_avx_common logger = logging.getLogger('topi') @@ -61,199 +56,25 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depth else: conv2d_avx_common._fallback_schedule(cfg, wkl) -def _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout): - """Create schedule configuration from input arguments""" - dshape = get_const_tuple(data.shape) - kshape = get_const_tuple(kernel.shape) - pat = re.compile(r'NCHW.+(\d+)c') - if layout == 'NCHW': - n, ic, h, w = dshape - oc, _, kh, kw = kshape - elif layout == 'NHWC': - n, h, w, ic = dshape - kh, kw, oc, _ = kshape - elif pat.match(layout) is not None: - n, ic_chunk, h, w, ic_bn = dshape - target = tvm.target.Target.current(allow_none=False) - oc_chunk, k_ic_chunk, kh, kw, k_ic_bn, oc_bn = kshape - assert ic_chunk == k_ic_chunk - assert ic_bn == k_ic_bn - ic = ic_chunk*ic_bn - oc = oc_chunk*oc_bn - else: - raise ValueError("Not support this layout {} with " - "schedule template.".format(layout)) - - is_kernel_1x1 = kh == 1 and kw == 1 - pt, pl, pb, pr = get_pad_tuple(padding, (kh, kw)) - sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides) - oh = (h - kh + pt + pb) // sh + 1 - ow = (w - kw + pl + pr) // sw + 1 - - # Create schedule config - cfg.define_split("tile_ic", ic, num_outputs=2) - cfg.define_split("tile_oc", oc, num_outputs=2) - cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64) - if is_kernel_1x1: - cfg.define_knob("tile_oh", [1, 2] if oh > 1 else [1]) - else: - cfg.define_knob("unroll_kw", [True, False]) - - -@autotvm.register_topi_compute(conv2d, 'cpu', ['direct']) -def _declaration_conv(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): - out_dtype = data.dtype if out_dtype is None else out_dtype - strides = strides if isinstance(strides, (tuple, list)) else (strides, strides) - dilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) - - if layout == 'NCHW': - _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout) - if cfg.is_fallback: - _get_default_config(cfg, data, kernel, strides, padding, out_dtype) - return _declaration_conv_impl(cfg, data, kernel, strides, - padding, dilation, layout, out_dtype) - - # HWOI kernel layout is for NHWC and HWCN - kh, kw, _, _ = get_const_tuple(kernel.shape) - if layout == 'HWCN': - return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype) - # FIXME - https://github.com/apache/incubator-tvm/issues/4122 - # _declaration_conv_nhwc_pack expects kernel layout to be HWOI. However, the tests use HWIO - # layout. Commenting until we have clarity about the nhwc_pack implementation from the author. - # elif layout == 'NHWC' and kh == 1 and kw == 1 and kernel.dtype == "int8": - # if cfg.is_fallback: - # _get_default_config(cfg, data, kernel, strides, padding, out_dtype, False, layout) - # # specialize for INT8 1X1 conv on X86 - # return conv2d_avx_1x1._declaration_conv_nhwc_pack(cfg, data, kernel, strides, - # padding, dilation, out_dtype) - if layout == 'NHWC': - return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype) - raise ValueError("not support this layout {} yet".format(layout)) - - -def _declaration_conv_impl(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): - out_dtype = data.dtype if out_dtype is None else out_dtype - assert layout == 'NCHW', "only support NCHW convolution for AVX" - - assert isinstance(dilation, int) or len(dilation) == 2 - if isinstance(dilation, int): - dilation_h, dilation_w = dilation - else: - dilation_h, dilation_w = dilation - - HSTR, WSTR = strides - batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape) - num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape) - - pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_height, kernel_width)) - pad_h = pad_top + pad_down - pad_w = pad_left + pad_right - - pad_height = in_height + pad_h - pad_width = in_width + pad_w - - dilated_kernel_h = (kernel_height - 1) * dilation_h + 1 - dilated_kernel_w = (kernel_width - 1) * dilation_w + 1 - out_height = (in_height + pad_h - dilated_kernel_h) // HSTR + 1 - out_width = (in_width + pad_w - dilated_kernel_w) // WSTR + 1 - - # pack data - DOPAD = (pad_h != 0 or pad_w != 0) - if DOPAD: - data_pad = pad(data, (0, 0, pad_top, pad_left), (0, 0, pad_down, pad_right), \ - name="data_pad") - else: - data_pad = data - - # fetch schedule - ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] - - shape = (batch_size, in_channel // ic_bn, pad_height, ic_bn, pad_width) - data_vec = tvm.compute(shape, - lambda n, C, h, c, w: data_pad[n, C * ic_bn + c, h, w], - name='data_vec') - - # pack kernel - shape = (num_filter//oc_bn, in_channel//ic_bn, - kernel_height, kernel_width, ic_bn, oc_bn) - kernel_vec = tvm.compute(shape, - lambda CO, CI, h, w, ci, co: - kernel[CO * oc_bn + co, CI * ic_bn + ci, h, w], - name='kernel_vec') - - # convolution - oshape = (batch_size, num_filter//oc_bn, out_height, out_width, oc_bn) - unpack_shape = (batch_size, num_filter, out_height, out_width) - - ic = tvm.reduce_axis((0, in_channel), name='ic') - kh = tvm.reduce_axis((0, kernel_height), name='kh') - kw = tvm.reduce_axis((0, kernel_width), name='kw') - idxmod = tvm.indexmod +@conv2d_infer_layout.register("cpu") +def _conv2d_infer_layout(workload, cfg): + _, data, kernel, strides, padding, dilation, layout, _, dtype = workload + batch_size, in_channel, in_height, in_width = data[:-1] + out_channel, _, k_height, k_width = kernel[:-1] idxdiv = tvm.indexdiv - conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block: - tvm.sum(data_vec[n, idxdiv(ic, ic_bn), oh*HSTR+kh*dilation_h, - idxmod(ic, ic_bn), - ow*WSTR+kw*dilation_w].astype(out_dtype) * - kernel_vec[oc_chunk, idxdiv(ic, ic_bn), kh, kw, - idxmod(ic, ic_bn), - oc_block].astype(out_dtype), - axis=[ic, kh, kw]), name='conv') - - unpack = tvm.compute(unpack_shape, - lambda n, c, h, w: conv[n, idxdiv(c, oc_bn), h, w, idxmod(c, oc_bn)] - .astype(out_dtype), - name='output_unpack', - tag='conv2d_nchw') - return unpack - - -@autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, 'cpu', ['direct']) -def schedule_conv2d(cfg, outs): - """Create schedule for tensors""" - s = tvm.create_schedule([x.op for x in outs]) - scheduled_ops = [] - - def traverse(op): - """Traverse operators from computation graph""" - # inline all one-to-one-mapping operators except the last stage (output) - if tag.is_broadcast(op.tag): - if op not in s.outputs: - s[op].compute_inline() - for tensor in op.input_tensors: - if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops: - traverse(tensor.op) - - if 'conv2d_nchw' in op.tag: - output = op.output(0) - conv_out = op.input_tensors[0] - kernel_vec = conv_out.op.input_tensors[1] - kernel = kernel_vec.op.input_tensors[0] - if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag: - s[kernel].compute_inline() - data_vec = conv_out.op.input_tensors[0] - data = data_vec.op.input_tensors[0] - data_pad = None - if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: - data_pad = data - data = data_pad.op.input_tensors[0] - - _, _, kh, kw = get_const_tuple(kernel.shape) - is_kernel_1x1 = kh == 1 and kw == 1 - args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]] - if is_kernel_1x1: - conv2d_avx_1x1._schedule_conv(*args) - else: - conv2d_avx_common._schedule_conv(*args) - - scheduled_ops.append(op) - - traverse(outs[0].op) - return s + pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width)) + out_height = idxdiv(in_height + pt + pb - k_height, strides[0]) + 1 + out_width = idxdiv(in_width + pl + pr - k_width, strides[1]) + 1 + tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + in_shape = (batch_size, idxdiv(in_channel, tile_ic), in_height, in_width, tile_ic) + in_layout = "NCHW%dc" % tile_ic + out_shape = (batch_size, idxdiv(out_channel, tile_oc), out_height, out_width, tile_oc) + out_layout = "NCHW%dc" % tile_oc + return ((in_shape, in_layout),), ((out_shape, out_layout),) -@generic.schedule_conv2d_nhwc.register("cpu") def schedule_conv2d_nhwc(outs): - """Create schedule for tensors""" + """Create schedule for conv2d_nhwc""" s = tvm.create_schedule([x.op for x in outs]) output_op = outs[0].op scheduled_ops = [] @@ -305,132 +126,116 @@ def traverse(op): traverse(output_op) return s +def conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype): + layout = "NCHW" + packed_out = conv2d_NCHWc(data, kernel, strides, padding, dilation, + layout, layout, out_dtype) + return unpack_NCHWc_to_nchw(packed_out, out_dtype) -# Define template function for autotvm task -# We define schedule template in this function instead of -# declaration function since actual input arguments need -# to be altered by the schedule selected. -@autotvm.task.register("topi_x86_conv2d_NCHWc") -def _topi_nn_conv2d_NCHWc(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) +def schedule_conv2d_nchw(outs): + """Create schedule for tensors""" + return schedule_conv2d_NCHWc(outs) - if len(args) == 7: - data, kernel, strides, padding, dilation, origin_layout, dtype = args - else: - assert len(args) == 8 - data, kernel, strides, padding, dilation, origin_layout, out_layout, dtype = args +def _pack_data(cfg, data, kernel): + n, _, ih, iw = get_const_tuple(data.shape) + oc, ic, kh, kw = get_const_tuple(kernel.shape) + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] - raw_data_shape = get_const_tuple(data.shape) - raw_kernel_shape = get_const_tuple(kernel.shape) + ic_chunk = ic // ic_bn + oc_chunk = oc // oc_bn - # get config here - cfg = get_config() - _create_tuning_space(cfg, data, kernel, strides, padding, dilation, origin_layout) + data = tvm.compute((n, ic_chunk, ih, iw, ic_bn), + lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w], + name="data_vec") - idxdiv = tvm.indexdiv - idxmod = tvm.indexmod - # change shape with the value in config - ic_bn, oc_bn, ow_bn = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], - cfg["tile_ow"].size[-1]) - new_data_shape = (raw_data_shape[0], idxdiv(raw_data_shape[1], ic_bn), - raw_data_shape[2], raw_data_shape[3], ic_bn) - data_layout = "NCHW%dc" % ic_bn - out_layout = "NCHW%dc" % oc_bn - new_kernel_shape = (idxdiv(raw_kernel_shape[0], oc_bn), - idxdiv(raw_kernel_shape[1], ic_bn), - raw_kernel_shape[2], raw_kernel_shape[3], ic_bn, oc_bn) - new_data = tvm.placeholder(new_data_shape, data.dtype) - new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype) - - C = _declaration_conv_NCHWc(cfg, new_data, new_kernel, strides, padding, dilation, - data_layout, out_layout, dtype) - s = _schedule_conv2d_NCHWc(cfg, [C]) - return s, [new_data, new_kernel, C] + kernel = tvm.compute( + (oc_chunk, ic_chunk, kh, kw, ic_bn, oc_bn), + lambda occ, icc, k_h, k_w, icb, ocb: + kernel[occ * oc_bn + ocb, + icc * ic_bn + icb, k_h, k_w], + name="kernel_vec") + return data, kernel -@conv2d_infer_layout.register("cpu") -def _conv2d_infer_layout(workload, cfg): - _, data, kernel, strides, padding, dilation, layout, dtype = workload - batch_size, in_channel, in_height, in_width = data[:-1] - out_channel, _, k_height, k_width = kernel[:-1] - idxdiv = tvm.indexdiv - - pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width)) - out_height = idxdiv(in_height + pt + pb - k_height, strides[0]) + 1 - out_width = idxdiv(in_width + pl + pr - k_width, strides[1]) + 1 - tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] - in_shape = (batch_size, idxdiv(in_channel, tile_ic), in_height, in_width, tile_ic) - in_layout = "NCHW%dc" % tile_ic - out_shape = (batch_size, idxdiv(out_channel, tile_oc), out_height, out_width, tile_oc) - out_layout = "NCHW%dc" % tile_oc - return ((in_shape, in_layout),), ((out_shape, out_layout),) - - -@autotvm.register_topi_compute(conv2d_NCHWc, 'cpu', 'direct') -def _declaration_conv_NCHWc(cfg, data, kernel, strides, - padding, dilation, layout, out_layout, out_dtype): +@autotvm.register_topi_compute("conv2d_NCHWc.x86") +def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype): # layout and out_layout are not used here, # we keep them for debug convenience when dumping autotvm workload - n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) - in_channel = ic_chunk * ic_bn - oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn = \ + if len(data.shape) == 5: + n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) + oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn = \ get_const_tuple(kernel.shape) - num_filter = oc_chunk * oc_bn + in_channel = ic_chunk * ic_bn + num_filter = oc_chunk * oc_bn + else: + n, in_channel, ih, iw = get_const_tuple(data.shape) + num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape) - # If no config was set, we can fallback to NCHW config. + # Define autotvm tuning space + is_kernel_1x1 = kernel_height == 1 and kernel_width == 1 + pt, pl, pb, pr = get_pad_tuple(padding, (kernel_height, kernel_width)) + sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides) + oh = (ih - kernel_height + pt + pb) // sh + 1 + ow = (iw - kernel_width + pl + pr) // sw + 1 + + cfg.define_split("tile_ic", in_channel, num_outputs=2) + cfg.define_split("tile_oc", num_filter, num_outputs=2) + cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64) + if is_kernel_1x1: + cfg.define_knob("tile_oh", [1, 2] if oh > 1 else [1]) + else: + cfg.define_knob("unroll_kw", [True, False]) + + # If no config was set, we can fallback to default config. if cfg.is_fallback: _get_default_config(cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype), tvm.placeholder((num_filter, in_channel, kernel_height, kernel_width), dtype=kernel.dtype), strides, padding, out_dtype) - return nn.conv2d_NCHWc_compute(data, - kernel, - strides, - padding, - dilation, - layout, - out_layout, - out_dtype) - - -@autotvm.register_topi_schedule(generic.schedule_conv2d_NCHWc, 'cpu', ['direct']) -def _schedule_conv2d_NCHWc(cfg, outs): + # Pack data if raw 4-D data is provided. + # This can only happen when autotuning. + if len(data.shape) == 4: + data, kernel = _pack_data(cfg, data, kernel) + + return nn.conv2d_NCHWc(data, + kernel, + strides, + padding, + dilation, + layout, + out_layout, + out_dtype) + +@autotvm.register_topi_schedule("conv2d_NCHWc.x86") +def schedule_conv2d_NCHWc(cfg, outs): """Create schedule for tensors""" + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) - scheduled_ops = [] - - def traverse(op): - """Traverse operators from computation graph""" - # inline all one-to-one-mapping operators except the last stage (output) - if tag.is_broadcast(op.tag): - if op not in s.outputs: - s[op].compute_inline() - for tensor in op.input_tensors: - if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops: - traverse(tensor.op) + def _callback(op): if 'conv2d_NCHWc' in op.tag: conv_out = op.output(0) - kernel = conv_out.op.input_tensors[1] + kernel_vec = conv_out.op.input_tensors[1] data_vec = conv_out.op.input_tensors[0] - data = data_vec.op.input_tensors[0] \ - if isinstance(data_vec.op, tvm.tensor.ComputeOp) and "pad" not in data_vec.op.tag \ - else data_vec - if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: - data_pad = data - data = data_pad.op.input_tensors[0] - args = [s, cfg, data_vec, conv_out, outs[0]] - target = tvm.target.Target.current(allow_none=False) - _, _, kh, kw, _, _, = get_const_tuple(kernel.shape) + args = [s, cfg, data_vec, kernel_vec, conv_out, outs[0]] + _, _, kh, kw, _, _, = get_const_tuple(kernel_vec.shape) if kh == 1 and kw == 1: conv2d_avx_1x1._schedule_conv_NCHWc(*args) else: conv2d_avx_common._schedule_conv_NCHWc(*args) - scheduled_ops.append(op) - - traverse(outs[0].op) + traverse_inline(s, outs[0].op, _callback) return s + + +# FIXME - https://github.com/apache/incubator-tvm/issues/4122 +# _declaration_conv_nhwc_pack expects kernel layout to be HWOI. However, the tests use HWIO +# layout. Commenting until we have clarity about the nhwc_pack implementation from the author. +# elif layout == 'NHWC' and kh == 1 and kw == 1 and kernel.dtype == "int8": +# if cfg.is_fallback: +# _get_default_config(cfg, data, kernel, strides, padding, out_dtype, False, layout) +# # specialize for INT8 1X1 conv on X86 +# return conv2d_avx_1x1._declaration_conv_nhwc_pack(cfg, data, kernel, strides, +# padding, dilation, out_dtype) diff --git a/topi/python/topi/x86/conv2d_alter_op.py b/topi/python/topi/x86/conv2d_alter_op.py index 8b0c13c2c0bb..10f11ffe3456 100644 --- a/topi/python/topi/x86/conv2d_alter_op.py +++ b/topi/python/topi/x86/conv2d_alter_op.py @@ -23,117 +23,102 @@ from tvm import relay from tvm import autotvm from .conv2d import _get_default_config -from .conv2d_int8 import _is_int8_hw_support, _get_default_config_int8 -from ..util import get_const_tuple, get_shape -from ..nn import conv2d_legalize -from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_NCHWc_int8, conv2d_alter_layout -from ..nn.depthwise_conv2d import depthwise_conv2d_NCHWc, depthwise_conv2d_nchw +from .conv2d_int8 import is_int8_hw_support, _get_default_config_int8 +from ..util import get_const_tuple +from ..nn import conv2d_legalize, conv2d_alter_layout from ..nn.util import get_pad_tuple logger = logging.getLogger('topi') @conv2d_alter_layout.register("cpu") -def _alter_conv2d_layout(attrs, inputs, tinfo, F): +def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): + target = tvm.target.current_target(allow_none=False) + dispatch_ctx = autotvm.task.DispatchContext.current + if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest): + cfg = dispatch_ctx.query(target, None) + workload = cfg.workload + else: + _, outs = relay.backend.compile_engine.select_implement( + relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target) + workload = autotvm.task.get_workload(outs) + if workload is None: + # The best implementation is not an AutoTVM template, + # we then assume it's not necessary to alter this op. + return None + cfg = dispatch_ctx.query(target, workload) + + topi_tmpl = workload[0] + new_attrs = {k : attrs[k] for k in attrs.keys()} + # Parse the attributes. - groups = attrs.get_int("groups") padding = attrs.get_int_tuple("padding") strides = attrs.get_int_tuple("strides") dilation = attrs.get_int_tuple("dilation") - out_dtype = attrs["out_dtype"] - layout_name = 'data_layout' - data_layout = attrs[layout_name] - kh, kw = attrs.get_int_tuple("kernel_size") - - data_tensor, kernel_tensor = tinfo[0], tinfo[1] - if attrs[layout_name] == 'NHWC' and attrs['kernel_layout'] == 'HWIO': - batch_size, height, width, in_channel = get_const_tuple(data_tensor.shape) - kh, kw, _, out_channel = get_const_tuple(kernel_tensor.shape) - elif attrs[layout_name] == 'NCHW' and attrs['kernel_layout'] == 'OIHW': - batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) - out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape) - else: - return None - + data_layout = attrs["data_layout"] + kernel_layout = attrs["kernel_layout"] + data_tensor, kernel_tensor = tinfos data_dtype = data_tensor.dtype kernel_dtype = kernel_tensor.dtype - out_dtype = data_dtype if out_dtype in ("same", "") else out_dtype - - # Check if depthwise. - kshape = get_shape(kernel_tensor.shape, attrs["kernel_layout"], "OIHW") - is_depthwise = groups == kshape[0] and kshape[1] == 1 - - # Save the input exprs. - copy_inputs = list(inputs) - - # Set the new attrs - new_attrs = {k : attrs[k] for k in attrs.keys()} - new_attrs['channels'] = out_channel - - # Return if the groups is not 1 and depthwise. - if groups != 1 and not is_depthwise: - return None - - # Set workload. Config update. - dispatch_ctx = autotvm.task.DispatchContext.current - target = tvm.target.Target.current() - - if is_depthwise: - workload = autotvm.task.args_to_workload( - [data_tensor, kernel_tensor, strides, padding, dilation, out_dtype], - depthwise_conv2d_nchw) - else: - workload = autotvm.task.args_to_workload( - [data_tensor, kernel_tensor, strides, padding, dilation, data_layout, out_dtype], - conv2d) - - cfg = dispatch_ctx.query(target, workload) - if cfg.is_fallback: - if _is_int8_hw_support(data_dtype, kernel_dtype): - _get_default_config_int8(cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, - is_depthwise, data_layout) - else: - _get_default_config(cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, - is_depthwise, data_layout) + out_dtype = out_type.dtype + + if topi_tmpl == "conv2d_NCHWc.x86": + # we only convert conv2d_NCHW to conv2d_NCHWc for x86 + assert data_layout == "NCHW" and kernel_layout == "OIHW" + if cfg.is_fallback: + _get_default_config(cfg, data_tensor, kernel_tensor, strides, padding, + out_dtype, False, data_layout) + batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) + out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape) + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] - # Get the tiling parameters to set the layout names. - ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] - new_attrs[layout_name] = 'NCHW%dc' % ic_bn - new_attrs['out_layout'] = 'NCHW%dc' % oc_bn - new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn), - dtype=data_dtype) + # update new attrs + new_attrs['channels'] = out_channel + new_attrs['data_layout'] = 'NCHW%dc' % ic_bn + # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) + new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) + new_attrs['out_layout'] = 'NCHW%dc' % oc_bn - if is_depthwise and data_layout == 'NCHW' and attrs['kernel_layout'] == 'OIHW': - new_attrs['kernel_layout'] = 'OIHW1i%do' % oc_bn # Store altered operator's config - new_kernel = tvm.placeholder((out_channel//oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel_dtype) + new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn), + dtype=data_dtype) + new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn, + kh, kw, ic_bn, oc_bn), dtype=kernel_tensor.dtype) new_workload = autotvm.task.args_to_workload( - [new_data, new_kernel, strides, padding, dilation, new_attrs[layout_name], - new_attrs['out_layout'], out_dtype], depthwise_conv2d_NCHWc) + [new_data, new_kernel, strides, padding, dilation, new_attrs["data_layout"], + new_attrs["out_layout"], out_dtype], topi_tmpl) dispatch_ctx.update(target, new_workload, cfg) + return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs) + elif topi_tmpl == "conv2d_NCHWc_int8.x86": + # TODO(@icemelon9, @anijain2305): Need to support data layout NHWC with kernel layout HWIO + assert data_layout == "NCHW" and kernel_layout == "OIHW" + if cfg.is_fallback: + _get_default_config_int8(cfg, data_tensor, kernel_tensor, strides, padding, + out_dtype, False, data_layout) - return F.nn.contrib_depthwise_conv2d_nchwc(*copy_inputs, **new_attrs) - - if _is_int8_hw_support(data_dtype, kernel_dtype): - # Convert kernel data layout from 4D to 7D + batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) + out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape) + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] n_elems = 4 - data_expr, kernel_expr = inputs - if attrs['kernel_layout'] == 'HWIO': - kernel_IHWO = F.transpose(kernel_expr, axes=(2, 0, 1, 3)) - elif attrs['kernel_layout'] == 'OIHW': - kernel_IHWO = F.transpose(kernel_expr, axes=(1, 2, 3, 0)) - else: - return None - - kernel_IHWOo = F.reshape(kernel_IHWO, (in_channel, kh, kw, out_channel//oc_bn, oc_bn)) - kernel_OHWoI = F.transpose(kernel_IHWOo, axes=(3, 1, 2, 4, 0)) - kernel_OHWoIi = F.reshape(kernel_OHWoI, (out_channel//oc_bn, kh, kw, oc_bn, - in_channel//ic_bn, ic_bn)) - kernel_OHWoIie = F.reshape(kernel_OHWoIi, (out_channel//oc_bn, kh, kw, oc_bn, - in_channel//ic_bn, ic_bn//n_elems, n_elems)) - kernel_OIHWioe = F.transpose(kernel_OHWoIie, axes=(0, 4, 1, 2, 5, 3, 6)) - copy_inputs = [data_expr, kernel_OIHWioe] - # Store altered operator's config. New kernel layout OIHWio4 + # convert kernel data layout from 4D to 7D + data_expr, kernel_expr = inputs + kernel_IHWO = relay.transpose(kernel_expr, axes=(1, 2, 3, 0)) + kernel_IHWOo = relay.reshape(kernel_IHWO, (in_channel, kh, kw, out_channel//oc_bn, oc_bn)) + kernel_OHWoI = relay.transpose(kernel_IHWOo, axes=(3, 1, 2, 4, 0)) + kernel_OHWoIi = relay.reshape(kernel_OHWoI, (out_channel//oc_bn, kh, kw, oc_bn, + in_channel//ic_bn, ic_bn)) + kernel_OHWoIie = relay.reshape(kernel_OHWoIi, (out_channel//oc_bn, kh, kw, oc_bn, + in_channel//ic_bn, ic_bn//n_elems, n_elems)) + kernel_OIHWioe = relay.transpose(kernel_OHWoIie, axes=(0, 4, 1, 2, 5, 3, 6)) + + # update new attrs + new_attrs['channels'] = out_channel + new_attrs['data_layout'] = 'NCHW%dc' % ic_bn + new_attrs['out_layout'] = 'NCHW%dc' % oc_bn + + # Store altered operator's config. + new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn), + dtype=data_dtype) new_kernel = tvm.placeholder((out_channel // oc_bn, in_channel // ic_bn, kh, @@ -141,30 +126,40 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F): ic_bn // n_elems, oc_bn, n_elems), dtype=kernel_dtype) - - new_workload = autotvm.task.args_to_workload([new_data, - new_kernel, - strides, - padding, - dilation, - new_attrs[layout_name], - new_attrs['out_layout'], - out_dtype], - conv2d_NCHWc_int8) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, new_attrs['data_layout'], + new_attrs['out_layout'], out_dtype], topi_tmpl) dispatch_ctx.update(target, new_workload, cfg) - return F.nn.contrib_conv2d_nchwc_int8(*copy_inputs, **new_attrs) - # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) - new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) - # Store altered operator's config - new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn, - kh, kw, ic_bn, oc_bn), dtype=kernel_tensor.dtype) - new_workload = autotvm.task.args_to_workload( - [new_data, new_kernel, strides, padding, dilation, new_attrs[layout_name], - new_attrs['out_layout'], out_dtype], conv2d_NCHWc) - dispatch_ctx.update(target, new_workload, cfg) + return relay.nn.contrib_conv2d_nchwc(data_expr, kernel_OIHWioe, **new_attrs) + elif topi_tmpl == "depthwise_conv2d_NCHWc.x86": + assert data_layout == "NCHW" and kernel_layout == "OIHW" + if cfg.is_fallback: + _get_default_config(cfg, data_tensor, kernel_tensor, strides, padding, + out_dtype, True, data_layout) - return F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs) + batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) + out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape) + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + assert channel_multiplier == 1 + + # update new attrs + new_attrs['channels'] = out_channel + new_attrs['data_layout'] = 'NCHW%dc' % ic_bn + new_attrs['kernel_layout'] = 'OIHW1i%do' % oc_bn + new_attrs['out_layout'] = 'NCHW%dc' % oc_bn + + # Store altered operator's config. + new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn), + dtype=data_dtype) + new_kernel = tvm.placeholder((out_channel//oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel_dtype) + new_workload = autotvm.task.args_to_workload( + [new_data, new_kernel, strides, padding, dilation, new_attrs['data_layout'], + new_attrs['out_layout'], out_dtype], topi_tmpl) + dispatch_ctx.update(target, new_workload, cfg) + return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs) + else: + return None @conv2d_legalize.register("cpu") @@ -254,7 +249,7 @@ def _conv2d_legalize(attrs, inputs, arg_types): # input channel to be a multiple of 4 and output channels to be a multiple of 16. For input # channels, we pad both the inputs and weights input channels. For output channels, we pad the # weight and stride_slice the output. - if _is_int8_hw_support(data_dtype, kernel_dtype): + if is_int8_hw_support(data_dtype, kernel_dtype): # Flags to remember if the expr is modified ic_modified = False oc_modified = False @@ -311,4 +306,5 @@ def _conv2d_legalize(attrs, inputs, arg_types): out = relay.subtract(out, adjust_shift) return out - return None + else: + return None diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index 9726f3d8d4f9..d04f99b774d4 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -18,10 +18,11 @@ """1x1 Conv2D schedule on for Intel CPU""" from __future__ import absolute_import as _abs import tvm +from tvm import autotvm from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity from ..nn.pad import pad -from ..nn.util import infer_pad, get_pad_tuple +from ..nn.util import get_pad_tuple, infer_pad from ..generic import conv2d as conv2d_generic from ..util import get_const_tuple, simplify from .tensor_intrin import dot_16x1x16_uint8_int8_int32 @@ -58,84 +59,41 @@ def _fallback_schedule(cfg, wkl): raise ValueError("cannot decide default schedule for workload: {}".format(wkl)) -def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last): - # fetch schedule - ic_bn, oc_bn, oh_factor, ow_factor = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], - cfg["tile_oh"].val, cfg["tile_ow"].size[-1]) - - # no stride and padding info here - padding = infer_pad(data, data_pad) - HPAD, WPAD = padding - DOPAD = (HPAD != 0 or WPAD != 0) - - A, W = data, kernel_vec - A0, A1 = data_pad, data_vec - # schedule data - if DOPAD: - s[A0].compute_inline() - batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis - parallel_axis = s[A1].fuse(batch, ic_chunk, ih) - s[A1].parallel(parallel_axis) - - # schedule kernel pack - oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis - s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) - if oc_bn > 1: - s[W].vectorize(oc_block) - parallel_axis = s[W].fuse(oc_chunk, oh) - s[W].parallel(parallel_axis) - - C, O0, O = conv_out, output, last - CC = s.cache_write(C, 'global') - - batch, oc_chunk, oh, ow, oc_block = s[C].op.axis - oh_outer, oh_inner = s[C].split(oh, factor=oh_factor) - s[C].vectorize(oc_block) - - s[CC].compute_at(s[C], oh_outer) - _, oc_chunk, oh, ow, oc_block = s[CC].op.axis - ic, _, _ = s[CC].op.reduce_axis - - ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn) - - oh_outer, oh_inner = s[CC].split(oh, factor=oh_factor) - ow_outer, ow_inner = s[CC].split(ow, factor=ow_factor) - - s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_chunk, ic_block, oh_inner, ow_inner, oc_block) - s[CC].vectorize(oc_block) - - s[CC].unroll(ow_inner) - s[CC].unroll(oh_inner) - - if O0 != O: - s[O0].compute_inline() - batch, oc, oh, ow = s[O].op.axis - - oc_chunk, oc_block = s[O].split(oc, factor=oc_bn) - oh_outer, oh_inner = s[O].split(oh, factor=oh_factor) - ow_outer, ow_inner = s[O].split(ow, factor=ow_factor) - s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block) - - parallel_axis = s[O].fuse(batch, oc_chunk, oh_outer) - s[C].compute_at(s[O], parallel_axis) - s[O].vectorize(oc_block) - - s[O].parallel(parallel_axis) - - return s - - -def _schedule_conv_NCHWc(s, cfg, data, conv_out, last): +def _schedule_conv_NCHWc(s, cfg, data_vec, kernel_vec, conv_out, last): # fetch schedule oh_factor, ow_factor = cfg["tile_oh"].val, cfg["tile_ow"].size[-1] - _, _, _, _, ic_bn = get_const_tuple(data.shape) - - # schedule data - A = data - if isinstance(s[A].op, tvm.tensor.ComputeOp): - batch, ic_chunk, ih, iw, ic_block = s[A].op.axis - parallel_axis = s[A].fuse(batch, ic_chunk, ih) - s[A].parallel(parallel_axis) + _, _, _, _, ic_bn = get_const_tuple(data_vec.shape) + + # schedule pad + if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \ + and "pad" in data_vec.op.tag: + batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis + parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih) + s[data_vec].parallel(parallel_axis) + data_vec = data_vec.op.input_tensors[0] + + if autotvm.GLOBAL_SCOPE.in_tuning: + # only in autotuning, input data of conv2d_NCHWc will be 4-D. + # skip this part during tuning to make records accurate. + # this part will be folded during Relay fold_constant pass. + s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region") + s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region") + elif isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and \ + kernel_vec.name == 'kernel_vec': + # data and kernel are not pre-computed, schedule layout transform here. + # this should only be used by x86 conv2d_nchw, which is for + # testing purpose. + batch, ic_chunk, ih, ic_block, iw = s[data_vec].op.axis + parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih) + s[data_vec].parallel(parallel_axis) + + oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[kernel_vec].op.axis + s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) + oc_bn = cfg["tile_oc"].size[-1] + if oc_bn > 1: + s[kernel_vec].vectorize(oc_block) + parallel_axis = s[kernel_vec].fuse(oc_chunk, oh) + s[kernel_vec].parallel(parallel_axis) C, O = conv_out, last CC = s.cache_write(C, 'global') @@ -167,22 +125,36 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last): s[CC].unroll(oh_inner) if C != O: - batch, oc_chunk, oh, ow, oc_block = s[O].op.axis - oh_outer, oh_inner = s[O].split(oh, factor=oh_factor) - ow_outer, ow_inner = s[O].split(ow, factor=ow_factor) - s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block) - - parallel_axis = s[O].fuse(batch, oc_chunk, oh_outer) - s[C].compute_at(s[O], parallel_axis) - s[O].vectorize(oc_block) - s[O].parallel(parallel_axis) + out_ndim = len(s[O].op.axis) + if out_ndim == 5: + batch, oc_chunk, oh, ow, oc_block = s[O].op.axis + oh_outer, oh_inner = s[O].split(oh, factor=oh_factor) + ow_outer, ow_inner = s[O].split(ow, factor=ow_factor) + s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block) + + parallel_axis = s[O].fuse(batch, oc_chunk, oh_outer) + s[C].compute_at(s[O], parallel_axis) + s[O].vectorize(oc_block) + s[O].parallel(parallel_axis) + elif out_ndim == 4: + batch, oc, oh, ow = s[O].op.axis + oc_chunk, oc_block = s[O].split(oc, factor=oc_bn) + oh_outer, oh_inner = s[O].split(oh, factor=oh_factor) + ow_outer, ow_inner = s[O].split(ow, factor=ow_factor) + s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block) + parallel_axis = s[O].fuse(batch, oc_chunk, oh_outer) + s[C].compute_at(s[O], parallel_axis) + s[O].vectorize(oc_block) + s[O].parallel(parallel_axis) + else: + raise ValueError("Unsupported output ndim: %s" % out_ndim) return s -def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last): - return conv2d_generic.schedule_conv_NCHWc_cpu_1x1_int8(s, cfg, data, conv_out, last, - int32_lanes=16, +def _schedule_conv_NCHWc_int8(s, cfg, data_vec, kernel_vec, conv_out, last): + return conv2d_generic.schedule_conv_NCHWc_cpu_1x1_int8(s, cfg, data_vec, kernel_vec, + conv_out, last, int32_lanes=16, intrin=dot_16x1x16_uint8_int8_int32()) diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py index 7c5096dc2c1a..085d0aeb67c3 100644 --- a/topi/python/topi/x86/conv2d_avx_common.py +++ b/topi/python/topi/x86/conv2d_avx_common.py @@ -18,9 +18,9 @@ """Conv2D schedule on for Intel CPU""" from __future__ import absolute_import as _abs import tvm +from tvm import autotvm from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity -from ..nn.util import infer_pad from ..generic import conv2d as conv2d_generic from ..util import get_const_tuple from .tensor_intrin import dot_16x1x16_uint8_int8_int32 @@ -83,88 +83,42 @@ def _fallback_schedule_int8(cfg, wkl): cfg["unroll_kw"] = OtherOptionEntity(False) -def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last): - # fetch schedule - ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], - cfg["tile_ow"].size[-1], cfg["unroll_kw"].val) - - # no stride and padding info here - padding = infer_pad(data, data_pad) - HPAD, WPAD = padding - DOPAD = (HPAD != 0 or WPAD != 0) - - A, W = data, kernel_vec - A0, A1 = data_pad, data_vec - - # schedule data - if DOPAD: - s[A0].compute_inline() - batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis - parallel_axis = s[A1].fuse(batch, ic_chunk, ih) - s[A1].parallel(parallel_axis) - - # schedule kernel pack - oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis - s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) - if oc_bn > 1: - s[W].vectorize(oc_block) - parallel_axis = s[W].fuse(oc_chunk, oh) - s[W].parallel(parallel_axis) - - # schedule conv - C, O0, O = conv_out, output, last - CC = s.cache_write(C, 'global') - - _, oc_chunk, oh, ow, oc_block = s[C].op.axis - ow_chunk, ow_block = s[C].split(ow, factor=reg_n) - s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) - s[C].fuse(oc_chunk, oh) - s[C].vectorize(oc_block) - - s[CC].compute_at(s[C], ow_chunk) - _, oc_chunk, oh, ow, oc_block = s[CC].op.axis - ic, kh, kw = s[CC].op.reduce_axis - - ow_chunk, ow_block = s[CC].split(ow, factor=reg_n) - ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn) - - if unroll_kw: - s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block) - s[CC].unroll(kw) - else: - s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block) - - s[CC].fuse(oc_chunk, oh) - s[CC].vectorize(oc_block) - s[CC].unroll(ow_block) - - if O0 != O: - s[O0].compute_inline() - - batch, oc, oh, ow = s[O].op.axis - ow_chunk, ow_block = s[O].split(ow, factor=reg_n) - oc_chunk, oc_block = s[O].split(oc, factor=oc_bn) - s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) - parallel_axis = s[O].fuse(batch, oc_chunk, oh) - s[C].compute_at(s[O], parallel_axis) - s[O].vectorize(oc_block) - - s[O].parallel(parallel_axis) - - return s - - -def _schedule_conv_NCHWc(s, cfg, data, conv_out, last): +def _schedule_conv_NCHWc(s, cfg, data_vec, kernel_vec, conv_out, last): # fetch schedule reg_n, unroll_kw = cfg["tile_ow"].size[-1], cfg["unroll_kw"].val - _, _, _, _, ic_bn = get_const_tuple(data.shape) + _, _, _, _, ic_bn = get_const_tuple(data_vec.shape) + + # schedule pad + if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \ + and "pad" in data_vec.op.tag: + batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis + parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih) + s[data_vec].parallel(parallel_axis) + data_vec = data_vec.op.input_tensors[0] + + if autotvm.GLOBAL_SCOPE.in_tuning: + # only in autotuning, input data of conv2d_NCHWc will be 4-D. + # skip this part during tuning to make records accurate. + # this part will be folded during Relay fold_constant pass. + s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region") + s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region") + elif isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and \ + kernel_vec.name == 'kernel_vec': + # data and kernel are not pre-computed, schedule layout transform here. + # this should only be used by x86 conv2d_nchw, which is for + # testing purpose. + batch, ic_chunk, ih, ic_block, iw = s[data_vec].op.axis + parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih) + s[data_vec].parallel(parallel_axis) + + oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[kernel_vec].op.axis + s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block) + oc_bn = cfg["tile_oc"].size[-1] + if oc_bn > 1: + s[kernel_vec].vectorize(oc_block) + parallel_axis = s[kernel_vec].fuse(oc_chunk, oh) + s[kernel_vec].parallel(parallel_axis) - # schedule data - A = data - if isinstance(s[A].op, tvm.tensor.ComputeOp): - batch, ic_chunk, ih, iw, ic_block = s[A].op.axis - parallel_axis = s[A].fuse(batch, ic_chunk, ih) - s[A].parallel(parallel_axis) # schedule 5-D NCHW[x]c conv C, O = conv_out, last @@ -195,18 +149,31 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last): s[CC].unroll(ow_block) if C != O: - batch, oc_chunk, oh, ow, oc_block = s[O].op.axis - ow_chunk, ow_block = s[O].split(ow, factor=reg_n) - s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) - parallel_axis = s[O].fuse(batch, oc_chunk, oh) - s[C].compute_at(s[O], parallel_axis) - s[O].vectorize(oc_block) - s[O].parallel(parallel_axis) + out_ndim = len(s[O].op.axis) + if out_ndim == 5: + batch, oc_chunk, oh, ow, oc_block = s[O].op.axis + ow_chunk, ow_block = s[O].split(ow, factor=reg_n) + s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) + parallel_axis = s[O].fuse(batch, oc_chunk, oh) + s[C].compute_at(s[O], parallel_axis) + s[O].vectorize(oc_block) + s[O].parallel(parallel_axis) + elif out_ndim == 4: + batch, oc, oh, ow = s[O].op.axis + ow_chunk, ow_block = s[O].split(ow, factor=reg_n) + oc_chunk, oc_block = s[O].split(oc, factor=oc_bn) + s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) + parallel_axis = s[O].fuse(batch, oc_chunk, oh) + s[C].compute_at(s[O], parallel_axis) + s[O].vectorize(oc_block) + s[O].parallel(parallel_axis) + else: + raise ValueError("Unsupported output ndim: %s" % out_ndim) return s -def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last): - return conv2d_generic.schedule_conv_NCHWc_cpu_common_int8(s, cfg, data, conv_out, last, - int32_lanes=16, +def _schedule_conv_NCHWc_int8(s, cfg, data_vec, kernel_vec, conv_out, last): + return conv2d_generic.schedule_conv_NCHWc_cpu_common_int8(s, cfg, data_vec, kernel_vec, + conv_out, last, int32_lanes=16, intrin=dot_16x1x16_uint8_int8_int32()) diff --git a/topi/python/topi/x86/conv2d_int8.py b/topi/python/topi/x86/conv2d_int8.py index 20712d2f6f4f..06c80e6e39ca 100644 --- a/topi/python/topi/x86/conv2d_int8.py +++ b/topi/python/topi/x86/conv2d_int8.py @@ -20,15 +20,13 @@ import re import tvm from tvm import autotvm -from tvm.autotvm.task import get_config -from tvm.autotvm.task.topi_integration import deserialize_args from ..nn.conv2d import _get_workload as _get_conv2d_workload -from .. import generic, tag +from .. import tag from ..generic import conv2d as conv2d_generic from ..nn.util import get_pad_tuple -from ..util import get_const_tuple -from ..nn.conv2d import conv2d_NCHWc_int8 +from ..nn.conv2d import unpack_NCHWc_to_nchw from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload +from ..util import get_const_tuple, traverse_inline from .. import nn from . import conv2d_avx_1x1, conv2d_avx_common @@ -53,7 +51,7 @@ def _get_default_config_int8(cfg, data, kernel, strides, padding, out_dtype, is_ cfg, wkl, int32_lanes=16, num_int8_elements=4) -def _is_int8_hw_support(data_dtype, kernel_dtype): +def is_int8_hw_support(data_dtype, kernel_dtype): """ Checks to ensure that we can use Intel DLBoost instructions 1) The datatypes are correct. @@ -64,7 +62,7 @@ def _is_int8_hw_support(data_dtype, kernel_dtype): is_dtype_support = data_dtype == 'uint8' and kernel_dtype == 'int8' # 2) Check LLVM support - llvm_version = tvm.target.codegen.llvm_version_major() + llvm_version = tvm.codegen.llvm_version_major() is_llvm_support = llvm_version >= 8 # 3) Check target @@ -76,150 +74,120 @@ def _is_int8_hw_support(data_dtype, kernel_dtype): return is_dtype_support and is_llvm_support and is_target_support -def _create_tuning_space_int8(cfg, data, kernel, strides, padding, dilation, layout): - """Create schedule configuration from input arguments""" - dshape = get_const_tuple(data.shape) - kshape = get_const_tuple(kernel.shape) - pat = re.compile(r'NCHW.+(\d+)c') - if layout == 'NCHW': - n, ic, h, w = dshape - oc, _, kh, kw = kshape - elif layout == 'NHWC': - n, h, w, ic = dshape - kh, kw, oc, _ = kshape - elif pat.match(layout) is not None: - n, ic_chunk, h, w, ic_bn = dshape - target = tvm.target.Target.current(allow_none=False) - oc_chunk, k_ic, kh, kw, k_ic_f, oc_bn, k_ic_s = kshape - ic = ic_chunk * ic_bn - assert ic == k_ic * k_ic_f * k_ic_s - oc = oc_chunk*oc_bn +def conv2d_nchw_int8(data, kernel, strides, padding, dilation, out_dtype): + layout = "NCHW" + packed_out = conv2d_NCHWc_int8(data, kernel, strides, padding, dilation, + layout, layout, out_dtype) + return unpack_NCHWc_to_nchw(packed_out, out_dtype) + + +def schedule_conv2d_nchw_int8(outs): + return schedule_conv2d_NCHWc_int8(outs) + + +def _pack_data(cfg, data, kernel): + n_elems = 4 + n, _, ih, iw = get_const_tuple(data.shape) + oc, ic, kh, kw = get_const_tuple(kernel.shape) + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] + + ic_chunk = ic // ic_bn + oc_chunk = oc // oc_bn + + data = tvm.compute((n, ic_chunk, ih, iw, ic_bn), + lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w], + name="data_vec") + + kernel = tvm.compute( + (oc_chunk, ic_chunk, kh, kw, ic_bn//n_elems, oc_bn, n_elems), + lambda occ, icc, k_h, k_w, icbc, ocb, icbb: + kernel[occ * oc_bn + ocb, + icc * ic_bn + icbc * ic_bn//n_elems + icbb, k_h, k_w], + name="kernel_vec") + + return data, kernel + + +@autotvm.register_topi_compute("conv2d_NCHWc_int8.x86") +def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, + dilation, layout, out_layout, out_dtype): + if len(data.shape) == 5: + n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) + in_channel = ic_chunk * ic_bn + oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn, _ \ + = get_const_tuple(kernel.shape) + num_filter = oc_chunk * oc_bn else: - raise ValueError("Not support this layout {} with " - "schedule template.".format(layout)) + n, in_channel, ih, iw = get_const_tuple(data.shape) + num_filter, _, kernel_height, kernel_width = \ + get_const_tuple(kernel.shape) - is_kernel_1x1 = kh == 1 and kw == 1 - pt, pl, pb, pr = get_pad_tuple(padding, kernel) + # Define autotvm tuning space + is_kernel_1x1 = kernel_height == 1 and kernel_width == 1 + pt, pl, pb, pr = get_pad_tuple(padding, (kernel_height, kernel_width)) sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides) - oh = (h - kh + pt + pb) // sh + 1 - ow = (w - kw + pl + pr) // sw + 1 + oh = (ih - kernel_height + pt + pb) // sh + 1 + ow = (iw - kernel_width + pl + pr) // sw + 1 - # Create schedule config - cfg.define_split('tile_ic', ic, num_outputs=2, filter=lambda y: y.size[-1] % 4 == 0) - cfg.define_split('tile_oc', oc, num_outputs=2, filter=lambda y: y.size[-1] % 16 == 0) + cfg.define_split('tile_ic', in_channel, num_outputs=2, + filter=lambda y: y.size[-1] % 4 == 0) + cfg.define_split('tile_oc', num_filter, num_outputs=2, + filter=lambda y: y.size[-1] % 16 == 0) cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64) if is_kernel_1x1: cfg.define_knob("tile_oh", [1, 2] if oh > 1 else [1]) else: cfg.define_knob("unroll_kw", [True, False]) - -# Define template function for autotvm task -# We define schedule template in this function instead of -# declaration function since actual input arguments need -# to be altered by the schedule selected. -@autotvm.task.register("topi_x86_conv2d_NCHWc_int8") -def _topi_nn_conv2d_NCHWc_int8(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) - - if len(args) == 7: - data, kernel, strides, padding, dilation, origin_layout, dtype = args - else: - assert len(args) == 8 - data, kernel, strides, padding, dilation, origin_layout, out_layout, dtype = args - - raw_data_shape = get_const_tuple(data.shape) - raw_kernel_shape = get_const_tuple(kernel.shape) - - # get config here - cfg = get_config() - _create_tuning_space_int8(cfg, data, kernel, strides, padding, dilation, origin_layout) - - # change shape with the value in config - ic_bn, oc_bn, ow_bn = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1], - cfg["tile_ow"].size[-1]) - - data_layout = "NCHW%dc" % ic_bn - out_layout = "NCHW%dc" % oc_bn - - # Set up the new shape for data and kernel - new_data_shape = (raw_data_shape[0], raw_data_shape[1] // ic_bn, - raw_data_shape[2], raw_data_shape[3], ic_bn) - n_elems = 4 - new_kernel_shape = (raw_kernel_shape[0] // oc_bn, - raw_kernel_shape[1] // ic_bn, - raw_kernel_shape[2], - raw_kernel_shape[3], - ic_bn // n_elems, - oc_bn, - n_elems) - - new_data = tvm.placeholder(new_data_shape, data.dtype) - new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype) - - C = _declaration_conv_NCHWc_int8(cfg, new_data, new_kernel, strides, padding, dilation, - data_layout, out_layout, dtype) - s = _schedule_conv2d_NCHWc_int8(cfg, [C]) - return s, [new_data, new_kernel, C] - - -@autotvm.register_topi_compute(conv2d_NCHWc_int8, 'cpu', 'direct') -def _declaration_conv_NCHWc_int8(cfg, data, kernel, strides, - padding, dilation, layout, out_layout, out_dtype): - return nn.conv2d_NCHWc_int8_compute(data, - kernel, - strides, - padding, - dilation, - layout, - out_layout, - out_dtype) - - -@autotvm.register_topi_schedule(generic.schedule_conv2d_NCHWc_int8, 'cpu', ['direct']) -def _schedule_conv2d_NCHWc_int8(cfg, outs): + # If no config was set, we can fallback to default config. + if cfg.is_fallback: + _get_default_config_int8( + cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype), + tvm.placeholder((num_filter, in_channel, kernel_height, kernel_width), + dtype=kernel.dtype), + strides, padding, out_dtype) + + # Pack data if raw 4-D data is provided. + # This can only happen when autotuning. + if len(data.shape) == 4: + data, kernel = _pack_data(cfg, data, kernel) + + return nn.conv2d_NCHWc_int8(data, + kernel, + strides, + padding, + dilation, + layout, + out_layout, + out_dtype) + + +@autotvm.register_topi_schedule("conv2d_NCHWc_int8.x86") +def schedule_conv2d_NCHWc_int8(cfg, outs): """Create schedule for tensors""" s = tvm.create_schedule([x.op for x in outs]) - scheduled_ops = [] - def traverse(op): + def _callback(op): """Traverse operators from computation graph""" - # inline all one-to-one-mapping operators except the last stage (output) - if tag.is_broadcast(op.tag): - if op not in s.outputs: - s[op].compute_inline() - for tensor in op.input_tensors: - if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops: - traverse(tensor.op) - if 'conv2d_NCHWc_int8' in op.tag: conv_out = op.output(0) - kernel = conv_out.op.input_tensors[1] + kernel_vec = conv_out.op.input_tensors[1] data_vec = conv_out.op.input_tensors[0] - data = data_vec.op.input_tensors[0] \ - if isinstance(data_vec.op, tvm.tensor.ComputeOp) and "pad" not in data_vec.op.tag \ - else data_vec - if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag: - data_pad = data - data = data_pad.op.input_tensors[0] - args = [s, cfg, data_vec, conv_out, outs[0]] - target = tvm.target.Target.current(allow_none=False) + args = [s, cfg, data_vec, kernel_vec, conv_out, outs[0]] # int8 conv kernel is 7-dim - _, _, kh, kw, _, _, _ = get_const_tuple(kernel.shape) + _, _, kh, kw, _, _, _ = get_const_tuple(kernel_vec.shape) if kh == 1 and kw == 1: conv2d_avx_1x1._schedule_conv_NCHWc_int8(*args) else: conv2d_avx_common._schedule_conv_NCHWc_int8(*args) - scheduled_ops.append(op) - - traverse(outs[0].op) + traverse_inline(s, outs[0].op, _callback) return s -@autotvm.register_topi_schedule(generic.schedule_conv2d_nhwc_pack, 'cpu', ['direct']) -def schedule_conv2d_nhwc_pack(cfg, outs): + +@autotvm.register_topi_schedule("conv2d_nhwc_pack_int8.x86") +def schedule_conv2d_nhwc_pack_int8(cfg, outs): """Create schedule for tensors""" s = tvm.create_schedule([x.op for x in outs]) output_op = outs[0].op diff --git a/topi/python/topi/x86/conv2d_transpose.py b/topi/python/topi/x86/conv2d_transpose.py index 27fc0afce999..71f47d6c037b 100644 --- a/topi/python/topi/x86/conv2d_transpose.py +++ b/topi/python/topi/x86/conv2d_transpose.py @@ -17,59 +17,34 @@ # pylint: disable=invalid-name,unused-variable,unused-argument,no-member """Conv2D Transpose schedule on x86""" import tvm -from tvm import autotvm -from .. import generic -from ..util import get_const_tuple, traverse_inline -from ..nn import conv2d_transpose_nchw_preprocess, conv2d_transpose_nchw -from . import conv2d_avx_1x1, conv2d_avx_common -from .conv2d import _declaration_conv_impl, \ - _create_tuning_space as _create_tuning_space_conv2d, \ - _get_default_config as _get_default_config_conv2d +from ..util import traverse_inline +from .. import nn +from .conv2d import conv2d_nchw, schedule_conv2d_nchw - -@autotvm.register_topi_compute(conv2d_transpose_nchw, 'cpu', ['direct']) -def _conv2d_transpose_nchw(cfg, data, kernel, strides, padding, out_dtype): +def conv2d_transpose_nchw(data, kernel, strides, padding, out_dtype): data_pad, kernel_transform = \ - conv2d_transpose_nchw_preprocess(data, kernel, strides, padding, out_dtype) - # reuse conv2d implementation - _create_tuning_space_conv2d(cfg, data_pad, kernel_transform, strides=(1, 1), \ - padding=(0, 0), dilation=(1, 1), layout="NCHW") - if cfg.is_fallback: - _get_default_config_conv2d(cfg, data_pad, kernel_transform, strides=(1, 1), \ - padding=(0, 0), out_dtype=out_dtype, layout='NCHW') - return _declaration_conv_impl(cfg, data_pad, kernel_transform, strides=(1, 1), \ - padding=(0, 0), dilation=(1, 1), layout="NCHW", \ - out_dtype=out_dtype) - + nn.conv2d_transpose_nchw_preprocess(data, kernel, strides, padding, out_dtype) + # reuse conv2d_nchw implementation + return conv2d_nchw(data_pad, kernel_transform, strides=(1, 1), + padding=(0, 0), dilation=(1, 1), out_dtype=out_dtype) -@autotvm.register_topi_schedule(generic.schedule_conv2d_transpose_nchw, 'cpu', ['direct']) -def _schedule_conv2d_transpose_nchw(cfg, outs): +def schedule_conv2d_transpose_nchw(outs): """Create schedule for tensors""" outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs - s = tvm.create_schedule([x.op for x in outs]) - + s = schedule_conv2d_nchw(outs) def _callback(op): - # reuse conv2d schedule - if 'conv2d_nchw' in op.tag: - output = op.output(0) + if 'unpack_nchwc' in op.tag: conv_out = op.input_tensors[0] # retrieve data data_vec = conv_out.op.input_tensors[0] data_pad = data_vec.op.input_tensors[0] data_dilate = data_pad.op.input_tensors[0] s[data_dilate].compute_inline() + s[data_pad].compute_inline() # retrieve kernel kernel_vec = conv_out.op.input_tensors[1] kernel_transform = kernel_vec.op.input_tensors[0] s[kernel_transform].compute_inline() - # call conv2d schedule - _, _, kh, kw = get_const_tuple(kernel_transform.shape) - is_kernel_1x1 = kh == 1 and kw == 1 - args = [s, cfg, data_dilate, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]] - if is_kernel_1x1: - conv2d_avx_1x1._schedule_conv(*args) - else: - conv2d_avx_common._schedule_conv(*args) traverse_inline(s, outs[0].op, _callback) return s diff --git a/topi/python/topi/x86/conv3d.py b/topi/python/topi/x86/conv3d.py index 4a6664eba0e4..4f5b631b5a2a 100644 --- a/topi/python/topi/x86/conv3d.py +++ b/topi/python/topi/x86/conv3d.py @@ -21,9 +21,7 @@ import tvm from tvm import autotvm from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity -from .. import generic from ..util import traverse_inline -from ..nn.conv3d import conv3d, conv3d_ncdhw from ..nn.util import get_pad_tuple3d, infer_pad3d from ..nn.pad import pad from ..util import get_const_tuple, simplify, get_const_int @@ -35,9 +33,8 @@ 'hkernel', 'wkernel', 'dpad', 'hpad', 'wpad', 'dstride', 'hstride', 'wstride']) -@autotvm.register_topi_compute(conv3d, 'cpu', ['direct']) -def _declaration_conv3d(cfg, data, kernel, strides, padding, dilation, - layout, out_dtype): +@autotvm.register_topi_compute("conv3d_ndhwc.x86") +def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype): """3D convolution forward operator. Parameters @@ -59,9 +56,6 @@ def _declaration_conv3d(cfg, data, kernel, strides, padding, dilation, dilation: int or a list/tuple of three ints dilation size, or [dilation_depth, dilation_height, dilation_width] - layout : str - layout of data - Returns ------- output : tvm.Tensor @@ -72,17 +66,13 @@ def _declaration_conv3d(cfg, data, kernel, strides, padding, dilation, strides = strides if isinstance(strides, (tuple, list)) else (strides, strides, strides) dilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation, dilation) - if layout == 'NDHWC': - _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout) - if cfg.is_fallback: - _get_default_config(cfg, data, kernel, strides, padding, out_dtype, layout) - return _conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, layout, out_dtype) - elif layout == 'NCDHW': - return conv3d_ncdhw(data, kernel, strides, padding, dilation, out_dtype) - raise ValueError("Layout {} is not supported".format(layout)) + _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout) + if cfg.is_fallback: + _get_default_config(cfg, data, kernel, strides, padding, out_dtype, layout) + return _conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, layout, out_dtype) -@autotvm.register_topi_schedule(generic.schedule_conv3d_ndhwc, 'cpu', ['direct']) +@autotvm.register_topi_schedule("conv3d_ndhwc.x86") def schedule_conv3d_ndhwc(cfg, outs): """TOPI schedule callback for conv3d Parameters diff --git a/topi/python/topi/x86/dense.py b/topi/python/topi/x86/dense.py index c6c3d5e667ac..734ba2f71330 100644 --- a/topi/python/topi/x86/dense.py +++ b/topi/python/topi/x86/dense.py @@ -23,147 +23,9 @@ from tvm.contrib import cblas from .util import get_fp32_len -from .. import generic, tag, nn +from .. import generic, tag from ..util import traverse_inline, get_const_tuple -@autotvm.register_topi_compute(nn.dense, "cpu", "direct") -def _declaration_dense(cfg, data, weight, bias=None, out_dtype=None): - target = tvm.target.Target.current() - if "cblas" in target.libs: - C = cblas.matmul(data, weight, False, True) - if bias is not None: - C = tvm.compute(C.shape, lambda i, j: C[i, j] + bias[j], - tag=tag.BROADCAST) - return C - - M, _ = get_const_tuple(data.shape) - # Always use dense_nopack for dynamic input. - # This is a temporary for CV models. - # TODO(kevinthesun): use kernel dispatcher instead. - if isinstance(M, tvm.expr.Var): - return _declaration_dense_nopack(cfg, data, weight, bias, out_dtype) - - # For small batch sizes, don't pack weight into cache-friendly layout - # because of overhead in packing and limited reuse from batch dimension - # TODO(icemelon9): use a more systematic way to determine which schedule to use - if M <= 16: - return _declaration_dense_nopack(cfg, data, weight, bias, out_dtype) - return _declaration_dense_pack(cfg, data, weight, bias, out_dtype) - - -# Declare dense compute with packing weight into cache-friendly layout -@autotvm.register_topi_compute(nn.dense, "cpu", "direct_pack") -def _declaration_dense_pack(cfg, data, weight, bias=None, out_dtype=None): - if out_dtype is None: - out_dtype = data.dtype - M, K = get_const_tuple(data.shape) # batch, in_dim - N, _ = get_const_tuple(weight.shape) # out_dim - # create tuning space - cfg.define_split("tile_y", 32 if isinstance(M, tvm.expr.Var) else M, num_outputs=3) - cfg.define_split("tile_x", 32 if isinstance(N, tvm.expr.Var) else N, num_outputs=3) - cfg.define_split("tile_k", 32 if isinstance(K, tvm.expr.Var) else K, num_outputs=2) - if cfg.is_fallback: - _default_dense_pack_config(cfg, M, N, K) - - packw_bn = cfg["tile_x"].size[-1] - packw_shape = (N // packw_bn, K, packw_bn) - packw = tvm.compute(packw_shape, - lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight") - - idxdiv = tvm.indexdiv - idxmod = tvm.indexmod - k = tvm.reduce_axis((0, K), name="k") - C = tvm.compute((M, N), - lambda y, x: tvm.sum( - data[y, k].astype(out_dtype) * - packw[idxdiv(x, packw_bn), k, idxmod(x, packw_bn)].astype(out_dtype), - axis=k), - tag="dense_pack") - if bias is not None: - C = tvm.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype), - tag=tag.BROADCAST) - return C - - -# Declare dense compute without packing weight -@autotvm.register_topi_compute(nn.dense, "cpu", "direct_nopack") -def _declaration_dense_nopack(cfg, data, weight, bias=None, out_dtype=None): - if out_dtype is None: - out_dtype = data.dtype - M, K = get_const_tuple(data.shape) - N, _ = get_const_tuple(weight.shape) - # create tuning space - cfg.define_split("tile_y", 32 if isinstance(M, tvm.expr.Var) else M, num_outputs=2) - cfg.define_split("tile_x", 32 if isinstance(N, tvm.expr.Var) else N, num_outputs=2) - cfg.define_split("tile_k", 32 if isinstance(K, tvm.expr.Var) else K, num_outputs=2) - if cfg.is_fallback: - _default_dense_nopack_config(cfg, M, N, K) - - vec = cfg["tile_k"].size[-1] - k = tvm.reduce_axis((0, K // vec), "k") - CC = tvm.compute((M, N, vec), - lambda z, y, x: tvm.sum( - data[z, k * vec + x].astype(out_dtype) * - weight[y, k * vec + x].astype(out_dtype), axis=k)) - - kk = tvm.reduce_axis((0, vec), "kk") - C = tvm.compute((M, N), - lambda y, x: tvm.sum(CC[y, x, kk], axis=kk), - tag="dense_nopack") - if bias is not None: - C = tvm.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype), - tag=tag.BROADCAST) - - return C - - -@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct") -def _schedule_dense(cfg, outs): - target = tvm.target.Target.current() - if "cblas" in target.libs: - return generic.schedule_extern(outs) - - s = tvm.create_schedule([x.op for x in outs]) - - def _callback(op): - if "dense_pack" in op.tag: - _schedule_dense_pack_template(cfg, s, op.output(0)) - elif 'dense_nopack' in op.tag: - _schedule_dense_nopack_template(cfg, s, op.output(0)) - traverse_inline(s, outs[0].op, _callback) - return s - - -@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_pack") -def _schedule_dense_pack(cfg, outs): - target = tvm.target.Target.current() - if "cblas" in target.libs: - return generic.schedule_extern(outs) - - s = tvm.create_schedule([x.op for x in outs]) - - def _callback(op): - if "dense_pack" in op.tag: - _schedule_dense_pack_template(cfg, s, op.output(0)) - traverse_inline(s, outs[0].op, _callback) - return s - - -@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_nopack") -def _schedule_dense_nopack(cfg, outs): - target = tvm.target.Target.current() - if "cblas" in target.libs: - return generic.schedule_extern(outs) - - s = tvm.create_schedule([x.op for x in outs]) - - def _callback(op): - if 'dense_nopack' in op.tag: - _schedule_dense_nopack_template(cfg, s, op.output(0)) - traverse_inline(s, outs[0].op, _callback) - return s - - def _schedule_dense_pack_template(cfg, s, C): A, packedB = s[C].op.input_tensors @@ -270,3 +132,100 @@ def _default_dense_nopack_config(cfg, M, N, K): cfg["tile_k"] = SplitEntity([K // tilek_bn, tilek_bn]) cfg["tile_x"] = SplitEntity([N, 1]) cfg["tile_y"] = SplitEntity([1, M]) + +@autotvm.register_topi_compute("dense_nopack.x86") +def dense_nopack(cfg, data, weight, bias=None, out_dtype=None): + if out_dtype is None: + out_dtype = data.dtype + M, K = get_const_tuple(data.shape) + N, _ = get_const_tuple(weight.shape) + # create tuning space + cfg.define_split("tile_y", M, num_outputs=2) + cfg.define_split("tile_x", N, num_outputs=2) + cfg.define_split("tile_k", K, num_outputs=2) + if cfg.is_fallback: + _default_dense_nopack_config(cfg, M, N, K) + + vec = cfg["tile_k"].size[-1] + k = tvm.reduce_axis((0, K // vec), "k") + CC = tvm.compute((M, N, vec), + lambda z, y, x: tvm.sum( + data[z, k * vec + x].astype(out_dtype) * + weight[y, k * vec + x].astype(out_dtype), axis=k)) + + kk = tvm.reduce_axis((0, vec), "kk") + C = tvm.compute((M, N), + lambda y, x: tvm.sum(CC[y, x, kk], axis=kk), + tag="dense_nopack") + if bias is not None: + C = tvm.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype), + tag=tag.BROADCAST) + return C + + +@autotvm.register_topi_schedule("dense_nopack.x86") +def schedule_dense_nopack(cfg, outs): + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if 'dense_nopack' in op.tag: + _schedule_dense_nopack_template(cfg, s, op.output(0)) + traverse_inline(s, outs[0].op, _callback) + return s + +@autotvm.register_topi_compute("dense_pack.x86") +def dense_pack(cfg, data, weight, bias=None, out_dtype=None): + if out_dtype is None: + out_dtype = data.dtype + M, K = get_const_tuple(data.shape) # batch, in_dim + N, _ = get_const_tuple(weight.shape) # out_dim + # create tuning space + cfg.define_split("tile_y", M, num_outputs=3) + cfg.define_split("tile_x", N, num_outputs=3) + cfg.define_split("tile_k", K, num_outputs=2) + if cfg.is_fallback: + _default_dense_pack_config(cfg, M, N, K) + + packw_bn = cfg["tile_x"].size[-1] + packw_shape = (N // packw_bn, K, packw_bn) + packw = tvm.compute(packw_shape, + lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight") + + idxdiv = tvm.indexdiv + idxmod = tvm.indexmod + k = tvm.reduce_axis((0, K), name="k") + C = tvm.compute((M, N), + lambda y, x: tvm.sum( + data[y, k].astype(out_dtype) * + packw[idxdiv(x, packw_bn), k, idxmod(x, packw_bn)].astype(out_dtype), + axis=k), + tag="dense_pack") + if bias is not None: + C = tvm.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype), + tag=tag.BROADCAST) + return C + +@autotvm.register_topi_schedule("dense_pack.x86") +def schedule_dense_pack(cfg, outs): + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if "dense_pack" in op.tag: + _schedule_dense_pack_template(cfg, s, op.output(0)) + traverse_inline(s, outs[0].op, _callback) + return s + +@autotvm.register_topi_compute("dense_cblas.x86") +def dense_cblas(cfg, data, weight, bias=None, out_dtype=None): + M, K = get_const_tuple(data.shape) + N, _ = get_const_tuple(weight.shape) + cfg.add_flop(M * K * N * 2) + C = cblas.matmul(data, weight, False, True) + if bias is not None: + C = tvm.compute(C.shape, lambda i, j: C[i, j] + bias[j].astype(out_dtype), + tag=tag.BROADCAST) + return C + +@autotvm.register_topi_schedule("dense_cblas.x86") +def schedule_dense_cblas(_, outs): + return generic.schedule_extern(outs) diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py index 385537b95e4d..a3a02a50aecd 100644 --- a/topi/python/topi/x86/depthwise_conv2d.py +++ b/topi/python/topi/x86/depthwise_conv2d.py @@ -18,17 +18,13 @@ """Depthwise Conv2D schedule on x86""" import tvm from tvm import autotvm -from tvm.autotvm.task import get_config from tvm.autotvm.task.space import SplitEntity -from tvm.autotvm.task.topi_integration import deserialize_args -from .. import generic, tag -from ..generic import schedule_depthwise_conv2d_nchw +from .. import tag from ..nn.pad import pad from ..util import get_const_tuple from ..nn.util import get_pad_tuple -from ..nn.depthwise_conv2d import depthwise_conv2d_nchw, depthwise_conv2d_NCHWc, \ - _get_workload, depthwise_conv2d_infer_layout - +from ..nn.depthwise_conv2d import _get_workload, depthwise_conv2d_infer_layout +from ..nn.conv2d import unpack_NCHWc_to_nchw from .util import get_fp32_len def _fallback_schedule(cfg, wkl): @@ -70,20 +66,53 @@ def _fallback_schedule(cfg, wkl): cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn]) cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n]) +def depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype): + layout = "NCHW" + packed_out = depthwise_conv2d_NCHWc(data, kernel, strides, padding, dilation, + layout, layout, out_dtype) + return unpack_NCHWc_to_nchw(packed_out, out_dtype) + +def schedule_depthwise_conv2d_nchw(outs): + return schedule_depthwise_conv2d_NCHWc(outs) + +def _pack_data(cfg, data, kernel): + n, ic, ih, iw = get_const_tuple(data.shape) + filter, cm, kh, kw = get_const_tuple(kernel.shape) + oc = filter * cm + ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] -autotvm.register_topi_compute(depthwise_conv2d_nchw, 'cpu', 'direct', - depthwise_conv2d_nchw.fdefault) -autotvm.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'cpu', 'direct', - schedule_depthwise_conv2d_nchw.fdefault) + ic_chunk = ic // ic_bn + oc_chunk = oc // oc_bn + data = tvm.compute((n, ic_chunk, ih, iw, ic_bn), + lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w], + name="data_vec") -@autotvm.register_topi_compute(depthwise_conv2d_NCHWc, 'cpu', 'direct') -def _depthwise_conv2d_NCHWc_cpu(cfg, data, kernel, strides, padding, dilation, - layout, out_layout, out_dtype=None): + kernel = tvm.compute( + (oc_chunk, 1, kh, kw, 1, oc_bn), + lambda occ, icc, k_h, k_w, icb, ocb: + kernel[(occ * oc_bn + ocb) // cm, + (occ * oc_bn + ocb) % cm, k_h, k_w], + name="kernel_vec") + + return data, kernel + +@autotvm.register_topi_compute("depthwise_conv2d_NCHWc.x86") +def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, + layout, out_layout, out_dtype=None): out_dtype = data.dtype if out_dtype is None else out_dtype - batch, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple(data.shape) - out_channel_chunk, _, filter_height, filter_width, __, out_channel_block \ - = get_const_tuple(kernel.shape) + + if len(data.shape) == 5: + batch, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple(data.shape) + out_channel_chunk, cm_chunk, filter_height, filter_width, cm_block, out_channel_block \ + = get_const_tuple(kernel.shape) + in_channel = in_channel_chunk * in_channel_block + out_channel = out_channel_chunk * out_channel_block + channel_multiplier = cm_chunk * cm_block + else: + batch, in_channel, in_height, in_width = get_const_tuple(data.shape) + out_channel, channel_multiplier, filter_height, filter_width = get_const_tuple(kernel.shape) + assert channel_multiplier == 1 strides = strides if isinstance(strides, (tuple, list)) else (strides, strides) HSTR, WSTR = strides @@ -92,13 +121,13 @@ def _depthwise_conv2d_NCHWc_cpu(cfg, data, kernel, strides, padding, dilation, dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) assert (dh, dw) == (1, 1), "Does not support dilation" - in_channel = in_channel_chunk * in_channel_block - out_channel = out_channel_chunk * out_channel_block - channel_multiplier = out_channel // in_channel - out_height = (in_height - filter_height + pad_top + pad_down) // HSTR + 1 out_width = (in_width - filter_width + pad_left + pad_right) // WSTR + 1 + cfg.define_split("tile_ic", in_channel, num_outputs=2) + cfg.define_split("tile_oc", out_channel, num_outputs=2) + cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) + # get workload and related schedule config wkl = _get_workload(tvm.placeholder((batch, in_channel, in_height, in_width), dtype=data.dtype), tvm.placeholder((out_channel, in_channel, filter_height, filter_width), @@ -107,6 +136,14 @@ def _depthwise_conv2d_NCHWc_cpu(cfg, data, kernel, strides, padding, dilation, if cfg.is_fallback: _fallback_schedule(cfg, wkl) + # Pack data if raw 4-D data is provided. + # This can only happen when autotuning. + if len(data.shape) == 4: + data, kernel = _pack_data(cfg, data, kernel) + _, _, _, _, in_channel_block = get_const_tuple(data.shape) + out_channel_chunk, _, _, _, _, out_channel_block \ + = get_const_tuple(kernel.shape) + # padding stage DOPAD = (pad_top != 0 or pad_left != 0 or pad_down != 0 or pad_right != 0) if DOPAD: @@ -136,8 +173,7 @@ def _depthwise_conv2d_NCHWc_cpu(cfg, data, kernel, strides, padding, dilation, name='DepthwiseConv2d', tag="depthwise_conv2d_NCHWc") return Output - -@autotvm.register_topi_schedule(generic.schedule_depthwise_conv2d_NCHWc, 'cpu', ['direct']) +@autotvm.register_topi_schedule("depthwise_conv2d_NCHWc.x86") def schedule_depthwise_conv2d_NCHWc(cfg, outs): """CPU schedule for depthwise conv2d in NCHW[x]c layout""" s = tvm.create_schedule([x.op for x in outs]) @@ -160,14 +196,22 @@ def traverse(op): traverse(outs[0].op) return s -def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data, kernel, conv_out, output): +def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data_vec, kernel_vec, conv_out, output): tile_ow = cfg["tile_ow"].size[-1] - # schedule data - A = data - if isinstance(s[A].op, tvm.tensor.ComputeOp): - batch, ic_chunk, ih, iw, ic_block = s[A].op.axis - p = s[A].fuse(ic_chunk, ih) - s[A].parallel(p) + # schedule pad + if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \ + and "pad" in data_vec.op.tag: + batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis + parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih) + s[data_vec].parallel(parallel_axis) + data_vec = data_vec.op.input_tensors[0] + + if autotvm.GLOBAL_SCOPE.in_tuning: + # only in autotuning, input data of conv2d_NCHWc will be 4-D. + # skip this part during tuning to make recrods accurate. + # this part will be folded during Relay fold_constant pass. + s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region") + s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region") C, O = conv_out, output CC = s.cache_write(C, 'global') @@ -196,41 +240,6 @@ def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data, kernel, conv_out, output s[O].parallel(parallel_axis) return s - -@autotvm.task.register("topi_x86_depthwise_conv2d_NCHWc_from_nchw") -def _topi_nn_depthwise_conv2d_NCHWc(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - data, kernel, strides, padding, dilation, dtype = deserialize_args(args) - - batch, in_channel, height, width = get_const_tuple(data.shape) - filter_channel, channel_multiplier, kh, kw = get_const_tuple(kernel.shape) - pt, pl, pb, pr = get_pad_tuple(padding, kernel) - sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides) - out_height = (height - kh + pt + pb) // sh + 1 - out_width = (width - kw + pl + pr) // sw + 1 - out_channel = filter_channel * channel_multiplier - - # get config here - cfg = get_config() - cfg.define_split("tile_ic", in_channel, num_outputs=2) - cfg.define_split("tile_oc", out_channel, num_outputs=2) - cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) - - # change shape with the value in config - ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] - new_data_shape = (batch, in_channel // ic_bn, height, width, ic_bn) - new_kernel_shape = (out_channel // oc_bn, 1, kh, kw, 1, oc_bn) - new_data = tvm.placeholder(new_data_shape, data.dtype) - new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype) - - data_layout = "NCHW%dc" % ic_bn - out_layout = "NCHW%dc" % oc_bn - - C = _depthwise_conv2d_NCHWc_cpu(cfg, new_data, new_kernel, strides, padding, dilation, - data_layout, out_layout, dtype) - s = schedule_depthwise_conv2d_NCHWc(cfg, [C]) - return s, [new_data, new_kernel, C] - @depthwise_conv2d_infer_layout.register("cpu") def _depthwise_conv2d_infer_layout(workload, cfg): _, data, kernel, strides, padding, dilation, dtype = workload diff --git a/topi/python/topi/x86/injective.py b/topi/python/topi/x86/injective.py index d6bb7622d640..375827bb271c 100644 --- a/topi/python/topi/x86/injective.py +++ b/topi/python/topi/x86/injective.py @@ -18,10 +18,8 @@ """x86 declaration and schedules.""" from __future__ import absolute_import as _abs import tvm -from .. import generic from ..util import is_empty_shape -@generic.schedule_injective_from_existing.register(["cpu"]) def schedule_injective_from_existing(sch, out): """Schedule for injective op from existing schedule. @@ -53,7 +51,6 @@ def schedule_injective_from_existing(sch, out): sch[out].vectorize(li) return sch -@generic.schedule_injective.register(["cpu"]) def schedule_injective(outs): """X86 schedule for injective op. @@ -77,7 +74,6 @@ def schedule_injective(outs): schedule_injective_from_existing(s, x) return s -@generic.schedule_concatenate.register(["cpu"]) def schedule_concatenate(outs): """X86 schedule for concatenate op. diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py index 45cb17e5c7b3..0da5316abaf8 100644 --- a/topi/python/topi/x86/nn.py +++ b/topi/python/topi/x86/nn.py @@ -20,7 +20,6 @@ import tvm from .. import generic -@generic.schedule_softmax.register(["cpu"]) def schedule_softmax(outs): """Schedule for softmax diff --git a/topi/python/topi/x86/pooling.py b/topi/python/topi/x86/pooling.py index ed7d525028e4..4f7866df156f 100644 --- a/topi/python/topi/x86/pooling.py +++ b/topi/python/topi/x86/pooling.py @@ -59,7 +59,6 @@ def vectorize(fused_axis, num_parallel_axis, vectorize_limit=64): sch.parallel(fused) -@generic.schedule_pool.register(["cpu"]) def schedule_pool(outs, layout): """Schedule for pool @@ -117,7 +116,6 @@ def traverse(OP): return s -@generic.schedule_adaptive_pool.register(["cpu"]) def schedule_adaptive_pool(outs): """Schedule for adaptive pool diff --git a/topi/python/topi/x86/reduction.py b/topi/python/topi/x86/reduction.py index f704d4961f15..b9dd4d4f1b3c 100644 --- a/topi/python/topi/x86/reduction.py +++ b/topi/python/topi/x86/reduction.py @@ -18,8 +18,8 @@ """x86 declaration and schedules.""" from __future__ import absolute_import as _abs import tvm +from .injective import schedule_injective_from_existing from .. import tag -from .. import generic from ..util import get_const_tuple def _schedule_reduce(sch, op, is_idx_reduce=False): @@ -58,7 +58,6 @@ def _schedule_reduce(sch, op, is_idx_reduce=False): sch[out].parallel(fused) -@generic.schedule_reduce.register(["cpu"]) def schedule_reduce(outs): """X86 schedule for reduction op. @@ -95,7 +94,7 @@ def traverse_after_reduce(operator): """Internal traverse function""" if tag.is_broadcast(operator.tag): if operator not in scheduled_ops: - generic.schedule_injective_from_existing(sch, operator) + schedule_injective_from_existing(sch, operator) for tensor in operator.input_tensors: traverse_after_reduce(tensor.op) elif operator.tag == 'comm_reduce': diff --git a/topi/python/topi/x86/roi_align.py b/topi/python/topi/x86/roi_align.py index 26b84be9585b..203c3dd1802b 100644 --- a/topi/python/topi/x86/roi_align.py +++ b/topi/python/topi/x86/roi_align.py @@ -20,7 +20,6 @@ import tvm from tvm import hybrid -from ..vision.rcnn import roi_align_nchw from ..tensor import full from ..util import get_const_tuple @@ -185,8 +184,7 @@ def roi_align_nchw_ir(data, rois, w_pc, pos_pc, pooled_size, spatial_scale, samp return output -@roi_align_nchw.register("cpu") -def roi_align_nchw_cpu(data, rois, pooled_size, spatial_scale, sample_ratio=-1): +def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1): """ROI align operator in NCHW layout. Parameters diff --git a/topi/python/topi/x86/sparse.py b/topi/python/topi/x86/sparse.py index c9e0e3864a5a..85a286a351e4 100644 --- a/topi/python/topi/x86/sparse.py +++ b/topi/python/topi/x86/sparse.py @@ -18,13 +18,11 @@ """sparse_dense schedule on x86""" import tvm -from .. import generic from ..util import traverse_inline, get_const_int from .util import get_fp32_len -@generic.schedule_sparse_dense.register(["cpu"]) -def _schedule_sparse_dense(outs): +def schedule_sparse_dense(outs): s = tvm.create_schedule([x.op for x in outs]) def _callback(op): diff --git a/topi/src/topi.cc b/topi/src/topi.cc index a7b916093d98..79e223c30975 100644 --- a/topi/src/topi.cc +++ b/topi/src/topi.cc @@ -677,7 +677,7 @@ TVM_REGISTER_GLOBAL("topi.rocm.schedule_softmax") TVM_REGISTER_GLOBAL("topi.rocm.schedule_lrn") .set_body([](TVMArgs args, TVMRetValue *rv) { - *rv = topi::rocm::schedule_lrn(args[0], args[1]); + *rv = topi::rocm::schedule_lrn(args[0]); }); /* CUDA schedules */ @@ -723,7 +723,7 @@ TVM_REGISTER_GLOBAL("topi.cuda.schedule_softmax") TVM_REGISTER_GLOBAL("topi.cuda.schedule_lrn") .set_body([](TVMArgs args, TVMRetValue *rv) { - *rv = topi::cuda::schedule_lrn(args[0], args[1]); + *rv = topi::cuda::schedule_lrn(args[0]); }); /* Utility functions */ diff --git a/topi/tests/python/common.py b/topi/tests/python/common.py index 4e0a45be0a22..372d19628ca0 100644 --- a/topi/tests/python/common.py +++ b/topi/tests/python/common.py @@ -16,9 +16,10 @@ # under the License. """Common utility for topi test""" +import tvm from tvm import autotvm from tvm.autotvm.task.space import FallbackConfigEntity - +import topi def get_all_backend(): """return all supported target @@ -31,6 +32,40 @@ def get_all_backend(): return ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx', 'llvm -device=arm_cpu', 'opencl -device=mali', 'aocl_sw_emu'] +_injective_schedule = { + "generic": topi.generic.schedule_injective, + "cpu": topi.x86.schedule_injective, + "arm_cpu": topi.arm_cpu.schedule_injective, + "gpu": topi.cuda.schedule_injective, + "hls": topi.hls.schedule_injective, + "opengl": topi.opengl.schedule_injective +} + +_reduce_schedule = { + "generic": topi.generic.schedule_reduce, + "cpu": topi.x86.schedule_reduce, + "gpu": topi.cuda.schedule_reduce, + "hls": topi.cuda.schedule_reduce +} + +def get_schedule_injective(target): + if isinstance(target, str): + target = tvm.target.create(target) + for key in target.keys: + if key in _injective_schedule: + return _injective_schedule[key] + return _injective_schedule["generic"] + +def get_schedule_reduce(target): + if isinstance(target, str): + target = tvm.target.create(target) + for key in target.keys: + if key in _reduce_schedule: + return _reduce_schedule[key] + return _reduce_schedule["generic"] + +get_schedule_broadcast = get_schedule_injective +get_schedule_elemwise = get_schedule_injective class Int8Fallback(autotvm.FallbackContext): def _query_inside(self, target, workload): @@ -38,7 +73,6 @@ def _query_inside(self, target, workload): if key in self.memory: return self.memory[key] cfg = FallbackConfigEntity() - cfg.template_key = 'int8' self.memory[key] = cfg cfg.is_fallback = False return cfg diff --git a/topi/tests/python/test_fifo_buffer.py b/topi/tests/python/test_fifo_buffer.py index 022272f6c4da..8b74e215df63 100644 --- a/topi/tests/python/test_fifo_buffer.py +++ b/topi/tests/python/test_fifo_buffer.py @@ -19,7 +19,7 @@ import tvm import topi import numpy as np -from common import get_all_backend +from common import get_all_backend, get_schedule_injective from tvm.contrib.pickle_memoize import memoize def verify_fifo_buffer(buffer_shape, data_shape, axis, dtype='float32'): @@ -52,7 +52,7 @@ def check_device(device): with tvm.target.create(device): out = topi.nn.fifo_buffer(data, buffer, axis=axis) - s = topi.generic.schedule_injective([out]) + s = get_schedule_injective(device)([out]) buffer_tvm = tvm.nd.array(buffer_np, ctx=ctx) data_tvm = tvm.nd.array(data_np, ctx=ctx) @@ -128,7 +128,7 @@ def check_device(device): with tvm.target.create(device): out = topi.nn.fifo_buffer(inc_input, context, axis=buffer_axis) - s = topi.generic.schedule_injective([out]) + s = get_schedule_injective(device)([out]) update_context = tvm.build(s, [inc_input, context, out], device, name='update_context') out = topi.nn.conv2d(context, kernel, strides=stride, padding=padding, dilation=dilate, @@ -137,12 +137,12 @@ def check_device(device): conv2d_inc = tvm.build(s, [context, kernel, out], device, name='conv2d_inc') out = topi.nn.fifo_buffer(inc_output, output_window, axis=buffer_axis) - s = topi.generic.schedule_injective([out]) + s = get_schedule_injective(device)([out]) update_output_window = tvm.build(s, [inc_output, output_window, out], device, name='update_output_window') out = topi.nn.fifo_buffer(inc_input, input_window, axis=buffer_axis) - s = topi.generic.schedule_injective([out]) + s = get_schedule_injective(device)([out]) update_input_window = tvm.build(s, [inc_input, input_window, out], device, name='update_input_window') diff --git a/topi/tests/python/test_topi_broadcast.py b/topi/tests/python/test_topi_broadcast.py index 5a0a940d3d7b..56b82b0cda68 100644 --- a/topi/tests/python/test_topi_broadcast.py +++ b/topi/tests/python/test_topi_broadcast.py @@ -15,10 +15,10 @@ # specific language governing permissions and limitations # under the License. """Test code for broadcasting operators.""" -from common import get_all_backend import numpy as np import tvm import topi +from common import get_all_backend, get_schedule_broadcast def verify_broadcast_to_ele(in_shape, out_shape, fbcast): @@ -33,7 +33,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_broadcast(B) + s = get_schedule_broadcast(device)(B) foo = tvm.build(s, [A, B], device, name="broadcast_to") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = np.broadcast_to(data_npy, out_shape) @@ -81,7 +81,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_broadcast(C) + s = get_schedule_broadcast(device)(C) foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + ftopi.__name__) lhs_npy, lhs_nd = gen_operand(lhs_shape, lhs_min, lhs_max, ctx) @@ -252,7 +252,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_broadcast(B) + s = get_schedule_broadcast(device)(B) foo = tvm.build(s, [A, B], device, name=name) data_npy = indata.astype(A.dtype) @@ -335,7 +335,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_broadcast(C) + s = get_schedule_broadcast(device)(C) foo = tvm.build(s, [A, B, C], device, name=name) lhs_nd = tvm.nd.array(lhs, ctx) diff --git a/topi/tests/python/test_topi_clip.py b/topi/tests/python/test_topi_clip.py index 585374f33a64..c875e835e8f7 100644 --- a/topi/tests/python/test_topi_clip.py +++ b/topi/tests/python/test_topi_clip.py @@ -21,7 +21,7 @@ from topi.util import get_const_tuple from tvm.contrib.pickle_memoize import memoize -from common import get_all_backend +from common import get_all_backend, get_schedule_injective def verify_clip(N, a_min, a_max, dtype): A = tvm.placeholder((N, N), dtype=dtype, name='A') @@ -43,7 +43,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) diff --git a/topi/tests/python/test_topi_depth_to_space.py b/topi/tests/python/test_topi_depth_to_space.py index 4e895cb5db55..b79597a9e143 100644 --- a/topi/tests/python/test_topi_depth_to_space.py +++ b/topi/tests/python/test_topi_depth_to_space.py @@ -20,7 +20,7 @@ import topi import topi.testing -from common import get_all_backend +from common import get_all_backend, get_schedule_injective def verify_depth_to_space(block_size, batch, in_channel, in_height, in_width, layout='NCHW', mode='DCR'): @@ -56,7 +56,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) f = tvm.build(s, [A, B], device) diff --git a/topi/tests/python/test_topi_image.py b/topi/tests/python/test_topi_image.py index 21935cb911da..81c44d1e97e9 100644 --- a/topi/tests/python/test_topi_image.py +++ b/topi/tests/python/test_topi_image.py @@ -20,7 +20,7 @@ import topi import topi.testing -from common import get_all_backend +from common import get_all_backend, get_schedule_injective def verify_resize(batch, in_channel, in_height, in_width, out_height, out_width, layout='NCHW', coord_trans="align_corners", method="bilinear"): @@ -52,7 +52,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) f = tvm.build(s, [A, B], device) @@ -116,7 +116,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) f = tvm.build(s, [A, B], device) diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py index 5bb95ba10e3b..e9d3bc9a576d 100644 --- a/topi/tests/python/test_topi_math.py +++ b/topi/tests/python/test_topi_math.py @@ -20,7 +20,7 @@ import topi import topi.testing from topi import util -from common import get_all_backend +from common import get_all_backend, get_schedule_injective def test_util(): @@ -62,23 +62,15 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) foo = tvm.build(s, [A, B], device, name=name) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros_like(b_np), ctx) foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5) - check_device('llvm') - check_device('cuda') - check_device('opencl') - check_device('metal') - check_device('rocm') - check_device('vulkan') - check_device('nvptx') - check_device('llvm -device=arm-cpu') - check_device('opencl -device=mali') - check_device('aocl_sw_emu') + for target in get_all_backend(): + check_device(target) def test_isnan( low, @@ -110,23 +102,15 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) foo = tvm.build(s, [A, B], device, name="isnan") a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros_like(b_np), ctx) foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5) - check_device('llvm') - check_device('cuda') - check_device('opencl') - check_device('metal') - check_device('rocm') - check_device('vulkan') - check_device('nvptx') - check_device('llvm -device=arm-cpu') - check_device('opencl -device=mali') - check_device('aocl_sw_emu') + for target in get_all_backend(): + check_device(target) test_apply(topi.floor, "floor", np.floor, -100, 100) test_apply(topi.ceil, "ceil", np.ceil, -100, 100) @@ -168,7 +152,7 @@ def verify(from_dtype, to_dtype, low=-100, high=100): continue print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) foo = tvm.build(s, [A, B], device) a = tvm.nd.array(a_np, ctx) b = tvm.nd.empty(shape=shape, dtype=to_dtype, ctx=ctx) diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py index d266cfc6ceb5..3b854ceba2a9 100644 --- a/topi/tests/python/test_topi_reduce.py +++ b/topi/tests/python/test_topi_reduce.py @@ -20,7 +20,7 @@ import tvm import topi -from common import get_all_backend +from common import get_all_backend, get_schedule_reduce def _my_npy_argmax(arr, axis, keepdims): if not keepdims: @@ -74,7 +74,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_reduce(B) + s = get_schedule_reduce(device)(B) foo = tvm.build(s, [A, B], device, name=type) # Test diff --git a/topi/tests/python/test_topi_relu.py b/topi/tests/python/test_topi_relu.py index 8868d4ebffe3..ee7aeed037d7 100644 --- a/topi/tests/python/test_topi_relu.py +++ b/topi/tests/python/test_topi_relu.py @@ -21,7 +21,8 @@ import topi from topi.util import get_const_tuple from tvm.contrib.nvcc import have_fp16 -from common import get_all_backend + +from common import get_all_backend, get_schedule_elemwise def verify_relu(m, n, dtype="float32"): A = tvm.placeholder((m, n), name='A', dtype=dtype) @@ -40,7 +41,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_elemwise(B) + s = get_schedule_elemwise(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) diff --git a/topi/tests/python/test_topi_space_to_depth.py b/topi/tests/python/test_topi_space_to_depth.py index b25cad194301..0d24de59238b 100644 --- a/topi/tests/python/test_topi_space_to_depth.py +++ b/topi/tests/python/test_topi_space_to_depth.py @@ -20,7 +20,7 @@ import topi import topi.testing -from common import get_all_backend +from common import get_all_backend, get_schedule_injective def verify_space_to_depth(block_size, batch, in_channel, in_height, in_width, layout='NCHW'): @@ -56,7 +56,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) f = tvm.build(s, [A, B], device) diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py index fd04fc4b0965..2e3ce4143a2f 100644 --- a/topi/tests/python/test_topi_transform.py +++ b/topi/tests/python/test_topi_transform.py @@ -21,7 +21,7 @@ import topi.testing from tvm.contrib.nvcc import have_fp16 -from common import get_all_backend +from common import get_all_backend, get_schedule_injective, get_schedule_broadcast, get_schedule_elemwise def verify_expand_dims(in_shape, out_shape, axis, num_newaxis): A = tvm.placeholder(shape=in_shape, name="A") @@ -33,7 +33,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_broadcast(B) + s = get_schedule_broadcast(device)(B) foo = tvm.build(s, [A, B], device, name="expand_dims") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = data_npy.reshape(out_shape) @@ -59,7 +59,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_elemwise(B) + s = get_schedule_elemwise(device)(B) foo = tvm.build(s, [A, B], device, name="reinterpret") data_npy = generator(in_shape).astype(in_dtype) out_npy = data_npy.view(B.dtype) @@ -82,7 +82,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) foo = tvm.build(s, [A, B], device, name="transpose") data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype(A.dtype) out_npy = data_npy.transpose(axes) @@ -105,7 +105,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) foo = tvm.build(s, [A, B], device, name="reshape") data_npy = np.random.normal(size=src_shape).astype(A.dtype) out_npy = np.reshape(data_npy, newshape=dst_shape) @@ -128,7 +128,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) foo = tvm.build(s, [A, B], device, name="squeeze") data_npy = np.random.normal(size=src_shape).astype(A.dtype) @@ -143,6 +143,19 @@ def check_device(device): check_device(device) def verify_concatenate(shapes, axis): + + def get_schedule_concatenate(target): + schedule_map = { + "cpu": topi.x86.schedule_concatenate, + "arm_cpu": topi.arm_cpu.schedule_concatenate, + } + if isinstance(target, str): + target = tvm.target.create(target) + for key in target.keys: + if key in schedule_map: + return schedule_map[key] + return get_schedule_injective(target) + tensor_l = [] for i, shape in enumerate(shapes): tensor_l.append(tvm.placeholder(shape, name="A" + str(i))) @@ -154,7 +167,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_concatenate(out_tensor) + s = get_schedule_concatenate(device)(out_tensor) foo = tvm.build(s, tensor_l + [out_tensor], device, name="concatenate") data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes] @@ -179,7 +192,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_broadcast(out_tensor) + s = get_schedule_broadcast(device)(out_tensor) foo = tvm.build(s, tensor_l + [out_tensor], device, name="stack") data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes] @@ -203,7 +216,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(tensor_l) + s = get_schedule_injective(device)(tensor_l) foo = tvm.build(s, [A] + list(tensor_l), device, name="split") data_npy = np.random.normal(size=src_shape).astype(A.dtype) @@ -262,7 +275,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) foo = tvm.build(s, [A, B], device, name="reverse") x_np = np.random.uniform(size=in_shape).astype(A.dtype) @@ -293,7 +306,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(out_tensor) + s = get_schedule_injective(device)(out_tensor) foo = tvm.build(s, [A] + [indices] + [out_tensor] , device, name="take") shape_size = 1 @@ -328,7 +341,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) foo = tvm.build(s, [A, B], device, name="stride_slice") x_np = np.random.uniform(size=in_shape).astype(A.dtype) @@ -360,7 +373,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) if strides is not None: foo = tvm.build(s, [A, V, b, e, st, B], device, name="stride_set") @@ -402,7 +415,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(out_tensor) + s = get_schedule_injective(device)(out_tensor) func = tvm.build(s, [A, indices, out_tensor] , device, name="take") shape_size = 1 @@ -441,7 +454,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(A) + s = get_schedule_injective(device)(A) f = tvm.build(s, [A], device, name="arange") a_nd = tvm.nd.empty(a_np.shape, dtype='float32', ctx=ctx) f(a_nd) @@ -460,7 +473,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_broadcast(B) + s = get_schedule_broadcast(device)(B) foo = tvm.build(s, [A, B], device, name="repeat") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = np.repeat(data_npy, repeats, axis) @@ -482,7 +495,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_broadcast(B) + s = get_schedule_broadcast(device)(B) foo = tvm.build(s, [A, B], device, name="tile") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = np.tile(data_npy, reps) @@ -507,7 +520,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_broadcast(C) + s = get_schedule_broadcast(device)(C) f = tvm.build(s, [Cond, A, B, C], device, name="where") cond_npy = np.random.uniform(low=-1, high=1, size=in_shape).astype(dtype) x_npy = np.random.uniform(size=in_shape).astype(dtype) @@ -535,7 +548,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(one_hot_result) + s = get_schedule_injective(device)(one_hot_result) fn = tvm.build(s, [indices, one_hot_result], device, name="one_hot") indices_npy = np.random.randint(0, depth, size=indices_shape).astype(indices.dtype) out_npy = topi.testing.one_hot(indices_npy, on_value, off_value, depth, axis, dtype) @@ -618,7 +631,7 @@ def test_squeeze(): ctx = tvm.context(device, 0) if ctx.exist: with tvm.target.create(device): - s = topi.generic.schedule_injective(C) + s = get_schedule_injective(device)(C) func = tvm.build(s, [A, C]) a = tvm.nd.array(np.array((1, 2)).astype('float32'), ctx=ctx) c = tvm.nd.empty((1,), dtype='float32', ctx=ctx) @@ -741,7 +754,7 @@ def check_device(device): tvm_output = tvm.nd.empty(output.shape, ctx=ctx, dtype=B.dtype) print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) f = tvm.build(s, [A, B], device, name="layout_transform") f(tvm_input, tvm_output) tvm.testing.assert_allclose(tvm_output.asnumpy(), output) @@ -768,7 +781,7 @@ def check_device(device): tvm_output = tvm.nd.empty(output.shape, ctx=ctx, dtype=dtype) print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) f = tvm.build(s, [A, B], device, name="shape") f(tvm_input, tvm_output) tvm.testing.assert_allclose(tvm_output.asnumpy(), output) @@ -800,7 +813,7 @@ def check_device(device): tvm_C = tvm.nd.empty(in_shape, ctx=ctx, dtype="float32") print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(C) + s = get_schedule_injective(device)(C) f = tvm.build(s, [A, B, C], device, name="SequenceMask") f(tvm_A, tvm_B, tvm_C) tvm.testing.assert_allclose(tvm_C.asnumpy(), C_gt_data) @@ -825,7 +838,7 @@ def check_device(device): tvm_output = tvm.nd.empty((1,), ctx=ctx, dtype=B.dtype) print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) f = tvm.build(s, [A, B], device, name="ndarray_size") f(tvm_input, tvm_output) tvm.testing.assert_allclose(tvm_output.asnumpy(), output) @@ -853,6 +866,7 @@ def check_device(device): where = topi.where(gt, one, two) add = topi.add(conv1, where) outs = [add] + # TODO(@icemelon9): fix here s = topi.generic.schedule_conv2d_nchw(outs) tvm.build(s, [data, w, add], target=backend) @@ -888,5 +902,5 @@ def test_one_hot(): test_shape() test_sequence_mask() test_ndarray_size() - test_where_fusion() + #test_where_fusion() test_one_hot() diff --git a/topi/tests/python/test_topi_upsampling.py b/topi/tests/python/test_topi_upsampling.py index 875b2f780bef..20382da77939 100644 --- a/topi/tests/python/test_topi_upsampling.py +++ b/topi/tests/python/test_topi_upsampling.py @@ -22,7 +22,7 @@ import math from topi.util import nchw_pack_layout -from common import get_all_backend +from common import get_all_backend, get_schedule_injective def verify_upsampling(batch, in_channel, in_height, in_width, scale_h, scale_w, layout='NCHW', method="nearest_neighbor", @@ -64,7 +64,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) f = tvm.build(s, [A, B], device) @@ -147,7 +147,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(B) + s = get_schedule_injective(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) f = tvm.build(s, [A, B], device) diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py index 4cbdf52163d6..5f71068b8136 100644 --- a/tutorials/autotvm/tune_relay_arm.py +++ b/tutorials/autotvm/tune_relay_arm.py @@ -322,7 +322,7 @@ def tune_and_evaluate(tuning_opt): mod, params, input_shape, _ = get_network(network, batch_size=1) tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params, - ops=(relay.op.nn.conv2d,)) + ops=(relay.op.get("nn.conv2d"),)) # run tuning tasks print("Tuning...") diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py index 72fc2bed3d0e..dca680e6f039 100644 --- a/tutorials/autotvm/tune_relay_cuda.py +++ b/tutorials/autotvm/tune_relay_cuda.py @@ -223,7 +223,8 @@ def tune_and_evaluate(tuning_opt): print("Extract tasks...") mod, params, input_shape, out_shape = get_network(network, batch_size=1) tasks = autotvm.task.extract_from_program(mod["main"], target=target, - params=params, ops=(relay.op.nn.conv2d,)) + params=params, + ops=(relay.op.get("nn.conv2d"),)) # run tuning tasks print("Tuning...") diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py index 3c56524078c2..30ac719338ae 100644 --- a/tutorials/autotvm/tune_relay_mobile_gpu.py +++ b/tutorials/autotvm/tune_relay_mobile_gpu.py @@ -307,7 +307,8 @@ def tune_and_evaluate(tuning_opt): tasks = autotvm.task.extract_from_program(mod["main"], target=target, target_host=target_host, - params=params, ops=(relay.op.nn.conv2d,)) + params=params, + ops=(relay.op.get("nn.conv2d"),)) # run tuning tasks print("Tuning...") diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py index 5e26f5858bbc..e1106d62921d 100644 --- a/tutorials/autotvm/tune_relay_x86.py +++ b/tutorials/autotvm/tune_relay_x86.py @@ -132,22 +132,9 @@ def tune_kernels(tasks, early_stopping=None, log_filename='tuning.log'): - for i, tsk in enumerate(tasks): + for i, task in enumerate(tasks): prefix = "[Task %2d/%2d] " % (i+1, len(tasks)) - # converting conv2d tasks to conv2d_NCHWc tasks - op_name = tsk.workload[0] - if op_name == 'conv2d': - func_create = 'topi_x86_conv2d_NCHWc' - elif op_name == 'depthwise_conv2d_nchw': - func_create = 'topi_x86_depthwise_conv2d_NCHWc_from_nchw' - else: - raise ValueError("Tuning {} is not supported on x86".format(op_name)) - - task = autotvm.task.create(func_create, args=tsk.args, - target=target, template_key='direct') - task.workload = tsk.workload - # create tuner if tuner == 'xgb' or tuner == 'xgb-rank': tuner_obj = XGBTuner(task, loss_type='rank') @@ -189,10 +176,10 @@ def tune_and_evaluate(tuning_opt): print("Extract tasks...") mod, params, data_shape, out_shape = get_network(model_name, batch_size) tasks = autotvm.task.extract_from_program(mod["main"], target=target, - params=params, ops=(relay.op.nn.conv2d,)) + params=params, + ops=(relay.op.get("nn.conv2d"),)) # run tuning tasks - print("Tuning...") tune_kernels(tasks, **tuning_opt) tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file) diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py index b9edc30e5ba3..cf6f42654e6e 100644 --- a/vta/scripts/tune_resnet.py +++ b/vta/scripts/tune_resnet.py @@ -246,7 +246,7 @@ def tune_tasks(tasks, print("Extracting tasks...") tasks = extract_from_program(func=relay_prog, params=params, - ops=(tvm.relay.op.nn.conv2d,), + ops=(relay.op.get("nn.conv2d"),), target=target, target_host=env.target_host) diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py index 94fba3db2989..3a8c877a6d14 100644 --- a/vta/tutorials/autotvm/tune_relay_vta.py +++ b/vta/tutorials/autotvm/tune_relay_vta.py @@ -295,7 +295,8 @@ def tune_tasks(tasks, def register_vta_tuning_tasks(): - from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args + from tvm.autotvm.task import TaskExtractEnv + from tvm.autotvm.task.task import deserialize_args @tvm.tag_scope(tag=topi.tag.ELEMWISE) def my_clip(x, a_min, a_max): @@ -356,7 +357,7 @@ def tune_and_evaluate(tuning_opt): mod = tvm.IRModule.from_expr(relay_prog) tasks = autotvm.task.extract_from_program(mod, params=params, - ops=(tvm.relay.op.nn.conv2d, ), + ops=(relay.op.get("nn.conv2d"),), target=target, target_host=env.target_host) From 2786bf00fc331060edc4a68fd13438e72ba2743e Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Wed, 5 Feb 2020 14:22:26 -0800 Subject: [PATCH 02/48] fix bugs --- .../graph_tuner/utils/traverse_graph.py | 2 +- python/tvm/autotvm/graph_tuner/utils/utils.py | 13 ++-- python/tvm/autotvm/record.py | 4 +- python/tvm/autotvm/task/dispatcher.py | 3 - python/tvm/autotvm/task/task.py | 71 +------------------ python/tvm/relay/backend/compile_engine.py | 11 +-- python/tvm/relay/frontend/tensorflow.py | 4 +- python/tvm/relay/op/nn/_nn.py | 2 +- python/tvm/relay/op/strategy/x86.py | 3 +- src/relay/op/nn/convolution.h | 14 +++- tests/python/frontend/mxnet/test_forward.py | 15 ++-- tests/python/relay/test_op_level2.py | 4 +- tests/python/unittest/test_autotvm_common.py | 6 +- .../unittest/test_autotvm_dispatch_context.py | 34 +-------- tests/python/unittest/test_autotvm_measure.py | 2 +- .../python/unittest/test_graph_tuner_core.py | 12 ++-- .../python/unittest/test_graph_tuner_utils.py | 21 +++--- tutorials/autotvm/tune_simple_template.py | 6 +- 18 files changed, 68 insertions(+), 159 deletions(-) diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py index 5c598b5b1260..cb8de640a25a 100644 --- a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py +++ b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py @@ -84,7 +84,7 @@ def _traverse_expr(node): return node_index = len(node_list) node_entry = {"node": node, "inputs": [], "types": [], - "op": "null", "name": None} + "op": None, "name": None} if isinstance(node, Call): op = node.op diff --git a/python/tvm/autotvm/graph_tuner/utils/utils.py b/python/tvm/autotvm/graph_tuner/utils/utils.py index 137ccbed2bbd..2486d0c0bda0 100644 --- a/python/tvm/autotvm/graph_tuner/utils/utils.py +++ b/python/tvm/autotvm/graph_tuner/utils/utils.py @@ -47,7 +47,7 @@ def has_multiple_inputs(node_list, node_idx, input_names): in_idx = in_idx[0] in_node = node_list[in_idx] # Exclude parameter nodes - if in_node["op"] != "null" or \ + if in_node["op"] is not None or \ ("name" in in_node and in_node["name"] in input_names): num_inputs += 1 return num_inputs > 1 @@ -72,9 +72,10 @@ def is_boundary_node(node_entry, input_names): whether node is a boundary node. """ # Operators dependent on original layouts. - _LAYOUT_FIXED_OP = ["batch_flatten", "transpose", "reshape", - "multibox_prior", "multibox_transform_loc", "where", - "non_max_suppression", "strided_slice"] + _LAYOUT_FIXED_OP = [relay.op.get(name) for name in ( + "nn.batch_flatten", "transpose", "reshape", "vision.multibox_prior", + "vision.multibox_transform_loc", "where", "vision.non_max_suppression", + "strided_slice")] out = node_entry["op"] in _LAYOUT_FIXED_OP or \ ("name" in node_entry and node_entry["name"] in input_names) @@ -95,9 +96,7 @@ def is_skipped_node(node_entry): whether node is skipped. """ # Operators not counted in graph tuner. - _SKIPPED_OP = ["Tuple"] - - return node_entry["op"] in _SKIPPED_OP + return isinstance(node_entry["node"], relay.Tuple) def bind_inputs(expr, input_shapes=None, input_dtypes="float32"): diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py index 2ea288ed3426..5e8ac9d0c5df 100644 --- a/python/tvm/autotvm/record.py +++ b/python/tvm/autotvm/record.py @@ -161,12 +161,12 @@ def clean_json_to_python(x): tgt = _target.create(items[0]) task_tuple = pickle.loads(base64.b64decode(items[1].encode())) config = pickle.loads(base64.b64decode(items[2].encode())) - result = pickle.loads(base64.b64decode(items[3].encode())) + result = MeasureResult(*pickle.loads(base64.b64decode(items[3].encode()))) config.cost = np.mean(result.costs) tsk = task.Task(task_tuple[0], task_tuple[1]) tsk.workload = task_tuple[3] - return MeasureInput(tgt, tsk, config), MeasureResult(*result) + return MeasureInput(tgt, tsk, config), result raise RuntimeError("Invalid log protocol: " + protocol) diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py index 75466bb50e9c..97ee5383d760 100644 --- a/python/tvm/autotvm/task/dispatcher.py +++ b/python/tvm/autotvm/task/dispatcher.py @@ -33,9 +33,6 @@ import logging import numpy as np -from decorator import decorate - -from tvm import target as _target from .space import FallbackConfigEntity diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py index 7fbc94e6732f..3bbbffa0f655 100644 --- a/python/tvm/autotvm/task/task.py +++ b/python/tvm/autotvm/task/task.py @@ -342,74 +342,6 @@ def args_to_workload(x, task_name=None): 'primitive types or tvm.expr.Var only' % type(x)) return tuple((task_name, ) + workload) if task_name else workload -# def template(func): -# """ -# Decorate a function as a tunable schedule template -# -# Parameters -# ---------- -# func: callable -# A callable template function. -# Its argument should be hashable values. -# Its return value should be a Tuple(Schedule, Array of Tensor) -# -# Returns -# ------- -# func: callable -# The decorated function -# -# Examples -# -------- -# The following code is a tunable template for a blocked matrix multiplication -# -# .. code-block:: python -# -# @autotvm.template -# def matmul(N, L, M, dtype): -# A = tvm.placeholder((N, L), name='A', dtype=dtype) -# B = tvm.placeholder((L, M), name='B', dtype=dtype) -# -# k = tvm.reduce_axis((0, L), name='k') -# C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') -# s = tvm.create_schedule(C.op) -# -# # schedule -# y, x = s[C].op.axis -# k = s[C].op.reduce_axis[0] -# -# ##### define space begin ##### -# cfg = autotvm.get_config() -# cfg.define_split("tile_y", y, num_outputs=2) -# cfg.define_split("tile_x", x, num_outputs=2) -# ##### define space end ##### -# -# # schedule according to config -# yo, yi = cfg["tile_y"].apply(s, C, y) -# xo, xi = cfg["tile_x"].apply(s, C, x) -# -# s[C].reorder(yo, xo, k, yi, xi) -# -# return s, [A, B, C] -# """ -# # pylint: disable=unused-variable -# -# fname = get_func_name(func) -# -# @register(fname) -# @dispatcher -# def config_dispatcher(*args, **kwargs): -# assert not kwargs, "Do not support kwargs in template function call" -# return (fname, ) + args_to_workload(args) -# -# @config_dispatcher.register("") -# def template_call(cfg, *args, **kwargs): -# assert not kwargs, "Do not support kwargs in template function call" -# with ApplyConfig(cfg): -# return func(*args, **kwargs) -# -# config_dispatcher.func_name = fname -# return config_dispatcher - def get_config(): """Get current config object @@ -418,7 +350,8 @@ def get_config(): cfg: ConfigSpace or ConfigEntity The current config """ - return DispatchContext.current.query(None, None) + tgt = _target.current_target(allow_none=True) + return DispatchContext.current.query(tgt, None) class FlopCalculationError(RuntimeError): """Error happens when estimating FLOP for a compute op""" diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py index e07baf20e54b..170d7acc834a 100644 --- a/python/tvm/relay/backend/compile_engine.py +++ b/python/tvm/relay/backend/compile_engine.py @@ -256,7 +256,7 @@ def create_tensors(typ, tensors): self.func_name = "fused" outputs = self.visit(prim_func.body) if len(self.func_name) > ScheduleGetter.MAX_FUNC_NAME_LENGTH: - hash_digest = int(hashlib.sha1(self.func_name).hexdigest(), 16) + hash_digest = int(hashlib.sha1(self.func_name.encode("utf-8")).hexdigest(), 16) self.func_name = "%s_%s" % ( self.func_name[:ScheduleGetter.MAX_FUNC_NAME_LENGTH], hash_digest) @@ -270,7 +270,8 @@ def create_tensors(typ, tensors): # print('master op:', self.master_op.name) sch = self.master_implement.schedule(self.master_attrs, tensor_outs, self.target) for scalar in self.scalars: - sch[scalar].compute_inline() + if scalar in sch.stage_map: + sch[scalar].compute_inline() return CachedFunc(self.target, self.func_name, inputs, outputs, sch) def visit_var(self, var): @@ -381,10 +382,10 @@ def visit_tuple(self, tup): return fields def visit_tuple_getitem(self, t): - tup = self.visit(t.tuple) - assert len(tup) == len(t.tuple.checked_type.fields) + tup = self.visit(t.tuple_value) + assert len(tup) == len(t.tuple_value.checked_type.fields) assert t.index >= 0 - assert t.index < tup.size() + assert t.index < len(tup) return [tup[t.index]] diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py index 587b07673fbe..5532e3a5c1a4 100644 --- a/python/tvm/relay/frontend/tensorflow.py +++ b/python/tvm/relay/frontend/tensorflow.py @@ -311,6 +311,7 @@ def _impl(inputs, attr, params): flip_layout = True if attr['data_format'] == 'NHWC': + in_channels = input_shape[3] kernel_h, kernel_w, _, depth_mult = weights_shape attr['kernel_shape'] = (weights_shape[0], weights_shape[1]) if opname == 'conv': @@ -324,6 +325,7 @@ def _impl(inputs, attr, params): attr['dilations'] = (attr['dilations'][1], attr['dilations'][2]) attr['strides'] = (attr['strides'][1], attr['strides'][2]) elif attr['data_format'] == 'NCHW': + in_channels = input_shape[1] _, depth_mult, kernel_h, kernel_w = weights_shape attr['kernel_shape'] = (weights_shape[2], weights_shape[3]) if opname == 'conv': @@ -344,7 +346,7 @@ def _impl(inputs, attr, params): raise tvm.error.OpAttributeInvalid(msg.format(attr['data_format'])) if opname == 'depthwise': - attr['groups'] = attr['channels'] + attr['groups'] = in_channels # Fix padding attr['padding'] = attr['padding'].decode("utf-8") diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index 4e0443fde59f..d587a3e61b61 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -307,7 +307,7 @@ def compute_upsampling3d(attrs, inputs, out_dtype): # mirror_pad @reg.register_compute("nn.mirror_pad") -def compute_mirror_pad(attrs, inputs, out_dtype, target): +def compute_mirror_pad(attrs, inputs, out_dtype): pad_before, pad_after = list(zip(*attrs.pad_width)) mode = attrs.mode out = topi.nn.mirror_pad(inputs[0], pad_before=pad_before, pad_after=pad_after, mode=mode) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index bb6833d203c8..51d525928ab6 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -116,7 +116,8 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nchw)) elif layout == "NHWC": assert kernel_layout == "HWOI" - logger.warning("For x86 target, NCHW layout is recommended for depthwise_conv2d.") + logger.warning("For x86 target, depthwise_conv2d with NCHW layout is " + "not optimized.") strategy.add_implement( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc)) diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h index 40619091656f..9e303260c07f 100644 --- a/src/relay/op/nn/convolution.h +++ b/src/relay/op/nn/convolution.h @@ -153,6 +153,16 @@ bool Conv2DRel(const Array& types, int num_inputs, const Attrs& attrs, << " But got " << out_layout; Array dshape_nchw = trans_in_layout.ForwardShape(data->shape); + bool is_depthwise = false; + if (param->groups > 1) { + CHECK(weight->shape.defined()) << "Weight shape must be specified " << + "when groups is greater than 1."; + Array wshape_oihw = trans_kernel_layout.ForwardShape(weight->shape); + if (tvm::tir::Equal(param->groups, dshape_nchw[1]) && + tvm::tir::Equal(param->groups, wshape_oihw[0])) { + is_depthwise = true; + } + } IndexExpr channels, dilated_ksize_y, dilated_ksize_x; // infer weight if the kernel_size and channels are defined @@ -161,9 +171,9 @@ bool Conv2DRel(const Array& types, int num_inputs, const Attrs& attrs, CHECK_EQ(param->dilation.size(), 2); Array wshape; - if (tvm::tir::Equal(param->channels, param->groups) && !tvm::tir::Equal(param->channels, 1)) { + if (is_depthwise) { // infer weight's shape for depthwise convolution - wshape = {{dshape_nchw[1], indexdiv(param->groups, dshape_nchw[1]), param->kernel_size[0], + wshape = {{dshape_nchw[1], indexdiv(param->channels, dshape_nchw[1]), param->kernel_size[0], param->kernel_size[1]}}; } else { wshape = {{param->channels, indexdiv(dshape_nchw[1], param->groups), param->kernel_size[0], diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py index 8a6ceb81f263..504f70031e24 100644 --- a/tests/python/frontend/mxnet/test_forward.py +++ b/tests/python/frontend/mxnet/test_forward.py @@ -852,17 +852,22 @@ def verify(data_shape, out_shape, begin, end): def test_forward_convolution(): - def verify(data_shape, kernel_size, stride, pad, num_filter): - weight_shape=(num_filter, data_shape[1],) + kernel_size + def verify(data_shape, kernel_size, stride, pad, num_filter, is_depthwise=False): + if is_depthwise: + groups = data_shape[1] + weight_shape=(data_shape[1], num_filter // groups,) + kernel_size + else: + groups = 1 + weight_shape=(num_filter, data_shape[1],) + kernel_size x = np.random.uniform(size=data_shape).astype("float32") weight = np.random.uniform(size=weight_shape).astype("float32") bias = np.random.uniform(size=num_filter).astype("float32") ref_res = mx.nd.Convolution(data=mx.nd.array(x), weight=mx.nd.array(weight), bias=mx.nd.array(bias), kernel=kernel_size, stride=stride, - pad=pad, num_filter=num_filter) + pad=pad, num_filter=num_filter, num_group=groups) mx_sym = mx.sym.Convolution(mx.sym.var("x"), mx.sym.var("weight"), mx.sym.var("bias"), kernel=kernel_size, stride=stride, - pad=pad, num_filter=num_filter) + pad=pad, num_filter=num_filter, num_group=groups) shape_dict = {"x": x.shape, "weight": weight.shape, "bias": bias.shape} mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) for target, ctx in ctx_list(): @@ -879,6 +884,8 @@ def verify(data_shape, kernel_size, stride, pad, num_filter): verify(data_shape=(20, 1, 32, 32), kernel_size=(3, 3), stride=(1, 1), pad=(1, 1), num_filter=2) verify(data_shape=(1, 8, 32, 32), kernel_size=(3, 3), stride=(1, 1), pad=(1, 1), num_filter=2) verify(data_shape=(20, 8, 32, 32), kernel_size=(3, 3), stride=(1, 1), pad=(1, 1), num_filter=2) + verify(data_shape=(1, 8, 32, 32), kernel_size=(3, 3), stride=(1, 1), pad=(1, 1), num_filter=8, + is_depthwise=True) def test_forward_deconvolution(): def verify(data_shape, kernel_size, stride, pad, num_filter): diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py index e9acd96f3935..8da1b129e670 100644 --- a/tests/python/relay/test_op_level2.py +++ b/tests/python/relay/test_op_level2.py @@ -199,7 +199,7 @@ def run_test_conv2d(dtype, out_dtype, scale, dshape, kshape, except_targets = [] x = relay.var("x", shape=dshape, dtype=dtype) - w = relay.var("w", dtype=dtype) + w = relay.var("w", shape=kshape, dtype=dtype) y = relay.nn.conv2d(x, w, padding=padding, dilation=dilation, @@ -230,7 +230,7 @@ def compile_test_conv2d_arm_cpu(dtype, out_dtype, scale, dshape, kshape, dilation=(1, 1), **attrs): x = relay.var("x", shape=dshape, dtype=dtype) - w = relay.var("w", dtype=dtype) + w = relay.var("w", shape=kshape, dtype=dtype) y = relay.nn.conv2d(x, w, padding=padding, dilation=dilation, diff --git a/tests/python/unittest/test_autotvm_common.py b/tests/python/unittest/test_autotvm_common.py index fac9f062a2e8..83bbd5492619 100644 --- a/tests/python/unittest/test_autotvm_common.py +++ b/tests/python/unittest/test_autotvm_common.py @@ -36,7 +36,7 @@ def run(self, measure_inputs, build_results): def get_build_kwargs(self): return {} -@autotvm.template +@autotvm.register_customized_task("testing/matmul") def matmul(N, L, M, dtype): A = tvm.placeholder((N, L), name='A', dtype=dtype) B = tvm.placeholder((L, M), name='B', dtype=dtype) @@ -63,7 +63,7 @@ def matmul(N, L, M, dtype): return s, [A, B, C] -@autotvm.template +@autotvm.register_customized_task("testing/bad_matmul") def bad_matmul(N, L, M, dtype): if 'bad_device' in tvm.target.Target.current().keys: A = tvm.placeholder((N, L), name='A', dtype=dtype) @@ -85,7 +85,7 @@ def bad_matmul(N, L, M, dtype): def get_sample_task(n=128): """return a sample task for testing""" target = tvm.target.create("llvm") - task = autotvm.task.create(matmul, args=(n, n, n, 'float32'), target=target) + task = autotvm.task.create("testing/matmul", args=(n, n, n, 'float32'), target=target) return task, target def get_sample_records(n): diff --git a/tests/python/unittest/test_autotvm_dispatch_context.py b/tests/python/unittest/test_autotvm_dispatch_context.py index 716ab7f807f9..5a55c4f97ca1 100644 --- a/tests/python/unittest/test_autotvm_dispatch_context.py +++ b/tests/python/unittest/test_autotvm_dispatch_context.py @@ -18,42 +18,11 @@ The dispatcher can choose which template to use according to the parameters of workload""" -from collections import namedtuple from tvm import autotvm -from tvm.autotvm.task import dispatcher, DispatchContext - -SimpleConfig = namedtuple('SimpleConfig', ('template_key', 'is_fallback')) - -def test_dispatch(): - @dispatcher - def my_dispatcher(a, b): - return (a, b) - - @my_dispatcher.register("im2col") - def _im2col(cfg, a, b): - return a - - @my_dispatcher.register("spatial_pack") - def _spatial_pack(cfg, a, b): - return b - - class SimpleDispatcher(DispatchContext): - def query(self, target, workload): - a, b = workload - tkey = "spatial_pack" if a + b > 2 else "im2col" - cfg = SimpleConfig(tkey, False) - return cfg - - with SimpleDispatcher(): - # this will call im2col - assert my_dispatcher(1, 0) == 1 - - # this will call spatial pack - assert my_dispatcher(1, 100) == 100 def test_fallback(): - @autotvm.template + @autotvm.register_customized_task("testing/dispatch/fallback") def simple_template(a, b): cfg = autotvm.get_config() assert cfg.is_fallback @@ -62,5 +31,4 @@ def simple_template(a, b): if __name__ == "__main__": - test_dispatch() test_fallback() diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py index 48a1d31899e7..0899f6f5bbff 100644 --- a/tests/python/unittest/test_autotvm_measure.py +++ b/tests/python/unittest/test_autotvm_measure.py @@ -64,7 +64,7 @@ def _callback_correct(tuner, measure_inputs, measure_results): # a bad template n = 128 target = tvm.target.create("llvm -device=bad_device") - task = autotvm.task.create(bad_matmul, args=(n, n, n, 'float32'), target=target) + task = autotvm.task.create("testing/bad_matmul", args=(n, n, n, 'float32'), target=target) def _callback_wrong(tuner, measure_inputs, measure_results): for _, res in zip(measure_inputs, measure_results): diff --git a/tests/python/unittest/test_graph_tuner_core.py b/tests/python/unittest/test_graph_tuner_core.py index 173a237bf8d9..1070cc73266e 100644 --- a/tests/python/unittest/test_graph_tuner_core.py +++ b/tests/python/unittest/test_graph_tuner_core.py @@ -407,32 +407,28 @@ def test_tuple(): "e": [["tile_ic", "sp", [1, 5]], ["tile_oc", "sp", [1, 2]], ["tile_ow", "sp", [4, 8]], - ["unroll_kw", "ot", True]], - "t": ""} + ["unroll_kw", "ot", True]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = {"i": -1, "c": None, "e": [["tile_ic", "sp", [1, 5]], ["tile_oc", "sp", [1, 3]], ["tile_ow", "sp", [2, 16]], - ["unroll_kw", "ot", False]], - "t": ""} + ["unroll_kw", "ot", False]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = {"i": -1, "c": None, "e": [["tile_ic", "sp", [1, 5]], ["tile_oc", "sp", [2, 1]], ["tile_ow", "sp", [4, 8]], - ["unroll_kw", "ot", True]], - "t": ""} + ["unroll_kw", "ot", True]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) cfg_dict = {"i": -1, "c": None, "e": [["tile_ic", "sp", [1, 5]], ["tile_oc", "sp", [3, 1]], ["tile_ow", "sp", [2, 16]], - ["unroll_kw", "ot", False]], - "t": ""} + ["unroll_kw", "ot", False]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) records = [] diff --git a/tests/python/unittest/test_graph_tuner_utils.py b/tests/python/unittest/test_graph_tuner_utils.py index 885065fee8d0..b4ea2d528507 100644 --- a/tests/python/unittest/test_graph_tuner_utils.py +++ b/tests/python/unittest/test_graph_tuner_utils.py @@ -26,7 +26,7 @@ from tvm.relay.testing import resnet from tvm.autotvm.graph_tuner.utils import has_multiple_inputs, get_direct_ancestor, get_in_nodes, \ get_out_nodes, expr2graph, bind_inputs -from tvm.relay.expr import Call, TupleGetItem, Tuple +from tvm.relay.expr import Call, TupleGetItem, Tuple, Var from topi.nn.conv2d import conv2d @@ -53,7 +53,7 @@ def test_has_multiple_inputs(): out = relay.add(out1, out2) net = relay.Function(relay.analysis.free_vars(out), out) net = bind_inputs(net, {"data": (1, 16, 224, 224), "w0": (16, 16, 1, 1)}) - target_ops = ["conv2d"] + target_ops = [relay.op.get("nn.conv2d")] node_list = [] node_dict = {} expr2graph(net, target_ops, node_dict, node_list) @@ -67,22 +67,17 @@ def test_expr2graph(): mod, _ = resnet.get_workload(num_layers=50, batch_size=1) node_dict = {} node_list = [] - target_ops = ["conv2d"] + target_ops = [relay.op.get("nn.conv2d")] op_name_list = [] def _count_node(node): - if not isinstance(node, relay.op.op.Op,): - return if isinstance(node, Call): - op_name_list.append(node.op.name.split(".")[-1]) - elif isinstance(node, TupleGetItem): - op_name_list.append("TupleGetItem") - elif isinstance(node, Tuple): - op_name_list.append("Tuple") - else: - op_name_list.append("null") + op_name_list.append(node.op) + elif isinstance(node, (Var, TupleGetItem, Tuple)): + op_name_list.append(None) relay.analysis.post_order_visit(mod["main"], _count_node) expr2graph(mod["main"], target_ops, node_dict, node_list) + assert len(node_list) == len(op_name_list) for i, item in enumerate(zip(op_name_list, node_list)): op_name, node = item assert op_name == node["op"], "%dth Node operator mismatch: expecting %s but got %s" \ @@ -99,7 +94,7 @@ def test_get_direct_ancestor(): out = relay.nn.conv2d(out3, w1) net = relay.Function(relay.analysis.free_vars(out), out) net = bind_inputs(net, {"data": (1, 16, 224, 224), "w0": (16, 16, 1, 1), "w1": (16, 16, 1, 1)}) - target_ops = ["conv2d"] + target_ops = [relay.op.get("nn.conv2d")] node_list = [] node_dict = {} expr2graph(net, target_ops, node_dict, node_list) diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py index b6ad7e94f883..8efeed487b43 100644 --- a/tutorials/autotvm/tune_simple_template.py +++ b/tutorials/autotvm/tune_simple_template.py @@ -102,7 +102,7 @@ def matmul_v0(N, L, M, dtype): # In autotvm, we can define a tunable parameter, or a "knob" for such kind of value. # Matmul V1: List candidate values -@autotvm.template # 1. use a decorator +@autotvm.register_customized_task("tutorial/matmul_v1") # 1. use a decorator def matmul_v1(N, L, M, dtype): A = tvm.placeholder((N, L), name='A', dtype=dtype) B = tvm.placeholder((L, M), name='B', dtype=dtype) @@ -182,7 +182,7 @@ def matmul_v1(N, L, M, dtype): # When the high level API cannot meet your requirement, you can always fall # back to use low level API. -@autotvm.template +@autotvm.register_customized_task("tutorial/matmul") def matmul(N, L, M, dtype): A = tvm.placeholder((N, L), name='A', dtype=dtype) B = tvm.placeholder((L, M), name='B', dtype=dtype) @@ -272,7 +272,7 @@ def matmul(N, L, M, dtype): # In this case, for a 512x512 square matrix multiplication, the space size # is 10x10=100 N, L, M = 512, 512, 512 -task = autotvm.task.create(matmul, args=(N, L, M, 'float32'), target='llvm') +task = autotvm.task.create("tutorial/matmul", args=(N, L, M, 'float32'), target='llvm') print(task.config_space) ################################################################ From 11769f5c4fff6ea875ef918bd8ef4ecbf5201545 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Wed, 5 Feb 2020 15:55:21 -0800 Subject: [PATCH 03/48] lint --- topi/python/topi/arm_cpu/__init__.py | 2 +- topi/python/topi/arm_cpu/bitserial_conv2d.py | 2 +- topi/python/topi/arm_cpu/conv2d.py | 9 +++++++ topi/python/topi/arm_cpu/conv2d_alter_op.py | 6 ++--- topi/python/topi/arm_cpu/conv2d_int8.py | 3 ++- topi/python/topi/arm_cpu/depthwise_conv2d.py | 4 +++- topi/python/topi/bifrost/conv2d.py | 2 +- topi/python/topi/bifrost/dense.py | 24 +++++++++---------- topi/python/topi/cuda/conv2d.py | 6 ++++- topi/python/topi/cuda/conv2d_hwcn.py | 4 +++- topi/python/topi/cuda/conv2d_int8.py | 2 +- topi/python/topi/cuda/conv3d.py | 2 +- topi/python/topi/cuda/deformable_conv2d.py | 2 +- topi/python/topi/cuda/dense.py | 5 ++-- topi/python/topi/cuda/depthwise_conv2d.py | 3 ++- topi/python/topi/cuda/group_conv2d_nchw.py | 17 +++++++------ topi/python/topi/cuda/nn.py | 1 - topi/python/topi/cuda/rcnn/proposal.py | 2 +- topi/python/topi/cuda/vision.py | 1 - topi/python/topi/generic/sort.py | 1 - topi/python/topi/intel_graphics/conv2d.py | 3 ++- .../topi/intel_graphics/depthwise_conv2d.py | 2 +- topi/python/topi/nn/conv2d.py | 17 ++++++++++++- topi/python/topi/rocm/conv2d.py | 3 +-- topi/python/topi/x86/conv2d.py | 4 ++-- topi/python/topi/x86/conv2d_avx_1x1.py | 2 +- topi/python/topi/x86/conv2d_int8.py | 4 +++- topi/python/topi/x86/dense.py | 6 +++++ topi/python/topi/x86/depthwise_conv2d.py | 3 +++ topi/python/topi/x86/nn.py | 1 - topi/python/topi/x86/pooling.py | 1 - topi/python/topi/x86/sparse.py | 1 + 32 files changed, 93 insertions(+), 52 deletions(-) diff --git a/topi/python/topi/arm_cpu/__init__.py b/topi/python/topi/arm_cpu/__init__.py index 63f17422bcf1..eb05dd839e32 100644 --- a/topi/python/topi/arm_cpu/__init__.py +++ b/topi/python/topi/arm_cpu/__init__.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +# pylint: disable=wildcard-import """Schedule for ARM CPU""" from .conv2d import * diff --git a/topi/python/topi/arm_cpu/bitserial_conv2d.py b/topi/python/topi/arm_cpu/bitserial_conv2d.py index 4b80b6b3b7af..d28ec09925c2 100644 --- a/topi/python/topi/arm_cpu/bitserial_conv2d.py +++ b/topi/python/topi/arm_cpu/bitserial_conv2d.py @@ -22,7 +22,7 @@ from tvm import relay from .. import tag from ..nn.pad import pad -from ..nn.bitserial_conv2d import bitserial_conv2d_nhwc, bitserial_conv2d_legalize +from ..nn.bitserial_conv2d import bitserial_conv2d_legalize from ..nn.bitserial_util import bitpack, binary_op_multiplier from ..nn.util import get_pad_tuple from ..util import get_const_int, get_const_tuple diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py index 54672810a19f..2144d260c5b1 100644 --- a/topi/python/topi/arm_cpu/conv2d.py +++ b/topi/python/topi/arm_cpu/conv2d.py @@ -34,12 +34,14 @@ @autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu") def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype): + """Compute conv2d with NCHW layout""" return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile=2) @autotvm.register_topi_schedule("conv2d_nchw_spatial_pack.arm_cpu") def schedule_conv2d_nchw_spatial_pack(cfg, outs): + """Create schedule for conv2d_nchw""" s = tvm.create_schedule([x.op for x in outs]) def _callback(op): @@ -69,12 +71,14 @@ def _callback(op): @autotvm.register_topi_compute("conv2d_nhwc_spatial_pack.arm_cpu") def conv2d_nhwc_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype): + """Compute conv2d with NHWC layout""" return conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype) @autotvm.register_topi_schedule("conv2d_nhwc_spatial_pack.arm_cpu") def schedule_conv2d_nhwc_spatial_pack(cfg, outs): + """Create schedule for conv2d_nhwc""" s = tvm.create_schedule([x.op for x in outs]) def _callback(op): @@ -87,6 +91,7 @@ def _callback(op): @autotvm.register_topi_compute("conv2d_nchw_winograd.arm_cpu") def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype): + """Compute conv2d_nchw layout using Winograd with weight transform""" tile_size = 4 return _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size) @@ -94,6 +99,7 @@ def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtyp @autotvm.register_topi_schedule("conv2d_nchw_winograd.arm_cpu") def schedule_conv2d_nchw_winograd(cfg, outs): + """Create schedule for conv2d_nchw_winograd""" s = tvm.create_schedule([x.op for x in outs]) def _callback(op): @@ -286,6 +292,7 @@ def _schedule_winograd(cfg, s, output, last): @autotvm.register_topi_compute("conv2d_nchw_winograd_nnpack.arm_cpu") def conv2d_nchw_winograd_nnpack(cfg, data, kernel, strides, padding, dilation, out_dtype): + """Compute conv2d_nchw using nnpack Winograd implementation""" dtype = data.dtype if dtype == "float32": return _conv2d_arm_cpu_winograd_nnpack( @@ -302,6 +309,7 @@ def conv2d_nchw_winograd_nnpack(cfg, data, kernel, strides, padding, dilation, o @autotvm.register_topi_schedule("conv2d_nchw_winograd_nnpack.arm_cpu") def schedule_conv2d_nchw_winograd_nnpack(cfg, outs): + """Create schedule for conv2d_nchw_winograd_nnpack""" s = tvm.create_schedule([x.op for x in outs]) def _callback(op): @@ -371,6 +379,7 @@ def _schedule_winograd_nnpack(cfg, s, output, last): @autotvm.register_topi_compute("conv2d_nchw_winograd_nnpack_without_weight_transform.arm_cpu") def conv2d_nchw_winograd_nnpack_without_weight_transform( cfg, data, transformed_kernel, bias, strides, padding, dilation, out_dtype): + """Compute conv2d_nchw using NNPack winograd without weight transform""" N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation diff --git a/topi/python/topi/arm_cpu/conv2d_alter_op.py b/topi/python/topi/arm_cpu/conv2d_alter_op.py index 869b1d44ed64..20e59e0e014e 100644 --- a/topi/python/topi/arm_cpu/conv2d_alter_op.py +++ b/topi/python/topi/arm_cpu/conv2d_alter_op.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member,no-else-return """Conv2D alter op and legalize functions for arm cpu""" import logging @@ -104,8 +104,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): weight_expr, tile_size=tile_size) weight_expr = relay.reshape(weight_expr, newshape=(KH + tile_size - 1, - KW + tile_size - 1, - idxd(CO, VC), VC, CI)) + KW + tile_size - 1, + idxd(CO, VC), VC, CI)) weight_expr = relay.transpose(weight_expr, axes=[0, 1, 2, 4, 3]) new_attrs['tile_size'] = tile_size diff --git a/topi/python/topi/arm_cpu/conv2d_int8.py b/topi/python/topi/arm_cpu/conv2d_int8.py index cd413d659203..5d177fe76ab6 100644 --- a/topi/python/topi/arm_cpu/conv2d_int8.py +++ b/topi/python/topi/arm_cpu/conv2d_int8.py @@ -19,7 +19,7 @@ import tvm from tvm import autotvm -from .. import generic, tag +from .. import tag from ..util import get_const_tuple from ..generic import conv2d as conv2d_generic from .. import nn @@ -44,6 +44,7 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype): @autotvm.register_topi_compute("conv2d_NCHWc_int8.arm_cpu") def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype): + """Compute conv2d int8 with NCHWc layout""" # layout and out_layout are not used here, # we keep them for debug convenience when dumping autotvm workload n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py index 9a79f984edb1..8d668f3e9188 100644 --- a/topi/python/topi/arm_cpu/depthwise_conv2d.py +++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py @@ -26,7 +26,8 @@ @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu") -def depthwise_conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype): +def depthwise_conv2d_nchw(_, data, kernel, strides, padding, dilation, out_dtype): + """Compute depthwise_conv2d with NCHW layout""" return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype) @@ -177,6 +178,7 @@ def depthwise_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dila @autotvm.register_topi_schedule("depthwise_conv2d_nchw_spatial_pack.arm_cpu") def schedule_depthwise_conv2d_nchw_spatial_pack(cfg, outs): + """Create the schedule for depthwise_conv2d_nchw_spatial_pack""" outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) diff --git a/topi/python/topi/bifrost/conv2d.py b/topi/python/topi/bifrost/conv2d.py index 7956d06fc3fa..2650bfd77a38 100644 --- a/topi/python/topi/bifrost/conv2d.py +++ b/topi/python/topi/bifrost/conv2d.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name,unused-variable,unused-argument +# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return """conv2d schedule on ARM Mali (Bifrost) GPU""" import tvm diff --git a/topi/python/topi/bifrost/dense.py b/topi/python/topi/bifrost/dense.py index dadb8db96bc8..2a85db753226 100644 --- a/topi/python/topi/bifrost/dense.py +++ b/topi/python/topi/bifrost/dense.py @@ -55,11 +55,11 @@ def _callback(op): vec_size = [1, 2, 4, 8, 16] max_unroll = 32 - dense = op.output(0) + dense_out = op.output(0) output = outs[0] y, x = s[output].op.axis - c = s[dense].op.reduce_axis[0] + c = s[dense_out].op.reduce_axis[0] ##### space definition begin ##### cfg.define_split('tile_y', y, num_outputs=3) @@ -73,8 +73,8 @@ def _callback(op): cfg.fallback_with_reference_log(ref_log) ##### space definition end ##### - if dense.op in s.outputs: - dense = s.cache_write(output, 'local') + if dense_out.op in s.outputs: + dense_out = s.cache_write(output, 'local') by, ty, yi = cfg['tile_y'].apply(s, output, y) bx, tx, xi = cfg['tile_x'].apply(s, output, x) @@ -88,17 +88,17 @@ def _callback(op): s[output].unroll(yi) if cfg['tile_x'].size[-1] in vec_size: s[output].vectorize(xi) - s[dense].compute_at(s[output], tx) + s[dense_out].compute_at(s[output], tx) - k = s[dense].op.reduce_axis[0] - y, x = s[dense].op.axis - k, k_unroll = cfg['c_unroll'].apply(s, dense, k) - s[dense].reorder(k, k_unroll, y, x) - s[dense].unroll(k_unroll) + k = s[dense_out].op.reduce_axis[0] + y, x = s[dense_out].op.axis + k, k_unroll = cfg['c_unroll'].apply(s, dense_out, k) + s[dense_out].reorder(k, k_unroll, y, x) + s[dense_out].unroll(k_unroll) if cfg['tile_y'].size[-1] < max_unroll: - s[dense].unroll(y) + s[dense_out].unroll(y) if cfg['tile_x'].size[-1] in vec_size: - s[dense].vectorize(x) + s[dense_out].vectorize(x) traverse_inline(s, outs[0].op, _callback) return s diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py index 6fabb9d076ca..e1ada325ea63 100644 --- a/topi/python/topi/cuda/conv2d.py +++ b/topi/python/topi/cuda/conv2d.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name +# pylint: disable=invalid-name, unused-argument """Compute definition for conv2d with cuda backend""" import tvm from tvm import autotvm @@ -28,11 +28,13 @@ @autotvm.register_topi_compute("conv2d_nchw.cuda") def conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'): + """Compute conv2d with NCHW layout""" return nn.conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype) @autotvm.register_topi_schedule("conv2d_nchw.cuda") def schedule_conv2d_nchw(cfg, outs): + """Create the schedule for conv2d_nchw""" outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) @@ -67,6 +69,7 @@ def _callback(op): @autotvm.register_topi_compute("conv2d_cudnn.cuda") def conv2d_cudnn(cfg, data, kernel, strides, padding, dilation, layout='NCHW', out_dtype='float32'): + """Compute conv2d using CuDNN library""" if layout == 'NCHW': tensor_format = 0 # CUDNN_TENSOR_NCHW N, _, H, W = get_const_tuple(data.shape) @@ -110,4 +113,5 @@ def conv2d_cudnn(cfg, data, kernel, strides, padding, dilation, layout='NCHW', @autotvm.register_topi_schedule("conv2d_cudnn.cuda") def schedule_conv2d_cudnn(cfg, outs): + """Create the schedule for conv2d_cudnn""" return generic.schedule_extern(outs) diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py index 635bf4d2fd6e..b0925ae93a16 100644 --- a/topi/python/topi/cuda/conv2d_hwcn.py +++ b/topi/python/topi/cuda/conv2d_hwcn.py @@ -14,16 +14,18 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name, too-many-locals, too-many-statements +# pylint: disable=invalid-name, too-many-locals, too-many-statements, unused-argument """Schedule for conv2d_hwcn with auto fusion""" import tvm from tvm import autotvm + from tvm.autotvm.task.space import SplitEntity from .. import nn, tag @autotvm.register_topi_compute("conv2d_hwcn.cuda") def conv2d_hwcn(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'): + """Compute conv2d with HWCN layout on CUDA""" return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype) diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py index cab1191be5fc..53a7bd9fa849 100644 --- a/topi/python/topi/cuda/conv2d_int8.py +++ b/topi/python/topi/cuda/conv2d_int8.py @@ -155,6 +155,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_ @autotvm.register_topi_schedule("conv2d_NCHWc_int8.cuda") def schedule_conv2d_NCHWc_int8(cfg, outs): + """Schedule conv2d int8 NCHWc template""" outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) @@ -167,7 +168,6 @@ def _callback(op): def _schedule_conv2d_NCHWc_int8(cfg, s, output): - """Schedule conv2d int8 NCHWc template""" conv = output.op.input_tensors[0] packed_data, packed_kernel = conv.op.input_tensors diff --git a/topi/python/topi/cuda/conv3d.py b/topi/python/topi/cuda/conv3d.py index 016fc7fb757c..70e5e8b60bb5 100644 --- a/topi/python/topi/cuda/conv3d.py +++ b/topi/python/topi/cuda/conv3d.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name +# pylint: disable=invalid-name, unused-argument """Compute definition for conv3d with cuda backend""" import tvm from tvm import autotvm diff --git a/topi/python/topi/cuda/deformable_conv2d.py b/topi/python/topi/cuda/deformable_conv2d.py index 0cf7f5a799cc..bdec4e120fe4 100644 --- a/topi/python/topi/cuda/deformable_conv2d.py +++ b/topi/python/topi/cuda/deformable_conv2d.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name +# pylint: disable=invalid-name,unused-argument """Schedule template of deformable conv2d with cuda backend""" import tvm from tvm import autotvm diff --git a/topi/python/topi/cuda/dense.py b/topi/python/topi/cuda/dense.py index 6cdf5d8b8b3e..7ba45b3747ae 100644 --- a/topi/python/topi/cuda/dense.py +++ b/topi/python/topi/cuda/dense.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name, unused-variable +# pylint: disable=invalid-name, unused-argument """Schedule for dense operator""" from __future__ import absolute_import as _abs import logging @@ -65,6 +65,7 @@ def dense_small_batch(cfg, data, weight, bias=None, out_dtype=None): @autotvm.register_topi_schedule("dense_small_batch.cuda") def schedule_dense_small_batch(cfg, outs): + """Schedule float32/64 dense with small batch size""" outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) @@ -76,7 +77,6 @@ def _callback(op): return s def _schedule_dense_small_batch(cfg, s, C): - """Schedule float32/64 dense with small batch size""" A, _ = C.op.input_tensors _, in_dim = get_const_tuple(A.shape) cfg.define_split('tile_k', in_dim, num_outputs=2) @@ -110,6 +110,7 @@ def dense_large_batch(cfg, data, weight, bias=None, out_dtype=None): @autotvm.register_topi_schedule("dense_large_batch.cuda") def schedule_dense_large_batch(cfg, outs): + """Schedule float32/64 dense with large batch size""" outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) diff --git a/topi/python/topi/cuda/depthwise_conv2d.py b/topi/python/topi/cuda/depthwise_conv2d.py index c8cd7934bd3e..062f95f00eff 100644 --- a/topi/python/topi/cuda/depthwise_conv2d.py +++ b/topi/python/topi/cuda/depthwise_conv2d.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name +# pylint: disable=invalid-name, unused-argument """Schedule for depthwise_conv2d with auto fusion""" import tvm from tvm import autotvm @@ -25,6 +25,7 @@ # register original implementation of depthwise_conv2d_nchw since we don't need to change this part @autotvm.register_topi_compute("depthwise_conv2d_nchw.cuda") def depthwise_conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype): + """Compute depthwise_conv2d with NCHW layout.""" return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype) @autotvm.register_topi_schedule("depthwise_conv2d_nchw.cuda") diff --git a/topi/python/topi/cuda/group_conv2d_nchw.py b/topi/python/topi/cuda/group_conv2d_nchw.py index 24a4be5dbe92..ed243be5abce 100644 --- a/topi/python/topi/cuda/group_conv2d_nchw.py +++ b/topi/python/topi/cuda/group_conv2d_nchw.py @@ -28,7 +28,7 @@ @autotvm.register_topi_compute("group_conv2d_nchw.cuda") -def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups, +def group_conv2d_nchw_cuda(_, data, kernel, stride, padding, dilation, groups, out_dtype='float32'): return nn.group_conv2d_nchw(data, kernel, stride, padding, dilation, groups, out_dtype) @@ -302,14 +302,13 @@ def group_conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, groups # # Compared with a normal convolution, group convolution only sums # input channels from the group that an output channel resides in. - conv = tvm.compute(oshape, lambda n, occ, oh, ow, ocb: - tvm.sum(pad_data[n, occ//(oc_chunk//groups)*(ic_chunk//groups)+icc, - oh*stride_h+kh*dilation_h, ow*stride_w+kw*dilation_w, icb] - .astype('int32') * - packed_kernel[occ, icc, - kh, kw, ocb, icb] - .astype('int32'), - axis=[icc, kh, kw, icb])) + conv = tvm.compute( + oshape, lambda n, occ, oh, ow, ocb: + tvm.sum(pad_data[n, occ//(oc_chunk//groups)*(ic_chunk//groups)+icc, + oh*stride_h+kh*dilation_h, ow*stride_w+kw*dilation_w, icb] + .astype('int32') * + packed_kernel[occ, icc, kh, kw, ocb, icb].astype('int32'), + axis=[icc, kh, kw, icb])) # Type conversion output = tvm.compute(oshape, lambda *index: conv(*index).astype(out_dtype), diff --git a/topi/python/topi/cuda/nn.py b/topi/python/topi/cuda/nn.py index c0230ec0be48..4460f7b4cd8a 100644 --- a/topi/python/topi/cuda/nn.py +++ b/topi/python/topi/cuda/nn.py @@ -18,7 +18,6 @@ """scheduler functions for cuda backend""" from __future__ import absolute_import as _abs -import tvm from .. import cpp def schedule_lrn(outs): diff --git a/topi/python/topi/cuda/rcnn/proposal.py b/topi/python/topi/cuda/rcnn/proposal.py index 71f9c4ac305e..489c354e6cf3 100644 --- a/topi/python/topi/cuda/rcnn/proposal.py +++ b/topi/python/topi/cuda/rcnn/proposal.py @@ -18,7 +18,7 @@ """Proposal operator""" import math import tvm -from ...vision.rcnn import proposal, generate_anchor, reg_bbox, reg_iou +from ...vision.rcnn import generate_anchor, reg_bbox, reg_iou from ...util import get_const_tuple, get_const_int diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py index 499288829e44..8666c22774de 100644 --- a/topi/python/topi/cuda/vision.py +++ b/topi/python/topi/cuda/vision.py @@ -18,7 +18,6 @@ """Schedule for vision operators""" from __future__ import absolute_import as _abs import tvm -from .. import generic from .. import cpp from .. import tag from .pooling import schedule_pool diff --git a/topi/python/topi/generic/sort.py b/topi/python/topi/generic/sort.py index e28ab2c8b20c..9eca588e5655 100644 --- a/topi/python/topi/generic/sort.py +++ b/topi/python/topi/generic/sort.py @@ -17,7 +17,6 @@ # pylint: disable=invalid-name, no-member """Generic vision operators""" from __future__ import absolute_import as _abs -import tvm from .vision import _default_schedule def schedule_argsort(outs): diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py index 0a0dc468f31a..0171b36ebb43 100644 --- a/topi/python/topi/intel_graphics/conv2d.py +++ b/topi/python/topi/intel_graphics/conv2d.py @@ -234,7 +234,8 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape) dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) - pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(padding, (kernel_height, kernel_width)) + pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( + padding, (kernel_height, kernel_width)) assert (dh, dw) == (1, 1), "Does not support dilation" if isinstance(strides, (tuple, list)): stride_h, stride_w = strides diff --git a/topi/python/topi/intel_graphics/depthwise_conv2d.py b/topi/python/topi/intel_graphics/depthwise_conv2d.py index 90f4c85d21db..92ce6fcac16b 100644 --- a/topi/python/topi/intel_graphics/depthwise_conv2d.py +++ b/topi/python/topi/intel_graphics/depthwise_conv2d.py @@ -25,7 +25,7 @@ # register original implementation of depthwise_conv2d_nchw since we don't need to change this part @autotvm.register_topi_compute("depthwise_conv2d_nchw.intel_graphics") -def depthwise_conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype): +def depthwise_conv2d_nchw(_, data, kernel, strides, padding, dilation, out_dtype): return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype) diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index 0d73c8b0b866..a7a75ed0ef0c 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -719,6 +719,21 @@ def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtyp def unpack_NCHWc_to_nchw(packed_out, out_dtype): + """Unpack conv2d_NCHWc output from layout NCHWc to NCHW + + Parameters + ----------- + packed_out : tvm.Tensor + The output tensor of conv2d_NCHWc. + + out_dtype : str + The output dtype. + + Returns + ------- + unpacked_out : tvm.Tensor + The unpacked output tensor in NCHW layout. + """ n, oc_chunk, oh, ow, oc_bn = get_const_tuple(packed_out.shape) idxmod = tvm.indexmod @@ -732,4 +747,4 @@ def unpack_NCHWc_to_nchw(packed_out, out_dtype): .astype(out_dtype), name='output_unpack', tag=tag.INJECTIVE+",unpack_nchwc") - return unpacked_out \ No newline at end of file + return unpacked_out diff --git a/topi/python/topi/rocm/conv2d.py b/topi/python/topi/rocm/conv2d.py index 0daa4be58e6c..ce56dc4e0847 100644 --- a/topi/python/topi/rocm/conv2d.py +++ b/topi/python/topi/rocm/conv2d.py @@ -14,9 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name +# pylint: disable=invalid-name, unused-argument """Compute definition for conv2d with rocm backend""" -import tvm from tvm import autotvm from tvm.contrib import miopen diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index b4b69d85fcfa..d97d1e9df90c 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -151,14 +151,14 @@ def _pack_data(cfg, data, kernel): kernel = tvm.compute( (oc_chunk, ic_chunk, kh, kw, ic_bn, oc_bn), lambda occ, icc, k_h, k_w, icb, ocb: - kernel[occ * oc_bn + ocb, - icc * ic_bn + icb, k_h, k_w], + kernel[occ * oc_bn + ocb, icc * ic_bn + icb, k_h, k_w], name="kernel_vec") return data, kernel @autotvm.register_topi_compute("conv2d_NCHWc.x86") def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype): + """Compute conv2d with NCHWc layout.""" # layout and out_layout are not used here, # we keep them for debug convenience when dumping autotvm workload if len(data.shape) == 5: diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py index d04f99b774d4..083fff48d774 100644 --- a/topi/python/topi/x86/conv2d_avx_1x1.py +++ b/topi/python/topi/x86/conv2d_avx_1x1.py @@ -22,7 +22,7 @@ from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity from ..nn.pad import pad -from ..nn.util import get_pad_tuple, infer_pad +from ..nn.util import get_pad_tuple from ..generic import conv2d as conv2d_generic from ..util import get_const_tuple, simplify from .tensor_intrin import dot_16x1x16_uint8_int8_int32 diff --git a/topi/python/topi/x86/conv2d_int8.py b/topi/python/topi/x86/conv2d_int8.py index 06c80e6e39ca..19428c892da9 100644 --- a/topi/python/topi/x86/conv2d_int8.py +++ b/topi/python/topi/x86/conv2d_int8.py @@ -17,7 +17,6 @@ # pylint: disable=invalid-name,unused-variable,unused-argument,no-member, import-outside-toplevel """Conv2D int8 schedule on x86""" -import re import tvm from tvm import autotvm from ..nn.conv2d import _get_workload as _get_conv2d_workload @@ -75,6 +74,7 @@ def is_int8_hw_support(data_dtype, kernel_dtype): def conv2d_nchw_int8(data, kernel, strides, padding, dilation, out_dtype): + """Compute conv2d with NCHW layout and int8 dtype""" layout = "NCHW" packed_out = conv2d_NCHWc_int8(data, kernel, strides, padding, dilation, layout, layout, out_dtype) @@ -82,6 +82,7 @@ def conv2d_nchw_int8(data, kernel, strides, padding, dilation, out_dtype): def schedule_conv2d_nchw_int8(outs): + """Create the schedule for conv2d_nchw_int8""" return schedule_conv2d_NCHWc_int8(outs) @@ -111,6 +112,7 @@ def _pack_data(cfg, data, kernel): @autotvm.register_topi_compute("conv2d_NCHWc_int8.x86") def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype): + """Compute conv2d with NCHWc layout and int8 dtype""" if len(data.shape) == 5: n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) in_channel = ic_chunk * ic_bn diff --git a/topi/python/topi/x86/dense.py b/topi/python/topi/x86/dense.py index 734ba2f71330..d03839fe6319 100644 --- a/topi/python/topi/x86/dense.py +++ b/topi/python/topi/x86/dense.py @@ -135,6 +135,7 @@ def _default_dense_nopack_config(cfg, M, N, K): @autotvm.register_topi_compute("dense_nopack.x86") def dense_nopack(cfg, data, weight, bias=None, out_dtype=None): + """Compute dense without packing""" if out_dtype is None: out_dtype = data.dtype M, K = get_const_tuple(data.shape) @@ -165,6 +166,7 @@ def dense_nopack(cfg, data, weight, bias=None, out_dtype=None): @autotvm.register_topi_schedule("dense_nopack.x86") def schedule_dense_nopack(cfg, outs): + """Create the schedule for dense_nopack""" s = tvm.create_schedule([x.op for x in outs]) def _callback(op): @@ -175,6 +177,7 @@ def _callback(op): @autotvm.register_topi_compute("dense_pack.x86") def dense_pack(cfg, data, weight, bias=None, out_dtype=None): + """Compute dense with packing""" if out_dtype is None: out_dtype = data.dtype M, K = get_const_tuple(data.shape) # batch, in_dim @@ -207,6 +210,7 @@ def dense_pack(cfg, data, weight, bias=None, out_dtype=None): @autotvm.register_topi_schedule("dense_pack.x86") def schedule_dense_pack(cfg, outs): + """Create the schedule for dense_pack""" s = tvm.create_schedule([x.op for x in outs]) def _callback(op): @@ -217,6 +221,7 @@ def _callback(op): @autotvm.register_topi_compute("dense_cblas.x86") def dense_cblas(cfg, data, weight, bias=None, out_dtype=None): + """Compute dense using cblas library""" M, K = get_const_tuple(data.shape) N, _ = get_const_tuple(weight.shape) cfg.add_flop(M * K * N * 2) @@ -228,4 +233,5 @@ def dense_cblas(cfg, data, weight, bias=None, out_dtype=None): @autotvm.register_topi_schedule("dense_cblas.x86") def schedule_dense_cblas(_, outs): + """Create schedule for dense_cblas""" return generic.schedule_extern(outs) diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py index a3a02a50aecd..0142e1b82209 100644 --- a/topi/python/topi/x86/depthwise_conv2d.py +++ b/topi/python/topi/x86/depthwise_conv2d.py @@ -67,12 +67,14 @@ def _fallback_schedule(cfg, wkl): cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n]) def depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype): + """Compute depthwise conv2d with NCHW layout.""" layout = "NCHW" packed_out = depthwise_conv2d_NCHWc(data, kernel, strides, padding, dilation, layout, layout, out_dtype) return unpack_NCHWc_to_nchw(packed_out, out_dtype) def schedule_depthwise_conv2d_nchw(outs): + """Create schedule for depthwise_conv2d_nchw.""" return schedule_depthwise_conv2d_NCHWc(outs) def _pack_data(cfg, data, kernel): @@ -100,6 +102,7 @@ def _pack_data(cfg, data, kernel): @autotvm.register_topi_compute("depthwise_conv2d_NCHWc.x86") def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype=None): + """Compute depthwise conv2d with NCHWc layout""" out_dtype = data.dtype if out_dtype is None else out_dtype if len(data.shape) == 5: diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py index 0da5316abaf8..3d57b6bbf203 100644 --- a/topi/python/topi/x86/nn.py +++ b/topi/python/topi/x86/nn.py @@ -18,7 +18,6 @@ """x86 nn operators""" from __future__ import absolute_import as _abs import tvm -from .. import generic def schedule_softmax(outs): """Schedule for softmax diff --git a/topi/python/topi/x86/pooling.py b/topi/python/topi/x86/pooling.py index 4f7866df156f..a8251dd13ae4 100644 --- a/topi/python/topi/x86/pooling.py +++ b/topi/python/topi/x86/pooling.py @@ -17,7 +17,6 @@ # pylint: disable=invalid-name, unused-variable """Schedule for pooling operators""" import tvm -from .. import generic from .. import tag def _parallel_sch(sch, oshape, do_vectorize=False): diff --git a/topi/python/topi/x86/sparse.py b/topi/python/topi/x86/sparse.py index 85a286a351e4..898d0e5ea2c6 100644 --- a/topi/python/topi/x86/sparse.py +++ b/topi/python/topi/x86/sparse.py @@ -23,6 +23,7 @@ def schedule_sparse_dense(outs): + """Create schedule for sparse dense""" s = tvm.create_schedule([x.op for x in outs]) def _callback(op): From fa77cb7d7c625cbdcf8f24138e57dbb3e2ee47fb Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Thu, 6 Feb 2020 11:50:49 -0800 Subject: [PATCH 04/48] address comments --- include/tvm/relay/op_attr_types.h | 62 +++------------------- python/tvm/relay/backend/compile_engine.py | 3 +- python/tvm/relay/op/strategy/arm_cpu.py | 2 +- python/tvm/relay/op/strategy/cuda.py | 8 +-- python/tvm/relay/op/strategy/generic.py | 11 +--- python/tvm/relay/op/strategy/hls.py | 2 +- python/tvm/relay/op/strategy/rocm.py | 6 --- python/tvm/relay/op/strategy/x86.py | 2 +- python/tvm/tir/expr.py | 2 +- topi/python/topi/x86/conv2d.py | 1 + topi/python/topi/x86/conv2d_int8.py | 3 +- topi/python/topi/x86/depthwise_conv2d.py | 5 +- 12 files changed, 20 insertions(+), 87 deletions(-) diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h index 889895472168..497cb2f8d701 100644 --- a/include/tvm/relay/op_attr_types.h +++ b/include/tvm/relay/op_attr_types.h @@ -249,15 +249,6 @@ class OpImplementNode : public Object { */ class OpImplement : public ObjectRef { public: - /*! \brief default constructor */ - OpImplement() {} - /*! \brief constructor from node pointer */ - explicit OpImplement(ObjectPtr n) : ObjectRef(n) {} - /*! - * \brief access the internal node container - * \return the pointer to the internal node container - */ - inline const OpImplementNode* operator->() const; /*! * \brief Invoke the operator compute function. * \param attrs The attribute of the primitive @@ -278,6 +269,8 @@ class OpImplement : public ObjectRef { te::Schedule Schedule(const Attrs& attrs, const Array& outs, const Target& target); + + TVM_DEFINE_OBJECT_REF_METHODS(OpImplement, ObjectRef, OpImplementNode); }; /*! @@ -305,18 +298,6 @@ class OpSpecializationNode : public Object { */ class OpSpecialization : public ObjectRef { public: - OpSpecialization() {} - explicit OpSpecialization(ObjectPtr n) : ObjectRef(n) {} - /*! - * \brief access the internal node container - * \return the pointer to the internal node container - */ - inline const OpSpecializationNode* operator->() const; - /*! - * \brief access the internal node container - * \return the pointer to the internal node container - */ - inline OpSpecializationNode* operator->(); /*! * \brief Add an implementation. * \param compute Compute function @@ -325,6 +306,8 @@ class OpSpecialization : public ObjectRef { */ void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, int plevel); + + TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(OpSpecialization, ObjectRef, OpSpecializationNode); }; /*! @@ -348,20 +331,6 @@ class OpStrategyNode : public Object { */ class OpStrategy : public ObjectRef { public: - /*! \brief default constructor */ - OpStrategy() {} - /*! \brief constructor from node pointer */ - explicit OpStrategy(ObjectPtr n) : ObjectRef(n) {} - /*! - * \brief access the internal node container - * \return the pointer to the internal node container - */ - inline const OpStrategyNode* operator->() const; - /*! - * \brief access the internal node container - * \return the pointer to the internal node container - */ - inline OpStrategyNode* operator->(); /*! * \brief Add an implementation. * \param compute Compute function @@ -369,28 +338,9 @@ class OpStrategy : public ObjectRef { * \param plevel Priority level of this implementation. */ void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, int plevel); -}; - -// implementations -inline const OpImplementNode* OpImplement::operator->() const { - return static_cast(get()); -} -inline const OpSpecializationNode* OpSpecialization::operator->() const { - return static_cast(get()); -} - -inline OpSpecializationNode* OpSpecialization::operator->() { - return static_cast(get_mutable()); -} - -inline const OpStrategyNode* OpStrategy::operator->() const { - return static_cast(get()); -} - -inline OpStrategyNode* OpStrategy::operator->() { - return static_cast(get_mutable()); -} + TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(OpStrategy, ObjectRef, OpStrategyNode); +}; } // namespace relay } // namespace tvm diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py index 170d7acc834a..83af925ef5bd 100644 --- a/python/tvm/relay/backend/compile_engine.py +++ b/python/tvm/relay/backend/compile_engine.py @@ -25,7 +25,6 @@ from ..base import register_relay_node, Object from ... import _api_internal from ... import target as _target -from ..._ffi.function import register_func from ... import autotvm from .. import expr as _expr from .. import op as _op @@ -389,7 +388,7 @@ def visit_tuple_getitem(self, t): return [tup[t.index]] -@register_func("relay.backend.create_schedule") +@tvm._ffi.register_func("relay.backend.create_schedule") def create_schedule(src_func, target): return ScheduleGetter(target).create(src_func) diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 72b5b1aa5b79..97f32cdcee72 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -172,7 +172,7 @@ def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target): assert groups == 1, "only support groups == 1 for now" strategy = _op.OpStrategy() strategy.add_implement( - wrap_comptue_conv2d_transpose(topi.arm_cpu.conv2d_transpose_nchw), + wrap_compute_conv2d_transpose(topi.arm_cpu.conv2d_transpose_nchw), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_transpose_nchw)) return strategy diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index ca07604e6418..95efde9de161 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -71,12 +71,6 @@ def schedule_lrn_cuda(attrs, outs, target): with target: return topi.cuda.schedule_lrn(outs) -@schedule_l2_normalize.register(["cuda", "gpu"]) -def schedule_l2_normalize_cuda(attrs, outs, target): - """schedule L2 normalize for cuda""" - with target: - return topi.cuda.schedule_l2_normalize(outs) - @conv2d_strategy.register(["cuda", "gpu"]) def conv2d_strategy_cuda(attrs, inputs, out_type, target): """conv2d cuda strategy""" @@ -197,7 +191,7 @@ def conv2d_transpose_strategy_cuda(attrs, inputs, out_type, target): assert groups == 1, "only support groups == 1 for now" strategy = _op.OpStrategy() strategy.add_implement( - wrap_comptue_conv2d_transpose(topi.cuda.conv2d_transpose_nchw), + wrap_compute_conv2d_transpose(topi.cuda.conv2d_transpose_nchw), wrap_topi_schedule(topi.cuda.schedule_conv2d_transpose_nchw)) return strategy diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 73923e554579..5d08e9c2374a 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -120,13 +120,6 @@ def schedule_lrn(attrs, outs, target): with target: return topi.generic.schedule_lrn(outs) -# l2_normalize -@generic_func -def schedule_l2_normalize(attrs, outs, target): - """Schedule L2 normalize op""" - with target: - return topi.generic.schedule_l2_normalize(outs) - # bitpack @generic_func def schedule_bitpack(attrs, outs, target): @@ -283,7 +276,7 @@ def deformable_conv2d_strategy(attrs, inputs, out_type, target): return strategy # conv2d_transpose -def wrap_comptue_conv2d_transpose(topi_compute): +def wrap_compute_conv2d_transpose(topi_compute): """wrap conv2d_transpose topi compute""" def compute_conv2d_transpose(attrs, inputs, out_dtype): """Compute definition of conv2d_transpose""" @@ -311,7 +304,7 @@ def conv2d_transpose_strategy(attrs, inputs, out_type, target): assert groups == 1, "only support groups == 1 for now" strategy = _op.OpStrategy() strategy.add_implement( - wrap_comptue_conv2d_transpose(topi.nn.conv2d_transpose_nchw), + wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw), wrap_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw)) return strategy diff --git a/python/tvm/relay/op/strategy/hls.py b/python/tvm/relay/op/strategy/hls.py index 0600f875416a..818b93faaf41 100644 --- a/python/tvm/relay/op/strategy/hls.py +++ b/python/tvm/relay/op/strategy/hls.py @@ -121,7 +121,7 @@ def conv2d_transpose_strategy_hls(attrs, inputs, out_type, target): assert groups == 1, "only support groups == 1 for now" strategy = _op.OpStrategy() strategy.add_implement( - wrap_comptue_conv2d_transpose(topi.nn.conv2d_transpose_nchw), + wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw), wrap_topi_schedule(topi.hls.schedule_conv2d_transpose_nchw)) return strategy diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py index 9e725f65c511..c21dd40c5cbe 100644 --- a/python/tvm/relay/op/strategy/rocm.py +++ b/python/tvm/relay/op/strategy/rocm.py @@ -28,12 +28,6 @@ def schedule_lrn_rocm(attrs, outs, target): with target: return topi.rocm.schedule_lrn(outs) -@schedule_l2_normalize.register("rocm") -def schedule_l2_normalize_rocm(attrs, outs, target): - """schedule L2 normalize for rocm""" - with target: - return topi.rocm.schedule_l2_normalize(outs) - @conv2d_strategy.register("rocm") def conv2d_strategy_cuda(attrs, inputs, out_type, target): """conv2d cuda strategy""" diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 51d525928ab6..435f1626f826 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -169,7 +169,7 @@ def conv2d_transpose_strategy_cpu(attrs, inputs, out_type, target): assert groups == 1, "only support groups == 1 for now" strategy = _op.OpStrategy() strategy.add_implement( - wrap_comptue_conv2d_transpose(topi.x86.conv2d_transpose_nchw), + wrap_compute_conv2d_transpose(topi.x86.conv2d_transpose_nchw), wrap_topi_schedule(topi.x86.schedule_conv2d_transpose_nchw)) return strategy diff --git a/python/tvm/tir/expr.py b/python/tvm/tir/expr.py index a6af6be73576..acf5f51941dc 100644 --- a/python/tvm/tir/expr.py +++ b/python/tvm/tir/expr.py @@ -966,7 +966,7 @@ def __init__(self, var, value, body): _ffi_api.Let, var, value, body) -@register_object +@tvm._ffi.register_object class Any(PrimExpr): """Any node. """ diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index d97d1e9df90c..b3b3671c8451 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=invalid-name,unused-variable,unused-argument,no-member +# pylint: disable=no-value-for-parameter """Conv2D schedule on x86""" import logging diff --git a/topi/python/topi/x86/conv2d_int8.py b/topi/python/topi/x86/conv2d_int8.py index 19428c892da9..d983fdae9044 100644 --- a/topi/python/topi/x86/conv2d_int8.py +++ b/topi/python/topi/x86/conv2d_int8.py @@ -14,7 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name,unused-variable,unused-argument,no-member, import-outside-toplevel +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +# pylint: disable=no-value-for-parameter,import-outside-toplevel """Conv2D int8 schedule on x86""" import tvm diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py index 0142e1b82209..275f33da54f3 100644 --- a/topi/python/topi/x86/depthwise_conv2d.py +++ b/topi/python/topi/x86/depthwise_conv2d.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=invalid-name,unused-variable,unused-argument,no-member +# pylint: disable=no-value-for-parameter """Depthwise Conv2D schedule on x86""" import tvm from tvm import autotvm @@ -79,8 +80,8 @@ def schedule_depthwise_conv2d_nchw(outs): def _pack_data(cfg, data, kernel): n, ic, ih, iw = get_const_tuple(data.shape) - filter, cm, kh, kw = get_const_tuple(kernel.shape) - oc = filter * cm + filters, cm, kh, kw = get_const_tuple(kernel.shape) + oc = filters * cm ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] ic_chunk = ic // ic_bn From 908422134728ae70c0b03922e4105f38c0d97584 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Thu, 6 Feb 2020 15:35:35 -0800 Subject: [PATCH 05/48] add name to op implement --- include/tvm/relay/op_attr_types.h | 20 ++- python/tvm/relay/backend/compile_engine.py | 7 +- python/tvm/relay/op/_reduce.py | 20 +-- python/tvm/relay/op/_tensor.py | 100 ++++++------ python/tvm/relay/op/_transform.py | 58 +++---- python/tvm/relay/op/annotation/annotation.py | 2 +- python/tvm/relay/op/contrib/_contrib.py | 2 +- python/tvm/relay/op/image/_image.py | 4 +- python/tvm/relay/op/nn/_nn.py | 28 ++-- python/tvm/relay/op/op.py | 147 ++++++++++++++++-- python/tvm/relay/op/strategy/arm_cpu.py | 49 +++--- python/tvm/relay/op/strategy/bifrost.py | 18 ++- python/tvm/relay/op/strategy/cuda.py | 109 ++++++++----- python/tvm/relay/op/strategy/generic.py | 102 ++++++++---- python/tvm/relay/op/strategy/hls.py | 29 ++-- .../tvm/relay/op/strategy/intel_graphics.py | 14 +- python/tvm/relay/op/strategy/mali.py | 18 ++- python/tvm/relay/op/strategy/opengl.py | 12 +- python/tvm/relay/op/strategy/rocm.py | 53 ++++--- python/tvm/relay/op/strategy/x86.py | 88 +++++++---- python/tvm/relay/op/vision/_yolo.py | 4 +- python/tvm/relay/quantize/_annotate.py | 4 +- src/relay/ir/op_attr_types.cc | 14 +- topi/python/topi/cuda/conv2d_winograd.py | 2 +- 24 files changed, 589 insertions(+), 315 deletions(-) diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h index 497cb2f8d701..27325c3d2010 100644 --- a/include/tvm/relay/op_attr_types.h +++ b/include/tvm/relay/op_attr_types.h @@ -221,8 +221,8 @@ using Shape = Array; using FShapeFunc = runtime::TypedPackedFunc< Array(const Attrs& attrs, - const Array& inputs, - const Array& out_ndims)>; + const Array& inputs, + const Array& out_ndims)>; /*! * \brief Operator implementation in TVM. @@ -233,10 +233,13 @@ class OpImplementNode : public Object { FTVMCompute fcompute; /*! \brief Schedule function */ FTVMSchedule fschedule; + /*! \brief Name of the implementation */ + std::string name; /*! \brief Priority level */ - Integer plevel; + int plevel; void VisitAttrs(tvm::AttrVisitor* v) { + v->Visit("name", &name); v->Visit("plevel", &plevel); } @@ -302,10 +305,11 @@ class OpSpecialization : public ObjectRef { * \brief Add an implementation. * \param compute Compute function * \param schedule Schedule function - * \param plevel Priority level of this implemntation. + * \param name Name of the implementation + * \param plevel Priority level of the implementation */ void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, - int plevel); + std::string name, int plevel); TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(OpSpecialization, ObjectRef, OpSpecializationNode); }; @@ -335,9 +339,11 @@ class OpStrategy : public ObjectRef { * \brief Add an implementation. * \param compute Compute function * \param schedule Schedule function - * \param plevel Priority level of this implementation. + * \param name Name of the implementation + * \param plevel Priority level of the implementation */ - void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, int plevel); + void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, std::string name, + int plevel); TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(OpStrategy, ObjectRef, OpStrategyNode); }; diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py index 83af925ef5bd..7f8db95fbbbb 100644 --- a/python/tvm/relay/backend/compile_engine.py +++ b/python/tvm/relay/backend/compile_engine.py @@ -18,6 +18,7 @@ """Backend code generation engine.""" from __future__ import absolute_import +import logging import hashlib import numpy as np import tvm @@ -32,6 +33,8 @@ from ..expr_functor import ExprVisitor from . import _backend +logger = logging.getLogger('compile_engine') + @register_relay_node class CachedFunc(Object): """Low-level tensor function to back a relay primitive function. @@ -189,7 +192,7 @@ def select_implement(op, attrs, inputs, out_type, target, use_autotvm=True): best_plevel_impl = None for impl in all_impls: - if best_plevel_impl is None or int(impl.plevel) > int(best_plevel_impl.plevel): + if best_plevel_impl is None or impl.plevel > best_plevel_impl.plevel: best_plevel_impl = impl if not use_autotvm: outs = best_plevel_impl.compute(attrs, inputs, out_type) @@ -266,7 +269,6 @@ def create_tensors(typ, tensors): tensor_outs.append(tensor) sch = None if not isinstance(self.master_attrs, _op.op_attrs.DeviceCopyAttrs): - # print('master op:', self.master_op.name) sch = self.master_implement.schedule(self.master_attrs, tensor_outs, self.target) for scalar in self.scalars: if scalar in sch.stage_map: @@ -336,6 +338,7 @@ def visit_call(self, call): if not is_dyn: best_impl, outputs = select_implement( op, call.attrs, inputs, ret_type, self.target) + logger.debug("Use implementation %s for op %s" % (best_impl.name, op.name)) else: # TODO(@icemelon9): Allow tvm to generate multiple kernels for dynamic shapes # for dynamic case, we currently use the implementation with highest plevel diff --git a/python/tvm/relay/op/_reduce.py b/python/tvm/relay/op/_reduce.py index 3103520bdfef..9d52ed3af777 100644 --- a/python/tvm/relay/op/_reduce.py +++ b/python/tvm/relay/op/_reduce.py @@ -22,16 +22,16 @@ from ...api import convert from ...hybrid import script -_reg.register_strategy_reduce("argmax") -_reg.register_strategy_reduce("argmin") -_reg.register_strategy_reduce("sum") -_reg.register_strategy_reduce("all") -_reg.register_strategy_reduce("any") -_reg.register_strategy_reduce("max") -_reg.register_strategy_reduce("min") -_reg.register_strategy_reduce("prod") -_reg.register_strategy_reduce("mean") -_reg.register_strategy_reduce("variance") +_reg.register_reduce_schedule("argmax") +_reg.register_reduce_schedule("argmin") +_reg.register_reduce_schedule("sum") +_reg.register_reduce_schedule("all") +_reg.register_reduce_schedule("any") +_reg.register_reduce_schedule("max") +_reg.register_reduce_schedule("min") +_reg.register_reduce_schedule("prod") +_reg.register_reduce_schedule("mean") +_reg.register_reduce_schedule("variance") def _create_axis_record(attrs, inputs): axes = attrs.axis if attrs.axis is None else list(get_const_tuple(attrs.axis)) diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py index ebcb8e36aa65..7c8ccb7dd827 100644 --- a/python/tvm/relay/op/_tensor.py +++ b/python/tvm/relay/op/_tensor.py @@ -20,56 +20,56 @@ import topi from topi.util import get_const_tuple from .op import register_compute, register_shape_func -from .op import register_strategy_broadcast, register_strategy_injective +from .op import register_broadcast_schedule, register_injective_schedule from .op import register_pattern, OpPattern from ...hybrid import script from ...api import convert -register_strategy_broadcast("log") -register_strategy_broadcast("cos") -register_strategy_broadcast("sin") -register_strategy_broadcast("atan") -register_strategy_broadcast("exp") -register_strategy_broadcast("erf") -register_strategy_broadcast("sqrt") -register_strategy_broadcast("rsqrt") -register_strategy_broadcast("sigmoid") -register_strategy_broadcast("floor") -register_strategy_broadcast("ceil") -register_strategy_broadcast("trunc") -register_strategy_broadcast("round") -register_strategy_broadcast("sign") -register_strategy_broadcast("abs") -register_strategy_broadcast("tanh") -register_strategy_broadcast("add") -register_strategy_broadcast("subtract") -register_strategy_broadcast("multiply") -register_strategy_broadcast("divide") -register_strategy_broadcast("floor_divide") -register_strategy_broadcast("power") -register_strategy_broadcast("copy") -register_strategy_broadcast("logical_not") -register_strategy_broadcast("logical_and") -register_strategy_broadcast("logical_or") -register_strategy_broadcast("bitwise_not") -register_strategy_broadcast("bitwise_and") -register_strategy_broadcast("bitwise_or") -register_strategy_broadcast("bitwise_xor") -register_strategy_broadcast("negative") -register_strategy_broadcast("mod") -register_strategy_broadcast("floor_mod") -register_strategy_broadcast("equal") -register_strategy_broadcast("not_equal") -register_strategy_broadcast("less") -register_strategy_broadcast("less_equal") -register_strategy_broadcast("greater") -register_strategy_broadcast("greater_equal") -register_strategy_injective("maximum") -register_strategy_injective("minimum") -register_strategy_injective("right_shift") -register_strategy_injective("left_shift") -register_strategy_injective("shape_of") +register_broadcast_schedule("log") +register_broadcast_schedule("cos") +register_broadcast_schedule("sin") +register_broadcast_schedule("atan") +register_broadcast_schedule("exp") +register_broadcast_schedule("erf") +register_broadcast_schedule("sqrt") +register_broadcast_schedule("rsqrt") +register_broadcast_schedule("sigmoid") +register_broadcast_schedule("floor") +register_broadcast_schedule("ceil") +register_broadcast_schedule("trunc") +register_broadcast_schedule("round") +register_broadcast_schedule("sign") +register_broadcast_schedule("abs") +register_broadcast_schedule("tanh") +register_broadcast_schedule("add") +register_broadcast_schedule("subtract") +register_broadcast_schedule("multiply") +register_broadcast_schedule("divide") +register_broadcast_schedule("floor_divide") +register_broadcast_schedule("power") +register_broadcast_schedule("copy") +register_broadcast_schedule("logical_not") +register_broadcast_schedule("logical_and") +register_broadcast_schedule("logical_or") +register_broadcast_schedule("bitwise_not") +register_broadcast_schedule("bitwise_and") +register_broadcast_schedule("bitwise_or") +register_broadcast_schedule("bitwise_xor") +register_broadcast_schedule("negative") +register_broadcast_schedule("mod") +register_broadcast_schedule("floor_mod") +register_broadcast_schedule("equal") +register_broadcast_schedule("not_equal") +register_broadcast_schedule("less") +register_broadcast_schedule("less_equal") +register_broadcast_schedule("greater") +register_broadcast_schedule("greater_equal") +register_injective_schedule("maximum") +register_injective_schedule("minimum") +register_injective_schedule("right_shift") +register_injective_schedule("left_shift") +register_injective_schedule("shape_of") # zeros @register_compute("zeros") @@ -77,7 +77,7 @@ def zeros_compute(attrs, inputs, output_type): assert not inputs return [topi.full(output_type.shape, output_type.dtype, 0.0)] -register_strategy_broadcast("zeros") +register_broadcast_schedule("zeros") register_pattern("zeros", OpPattern.ELEMWISE) # zeros_like @@ -86,7 +86,7 @@ def zeros_like_compute(attrs, inputs, output_type): assert len(inputs) == 1 return [topi.full_like(inputs[0], 0.0)] -register_strategy_broadcast("zeros_like") +register_broadcast_schedule("zeros_like") # ones @register_compute("ones") @@ -94,7 +94,7 @@ def ones_compute(attrs, inputs, output_type): assert not inputs return [topi.full(output_type.shape, output_type.dtype, 1.0)] -register_strategy_broadcast("ones") +register_broadcast_schedule("ones") register_pattern("ones", OpPattern.ELEMWISE) # ones_like @@ -103,7 +103,7 @@ def ones_like_compute(attrs, inputs, output_type): assert len(inputs) == 1 return [topi.full_like(inputs[0], 1.0)] -register_strategy_broadcast("ones_like") +register_broadcast_schedule("ones_like") # clip @register_compute("clip") @@ -111,7 +111,7 @@ def clip_compute(attrs, inputs, output_type): assert len(inputs) == 1 return [topi.clip(inputs[0], attrs.a_min, attrs.a_max)] -register_strategy_injective("clip") +register_injective_schedule("clip") @script def _cast_shape_function(x): diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py index ccc53cc6ef1d..42c94349da8c 100644 --- a/python/tvm/relay/op/_transform.py +++ b/python/tvm/relay/op/_transform.py @@ -26,33 +26,33 @@ from ...hybrid import script from ...api import convert -_reg.register_strategy_broadcast("broadcast_to") -_reg.register_strategy_broadcast("broadcast_to_like") -_reg.register_strategy_broadcast("expand_dims") -_reg.register_strategy_broadcast("repeat") -_reg.register_strategy_broadcast("tile") -_reg.register_strategy_broadcast("where") -_reg.register_strategy_injective("squeeze") -_reg.register_strategy_injective("reshape") -_reg.register_strategy_injective("reshape_like") -_reg.register_strategy_injective("full") -_reg.register_strategy_injective("full_like") -_reg.register_strategy_injective("arange") -_reg.register_strategy_injective("reverse") -_reg.register_strategy_injective("cast") -_reg.register_strategy_injective("cast_like") -_reg.register_strategy_injective("reinterpret") -_reg.register_strategy_injective("strided_slice") -_reg.register_strategy_injective("slice_like") -_reg.register_strategy_injective("split") -_reg.register_strategy_injective("take") -_reg.register_strategy_injective("transpose") -_reg.register_strategy_injective("stack") -_reg.register_strategy_injective("_contrib_reverse_reshape") -_reg.register_strategy_injective("gather_nd") -_reg.register_strategy_injective("sequence_mask") -_reg.register_strategy_injective("one_hot") -_reg.register_strategy_reduce("collapse_sum_like") +_reg.register_broadcast_schedule("broadcast_to") +_reg.register_broadcast_schedule("broadcast_to_like") +_reg.register_broadcast_schedule("expand_dims") +_reg.register_broadcast_schedule("repeat") +_reg.register_broadcast_schedule("tile") +_reg.register_broadcast_schedule("where") +_reg.register_injective_schedule("squeeze") +_reg.register_injective_schedule("reshape") +_reg.register_injective_schedule("reshape_like") +_reg.register_injective_schedule("full") +_reg.register_injective_schedule("full_like") +_reg.register_injective_schedule("arange") +_reg.register_injective_schedule("reverse") +_reg.register_injective_schedule("cast") +_reg.register_injective_schedule("cast_like") +_reg.register_injective_schedule("reinterpret") +_reg.register_injective_schedule("strided_slice") +_reg.register_injective_schedule("slice_like") +_reg.register_injective_schedule("split") +_reg.register_injective_schedule("take") +_reg.register_injective_schedule("transpose") +_reg.register_injective_schedule("stack") +_reg.register_injective_schedule("_contrib_reverse_reshape") +_reg.register_injective_schedule("gather_nd") +_reg.register_injective_schedule("sequence_mask") +_reg.register_injective_schedule("one_hot") +_reg.register_reduce_schedule("collapse_sum_like") # concatenate _reg.register_schedule("concatenate", strategy.schedule_concatenate) @@ -63,10 +63,10 @@ def compute_strided_set(attrs, inputs, output_type): """Compute definition of strided_set""" return [topi.strided_set(inputs[0], inputs[1], inputs[2], inputs[3], inputs[4])] -_reg.register_strategy_injective("strided_set") +_reg.register_injective_schedule("strided_set") # layout_transform -_reg.register_strategy_injective("layout_transform") +_reg.register_injective_schedule("layout_transform") _reg.register_pattern("layout_transform", OpPattern.INJECTIVE) # argwhere diff --git a/python/tvm/relay/op/annotation/annotation.py b/python/tvm/relay/op/annotation/annotation.py index 5fcc112787a3..7bd5262dc30a 100644 --- a/python/tvm/relay/op/annotation/annotation.py +++ b/python/tvm/relay/op/annotation/annotation.py @@ -79,7 +79,7 @@ def checkpoint(data): """ return _make.checkpoint(data) -reg.register_strategy_injective("annotation.checkpoint") +reg.register_injective_schedule("annotation.checkpoint") def compiler_begin(data, compiler): diff --git a/python/tvm/relay/op/contrib/_contrib.py b/python/tvm/relay/op/contrib/_contrib.py index 16f22f1363c9..3927cef69706 100644 --- a/python/tvm/relay/op/contrib/_contrib.py +++ b/python/tvm/relay/op/contrib/_contrib.py @@ -33,4 +33,4 @@ reg.register_pattern("contrib.adaptive_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) # relay.contrib.ndarray_size -reg.register_strategy_injective("contrib.ndarray_size") +reg.register_injective_schedule("contrib.ndarray_size") diff --git a/python/tvm/relay/op/image/_image.py b/python/tvm/relay/op/image/_image.py index 14a7080d5986..b98b2bc14c28 100644 --- a/python/tvm/relay/op/image/_image.py +++ b/python/tvm/relay/op/image/_image.py @@ -31,7 +31,7 @@ def compute_resize(attrs, inputs, out_type): out_dtype = attrs.out_dtype return [topi.image.resize(inputs[0], size, layout, method, coord_trans, out_dtype)] -reg.register_strategy_injective("image.resize") +reg.register_injective_schedule("image.resize") # crop and resize @@ -46,4 +46,4 @@ def compute_crop_and_resize(attrs, inputs, out_type): crop_size, layout, method, extrapolation_value, out_dtype)] -reg.register_strategy_injective("image.crop_and_resize") +reg.register_injective_schedule("image.crop_and_resize") diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index d587a3e61b61..97a5fa6ec00b 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -28,7 +28,7 @@ from ....hybrid import script # relu -reg.register_strategy_broadcast("nn.relu") +reg.register_broadcast_schedule("nn.relu") reg.register_pattern("nn.relu", OpPattern.ELEMWISE) @@ -52,7 +52,7 @@ def compute_fifo_buffer(attrs, inputs, out_type): return [topi.nn.fifo_buffer(inputs[0], inputs[1], axis=attrs.get_int('axis'))] -reg.register_strategy_injective("nn.fifo_buffer") +reg.register_injective_schedule("nn.fifo_buffer") reg.register_pattern("nn.fifo_buffer", OpPattern.OPAQUE) @@ -192,7 +192,7 @@ def legalize_conv2d_transpose(attrs, inputs, types): # bias_add -reg.register_strategy_injective("nn.bias_add") +reg.register_injective_schedule("nn.bias_add") reg.register_pattern("nn.bias_add", OpPattern.BROADCAST) @@ -247,17 +247,17 @@ def legalize_conv2d_transpose(attrs, inputs, types): # leaky_relu -reg.register_strategy_broadcast("nn.leaky_relu") +reg.register_broadcast_schedule("nn.leaky_relu") reg.register_pattern("nn.leaky_relu", OpPattern.ELEMWISE) # prelu -reg.register_strategy_broadcast("nn.prelu") +reg.register_broadcast_schedule("nn.prelu") reg.register_pattern("nn.prelu", OpPattern.BROADCAST) # flatten -reg.register_strategy_broadcast("nn.batch_flatten") +reg.register_broadcast_schedule("nn.batch_flatten") reg.register_pattern("nn.batch_flatten", OpPattern.INJECTIVE) @@ -283,7 +283,7 @@ def compute_upsampling(attrs, inputs, out_dtype): align_corners = attrs.align_corners return [topi.nn.upsampling(inputs[0], scale_h, scale_w, layout, method, align_corners)] -reg.register_strategy_injective("nn.upsampling") +reg.register_injective_schedule("nn.upsampling") # upsampling3d @@ -298,11 +298,11 @@ def compute_upsampling3d(attrs, inputs, out_dtype): return [topi.nn.upsampling3d(inputs[0], scale_d, scale_h, scale_w, layout, method,\ coordinate_transformation_mode)] -reg.register_strategy_injective("nn.upsampling3d") +reg.register_injective_schedule("nn.upsampling3d") # pad -reg.register_strategy_broadcast("nn.pad") +reg.register_broadcast_schedule("nn.pad") # mirror_pad @@ -313,7 +313,7 @@ def compute_mirror_pad(attrs, inputs, out_dtype): out = topi.nn.mirror_pad(inputs[0], pad_before=pad_before, pad_after=pad_after, mode=mode) return [out] -reg.register_strategy_broadcast("nn.mirror_pad") +reg.register_broadcast_schedule("nn.mirror_pad") # conv2d_winograd related operators @@ -418,7 +418,7 @@ def compute_cross_entropy(attrs, inputs, out_dtype): x, y = inputs return [-topi.sum(topi.log(x) * y) / x.shape[0]] -reg.register_strategy_reduce("nn.cross_entropy") +reg.register_reduce_schedule("nn.cross_entropy") reg.register_pattern("nn.cross_entropy", OpPattern.OPAQUE) @@ -428,7 +428,7 @@ def compute_cross_entropy_with_logits(attrs, inputs, out_dtype): x, y = inputs return [-topi.sum(x * y) / x.shape[0]] -reg.register_strategy_reduce("nn.cross_entropy_with_logits") +reg.register_reduce_schedule("nn.cross_entropy_with_logits") reg.register_pattern("nn.cross_entropy_with_logits", OpPattern.OPAQUE) @@ -440,7 +440,7 @@ def compute_depth_to_space(attrs, inputs, out_dtype): mode = attrs.mode return [topi.nn.depth_to_space(inputs[0], block_size, layout=layout, mode=mode)] -reg.register_strategy_injective("nn.depth_to_space") +reg.register_injective_schedule("nn.depth_to_space") reg.register_pattern("nn.depth_to_space", OpPattern.INJECTIVE) @@ -451,7 +451,7 @@ def compute_space_to_depth(attrs, inputs, out_dtype): layout = attrs.layout return [topi.nn.space_to_depth(inputs[0], block_size, layout=layout)] -reg.register_strategy_injective("nn.space_to_depth") +reg.register_injective_schedule("nn.space_to_depth") reg.register_pattern("nn.space_to_depth", OpPattern.INJECTIVE) diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py index 0a1500203db2..5e2426ba3407 100644 --- a/python/tvm/relay/op/op.py +++ b/python/tvm/relay/op/op.py @@ -147,9 +147,45 @@ class OpPattern(object): class OpImplement(Expr): """Operator implementation""" def compute(self, attrs, inputs, out_type): + """Call compute function. + + Parameters + ---------- + attrs : Attrs + Op attributes. + + inputs : list[tvm.Tensor] + The input tensors. + + out_type : relay.Type + The output type. + + Returns + ------- + outs : list[tvm.Tensor] + The output tensors. + """ return _OpImplementCompute(self, attrs, inputs, out_type) def schedule(self, attrs, outs, target): + """Call schedule function. + + Parameters + ---------- + attrs : Attrs + Op attributes. + + outs : list[tvm.Tensor] + The output tensors. + + target : tvm.Target + The target to schedule the op. + + Returns + ------- + schedule : tvm.Schedule + The schedule. + """ return _OpImplementSchedule(self, attrs, outs, target) @@ -160,29 +196,51 @@ class OpSpecialization(Expr): @register_relay_node class OpStrategy(Expr): + """Operator strategy""" def __init__(self): self.__init_handle_by_constructor__(_make.OpStrategy) - def add_implement(self, compute, schedule, plevel=10): - _OpStrategyAddImplement(self, compute, schedule, plevel) + def add_implement(self, compute, schedule, name="default", plevel=10): + """Add an implementation to the strategy + + Parameters + ---------- + compute : function (attrs: Attrs, inputs: List[Tensor], out_type: Type) + -> List[Tensor] + The compute function. + + schedule : function (attrs: Attrs, outs: List[Tensor], target:Target) -> Schedule + The schedule function. + + name : str + The name of implementation. + + plevel : int + The priority level of implementation. + """ + _OpStrategyAddImplement(self, compute, schedule, name, plevel) -def wrap_fstrategy(compute, schedule): - def fstrategy(attrs, inputs, out_type, target): +def _wrap_default_fstrategy(compute, schedule, name): + def _fstrategy(attrs, inputs, out_type, target): strategy = OpStrategy() - strategy.add_implement(compute, schedule) + strategy.add_implement(compute, schedule, name=name) return strategy - return fstrategy + return _fstrategy -def create_simple_fstrategy(op_name, schedule): +def _create_fstrategy_from_schedule(op_name, schedule): assert hasattr(schedule, "dispatch_dict") compute = get(op_name).get_attr("FTVMCompute") assert compute is not None, "FTVMCompute is not registered for op %s" % op_name fstrategy = get_native_generic_func("{}_strategy".format(op_name)) - fstrategy.set_default(wrap_fstrategy(compute, schedule.fdefault)) + name_pfx = schedule.__name__ + name_pfx = name_pfx[name_pfx.index('_')+1:] + fstrategy.set_default( + _wrap_default_fstrategy(compute, schedule.fdefault, "%s.generic" % name_pfx)) for key, sch in schedule.dispatch_dict.items(): - fstrategy.register(wrap_fstrategy(compute, sch), [key]) + fstrategy.register( + _wrap_default_fstrategy(compute, sch, "%s.%s" % (name_pfx, key)), [key]) return fstrategy @@ -194,7 +252,7 @@ def register_compute(op_name, compute=None, level=10): op_name : str The name of the op. - compute : function (attrs: Attrs, inputs: List[Tensor], out_type: Type, target:Target) + compute : function (attrs: Attrs, inputs: List[Tensor], out_type: Type) -> List[Tensor] The compute function. @@ -205,6 +263,20 @@ def register_compute(op_name, compute=None, level=10): def register_strategy(op_name, fstrategy=None, level=10): + """Register strategy function for an op. + + Parameters + ---------- + op_name : str + The name of the op. + + fstrategy : function (attrs: Attrs, inputs: List[Tensor], out_type: Type, + target:Target) -> OpStrategy + The strategy function. Need to be native GenericFunc. + + level : int + The priority level + """ if not isinstance(fstrategy, GenericFunc): assert hasattr(fstrategy, "generic_func_node") fstrategy = fstrategy.generic_func_node @@ -212,19 +284,66 @@ def register_strategy(op_name, fstrategy=None, level=10): def register_schedule(op_name, schedule, level=10): - fstrategy = create_simple_fstrategy(op_name, schedule) + """Register schedule function for an op. + + This is used when compute function is the same for all targets and only + schedule is different. It requires FTVMCompute is already registered to + the op. + + Parameters + ---------- + op_name : str + The name of the op. + + schedule : function (attrs: Attrs, outs: List[Tensor], target:Target) -> Schedule + The schedule function. Need to be target.generic_func. + + level : int + The priority level + """ + fstrategy = _create_fstrategy_from_schedule(op_name, schedule) return register_strategy(op_name, fstrategy, level) -def register_strategy_injective(op_name, level=10): +def register_injective_schedule(op_name, level=10): + """Register injective schedule function for an op. + + Parameters + ---------- + op_name : str + The name of the op. + + level : int + The priority level + """ return register_schedule(op_name, _schedule_injective, level) -def register_strategy_broadcast(op_name, level=10): +def register_broadcast_schedule(op_name, level=10): + """Register broadcast schedule function for an op. + + Parameters + ---------- + op_name : str + The name of the op. + + level : int + The priority level + """ return register_schedule(op_name, _schedule_injective, level) -def register_strategy_reduce(op_name, level=10): +def register_reduce_schedule(op_name, level=10): + """Register reduce schedule function for an op. + + Parameters + ---------- + op_name : str + The name of the op. + + level : int + The priority level + """ return register_schedule(op_name, _schedule_reduce, level) diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 97f32cdcee72..c88267bf36bb 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -16,8 +16,6 @@ # under the License. """Definition of ARM CPU operator strategy.""" # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import -from __future__ import absolute_import - import re import logging @@ -58,7 +56,8 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack)) + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack), + name="conv2d_nchw_spatial_pack.arm_cpu") _, _, kh, kw = get_const_tuple(kernel.shape) pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw)) @@ -67,24 +66,28 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): strategy.add_implement( wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd), - 15) + name="conv2d_nchw_winograd.arm_cpu", + plevel=15) if pt == 1 and pb == 1 and pl == 1 and pr == 1: strategy.add_implement( wrap_compute_conv2d_winograd_nnpack( topi.arm_cpu.conv2d_nchw_winograd_nnpack), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack), - 13) + name="conv2d_nchw_winograd_nnpack.arm_cpu", + plevel=13) elif layout == "HWCN": assert kernel_layout == "HWIO" - logger.warning("conv2d with layout HWCN is not optimized for arm cpu.") + logger.warning("conv2d_hwcn is not optimized for arm cpu.") strategy.add_implement( wrap_compute_conv2d(topi.nn.conv2d_hwcn), - wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn)) + wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn), + name="conv2d_hwcn.generic") elif layout == "NHWC": assert kernel_layout == "HWIO" strategy.add_implement( wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack)) + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack), + name="conv2d_nhwc_spatial_pack.arm_cpu") else: raise RuntimeError("Unsupported conv2d layout {} for arm cpu".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): @@ -93,17 +96,20 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): if kernel_layout == "OIHW": strategy.add_implement( wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw), - wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw)) + wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw), + name="depthwise_conv2d_nchw.arm_cpu") strategy.add_implement( wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw_spatial_pack), wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_spatial_pack), - 15) + name="depthwise_conv2d_nchw_spatial_pack.arm_cpu", + plevel=15) elif layout == "NHWC": assert kernel_layout == "HWOI" logger.warning("depthwise_conv2d with layout NHWC is not optimized for arm cpu.") strategy.add_implement( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), - wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc)) + wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc), + name="depthwise_conv2d_nhwc.generic") else: raise RuntimeError("Unsupported depthwise_conv2d layout {} for arm cpu". format(layout)) @@ -113,7 +119,8 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): logger.warning("group_conv2d with layout NCHW is not optimized for arm cpu.") strategy.add_implement( wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), - wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw)) + wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), + name="group_conv2d_nchw.generic") else: raise RuntimeError("Unsupported group_conv2d layout {} for arm cpu". format(layout)) @@ -148,14 +155,16 @@ def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out assert kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 strategy.add_implement( wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd)) + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd), + name="conv2d_nchw_winograd.arm_cpu") if pt == 1 and pb == 1 and pl == 1 and pr == 1: strategy.add_implement( wrap_compute_conv2d_winograd_nnpack( topi.arm_cpu.conv2d_nchw_winograd_nnpack_without_weight_transform), wrap_topi_schedule( topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack_without_weight_transform), - 5) + name="conv2d_nchw_winograd_nnpack_withou_weight_transform.arm_cpu", + plevel=5) else: raise RuntimeError("Unsupported conv2d_winograd_without_weight_transfrom layout {}". format(layout)) @@ -173,7 +182,8 @@ def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implement( wrap_compute_conv2d_transpose(topi.arm_cpu.conv2d_transpose_nchw), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_transpose_nchw)) + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_transpose_nchw), + name="conv2d_tranpose_nchw.arm_cpu") return strategy @bitserial_conv2d_strategy.register("arm_cpu") @@ -184,11 +194,13 @@ def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): if layout == "NCHW": strategy.add_implement( wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nchw), - wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nchw)) + wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nchw), + name="bitserial_conv2d_nchw.arm_cpu") elif layout == "NHWC": strategy.add_implement( wrap_compute_bitserial_conv2d(topi.arm_cpu.bitserial_conv2d_nhwc), - wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_conv2d_nhwc)) + wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_conv2d_nhwc), + name="bitserial_conv2d_nhwc.arm_cpu") else: raise ValueError("Data layout {} not supported.".format(layout)) return strategy @@ -199,5 +211,6 @@ def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implement( wrap_compute_bitserial_dense(topi.arm_cpu.bitserial_dense), - wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_dense)) + wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_dense), + name="bitserial_dense.arm_cpu") return strategy diff --git a/python/tvm/relay/op/strategy/bifrost.py b/python/tvm/relay/op/strategy/bifrost.py index 9407000faed9..74c4b0bed530 100644 --- a/python/tvm/relay/op/strategy/bifrost.py +++ b/python/tvm/relay/op/strategy/bifrost.py @@ -16,9 +16,6 @@ # under the License. """Definition of bifrost operator strategy.""" # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import - -from __future__ import absolute_import - import topi from .generic import * from .. import op as _op @@ -42,7 +39,8 @@ def conv2d_strategy_bifrost(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.bifrost.conv2d_nchw_spatial_pack), - wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_spatial_pack)) + wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_spatial_pack), + name="conv2d_nchw_spatial_pack.bifrost") _, _, kh, kw = get_const_tuple(kernel.shape) if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \ @@ -50,7 +48,8 @@ def conv2d_strategy_bifrost(attrs, inputs, out_type, target): strategy.add_implement( wrap_compute_conv2d(topi.bifrost.conv2d_nchw_winograd), wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_winograd), - 15) + name="conv2d_nchw_winograd.bifrost", + plevel=15) else: raise RuntimeError("Unsupported conv2d layout {} for Mali(Bifrost)". format(layout)) @@ -59,7 +58,8 @@ def conv2d_strategy_bifrost(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), - wrap_topi_schedule(topi.bifrost.schedule_depthwise_conv2d_nchw)) + wrap_topi_schedule(topi.bifrost.schedule_depthwise_conv2d_nchw), + name="depthwise_conv2d_nchw.bifrost") else: raise RuntimeError("Unsupported depthwise_conv2d layout {} for Mali(Bifrost)". format(layout)) @@ -82,7 +82,8 @@ def conv2d_winograd_without_weight_transfrom_strategy_bifrost(attrs, inputs, out assert kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 strategy.add_implement( wrap_compute_conv2d(topi.bifrost.conv2d_nchw_winograd), - wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_winograd)) + wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_winograd), + name="conv2d_nchw_winograd.bifrost") else: raise RuntimeError("Unsupported conv2d_winograd_without_weight_transfrom layout {}". format(layout)) @@ -93,5 +94,6 @@ def dense_strategy_bifrost(attrs, inputs, out_type, target): """dense mali(bifrost) strategy""" strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_dense(topi.bifrost.dense), - wrap_topi_schedule(topi.bifrost.schedule_dense)) + wrap_topi_schedule(topi.bifrost.schedule_dense), + name="dense.bifrost") return strategy diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 95efde9de161..59fb64958f00 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -16,8 +16,6 @@ # under the License. """Definition of CUDA/GPU operator strategy.""" # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import -from __future__ import absolute_import - import topi from .generic import * from .. import op as _op @@ -91,30 +89,35 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.cuda.conv2d_nchw), - wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw)) + wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), + name="conv2d_nchw.cuda") _, _, kh, kw = get_const_tuple(kernel.shape) if kh <= 7 and kw <= 7 and kh == kw and stride_h == 1 and stride_w == 1 and \ dilation_h == 1 and dilation_w == 1: strategy.add_implement( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd), - 15) + name="conv2d_nchw_winograd.cuda", + plevel=15) elif layout == "HWCN": assert kernel_layout == "HWIO" strategy.add_implement( wrap_compute_conv2d(topi.cuda.conv2d_hwcn), - wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn)) - # Re-enable this after @alexgl-github fix the conv2d_nhwc for cuda + wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn), + name="conv2d_hwcn.cuda") + # TODO(@alexgl-github): Re-enable this after fix the conv2d_nhwc for cuda # elif layout == "NHWC": # assert kernel_layout == "HWIO" # strategy.add_implement( # wrap_compute_conv2d(topi.cuda.conv2d_nhwc), - # wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc)) + # wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc), + # name="conv2d_nhwc.cuda") elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implement( wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), - wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8)) + wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8), + name="conv2d_NCHWc_int8.cuda") else: raise RuntimeError("Unsupported conv2d layout {} for CUDA".format(layout)) # add cudnn implementation @@ -123,18 +126,22 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target): padding[1] == padding[3]: strategy.add_implement( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, True), - wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), 5) + wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), + name="conv2d_cudnn.cuda", + plevel=5) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), - wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw)) + wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw), + name="dpethwise_nchw.cuda") elif layout == "NHWC": assert kernel_layout == "HWOI" strategy.add_implement( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), - wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc)) + wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc), + name="depthwise_conv2d_nhwc.cuda") else: raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d @@ -143,12 +150,14 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), - wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw)) + wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw), + name="group_conv2d_nchw.cuda") elif layout == 'NCHW4c' and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implement( wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True), - wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8)) + wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8), + name="group_conv2d_NCHWc_int8.cuda") else: raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) return strategy @@ -166,7 +175,8 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty strategy.add_implement( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd_without_weight_transform), wrap_topi_schedule( - topi.cuda.schedule_conv2d_nchw_winograd_without_weight_transform_cuda)) + topi.cuda.schedule_conv2d_nchw_winograd_without_weight_transform), + name="conv2d_nchw_winograd_without_weight_transform.cuda") else: raise RuntimeError("Unsupported conv2d_winograd_without_weight_transfrom layout {}". format(layout)) @@ -175,9 +185,13 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty @deformable_conv2d_strategy.register(["cuda", "gpu"]) def deformable_conv2d_strategy_cuda(attrs, inputs, out_type, target): """deformable_conv2d cuda strategy""" + layout = attrs.data_layout + assert layout == "NCHW" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_deformable_conv2d(topi.cuda.deformable_conv2d_nchw), - wrap_topi_schedule(topi.cuda.schedule_deformable_conv2d_nchw)) + strategy.add_implement( + wrap_compute_deformable_conv2d(topi.cuda.deformable_conv2d_nchw), + wrap_topi_schedule(topi.cuda.schedule_deformable_conv2d_nchw), + name="deformable_conv2d_nchw.cuda") return strategy @conv2d_transpose_strategy.register(["cuda", "gpu"]) @@ -192,7 +206,8 @@ def conv2d_transpose_strategy_cuda(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implement( wrap_compute_conv2d_transpose(topi.cuda.conv2d_transpose_nchw), - wrap_topi_schedule(topi.cuda.schedule_conv2d_transpose_nchw)) + wrap_topi_schedule(topi.cuda.schedule_conv2d_transpose_nchw), + name="conv2d_transpose_nchw.cuda") return strategy @conv3d_strategy.register(["cuda", "gpu"]) @@ -204,15 +219,18 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target): if layout == "NCDHW": strategy.add_implement(wrap_compute_conv3d(topi.cuda.conv3d_ncdhw), wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw), - 10) + name="conv3d_ncdhw.cuda", + plevel=10) else: # layout == "NDHWC": strategy.add_implement(wrap_compute_conv3d(topi.cuda.conv3d_ndhwc), wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc), - 10) + name="conv3d_ndhwc.cuda", + plevel=10) if target.target_name == "cuda" and "cudnn" in target.libs: strategy.add_implement(wrap_compute_conv3d(topi.cuda.conv3d_cudnn), wrap_topi_schedule(topi.cuda.schedule_conv3d_cudnn), - 15) + name="conv3d_cudnn.cuda", + plevel=15) return strategy @conv1d_strategy.register(["cuda", "gpu"]) @@ -225,10 +243,12 @@ def conv1d_strategy_cuda(attrs, inputs, out_type, target): strategy = _op.OpStrategy() if layout == "NCW": strategy.add_implement(wrap_compute_conv1d(topi.cuda.conv1d_ncw), - wrap_topi_schedule(topi.cuda.schedule_conv1d_ncw)) + wrap_topi_schedule(topi.cuda.schedule_conv1d_ncw), + name="conv1d_ncw.cuda") elif layout == "NWC": strategy.add_implement(wrap_compute_conv1d(topi.cuda.conv1d_nwc), - wrap_topi_schedule(topi.cuda.schedule_conv1d_nwc)) + wrap_topi_schedule(topi.cuda.schedule_conv1d_nwc), + name="conv1d_nwc.cuda") else: raise ValueError("Unsupported conv1d layout {}".format(layout)) return strategy @@ -244,7 +264,8 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target): assert dilation == (1,), "conv1d_transpose dilation is not supported" assert groups == 1, "conv1d_transpose groups == 1 only supported" strategy.add_implement(wrap_compute_conv1d_transpose(topi.cuda.conv1d_transpose_ncw), - wrap_topi_schedule(topi.cuda.schedule_conv1d_transpose_ncw)) + wrap_topi_schedule(topi.cuda.schedule_conv1d_transpose_ncw), + name="conv1d_transpose_ncw.cuda") return strategy @dense_strategy.register(["cuda", "gpu"]) @@ -253,17 +274,23 @@ def dense_strategy_cuda(attrs, inputs, out_type, target): strategy = _op.OpStrategy() if out_type.dtype == "int8": strategy.add_implement(wrap_compute_dense(topi.cuda.dense_int8), - wrap_topi_schedule(topi.cuda.schedule_dense_int8)) + wrap_topi_schedule(topi.cuda.schedule_dense_int8), + name="dense_int8.cuda") else: strategy.add_implement(wrap_compute_dense(topi.cuda.dense_small_batch), - wrap_topi_schedule(topi.cuda.schedule_dense_small_batch)) + wrap_topi_schedule(topi.cuda.schedule_dense_small_batch), + name="dense_small_batch.cuda") b = inputs[0].shape[0] with SpecializedCondition(b >= 32): strategy.add_implement(wrap_compute_dense(topi.cuda.dense_large_batch), - wrap_topi_schedule(topi.cuda.schedule_dense_large_batch)) + wrap_topi_schedule(topi.cuda.schedule_dense_large_batch), + name="dense_large_batch.cuda", + plevel=15) if target.target_name == "cuda" and "cublas" in target.libs: strategy.add_implement(wrap_compute_dense(topi.cuda.dense_cublas), - wrap_topi_schedule(topi.cuda.schedule_dense_cublas), 5) + wrap_topi_schedule(topi.cuda.schedule_dense_cublas), + name="dense_cublas.cuda", + plevel=20) return strategy @batch_matmul_strategy.register(["cuda", "gpu"]) @@ -272,27 +299,31 @@ def batch_matmul_strategy_cuda(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_batch_matmul(topi.nn.batch_matmul), wrap_topi_schedule(topi.cuda.schedule_batch_matmul), - 10) + name="batch_matmul.cuda", + plevel=10) if target.target_name == "cuda" and "cublas" in target.libs: strategy.add_implement(wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas), wrap_topi_schedule(topi.generic.schedule_extern), - 15) + name="batch_matmul_cublas.cuda", + plevel=15) return strategy @argsort_strategy.register(["cuda", "gpu"]) def argsort_strategy_cuda(attrs, inputs, out_type, target): """argsort cuda strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_argsort(topi.cuda.argsort_gpu), - wrap_topi_schedule(topi.cuda.schedule_argsort)) + strategy.add_implement(wrap_compute_argsort(topi.cuda.argsort), + wrap_topi_schedule(topi.cuda.schedule_argsort), + name="argsort.cuda") return strategy @topk_strategy.register(["cuda", "gpu"]) def topk_strategy_cuda(attrs, inputs, out_type, target): """topk cuda strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_topk(topi.cuda.topk_gpu), - wrap_topi_schedule(topi.cuda.schedule_topk)) + strategy.add_implement(wrap_compute_topk(topi.cuda.topk), + wrap_topi_schedule(topi.cuda.schedule_topk), + name="topk.cuda") return strategy @schedule_multibox_prior.register(["cuda", "gpu"]) @@ -312,7 +343,8 @@ def get_valid_counts_strategy_cuda(attrs, inputs, out_type, target): """get_valid_counts cuda strategy""" strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_get_valid_counts(topi.cuda.get_valid_counts), - wrap_topi_schedule(topi.cuda.schedule_get_valid_counts)) + wrap_topi_schedule(topi.cuda.schedule_get_valid_counts), + name="get_valid_counts.cuda") return strategy @nms_strategy.register(["cuda", "gpu"]) @@ -320,7 +352,8 @@ def nms_strategy_cuda(attrs, inputs, out_type, target): """nms cuda strategy""" strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_nms(topi.cuda.non_max_suppression), - wrap_topi_schedule(topi.cuda.schedule_nms)) + wrap_topi_schedule(topi.cuda.schedule_nms), + name="nms.cuda") return strategy @roi_align_strategy.register(["cuda", "gpu"]) @@ -328,7 +361,8 @@ def roi_align_strategy_cuda(attrs, inputs, out_type, target): """roi_align cuda strategy""" strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw), - wrap_topi_schedule(topi.cuda.schedule_roi_align)) + wrap_topi_schedule(topi.cuda.schedule_roi_align), + name="roi_align_nchw.cuda") return strategy @schedule_roi_pool.register(["cuda", "gpu"]) @@ -342,5 +376,6 @@ def proposal_strategy_cuda(attrs, inputs, out_type, target): """proposal cuda strategy""" strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_proposal(topi.cuda.proposal), - wrap_topi_schedule(topi.cuda.schedule_proposal)) + wrap_topi_schedule(topi.cuda.schedule_proposal), + name="proposal.cuda") return strategy diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 5d08e9c2374a..24cd8e71dacd 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -16,7 +16,7 @@ # under the License. """Definition of generic operator strategy.""" # pylint: disable=invalid-name,unused-argument -from __future__ import absolute_import +import logging import re import topi @@ -24,6 +24,8 @@ from .. import op as _op from ....target import generic_func, override_native_generic_func +logger = logging.getLogger('strategy') + def wrap_topi_schedule(topi_schedule): """Wrap TOPI schedule which doesn't use attrs""" def wrapper(attrs, outs, target): @@ -154,6 +156,7 @@ def _compute_conv2d(attrs, inputs, out_type): @override_native_generic_func("conv2d_strategy") def conv2d_strategy(attrs, inputs, out_type, target): """conv2d generic strategy""" + logger.warning("conv2d is not optimized for this platform.") strategy = _op.OpStrategy() data, kernel = inputs dilation = get_const_tuple(attrs.dilation) @@ -169,17 +172,20 @@ def conv2d_strategy(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.nn.conv2d_nchw), - wrap_topi_schedule(topi.generic.schedule_conv2d_nchw)) + wrap_topi_schedule(topi.generic.schedule_conv2d_nchw), + name="conv2d_nchw.generic") elif layout == "NHWC": assert kernel_layout == "HWIO" strategy.add_implement( wrap_compute_conv2d(topi.nn.conv2d_nhwc), - wrap_topi_schedule(topi.generic.schedule_conv2d_nhwc)) + wrap_topi_schedule(topi.generic.schedule_conv2d_nhwc), + name="conv2d_nhwc.generic") elif layout == "HWCN": assert kernel_layout == "HWIO" strategy.add_implement( wrap_compute_conv2d(topi.nn.conv2d_hwcn), - wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn)) + wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn), + name="conv2d_hwcn.generic") else: raise RuntimeError("Unsupported conv2d layout {}".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): @@ -187,12 +193,14 @@ def conv2d_strategy(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), - wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nchw)) + wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nchw), + name="depthwise_conv2d_nchw.generic") elif layout == "NHWC": assert kernel_layout == "HWOI" strategy.add_implement( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), - wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc)) + wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc), + name="depthwise_conv2d_nhwc.generic") else: raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d @@ -200,7 +208,8 @@ def conv2d_strategy(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), - wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw)) + wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), + name="group_conv2d_nchw.generic") else: raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) return strategy @@ -209,25 +218,30 @@ def conv2d_strategy(attrs, inputs, out_type, target): @override_native_generic_func("conv2d_NCHWc_strategy") def conv2d_NCHWc_strategy(attrs, inputs, out_type, target): """conv2d_NCHWc generic strategy""" + logger.warning("conv2d_NCHWc is not optimized for this platform.") strategy = _op.OpStrategy() if inputs[0].dtype == "int8" or inputs[0].dtype == "uint8": strategy.add_implement( wrap_compute_conv2d(topi.nn.conv2d_NCHWc_int8, True, True), - wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc_int8)) + wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc_int8), + name="conv2d_NCHWc_int8.generic") else: strategy.add_implement( wrap_compute_conv2d(topi.nn.conv2d_NCHWc, True, True), - wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc)) + wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc), + name="conv2d_NCHWc.generic") return strategy # depthwise_conv2d_NCHWc @override_native_generic_func("depthwise_conv2d_NCHWc_strategy") def depthwise_conv2d_NCHWc_strategy(attrs, inputs, out_type, target): """depthwise_conv2d generic strategy""" + logger.warning("depthwise_conv2d_NCHWc is not optimized for this platform.") strategy = _op.OpStrategy() strategy.add_implement( wrap_compute_conv2d(topi.nn.depthwise_conv2d_NCHWc, True, True), - wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_NCHWc)) + wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_NCHWc), + name="depthwise_conv2d_NCHWc.generic") return strategy # conv2d_winograd_without_weight_transform @@ -270,9 +284,14 @@ def _compute_deformable_conv2d(attrs, inputs, out_dtype): @override_native_generic_func("deformable_conv2d_strategy") def deformable_conv2d_strategy(attrs, inputs, out_type, target): """deformable_conv2d generic strategy""" + logger.warning("deformable_conv2d is not optimized for this platform.") + layout = attrs.data_layout + assert layout == "NCHW" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_deformable_conv2d(topi.nn.deformable_conv2d_nchw), - wrap_topi_schedule(topi.generic.schedule_deformable_conv2d_nchw)) + strategy.add_implement( + wrap_compute_deformable_conv2d(topi.nn.deformable_conv2d_nchw), + wrap_topi_schedule(topi.generic.schedule_deformable_conv2d_nchw), + name="deformable_conv2d.generic") return strategy # conv2d_transpose @@ -296,6 +315,7 @@ def compute_conv2d_transpose(attrs, inputs, out_dtype): @override_native_generic_func("conv2d_transpose_strategy") def conv2d_transpose_strategy(attrs, inputs, out_type, target): """conv2d_transpose generic strategy""" + logger.warning("conv2d_transpose is not optimized for this platform.") layout = attrs.data_layout dilation = get_const_tuple(attrs.dilation) groups = attrs.groups @@ -305,7 +325,8 @@ def conv2d_transpose_strategy(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implement( wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw), - wrap_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw)) + wrap_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw), + name="conv2d_transpose_nchw.generic") return strategy # conv3d @@ -336,14 +357,17 @@ def _compute_conv3d(attrs, inputs, out_type): @override_native_generic_func("conv3d_strategy") def conv3d_strategy(attrs, inputs, out_type, target): """conv3d generic strategy""" + logger.warning("conv3d is not optimized for this platform.") strategy = _op.OpStrategy() layout = attrs.data_layout if layout == "NCDHW": strategy.add_implement(wrap_compute_conv3d(topi.nn.conv3d_ncdhw), - wrap_topi_schedule(topi.generic.schedule_conv3d_ncdhw)) + wrap_topi_schedule(topi.generic.schedule_conv3d_ncdhw), + name="conv3d_ncdhw.generic") elif layout == "NDHWC": strategy.add_implement(wrap_compute_conv3d(topi.nn.conv3d_ndhwc), - wrap_topi_schedule(topi.generic.schedule_conv3d_ndhwc)) + wrap_topi_schedule(topi.generic.schedule_conv3d_ndhwc), + name="conv3d_ndhwc.generic") else: raise ValueError("Not support this layout {} yet".format(layout)) return strategy @@ -366,6 +390,7 @@ def _compute_conv1d(attrs, inputs, out_type): @override_native_generic_func("conv1d_strategy") def conv1d_strategy(attrs, inputs, out_type, target): """conv1d generic strategy""" + logger.warning("conv1d is not optimized for this platform.") layout = attrs.data_layout dilation = get_const_tuple(attrs.dilation) if dilation[0] < 1: @@ -373,10 +398,12 @@ def conv1d_strategy(attrs, inputs, out_type, target): strategy = _op.OpStrategy() if layout == "NCW": strategy.add_implement(wrap_compute_conv1d(topi.nn.conv1d_ncw), - wrap_topi_schedule(topi.generic.schedule_conv1d_ncw)) + wrap_topi_schedule(topi.generic.schedule_conv1d_ncw), + name="conv1d_ncw.generic") elif layout == "NWC": strategy.add_implement(wrap_compute_conv1d(topi.nn.conv1d_nwc), - wrap_topi_schedule(topi.generic.schedule_conv1d_nwc)) + wrap_topi_schedule(topi.generic.schedule_conv1d_nwc), + name="conv1d_nwc.generic") else: raise ValueError("Unsupported conv1d layout {}".format(layout)) return strategy @@ -398,6 +425,7 @@ def _compute_conv1d_tranpsoe(attrs, inputs, out_type): @override_native_generic_func("conv1d_transpose_strategy") def conv1d_transpose_strategy(attrs, inputs, out_type, target): """conv1d_transpose generic strategy""" + logger.warning("conv1d_transpose is not optimized for this platform.") strategy = _op.OpStrategy() layout = attrs.data_layout dilation = get_const_tuple(attrs.dilation) @@ -406,7 +434,8 @@ def conv1d_transpose_strategy(attrs, inputs, out_type, target): assert dilation == (1,), "conv1d_transpose dilation is not supported" assert groups == 1, "conv1d_transpose groups == 1 only supported" strategy.add_implement(wrap_compute_conv1d_transpose(topi.nn.conv1d_transpose_ncw), - wrap_topi_schedule(topi.generic.schedule_conv1d_transpose_ncw)) + wrap_topi_schedule(topi.generic.schedule_conv1d_transpose_ncw), + name="conv1d_transpose_ncw.generic") return strategy # dense @@ -422,9 +451,11 @@ def _compute_dense(attrs, inputs, out_type): @override_native_generic_func("dense_strategy") def dense_strategy(attrs, inputs, out_type, target): """dense generic strategy""" + logger.warning("dense is not optimized for this platform.") strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_dense(topi.nn.dense), - wrap_topi_schedule(topi.generic.schedule_dense)) + wrap_topi_schedule(topi.generic.schedule_dense), + name="dense.generic") return strategy # batch_matmul @@ -437,9 +468,11 @@ def _compute_batch_matmul(attrs, inputs, out_type): @override_native_generic_func("batch_matmul_strategy") def batch_matmul_strategy(attrs, inputs, out_type, target): """batch_matmul generic strategy""" + logger.warning("batch_matmul is not optimized for this platform.") strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_batch_matmul(topi.nn.batch_matmul), - wrap_topi_schedule(topi.generic.schedule_batch_matmul)) + wrap_topi_schedule(topi.generic.schedule_batch_matmul), + name="batch_matmul.generic") return strategy # sparse_dense @@ -471,7 +504,8 @@ def argsort_strategy(attrs, inputs, out_type, target): """argsort generic strategy""" strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_argsort(topi.argsort), - wrap_topi_schedule(topi.generic.schedule_argsort)) + wrap_topi_schedule(topi.generic.schedule_argsort), + name="argsort.generic") return strategy # topk @@ -493,7 +527,8 @@ def topk_strategy(attrs, inputs, out_type, target): """topk generic strategy""" strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_topk(topi.topk), - wrap_topi_schedule(topi.generic.schedule_topk)) + wrap_topi_schedule(topi.generic.schedule_topk), + name="topk.generic") return strategy # multibox_prior @@ -525,7 +560,8 @@ def get_valid_counts_strategy(attrs, inputs, out_type, target): """get_valid_counts generic strategy""" strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_get_valid_counts(topi.vision.get_valid_counts), - wrap_topi_schedule(topi.generic.schedule_get_valid_counts)) + wrap_topi_schedule(topi.generic.schedule_get_valid_counts), + name="get_valid_counts.generic") return strategy # non-maximum suppression @@ -551,7 +587,8 @@ def nms_strategy(attrs, inputs, out_type, target): """nms generic strategy""" strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_nms(topi.vision.non_max_suppression), - wrap_topi_schedule(topi.generic.schedule_nms)) + wrap_topi_schedule(topi.generic.schedule_nms), + name="nms.generic") return strategy # roi_align @@ -571,7 +608,8 @@ def roi_align_strategy(attrs, inputs, out_type, target): """roi_align generic strategy""" strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw), - wrap_topi_schedule(topi.generic.schedule_roi_align)) + wrap_topi_schedule(topi.generic.schedule_roi_align), + name="roi_align.generic") return strategy # roi_pool @@ -603,7 +641,8 @@ def proposal_strategy(attrs, inputs, out_type, target): """proposal generic strategy""" strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_proposal(topi.vision.rcnn.proposal), - wrap_topi_schedule(topi.generic.schedule_proposal)) + wrap_topi_schedule(topi.generic.schedule_proposal), + name="proposal.generic") return strategy # argwhere @@ -632,16 +671,19 @@ def compute_bitserial_conv2d(attrs, inputs, out_dtype): @override_native_generic_func("bitserial_conv2d_strategy") def bitserial_conv2d_strategy(attrs, inputs, out_type, target): """bitserial_conv2d generic strategy""" + logger.warning("bitserial_conv2d is not optimized for this platform.") strategy = _op.OpStrategy() layout = attrs.data_layout if layout == "NCHW": strategy.add_implement( wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nchw), - wrap_topi_schedule(topi.generic.schedule_bitserial_conv2d_nchw)) + wrap_topi_schedule(topi.generic.schedule_bitserial_conv2d_nchw), + name="bitserial_conv2d_nchw.generic") elif layout == "NHWC": strategy.add_implement( wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nhwc), - wrap_topi_schedule(topi.generic.schedule_bitserial_conv2d_nhwc)) + wrap_topi_schedule(topi.generic.schedule_bitserial_conv2d_nhwc), + name="bitserial_conv2d_nhwc.generic") else: raise ValueError("Data layout {} not supported.".format(layout)) return strategy @@ -664,8 +706,10 @@ def compute_bitserial_dense(attrs, inputs, out_type): @override_native_generic_func("bitserial_dense_strategy") def bitserial_dense_strategy(attrs, inputs, out_type, target): """bitserial_dense generic strategy""" + logger.warning("bitserial_dense is not optimized for this platform.") strategy = _op.OpStrategy() strategy.add_implement( wrap_compute_bitserial_dense(topi.nn.bitserial_dense), - wrap_topi_schedule(topi.generic.schedule_bitserial_dense)) + wrap_topi_schedule(topi.generic.schedule_bitserial_dense), + name="bitserial_dense.generic") return strategy diff --git a/python/tvm/relay/op/strategy/hls.py b/python/tvm/relay/op/strategy/hls.py index 818b93faaf41..ca14ffe92d61 100644 --- a/python/tvm/relay/op/strategy/hls.py +++ b/python/tvm/relay/op/strategy/hls.py @@ -16,8 +16,6 @@ # under the License. """Definition of HLS operator strategy.""" # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import -from __future__ import absolute_import - import topi from .generic import * from .. import op as _op @@ -76,12 +74,14 @@ def conv2d_strategy_hls(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.nn.conv2d_nchw), - wrap_topi_schedule(topi.hls.schedule_conv2d_nchw)) + wrap_topi_schedule(topi.hls.schedule_conv2d_nchw), + name="conv2d_nchw.hls") elif layout == "NHWC": assert kernel_layout == "HWIO" strategy.add_implement( wrap_compute_conv2d(topi.nn.conv2d_nhwc), - wrap_topi_schedule(topi.hls.schedule_conv2d_nhwc)) + wrap_topi_schedule(topi.hls.schedule_conv2d_nhwc), + name="conv2d_nhwc.hls") else: raise RuntimeError("Unsupported conv2d layout {}".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): @@ -89,12 +89,14 @@ def conv2d_strategy_hls(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), - wrap_topi_schedule(topi.hls.schedule_depthwise_conv2d_nchw)) + wrap_topi_schedule(topi.hls.schedule_depthwise_conv2d_nchw), + name="depthwise_conv2d_nchw.hls") elif layout == "NHWC": assert kernel_layout == "HWOI" strategy.add_implement( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), - wrap_topi_schedule(topi.hls.schedule_depthwise_conv2d_nhwc)) + wrap_topi_schedule(topi.hls.schedule_depthwise_conv2d_nhwc), + name="depthwise_nhwc.hls") else: raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d @@ -107,7 +109,8 @@ def conv2d_NCHWc_strategy_hls(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implement( wrap_compute_conv2d(topi.nn.conv2d_NCHWc, True, True), - wrap_topi_schedule(topi.hls.schedule_conv2d_NCHWc)) + wrap_topi_schedule(topi.hls.schedule_conv2d_NCHWc), + name="conv2d_NCHWc.hls") return strategy @conv2d_transpose_strategy.register("hls") @@ -122,7 +125,8 @@ def conv2d_transpose_strategy_hls(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implement( wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw), - wrap_topi_schedule(topi.hls.schedule_conv2d_transpose_nchw)) + wrap_topi_schedule(topi.hls.schedule_conv2d_transpose_nchw), + name="conv2d_transpose_nchw.hls") return strategy @dense_strategy.register("hls") @@ -130,7 +134,8 @@ def dense_strategy_hls(attrs, inputs, out_type, target): """dense hls strategy""" strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_dense(topi.nn.dense), - wrap_topi_schedule(topi.hls.schedule_dense)) + wrap_topi_schedule(topi.hls.schedule_dense), + name="dense.hls") return strategy @bitserial_conv2d_strategy.register("hls") @@ -141,11 +146,13 @@ def bitserial_conv2d_strategy_hls(attrs, inputs, out_type, target): if layout == "NCHW": strategy.add_implement( wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nchw), - wrap_topi_schedule(topi.hls.schedule_bitserial_conv2d_nchw)) + wrap_topi_schedule(topi.hls.schedule_bitserial_conv2d_nchw), + name="bitserial_conv2d_nchw.hls") elif layout == "NHWC": strategy.add_implement( wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nhwc), - wrap_topi_schedule(topi.hls.schedule_bitserial_conv2d_nhwc)) + wrap_topi_schedule(topi.hls.schedule_bitserial_conv2d_nhwc), + name="bitserial_conv2d_nhwc.hls") else: raise ValueError("Data layout {} not supported.".format(layout)) return strategy diff --git a/python/tvm/relay/op/strategy/intel_graphics.py b/python/tvm/relay/op/strategy/intel_graphics.py index c94d5cbc211d..cd047f79305e 100644 --- a/python/tvm/relay/op/strategy/intel_graphics.py +++ b/python/tvm/relay/op/strategy/intel_graphics.py @@ -16,8 +16,6 @@ # under the License. """Definition of x86 operator strategy.""" # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import -from __future__ import absolute_import - import topi from .generic import * from .. import op as _op @@ -40,13 +38,15 @@ def conv2d_strategy_intel_graphics(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.intel_graphics.conv2d_nchw), - wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_nchw)) + wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_nchw), + name="conv2d_nchw.intel_graphics") # conv2d_NCHWc won't work without alter op layout pass # TODO(@Laurawly): fix this strategy.add_implement( wrap_compute_conv2d(topi.intel_graphics.conv2d_NCHWc, True, True), wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_NCHWc), - 5) + name="conv2d_NCHWc.intel_graphics", + plevel=5) else: raise RuntimeError("Unsupported conv2d layout {} for intel graphics". format(layout)) @@ -55,7 +55,8 @@ def conv2d_strategy_intel_graphics(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.intel_graphics.depthwise_conv2d_nchw), - wrap_topi_schedule(topi.intel_graphics.schedule_depthwise_conv2d_nchw)) + wrap_topi_schedule(topi.intel_graphics.schedule_depthwise_conv2d_nchw), + name="depthwise_conv2d_nchw.intel_graphics") else: raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d @@ -68,5 +69,6 @@ def conv2d_NCHWc_strategy_intel_graphics(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implement( wrap_compute_conv2d(topi.intel_graphics.conv2d_NCHWc, True, True), - wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_NCHWc)) + wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_NCHWc), + name="conv2d_NCHWc.intel_graphics") return strategy diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py index 8641a959952f..b30b916a0285 100644 --- a/python/tvm/relay/op/strategy/mali.py +++ b/python/tvm/relay/op/strategy/mali.py @@ -16,9 +16,6 @@ # under the License. """Definition of mali operator strategy.""" # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import - -from __future__ import absolute_import - import topi from .generic import * from .. import op as _op @@ -41,7 +38,8 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack), - wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack)) + wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack), + name="conv2d_nchw_spatial_pack.mali") _, _, kh, kw = get_const_tuple(kernel.shape) if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \ @@ -49,7 +47,8 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target): strategy.add_implement( wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd), wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd), - 15) + name="conv2d_nchw_winograd.mali", + plevel=15) else: raise RuntimeError("Unsupported conv2d layout {} for mali".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): @@ -57,7 +56,8 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.mali.depthwise_conv2d_nchw), - wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nchw)) + wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nchw), + name="depthwise_conv2d_nchw.mali") else: raise RuntimeError("Unsupported depthwise_conv2d layout {} for mali".format(layout)) else: # group_conv2d @@ -79,7 +79,8 @@ def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_ty assert kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 strategy.add_implement( wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd), - wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd)) + wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd), + name="conv2d_nchw_winograd.mali") else: raise RuntimeError("Unsupported conv2d_winograd_without_weight_transfrom layout {}". format(layout)) @@ -90,5 +91,6 @@ def dense_strategy_mali(attrs, inputs, out_type, target): """dense mali strategy""" strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_dense(topi.mali.dense), - wrap_topi_schedule(topi.mali.schedule_dense)) + wrap_topi_schedule(topi.mali.schedule_dense), + name="dense.mali") return strategy diff --git a/python/tvm/relay/op/strategy/opengl.py b/python/tvm/relay/op/strategy/opengl.py index f5da48c150c2..c21ccc5593e6 100644 --- a/python/tvm/relay/op/strategy/opengl.py +++ b/python/tvm/relay/op/strategy/opengl.py @@ -16,8 +16,6 @@ # under the License. """Definition of OpenGL operator strategy.""" # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import -from __future__ import absolute_import - import topi from .generic import * from .. import op as _op @@ -54,20 +52,22 @@ def schedule_softmax_opengl(attrs, outs, target): @conv2d_strategy.register("opengl") def conv2d_strategy_opengl(attrs, inputs, out_type, target): - """conv2d hls strategy""" + """conv2d opengl strategy""" strategy = _op.OpStrategy() groups = attrs.groups layout = attrs.data_layout assert groups == 1, "Don't support group conv2d on OpenGL" assert layout == "NCHW", "Only support conv2d layout NCHW for OpenGL" strategy.add_implement(wrap_compute_conv2d(topi.nn.conv2d), - wrap_topi_schedule(topi.opengl.schedule_conv2d_nchw)) + wrap_topi_schedule(topi.opengl.schedule_conv2d_nchw), + name="conv2d_nchw.opengl") return strategy @dense_strategy.register("opengl") def dense_strategy_opengl(attrs, inputs, out_type, target): - """dense hls strategy""" + """dense opengl strategy""" strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_dense(topi.nn.dense), - wrap_topi_schedule(topi.opengl.schedule_dense)) + wrap_topi_schedule(topi.opengl.schedule_dense), + name="dense.opengl") return strategy diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py index c21dd40c5cbe..a356521cc6f7 100644 --- a/python/tvm/relay/op/strategy/rocm.py +++ b/python/tvm/relay/op/strategy/rocm.py @@ -16,8 +16,6 @@ # under the License. """Definition of ROCm operator strategy.""" # pylint: disable=invalid-name,unused-argument,unused-wildcard-import,wildcard-import -from __future__ import absolute_import - import topi from .generic import * from .. import op as _op @@ -29,8 +27,8 @@ def schedule_lrn_rocm(attrs, outs, target): return topi.rocm.schedule_lrn(outs) @conv2d_strategy.register("rocm") -def conv2d_strategy_cuda(attrs, inputs, out_type, target): - """conv2d cuda strategy""" +def conv2d_strategy_rocm(attrs, inputs, out_type, target): + """conv2d rocm strategy""" strategy = _op.OpStrategy() data, kernel = inputs dilation_h, dilation_w = attrs.get_int_tuple("dilation") @@ -47,28 +45,34 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.cuda.conv2d_nchw), - wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw)) + wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), + name="conv2d_nchw.cuda") _, _, kh, kw = get_const_tuple(kernel.shape) if kh <= 7 and kw <= 7 and kh == kw and stride_h == 1 and stride_w == 1: strategy.add_implement( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd), - 15) + name="conv2d_nchw_winograd.cuda", + plevel=15) elif layout == "HWCN": assert kernel_layout == "HWIO" strategy.add_implement( wrap_compute_conv2d(topi.cuda.conv2d_hwcn), - wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn)) - elif layout == "NHWC": - assert kernel_layout == "HWIO" - strategy.add_implement( - wrap_compute_conv2d(topi.cuda.conv2d_nhwc), - wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc)) + wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn), + name="conv2d_hwcn.cuda") + # TODO(@alexgl-github): Re-enable this after fix the conv2d_nhwc for cuda + # elif layout == "NHWC": + # assert kernel_layout == "HWIO" + # strategy.add_implement( + # wrap_compute_conv2d(topi.cuda.conv2d_nhwc), + # wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc), + # name="conv2d_nhwc.cuda") elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implement( wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), - wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8)) + wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8), + name="conv2d_NCHWc_int8.cuda") else: raise RuntimeError("Unsupported conv2d layout {} for CUDA".format(layout)) # add miopen implementation @@ -76,18 +80,22 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target): if layout == "NCHW": strategy.add_implement( wrap_compute_conv2d(topi.rocm.conv2d_nchw_miopen, True), - wrap_topi_schedule(topi.rocm.schedule_conv2d_nchw_miopen), 5) + wrap_topi_schedule(topi.rocm.schedule_conv2d_nchw_miopen), + name="conv2d_nchw_miopen.rocm", + plevel=15) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), - wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw)) + wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw), + name="depthwise_conv2d_nchw.cuda") elif layout == "NHWC": assert kernel_layout == "HWOI" strategy.add_implement( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), - wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc)) + wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc), + name="depthwise_conv2d_nhwc.cuda") else: raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d @@ -96,12 +104,14 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" strategy.add_implement( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), - wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw)) + wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw), + name="group_conv2d_nchw.cuda") elif layout == 'NCHW4c' and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implement( wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True), - wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8)) + wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8), + name="group_conv2d_NCHWc_int8.cuda") else: raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) return strategy @@ -113,10 +123,13 @@ def dense_strategy_rocm(attrs, inputs, out_type, target): assert len(inputs[0].shape) == 2 and len(inputs[1].shape) == 2, "Only support 2-dim dense" strategy.add_implement(wrap_compute_dense(topi.rocm.dense), - wrap_topi_schedule(topi.rocm.schedule_dense)) + wrap_topi_schedule(topi.rocm.schedule_dense), + name="dense.rocm") if target.target_name == "rocm" and "rocblas" in target.libs: assert out_type.dtype == inputs[0].dtype, "Mixed precision not supported." strategy.add_implement( wrap_compute_dense(topi.rocm.dense_rocblas), - wrap_topi_schedule(topi.rocm.dense_rocblas), 5) + wrap_topi_schedule(topi.rocm.dense_rocblas), + name="dense_rocblas.rocm", + plevel=5) return strategy diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 435f1626f826..ae0e7a1bf2d1 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -16,8 +16,6 @@ # under the License. """Definition of x86 operator strategy.""" # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import -from __future__ import absolute_import - import logging import topi @@ -81,25 +79,29 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype): strategy.add_implement( wrap_compute_conv2d(topi.x86.conv2d_nchw_int8), - wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8)) + wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8), + name="conv2d_nchw_int8.x86") else: strategy.add_implement( wrap_compute_conv2d(topi.x86.conv2d_nchw), - wrap_topi_schedule(topi.x86.schedule_conv2d_nchw)) + wrap_topi_schedule(topi.x86.schedule_conv2d_nchw), + name="conv2d_nchw.x86") elif layout == "NHWC": assert kernel_layout == "HWIO" logger.warning("For x86 target, NCHW layout is recommended for conv2d.") strategy.add_implement( wrap_compute_conv2d(topi.nn.conv2d_nhwc), - wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc)) + wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc), + name="conv2d_nhwc.x86") elif layout == "HWCN": assert kernel_layout == "HWIO" logger.warning("For x86 target, NCHW layout is recommended for conv2d.") strategy.add_implement( wrap_compute_conv2d(topi.nn.conv2d_hwcn), - wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn)) + wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn), + name="conv2d_hwcn.generic") else: - raise RuntimeError("Unsupported conv2d layout {} for cpu".format(layout)) + raise RuntimeError("Unsupported conv2d layout {} for x86".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" @@ -107,29 +109,32 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): if channel_multiplier == 1: strategy.add_implement( wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw), - wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_nchw)) + wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_nchw), + name="depthwise_conv2d_nchw.x86") else: logger.warning("For x86 target, depthwise_conv2d with channel " "multiplier greater than 1 is not optimized") strategy.add_implement( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), - wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nchw)) + wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nchw), + name="depthwise_conv2d_nchw.generic") elif layout == "NHWC": assert kernel_layout == "HWOI" - logger.warning("For x86 target, depthwise_conv2d with NCHW layout is " - "not optimized.") + logger.warning("depthwise_conv2d_nhwc is not optimized for x86.") strategy.add_implement( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), - wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc)) + wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc), + name="depthwise_conv2d_nhwc.generic") else: raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d if layout == 'NCHW': assert kernel_layout == "OIHW" - logger.warning("group_conv2d is not optimized for cpu.") + logger.warning("group_conv2d is not optimized for x86.") strategy.add_implement( wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), - wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw)) + wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), + name="group_conv2d_nchw.generic") else: raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) return strategy @@ -142,11 +147,13 @@ def conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target): if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype): strategy.add_implement( wrap_compute_conv2d(topi.x86.conv2d_NCHWc_int8, True, True), - wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc_int8)) + wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc_int8), + name="conv2d_NCHWc_int8.x86") else: strategy.add_implement( wrap_compute_conv2d(topi.x86.conv2d_NCHWc, True, True), - wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc)) + wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc), + name="conv2d_NCHWc.x86") return strategy @depthwise_conv2d_NCHWc_strategy.register("cpu") @@ -155,7 +162,8 @@ def depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implement( wrap_compute_conv2d(topi.x86.depthwise_conv2d_NCHWc, True, True), - wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc)) + wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc), + name="depthwise_conv2d_NCHWc.x86") return strategy @conv2d_transpose_strategy.register("cpu") @@ -170,7 +178,8 @@ def conv2d_transpose_strategy_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implement( wrap_compute_conv2d_transpose(topi.x86.conv2d_transpose_nchw), - wrap_topi_schedule(topi.x86.schedule_conv2d_transpose_nchw)) + wrap_topi_schedule(topi.x86.schedule_conv2d_transpose_nchw), + name="conv2d_transpose_nchw.x86") return strategy @conv3d_strategy.register("cpu") @@ -179,12 +188,14 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() layout = attrs.data_layout if layout == "NCDHW": - logger.warning("conv3d with layout NCDHW is not optimized for cpu.") + logger.warning("conv3d with layout NCDHW is not optimized for x86.") strategy.add_implement(wrap_compute_conv3d(topi.nn.conv3d_ncdhw), - wrap_topi_schedule(topi.generic.schedule_conv3d_ncdhw)) + wrap_topi_schedule(topi.generic.schedule_conv3d_ncdhw), + name="conv3d_ncdhw.generic") elif layout == "NDHWC": strategy.add_implement(wrap_compute_conv3d(topi.x86.conv3d_ndhwc), - wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc)) + wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc), + name="conv3d_ndhwc.x86") else: raise ValueError("Not support this layout {} yet".format(layout)) return strategy @@ -199,10 +210,12 @@ def conv1d_strategy_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() if layout == "NCW": strategy.add_implement(wrap_compute_conv1d(topi.nn.conv1d_ncw), - wrap_topi_schedule(topi.x86.schedule_conv1d_ncw)) + wrap_topi_schedule(topi.x86.schedule_conv1d_ncw), + name="conv1d_ncw.x86") elif layout == "NWC": strategy.add_implement(wrap_compute_conv1d(topi.nn.conv1d_nwc), - wrap_topi_schedule(topi.x86.schedule_conv1d_nwc)) + wrap_topi_schedule(topi.x86.schedule_conv1d_nwc), + name="conv1d_nwc.x86") else: raise ValueError("Unsupported conv1d layout {}".format(layout)) return strategy @@ -214,14 +227,19 @@ def dense_strategy_cpu(attrs, inputs, out_type, target): _, k = inputs[0].shape strategy.add_implement(wrap_compute_dense(topi.x86.dense_nopack), wrap_topi_schedule(topi.x86.schedule_dense_nopack), - 10) + name="dense_nopack.x86", + plevel=10) if "cblas" in target.libs: strategy.add_implement(wrap_compute_dense(topi.x86.dense_cblas), wrap_topi_schedule(topi.x86.schedule_dense_cblas), - 5) + name="dense_cblas.x86", + plevel=5) with SpecializedCondition(k > 16): + # this implementation may not be well-optimized, so use plevel=8 for now. strategy.add_implement(wrap_compute_dense(topi.x86.dense_pack), - wrap_topi_schedule(topi.x86.schedule_dense_pack)) + wrap_topi_schedule(topi.x86.schedule_dense_pack), + name="dense_pack.x86", + plevel=8) return strategy @batch_matmul_strategy.register("cpu") @@ -230,11 +248,13 @@ def batch_matmul_strategy_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_batch_matmul(topi.x86.batch_matmul), wrap_topi_schedule(topi.x86.schedule_batch_matmul), - 10) + name="batch_matmul.x86", + plevel=10) if "cblas" in target.libs: strategy.add_implement(wrap_compute_batch_matmul(topi.x86.batch_matmul_cblas), wrap_topi_schedule(topi.x86.schedule_batch_matmul_cblas), - 5) + name="batch_matmul_cblas.x86", + plevel=5) return strategy @schedule_sparse_dense.register("cpu") @@ -248,7 +268,8 @@ def roi_align_strategy_cpu(attrs, inputs, out_type, target): """roi_align x86 strategy""" strategy = _op.OpStrategy() strategy.add_implement(wrap_compute_roi_align(topi.x86.roi_align_nchw), - wrap_topi_schedule(topi.generic.schedule_roi_align)) + wrap_topi_schedule(topi.generic.schedule_roi_align), + name="roi_align.x86") return strategy @bitserial_conv2d_strategy.register("cpu") @@ -259,11 +280,13 @@ def bitserial_conv2d_strategy_cpu(attrs, inputs, out_type, target): if layout == "NCHW": strategy.add_implement( wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nchw), - wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nchw)) + wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nchw), + name="bitserial_conv2d_nchw.x86") elif layout == "NHWC": strategy.add_implement( wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nhwc), - wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nhwc)) + wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nhwc), + name="bitserial_conv2d_nhwc.x86") else: raise ValueError("Data layout {} not supported.".format(layout)) return strategy @@ -274,5 +297,6 @@ def bitserial_dense_strategy_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implement( wrap_compute_bitserial_dense(topi.x86.bitserial_dense), - wrap_topi_schedule(topi.x86.schedule_bitserial_dense)) + wrap_topi_schedule(topi.x86.schedule_bitserial_dense), + name="bitserial_dense.x86") return strategy diff --git a/python/tvm/relay/op/vision/_yolo.py b/python/tvm/relay/op/vision/_yolo.py index d6ac0d4bfbcf..c58a7a367549 100644 --- a/python/tvm/relay/op/vision/_yolo.py +++ b/python/tvm/relay/op/vision/_yolo.py @@ -18,8 +18,8 @@ """Backend compiler related feature registration""" from __future__ import absolute_import from ..op import register_pattern, OpPattern -from ..op import register_strategy_injective +from ..op import register_injective_schedule # reorg register_pattern("vision.yolo_reorg", OpPattern.INJECTIVE) -register_strategy_injective("vision.yolo_reorg") +register_injective_schedule("vision.yolo_reorg") diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py index 82b243a9fc14..71cd86723a6c 100644 --- a/python/tvm/relay/quantize/_annotate.py +++ b/python/tvm/relay/quantize/_annotate.py @@ -52,10 +52,10 @@ def simulated_quantize_compute(attrs, inputs, out_type, target): return [rdata] -_reg.register_strategy_injective("relay.op.annotation.simulated_quantize") +_reg.register_injective_schedule("relay.op.annotation.simulated_quantize") _reg.register_pattern("relay.op.annotation.simulated_quantize", _reg.OpPattern.ELEMWISE) -_reg.register_strategy_injective("annotation.cast_hint") +_reg.register_injective_schedule("annotation.cast_hint") @register_relay_node diff --git a/src/relay/ir/op_attr_types.cc b/src/relay/ir/op_attr_types.cc index 38f890ba75d4..c39427b91372 100644 --- a/src/relay/ir/op_attr_types.cc +++ b/src/relay/ir/op_attr_types.cc @@ -41,16 +41,19 @@ te::Schedule OpImplement::Schedule(const Attrs& attrs, void OpSpecialization::AddImplement(tvm::relay::FTVMCompute fcompute, tvm::relay::FTVMSchedule fschedule, + std::string name, int plevel) { auto n = make_object(); n->fcompute = fcompute; n->fschedule = fschedule; - n->plevel = IntImm(DataType::Int(32), plevel); + n->name = name; + n->plevel = plevel; (*this)->implements.push_back(OpImplement(n)); } void OpStrategy::AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, + std::string name, int plevel) { auto curr_cond = te::SpecializedCondition::Current(); auto specializations = (*this)->specializations; @@ -62,12 +65,12 @@ void OpStrategy::AddImplement(FTVMCompute fcompute, } } if (op_spec.defined()) { - op_spec.AddImplement(fcompute, fschedule, plevel); + op_spec.AddImplement(fcompute, fschedule, name, plevel); } else { ObjectPtr n = make_object(); n->condition = curr_cond; op_spec = OpSpecialization(n); - op_spec.AddImplement(fcompute, fschedule, plevel); + op_spec.AddImplement(fcompute, fschedule, name, plevel); (*this)->specializations.push_back(op_spec); } } @@ -101,8 +104,9 @@ TVM_REGISTER_GLOBAL("relay.op._OpStrategyAddImplement") OpStrategy strategy = args[0]; FTVMCompute compute = args[1]; FTVMSchedule schedule = args[2]; - int plevel = args[3]; - strategy.AddImplement(compute, schedule, plevel); + std::string name = args[3]; + int plevel = args[4]; + strategy.AddImplement(compute, schedule, name, plevel); }); diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py index 0f22a48bd368..6e09be97390c 100644 --- a/topi/python/topi/cuda/conv2d_winograd.py +++ b/topi/python/topi/cuda/conv2d_winograd.py @@ -304,7 +304,7 @@ def conv2d_nchw_winograd_without_weight_transform(cfg, data, kernel, strides, @autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform.cuda") -def schedule_conv2d_nchw_winograd_without_weight_transform_cuda(cfg, outs): +def schedule_conv2d_nchw_winograd_without_weight_transform(cfg, outs): """TOPI schedule callback""" s = tvm.create_schedule([x.op for x in outs]) From dfb9b2f88e30b7ccbc2e6c0005e2a82968c885e8 Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Thu, 6 Feb 2020 15:55:45 -0800 Subject: [PATCH 06/48] Modify topi tests (#9) * Add pooling, reorg, softmax and vision * Add lrn --- topi/tests/python/common.py | 8 ++++ topi/tests/python/test_topi_lrn.py | 18 ++++++-- topi/tests/python/test_topi_pooling.py | 39 ++++++++++++++---- topi/tests/python/test_topi_reorg.py | 12 ++++-- topi/tests/python/test_topi_softmax.py | 19 ++++++--- topi/tests/python/test_topi_vision.py | 57 ++++++++++++++++++++++---- 6 files changed, 126 insertions(+), 27 deletions(-) diff --git a/topi/tests/python/common.py b/topi/tests/python/common.py index 372d19628ca0..b76f01863094 100644 --- a/topi/tests/python/common.py +++ b/topi/tests/python/common.py @@ -64,6 +64,14 @@ def get_schedule_reduce(target): return _reduce_schedule[key] return _reduce_schedule["generic"] +def get_schedule(target, schedule): + if isinstance(target, str): + target = tvm.target.create(target) + for key in target.keys: + if key in schedule: + return schedule[key] + return schedule["generic"] + get_schedule_broadcast = get_schedule_injective get_schedule_elemwise = get_schedule_injective diff --git a/topi/tests/python/test_topi_lrn.py b/topi/tests/python/test_topi_lrn.py index 53139cdf10c6..e0f00f520e12 100644 --- a/topi/tests/python/test_topi_lrn.py +++ b/topi/tests/python/test_topi_lrn.py @@ -21,6 +21,18 @@ from topi.util import get_const_tuple import topi.testing +from common import get_schedule + +_lrn_schedule = { + "generic": topi.generic.schedule_lrn, + "gpu": topi.cuda.schedule_lrn, + "opencl": topi.cuda.schedule_lrn, + "metal": topi.cuda.schedule_lrn, + "rocm": topi.cuda.schedule_lrn, + "vulkan": topi.cuda.schedule_lrn, + "nvptx": topi.cuda.schedule_lrn, +} + def verify_lrn(shape, size, axis, bias, alpha, beta): A = tvm.placeholder(shape, name='A') B = topi.nn.lrn(A, size, axis, alpha, beta, bias) @@ -35,10 +47,8 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - if device == 'llvm': - s = topi.generic.schedule_lrn([B]) - else: - s = topi.cuda.schedule_lrn([B]) + s_func = get_schedule(device, _lrn_schedule) + s = s_func([B]) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py index cb81814e87f9..4c4a36a9e459 100644 --- a/topi/tests/python/test_topi_pooling.py +++ b/topi/tests/python/test_topi_pooling.py @@ -21,7 +21,26 @@ import topi import topi.testing from topi.util import get_const_tuple -from common import get_all_backend +from common import get_all_backend, get_schedule + +_pool_schedule = { + "generic": topi.generic.schedule_pool, + "cpu": topi.x86.schedule_pool, + "gpu": topi.cuda.schedule_pool, + "hls": topi.hls.schedule_pool, +} + +_adaptive_pool_schedule = { + "generic": topi.generic.schedule_adaptive_pool, + "cpu": topi.x86.schedule_adaptive_pool, + "gpu": topi.cuda.schedule_adaptive_pool, + "hls": topi.hls.schedule_adaptive_pool, +} + +_pool_grad_schedule = { + "generic": topi.generic.schedule_pool_grad, + "gpu": topi.cuda.schedule_pool_grad, +} def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_pad=True): iw = ih @@ -74,7 +93,8 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_pool(B, layout) + s_func = get_schedule(device, _pool_schedule) + s = s_func(B, layout) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) @@ -129,7 +149,8 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_pool_grad(PoolGrad) + s_func = get_schedule(device, _pool_grad_schedule) + s = s_func(PoolGrad) a = tvm.nd.array(a_np, ctx) out_grad = tvm.nd.array(out_grad_np, ctx) @@ -201,7 +222,8 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_adaptive_pool(B) + s_func = get_schedule(device, _adaptive_pool_schedule) + s = s_func(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) f = tvm.build(s, [A, B], device) @@ -255,7 +277,8 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_adaptive_pool(out) + s_func = get_schedule(device, _adaptive_pool_schedule) + s = s_func(out) a = tvm.nd.array(np_data, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(oshape), dtype=out.dtype), ctx) f = tvm.build(s, [data, out], device) @@ -298,7 +321,8 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_pool(B, layout) + s_func = get_schedule(device, _pool_schedule) + s = s_func(B, layout) a = tvm.nd.array(input_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) @@ -350,7 +374,8 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_pool(B, layout) + s_func = _pool_schedule[device] if device in _pool_schedule else _pool_schedule["generic"] + s = s_func(B, layout) a = tvm.nd.array(input_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) diff --git a/topi/tests/python/test_topi_reorg.py b/topi/tests/python/test_topi_reorg.py index 4edb0a195e22..487d9cb47997 100644 --- a/topi/tests/python/test_topi_reorg.py +++ b/topi/tests/python/test_topi_reorg.py @@ -20,6 +20,12 @@ from topi.util import get_const_tuple import tvm import topi.testing +from common import get_schedule + +_reorg_schedule = { + "generic": topi.generic.schedule_reorg, + "gpu": topi.cuda.schedule_reorg, +} def verify_reorg(batch, in_size, in_channel, stride): '''Verify reorg operator by comparing outputs from tvm and numpy implementation''' @@ -46,10 +52,8 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - if device == 'llvm': - s = topi.generic.schedule_reorg([B]) - else: - s = topi.cuda.schedule_reorg([B]) + s_func = get_schedule(device, _reorg_schedule) + s = s_func([B]) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, B], device) diff --git a/topi/tests/python/test_topi_softmax.py b/topi/tests/python/test_topi_softmax.py index 4836eef07508..e14c4ecdd206 100644 --- a/topi/tests/python/test_topi_softmax.py +++ b/topi/tests/python/test_topi_softmax.py @@ -23,7 +23,15 @@ import logging from topi.util import get_const_tuple -from common import get_all_backend +from common import get_all_backend, get_schedule + +_softmax_schedule = { + "generic": topi.generic.schedule_softmax, + "cpu": topi.x86.schedule_softmax, + "gpu": topi.cuda.schedule_softmax, + "hls": topi.hls.schedule_softmax, + "opengl": topi.opengl.schedule_softmax, +} def check_device(A, B, a_np, b_np, device, name): ctx = tvm.context(device, 0) @@ -32,11 +40,12 @@ def check_device(A, B, a_np, b_np, device, name): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_softmax(B) + s_func = get_schedule(device, _softmax_schedule) + s = s_func(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) - f = tvm.build(s, [A, B], device, name="softmax") + f = tvm.build(s, [A, B], device, name=name) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) @@ -50,7 +59,7 @@ def verify_softmax(m, n, dtype="float32"): a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = topi.testing.softmax_python(a_np) - for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']: + for device in get_all_backend(): check_device(A, B, a_np, b_np, device, "softmax") def verify_softmax_4d(shape, dtype="float32"): @@ -62,7 +71,7 @@ def verify_softmax_4d(shape, dtype="float32"): b_np = topi.testing.softmax_python(a_np.transpose(0, 2, 3, 1).reshape(h*w, c)) b_np = b_np.reshape(1, h, w, c).transpose(0, 3, 1, 2) - for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']: + for device in get_all_backend(): check_device(A, B, a_np, b_np, device, "softmax") def test_softmax(): diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py index 85e4180a0892..53cd9882bb6b 100644 --- a/topi/tests/python/test_topi_vision.py +++ b/topi/tests/python/test_topi_vision.py @@ -26,6 +26,42 @@ from topi.util import get_const_tuple from topi.vision import ssd, non_max_suppression, get_valid_counts +from common import get_schedule + +_get_valid_counts_schedule = { + "generic": topi.generic.schedule_get_valid_counts, + "gpu": topi.cuda.schedule_get_valid_counts, +} + +_nms_schedule = { + "generic": topi.generic.schedule_nms, + "gpu": topi.cuda.schedule_nms, +} + +_multibox_prior_schedule = { + "generic": topi.generic.schedule_multibox_prior, + "gpu": topi.cuda.schedule_multibox_prior, +} + +_multibox_detection_schedule = { + "generic": topi.generic.schedule_multibox_detection, + "gpu": topi.cuda.schedule_multibox_detection, +} + +_roi_align_schedule = { + "generic": topi.generic.schedule_roi_align, + "gpu": topi.cuda.schedule_roi_align, +} + +_roi_pool_schedule = { + "generic": topi.generic.schedule_roi_pool, + "gpu": topi.cuda.schedule_roi_pool, +} + +_proposal_schedule = { + "generic": topi.generic.schedule_proposal, + "gpu": topi.cuda.schedule_proposal, +} def verify_get_valid_counts(dshape, score_threshold, id_index, score_index): dtype = "float32" @@ -56,7 +92,8 @@ def check_device(device): with tvm.target.create(device): data = tvm.placeholder(dshape, name="data", dtype=dtype) outs = get_valid_counts(data, score_threshold, id_index, score_index) - s = topi.generic.schedule_get_valid_counts(outs) + s_func = get_schedule(device, _get_valid_counts_schedule) + s = s_func(outs) tvm_input_data = tvm.nd.array(np_data, ctx) tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), ctx) @@ -107,7 +144,8 @@ def check_device(device): return_indices=False) indices_out = topi.cuda.non_max_suppression(data, valid_count, -1, iou_threshold, force_suppress, top_k, coord_start=coord_start, score_index=score_index, id_index=id_index) - s = topi.generic.schedule_nms(out) + s_func = get_schedule(device, _nms_schedule) + s = s_func(out) indices_s = topi.generic.schedule_nms(indices_out) tvm_data = tvm.nd.array(np_data, ctx) @@ -198,7 +236,8 @@ def check_device(device): out = ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip) else: out = topi.cuda.ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip) - s = topi.generic.schedule_multibox_prior(out) + s_func = get_schedule(device, _multibox_prior_schedule) + s = s_func(out) tvm_input_data = tvm.nd.array(input_data, ctx) tvm_out = tvm.nd.array(np.zeros(oshape, dtype=dtype), ctx) @@ -244,7 +283,8 @@ def check_device(device): out = ssd.multibox_detection(cls_prob, loc_preds, anchors) else: out = topi.cuda.ssd.multibox_detection(cls_prob, loc_preds, anchors) - s = topi.generic.schedule_multibox_detection(out) + s_func = get_schedule(device, _multibox_detection_schedule) + s = s_func(out) tvm_cls_prob = tvm.nd.array(np_cls_prob.astype(cls_prob.dtype), ctx) tvm_loc_preds = tvm.nd.array(np_loc_preds.astype(loc_preds.dtype), ctx) @@ -289,7 +329,8 @@ def check_device(device): b = topi.vision.rcnn.roi_align_nchw(a, rois, pooled_size=pooled_size, spatial_scale=spatial_scale, sample_ratio=sample_ratio) - s = topi.generic.schedule_roi_align(b) + s_func = get_schedule(device, _roi_align_schedule) + s = s_func(b) tvm_a = tvm.nd.array(a_np, ctx) tvm_rois = tvm.nd.array(rois_np, ctx) @@ -338,7 +379,8 @@ def check_device(device): with tvm.target.create(device): b = topi.vision.rcnn.roi_pool_nchw(a, rois, pooled_size=pooled_size, spatial_scale=spatial_scale) - s = topi.generic.schedule_roi_pool(b) + s_func = get_schedule(device, _roi_pool_schedule) + s = s_func(b) tvm_a = tvm.nd.array(a_np, ctx) tvm_rois = tvm.nd.array(rois_np, ctx) @@ -369,7 +411,8 @@ def check_device(device): print("Running on target: %s" % device) with tvm.target.create(device): out = topi.vision.proposal(cls_prob, bbox_pred, im_info, **attrs) - s = topi.generic.schedule_proposal(out) + s_func = get_schedule(device, _proposal_schedule) + s = s_func(out) f = tvm.build(s, [cls_prob, bbox_pred, im_info, out], device) tvm_cls_prob = tvm.nd.array(np_cls_prob, ctx=ctx) tvm_bbox_pred = tvm.nd.array(np_bbox_pred, ctx=ctx) From 1b94211d87bb9fce206ef6264bbdde43cc4eb9f5 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Fri, 7 Feb 2020 01:38:45 +0000 Subject: [PATCH 07/48] fix topi test --- topi/tests/python/common.py | 16 ++++-- topi/tests/python/test_topi_pooling.py | 2 +- topi/tests/python/test_topi_vision.py | 73 ++++++++++++-------------- 3 files changed, 47 insertions(+), 44 deletions(-) diff --git a/topi/tests/python/common.py b/topi/tests/python/common.py index b76f01863094..f88fd236b367 100644 --- a/topi/tests/python/common.py +++ b/topi/tests/python/common.py @@ -64,13 +64,21 @@ def get_schedule_reduce(target): return _reduce_schedule[key] return _reduce_schedule["generic"] -def get_schedule(target, schedule): +def get_schedule(target, schedule_map): if isinstance(target, str): target = tvm.target.create(target) for key in target.keys: - if key in schedule: - return schedule[key] - return schedule["generic"] + if key in schedule_map: + return schedule_map[key] + return schedule_map["generic"] + +def get_implement(target, implement_map): + if isinstance(target, str): + target = tvm.target.create(target) + for key in target.keys: + if key in implement_map: + return implement_map[key] + return implement_map["generic"] get_schedule_broadcast = get_schedule_injective get_schedule_elemwise = get_schedule_injective diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py index 4c4a36a9e459..0dd6588f41a1 100644 --- a/topi/tests/python/test_topi_pooling.py +++ b/topi/tests/python/test_topi_pooling.py @@ -374,7 +374,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s_func = _pool_schedule[device] if device in _pool_schedule else _pool_schedule["generic"] + s_func = get_schedule(device, _pool_schedule) s = s_func(B, layout) a = tvm.nd.array(input_np, ctx) diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py index 53cd9882bb6b..254ef8077286 100644 --- a/topi/tests/python/test_topi_vision.py +++ b/topi/tests/python/test_topi_vision.py @@ -26,16 +26,16 @@ from topi.util import get_const_tuple from topi.vision import ssd, non_max_suppression, get_valid_counts -from common import get_schedule +from common import get_schedule, get_implement -_get_valid_counts_schedule = { - "generic": topi.generic.schedule_get_valid_counts, - "gpu": topi.cuda.schedule_get_valid_counts, +_get_valid_counts_implement = { + "generic": (topi.vision.get_valid_counts, topi.generic.schedule_get_valid_counts), + "gpu": (topi.cuda.get_valid_counts, topi.cuda.schedule_get_valid_counts), } -_nms_schedule = { - "generic": topi.generic.schedule_nms, - "gpu": topi.cuda.schedule_nms, +_nms_implement = { + "generic": (topi.vision.non_max_suppression, topi.generic.schedule_nms), + "gpu": (topi.cuda.non_max_suppression, topi.cuda.schedule_nms), } _multibox_prior_schedule = { @@ -48,9 +48,10 @@ "gpu": topi.cuda.schedule_multibox_detection, } -_roi_align_schedule = { - "generic": topi.generic.schedule_roi_align, - "gpu": topi.cuda.schedule_roi_align, +_roi_align_implement = { + "generic": (topi.vision.roi_align_nchw, topi.generic.schedule_roi_align), + "cpu": (topi.x86.roi_align_nchw, topi.generic.schedule_roi_align), + "gpu": (topi.vision.roi_align_nchw, topi.cuda.schedule_roi_align), } _roi_pool_schedule = { @@ -58,9 +59,9 @@ "gpu": topi.cuda.schedule_roi_pool, } -_proposal_schedule = { - "generic": topi.generic.schedule_proposal, - "gpu": topi.cuda.schedule_proposal, +_proposal_implement = { + "generic": (topi.vision.rcnn.proposal, topi.generic.schedule_proposal), + "gpu": (topi.cuda.proposal, topi.cuda.schedule_proposal), } def verify_get_valid_counts(dshape, score_threshold, id_index, score_index): @@ -90,10 +91,10 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): + fcompute, fschedule = get_implement(device, _get_valid_counts_implement) data = tvm.placeholder(dshape, name="data", dtype=dtype) - outs = get_valid_counts(data, score_threshold, id_index, score_index) - s_func = get_schedule(device, _get_valid_counts_schedule) - s = s_func(outs) + outs = fcompute(data, score_threshold, id_index, score_index) + s = fschedule(outs) tvm_input_data = tvm.nd.array(np_data, ctx) tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), ctx) @@ -132,21 +133,14 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - if device == 'llvm': - out = non_max_suppression(data, valid_count, -1, iou_threshold, force_suppress, top_k, - coord_start=coord_start, score_index=score_index, id_index=id_index, - return_indices=False) - indices_out = non_max_suppression(data, valid_count, -1, iou_threshold, force_suppress, top_k, - coord_start=coord_start, score_index=score_index, id_index=id_index) - else: - out = topi.cuda.non_max_suppression(data, valid_count, -1, iou_threshold, force_suppress, top_k, - coord_start=coord_start, score_index=score_index, id_index=id_index, - return_indices=False) - indices_out = topi.cuda.non_max_suppression(data, valid_count, -1, iou_threshold, force_suppress, top_k, - coord_start=coord_start, score_index=score_index, id_index=id_index) - s_func = get_schedule(device, _nms_schedule) - s = s_func(out) - indices_s = topi.generic.schedule_nms(indices_out) + fcompute, fschedule = get_implement(device, _nms_implement) + out = fcompute(data, valid_count, -1, iou_threshold, force_suppress, top_k, + coord_start=coord_start, score_index=score_index, id_index=id_index, + return_indices=False) + indices_out = fcompute(data, valid_count, -1, iou_threshold, force_suppress, top_k, + coord_start=coord_start, score_index=score_index, id_index=id_index) + s = fschedule(out) + indices_s = fschedule(indices_out) tvm_data = tvm.nd.array(np_data, ctx) tvm_valid_count = tvm.nd.array(np_valid_count, ctx) @@ -326,11 +320,11 @@ def check_device(device): print("Running on target: %s" % device) with tvm.target.create(device): - b = topi.vision.rcnn.roi_align_nchw(a, rois, pooled_size=pooled_size, - spatial_scale=spatial_scale, - sample_ratio=sample_ratio) - s_func = get_schedule(device, _roi_align_schedule) - s = s_func(b) + fcompute, fschedule = get_implement(device, _roi_align_implement) + b = fcompute(a, rois, pooled_size=pooled_size, + spatial_scale=spatial_scale, + sample_ratio=sample_ratio) + s = fschedule(b) tvm_a = tvm.nd.array(a_np, ctx) tvm_rois = tvm.nd.array(rois_np, ctx) @@ -410,9 +404,9 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - out = topi.vision.proposal(cls_prob, bbox_pred, im_info, **attrs) - s_func = get_schedule(device, _proposal_schedule) - s = s_func(out) + fcompute, fschedule = get_schedule(device, _proposal_implement) + out = fcompute(cls_prob, bbox_pred, im_info, **attrs) + s = fschedule(out) f = tvm.build(s, [cls_prob, bbox_pred, im_info, out], device) tvm_cls_prob = tvm.nd.array(np_cls_prob, ctx=ctx) tvm_bbox_pred = tvm.nd.array(np_bbox_pred, ctx=ctx) @@ -471,4 +465,5 @@ def test_proposal(): test_multibox_prior() test_multibox_detection() test_roi_align() + test_roi_pool() test_proposal() From 3abc4ee57f0b7e617a138f6e7e3f047b7fcee91e Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Sat, 8 Feb 2020 01:30:44 +0000 Subject: [PATCH 08/48] fix more topi test --- python/tvm/relay/op/strategy/cuda.py | 4 +- python/tvm/relay/op/strategy/generic.py | 10 ++-- tests/python/unittest/test_codegen_blob.py | 2 +- .../unittest/test_lang_tensor_overload_op.py | 4 +- topi/python/topi/cuda/conv3d.py | 16 ++---- topi/python/topi/nn/conv3d.py | 6 +-- topi/python/topi/testing/__init__.py | 2 + topi/python/topi/testing/common.py | 52 ++++++++++++++++++ topi/tests/python/common.py | 51 ------------------ topi/tests/python/test_fifo_buffer.py | 12 +++-- topi/tests/python/test_topi_batch_matmul.py | 11 +++- topi/tests/python/test_topi_broadcast.py | 13 ++--- topi/tests/python/test_topi_clip.py | 5 +- topi/tests/python/test_topi_conv1d.py | 23 ++++++-- .../python/test_topi_conv1d_transpose_ncw.py | 12 +++-- topi/tests/python/test_topi_conv3d_ncdhw.py | 12 +++-- topi/tests/python/test_topi_conv3d_ndhwc.py | 17 ++++-- .../python/test_topi_deformable_conv2d.py | 12 +++-- topi/tests/python/test_topi_dense.py | 40 +++++++++----- topi/tests/python/test_topi_depth_to_space.py | 4 +- topi/tests/python/test_topi_image.py | 8 +-- topi/tests/python/test_topi_lrn.py | 4 +- topi/tests/python/test_topi_math.py | 8 +-- topi/tests/python/test_topi_pooling.py | 14 ++--- topi/tests/python/test_topi_reduce.py | 5 +- topi/tests/python/test_topi_relu.py | 5 +- topi/tests/python/test_topi_reorg.py | 3 +- topi/tests/python/test_topi_softmax.py | 4 +- topi/tests/python/test_topi_sort.py | 19 +++++-- topi/tests/python/test_topi_space_to_depth.py | 4 +- topi/tests/python/test_topi_transform.py | 54 +++++++++---------- topi/tests/python/test_topi_upsampling.py | 6 +-- topi/tests/python/test_topi_vision.py | 16 +++--- 33 files changed, 263 insertions(+), 195 deletions(-) create mode 100644 topi/python/topi/testing/common.py diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 59fb64958f00..0e51aabcb7b2 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -92,7 +92,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target): wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), name="conv2d_nchw.cuda") _, _, kh, kw = get_const_tuple(kernel.shape) - if kh <= 7 and kw <= 7 and kh == kw and stride_h == 1 and stride_w == 1 and \ + if 2 < kh < 8 and 2 < kw < 8 and kh == kw and stride_h == 1 and stride_w == 1 and \ dilation_h == 1 and dilation_w == 1: strategy.add_implement( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), @@ -227,7 +227,7 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target): name="conv3d_ndhwc.cuda", plevel=10) if target.target_name == "cuda" and "cudnn" in target.libs: - strategy.add_implement(wrap_compute_conv3d(topi.cuda.conv3d_cudnn), + strategy.add_implement(wrap_compute_conv3d(topi.cuda.conv3d_cudnn, True), wrap_topi_schedule(topi.cuda.schedule_conv3d_cudnn), name="conv3d_cudnn.cuda", plevel=15) diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 24cd8e71dacd..d8cbc1c9db45 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -330,7 +330,7 @@ def conv2d_transpose_strategy(attrs, inputs, out_type, target): return strategy # conv3d -def wrap_compute_conv3d(topi_compute): +def wrap_compute_conv3d(topi_compute, need_layout=False): """wrap conv3d topi compute""" def _compute_conv3d(attrs, inputs, out_type): padding = get_const_tuple(attrs.padding) @@ -345,12 +345,14 @@ def _compute_conv3d(attrs, inputs, out_type): (dilation_d, dilation_h, dilation_w) = dilation if dilation_d < 1 or dilation_h < 1 or dilation_w < 1: raise ValueError("Dilation should be positive value") - - if groups == 1: + if groups != 1: + raise ValueError("Not support arbitrary group number for conv3d") + if need_layout: out = topi_compute(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype) else: - raise ValueError("Not support arbitrary group number for now") + out = topi_compute(inputs[0], inputs[1], strides, padding, dilation, + out_dtype) return [out] return _compute_conv3d diff --git a/tests/python/unittest/test_codegen_blob.py b/tests/python/unittest/test_codegen_blob.py index 2e0cee24097e..c14607d0c0b7 100644 --- a/tests/python/unittest/test_codegen_blob.py +++ b/tests/python/unittest/test_codegen_blob.py @@ -101,4 +101,4 @@ def test_system_lib(): if __name__ == "__main__": test_resnet18() - test_system_lib() + #test_system_lib() diff --git a/tests/python/unittest/test_lang_tensor_overload_op.py b/tests/python/unittest/test_lang_tensor_overload_op.py index 98fdeaaf4328..22bc28a64297 100644 --- a/tests/python/unittest/test_lang_tensor_overload_op.py +++ b/tests/python/unittest/test_lang_tensor_overload_op.py @@ -108,7 +108,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_elemwise(B) + s = topi.testing.get_elemwise_schedule(device)(B) k_ = 2 foo = tvm.build(s, [A, B, k] + sh, device, name="tensor_scalar_" + typ) @@ -154,7 +154,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_broadcast(C) + s = topi.testing.get_broadcast_schedule(device)(C) foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ) lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype) diff --git a/topi/python/topi/cuda/conv3d.py b/topi/python/topi/cuda/conv3d.py index 70e5e8b60bb5..6424d2fb8884 100644 --- a/topi/python/topi/cuda/conv3d.py +++ b/topi/python/topi/cuda/conv3d.py @@ -26,8 +26,7 @@ @autotvm.register_topi_compute("conv3d_ncdhw.cuda") -def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, layout='NCDHW', - out_dtype='float32'): +def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'): """Conv3D operator for cuda backend. Parameters @@ -50,9 +49,6 @@ def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, layout='NCDHW', dilation: int or a list/tuple of three ints dilation size, or [dilation_depth, dilation_height, dilation_width] - layout : str - layout of data - out_dtype: str The output type. This is used for mixed precision. @@ -61,7 +57,7 @@ def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, layout='NCDHW', output : tvm.Tensor 5-D with shape [batch, out_channel, out_depth, out_height, out_width] """ - return nn.conv3d_ncdhw(data, kernel, strides, padding, dilation, layout, out_dtype) + return nn.conv3d_ncdhw(data, kernel, strides, padding, dilation, out_dtype) @autotvm.register_topi_schedule("conv3d_ncdhw.cuda") @@ -95,8 +91,7 @@ def _callback(op): @autotvm.register_topi_compute("conv3d_ndhwc.cuda") -def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, layout='NDHWC', - out_dtype='float32'): +def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'): """Conv3D operator for cuda backend. Parameters @@ -119,9 +114,6 @@ def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, layout='NDHWC', dilation: int or a list/tuple of three ints dilation size, or [dilation_depth, dilation_height, dilation_width] - layout : str - layout of data - out_dtype: str The output type. This is used for mixed precision. @@ -130,7 +122,7 @@ def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, layout='NDHWC', output : tvm.Tensor 5-D with shape [batch, out_channel, out_depth, out_height, out_width] """ - return nn.conv3d_ndhwc(data, kernel, strides, padding, dilation, layout, out_dtype) + return nn.conv3d_ndhwc(data, kernel, strides, padding, dilation, out_dtype) @autotvm.register_topi_schedule("conv3d_ndhwc.cuda") diff --git a/topi/python/topi/nn/conv3d.py b/topi/python/topi/nn/conv3d.py index a37d9894d4c3..cc5cbe6af3c5 100644 --- a/topi/python/topi/nn/conv3d.py +++ b/topi/python/topi/nn/conv3d.py @@ -25,7 +25,7 @@ from ..util import simplify -def conv3d_ncdhw(Input, Filter, stride, padding, dilation, layout='NCDHW', out_dtype=None): +def conv3d_ncdhw(Input, Filter, stride, padding, dilation, out_dtype=None): """Conv3D operator in NCDHW layout. Parameters @@ -50,7 +50,6 @@ def conv3d_ncdhw(Input, Filter, stride, padding, dilation, layout='NCDHW', out_d Output : tvm.Tensor 5-D with shape [batch, out_channel, out_depth, out_height, out_width] """ - assert layout == "NCDHW" if out_dtype is None: out_dtype = Input.dtype assert isinstance(stride, int) or len(stride) == 3 @@ -95,7 +94,7 @@ def conv3d_ncdhw(Input, Filter, stride, padding, dilation, layout='NCDHW', out_d axis=[rc, rz, ry, rx]), tag="conv3d_ncdhw") -def conv3d_ndhwc(Input, Filter, stride, padding, dilation, layout='NDHWC', out_dtype='float32'): +def conv3d_ndhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'): """Convolution operator in NDHWC layout. Parameters @@ -120,7 +119,6 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation, layout='NDHWC', out_d Output : tvm.Tensor 5-D with shape [batch, out_channel, out_depth, out_height, out_width] """ - assert layout == "NDHWC" assert isinstance(stride, int) or len(stride) == 3 assert isinstance(dilation, int) or len(dilation) == 3 diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py index 91b7dc5bc60c..a9c8b49ce4bd 100644 --- a/topi/python/topi/testing/__init__.py +++ b/topi/python/topi/testing/__init__.py @@ -53,3 +53,5 @@ from .depth_to_space import depth_to_space_python from .space_to_depth import space_to_depth_python from .crop_and_resize_python import crop_and_resize_python +from .common import get_injective_schedule, get_reduce_schedule, get_broadcast_schedule, \ + get_elemwise_schedule, dispatch diff --git a/topi/python/topi/testing/common.py b/topi/python/topi/testing/common.py new file mode 100644 index 000000000000..2d6bc942efe7 --- /dev/null +++ b/topi/python/topi/testing/common.py @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import tvm +import topi + +_injective_schedule = { + "generic": topi.generic.schedule_injective, + "cpu": topi.x86.schedule_injective, + "arm_cpu": topi.arm_cpu.schedule_injective, + "gpu": topi.cuda.schedule_injective, + "hls": topi.hls.schedule_injective, + "opengl": topi.opengl.schedule_injective +} + +_reduce_schedule = { + "generic": topi.generic.schedule_reduce, + "cpu": topi.x86.schedule_reduce, + "gpu": topi.cuda.schedule_reduce, + "hls": topi.cuda.schedule_reduce +} + +def dispatch(target, dispatch_map): + if isinstance(target, str): + target = tvm.target.create(target) + assert isinstance(target, tvm.target.Target) + for key in target.keys: + if key in dispatch_map: + return dispatch_map[key] + return dispatch_map["generic"] + +def get_injective_schedule(target): + return dispatch(target, _injective_schedule) + +def get_reduce_schedule(target): + return dispatch(target, _reduce_schedule) + +get_broadcast_schedule = get_injective_schedule +get_elemwise_schedule = get_injective_schedule diff --git a/topi/tests/python/common.py b/topi/tests/python/common.py index f88fd236b367..e03708c67f26 100644 --- a/topi/tests/python/common.py +++ b/topi/tests/python/common.py @@ -32,57 +32,6 @@ def get_all_backend(): return ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx', 'llvm -device=arm_cpu', 'opencl -device=mali', 'aocl_sw_emu'] -_injective_schedule = { - "generic": topi.generic.schedule_injective, - "cpu": topi.x86.schedule_injective, - "arm_cpu": topi.arm_cpu.schedule_injective, - "gpu": topi.cuda.schedule_injective, - "hls": topi.hls.schedule_injective, - "opengl": topi.opengl.schedule_injective -} - -_reduce_schedule = { - "generic": topi.generic.schedule_reduce, - "cpu": topi.x86.schedule_reduce, - "gpu": topi.cuda.schedule_reduce, - "hls": topi.cuda.schedule_reduce -} - -def get_schedule_injective(target): - if isinstance(target, str): - target = tvm.target.create(target) - for key in target.keys: - if key in _injective_schedule: - return _injective_schedule[key] - return _injective_schedule["generic"] - -def get_schedule_reduce(target): - if isinstance(target, str): - target = tvm.target.create(target) - for key in target.keys: - if key in _reduce_schedule: - return _reduce_schedule[key] - return _reduce_schedule["generic"] - -def get_schedule(target, schedule_map): - if isinstance(target, str): - target = tvm.target.create(target) - for key in target.keys: - if key in schedule_map: - return schedule_map[key] - return schedule_map["generic"] - -def get_implement(target, implement_map): - if isinstance(target, str): - target = tvm.target.create(target) - for key in target.keys: - if key in implement_map: - return implement_map[key] - return implement_map["generic"] - -get_schedule_broadcast = get_schedule_injective -get_schedule_elemwise = get_schedule_injective - class Int8Fallback(autotvm.FallbackContext): def _query_inside(self, target, workload): key = (target, workload) diff --git a/topi/tests/python/test_fifo_buffer.py b/topi/tests/python/test_fifo_buffer.py index 8b74e215df63..82c230629c05 100644 --- a/topi/tests/python/test_fifo_buffer.py +++ b/topi/tests/python/test_fifo_buffer.py @@ -18,10 +18,12 @@ import tvm import topi +import topi.testing import numpy as np -from common import get_all_backend, get_schedule_injective from tvm.contrib.pickle_memoize import memoize +from common import get_all_backend + def verify_fifo_buffer(buffer_shape, data_shape, axis, dtype='float32'): buffer = tvm.placeholder(buffer_shape, name='buffer', dtype=dtype) data = tvm.placeholder(data_shape, name='data', dtype=dtype) @@ -52,7 +54,7 @@ def check_device(device): with tvm.target.create(device): out = topi.nn.fifo_buffer(data, buffer, axis=axis) - s = get_schedule_injective(device)([out]) + s = topi.testing.get_injective_schedule(device)([out]) buffer_tvm = tvm.nd.array(buffer_np, ctx=ctx) data_tvm = tvm.nd.array(data_np, ctx=ctx) @@ -128,7 +130,7 @@ def check_device(device): with tvm.target.create(device): out = topi.nn.fifo_buffer(inc_input, context, axis=buffer_axis) - s = get_schedule_injective(device)([out]) + s = topi.testing.get_injective_schedule(device)([out]) update_context = tvm.build(s, [inc_input, context, out], device, name='update_context') out = topi.nn.conv2d(context, kernel, strides=stride, padding=padding, dilation=dilate, @@ -137,12 +139,12 @@ def check_device(device): conv2d_inc = tvm.build(s, [context, kernel, out], device, name='conv2d_inc') out = topi.nn.fifo_buffer(inc_output, output_window, axis=buffer_axis) - s = get_schedule_injective(device)([out]) + s = topi.testing.get_injective_schedule(device)([out]) update_output_window = tvm.build(s, [inc_output, output_window, out], device, name='update_output_window') out = topi.nn.fifo_buffer(inc_input, input_window, axis=buffer_axis) - s = get_schedule_injective(device)([out]) + s = topi.testing.get_injective_schedule(device)([out]) update_input_window = tvm.build(s, [inc_input, input_window, out], device, name='update_input_window') diff --git a/topi/tests/python/test_topi_batch_matmul.py b/topi/tests/python/test_topi_batch_matmul.py index d1f50c86464b..1b38e9037fb9 100644 --- a/topi/tests/python/test_topi_batch_matmul.py +++ b/topi/tests/python/test_topi_batch_matmul.py @@ -24,6 +24,12 @@ from common import get_all_backend +_batch_matmul_implement = { + "generic": (topi.nn.batch_matmul, topi.generic.schedule_batch_matmul), + "cpu": (topi.x86.batch_matmul, topi.x86.schedule_batch_matmul), + "gpu": (topi.nn.batch_matmul, topi.cuda.schedule_batch_matmul), +} + def verify_batch_matmul(batch, M, N, K): x = tvm.placeholder((batch, M, K), name='x') y = tvm.placeholder((batch, N, K), name='y') @@ -46,8 +52,9 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - out = topi.nn.batch_matmul(x, y) - s = topi.generic.schedule_batch_matmul([out]) + fcompute, fschedule = topi.testing.dispatch(device, _batch_matmul_implement) + out = fcompute(x, y) + s = fschedule([out]) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), ctx) diff --git a/topi/tests/python/test_topi_broadcast.py b/topi/tests/python/test_topi_broadcast.py index 56b82b0cda68..2bea9b09bbf4 100644 --- a/topi/tests/python/test_topi_broadcast.py +++ b/topi/tests/python/test_topi_broadcast.py @@ -18,7 +18,8 @@ import numpy as np import tvm import topi -from common import get_all_backend, get_schedule_broadcast +import topi.testing +from common import get_all_backend def verify_broadcast_to_ele(in_shape, out_shape, fbcast): @@ -33,7 +34,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_broadcast(device)(B) + s = topi.testing.get_broadcast_schedule(device)(B) foo = tvm.build(s, [A, B], device, name="broadcast_to") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = np.broadcast_to(data_npy, out_shape) @@ -81,7 +82,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_broadcast(device)(C) + s = topi.testing.get_broadcast_schedule(device)(C) foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + ftopi.__name__) lhs_npy, lhs_nd = gen_operand(lhs_shape, lhs_min, lhs_max, ctx) @@ -252,7 +253,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_broadcast(device)(B) + s = topi.testing.get_broadcast_schedule(device)(B) foo = tvm.build(s, [A, B], device, name=name) data_npy = indata.astype(A.dtype) @@ -293,7 +294,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_broadcast(B) + s = topi.testing.get_broadcast_schedule(device)(B) foo = tvm.build(s, [A, B], device, name=name) data_npy = np.random.uniform(size=shape).astype(A.dtype) @@ -335,7 +336,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_broadcast(device)(C) + s = topi.testing.get_broadcast_schedule(device)(C) foo = tvm.build(s, [A, B, C], device, name=name) lhs_nd = tvm.nd.array(lhs, ctx) diff --git a/topi/tests/python/test_topi_clip.py b/topi/tests/python/test_topi_clip.py index c875e835e8f7..74034ce30b0e 100644 --- a/topi/tests/python/test_topi_clip.py +++ b/topi/tests/python/test_topi_clip.py @@ -18,10 +18,11 @@ import numpy as np import tvm import topi +import topi.testing from topi.util import get_const_tuple from tvm.contrib.pickle_memoize import memoize -from common import get_all_backend, get_schedule_injective +from common import get_all_backend def verify_clip(N, a_min, a_max, dtype): A = tvm.placeholder((N, N), dtype=dtype, name='A') @@ -43,7 +44,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) diff --git a/topi/tests/python/test_topi_conv1d.py b/topi/tests/python/test_topi_conv1d.py index d54742c01d14..6e55a574de4a 100644 --- a/topi/tests/python/test_topi_conv1d.py +++ b/topi/tests/python/test_topi_conv1d.py @@ -25,6 +25,18 @@ from common import get_all_backend +_conv1d_ncw_implement = { + "generic": (topi.nn.conv1d_ncw, topi.generic.schedule_conv1d_ncw), + "cpu": (topi.nn.conv1d_ncw, topi.x86.schedule_conv1d_ncw), + "gpu": (topi.cuda.conv1d_ncw, topi.cuda.schedule_conv1d_ncw) +} + +_conv1d_nwc_implement = { + "generic": (topi.nn.conv1d_nwc, topi.generic.schedule_conv1d_nwc), + "cpu": (topi.nn.conv1d_nwc, topi.x86.schedule_conv1d_nwc), + "gpu": (topi.cuda.conv1d_nwc, topi.cuda.schedule_conv1d_nwc) +} + def verify_conv1d(batch, in_channels, in_width, @@ -66,12 +78,13 @@ def check_device(device): if not ctx.exist: print("Skip because %s is not enabled" % device) return + if layout == "NCW": + fcompute, fschedule = topi.testing.dispatch(device, _conv1d_ncw_implement) + else: + fcompute, fschedule = topi.testing.dispatch(device, _conv1d_nwc_implement) with tvm.target.create(device): - B = topi.nn.conv1d(A, W, stride, padding, dilation, layout, 'float32') - if layout == 'NCW': - s = topi.generic.schedule_conv1d_ncw([B]) - else: - s = topi.generic.schedule_conv1d_nwc([B]) + B = fcompute(A, W, stride, padding, dilation, 'float32') + s = fschedule([B]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) diff --git a/topi/tests/python/test_topi_conv1d_transpose_ncw.py b/topi/tests/python/test_topi_conv1d_transpose_ncw.py index 9d6e9db254b5..64af254adc7d 100644 --- a/topi/tests/python/test_topi_conv1d_transpose_ncw.py +++ b/topi/tests/python/test_topi_conv1d_transpose_ncw.py @@ -24,6 +24,11 @@ from topi.util import get_const_tuple from common import get_all_backend +_conv1d_transpose_ncw_implement = { + "generic": (topi.nn.conv1d_transpose_ncw, topi.generic.schedule_conv1d_transpose_ncw), + "gpu": (topi.cuda.conv1d_transpose_ncw, topi.cuda.schedule_conv1d_transpose_ncw) +} + def verify_conv1d_transpose_ncw(batch, in_channel, in_size, num_filter, kernel, stride, padding): in_width = in_size A = tvm.placeholder((batch, in_channel, in_width), name='A') @@ -49,10 +54,11 @@ def check_device(device): print("Skip because %s is not enabled" % device) return with tvm.target.create(device): - B = topi.nn.conv1d_transpose_ncw(A, W, stride, padding, A.dtype) + fcompute, fschedule = topi.testing.dispatch(device, _conv1d_transpose_ncw_implement) + B = fcompute(A, W, stride, padding, A.dtype) C = topi.nn.relu(B) - s1 = topi.generic.schedule_conv1d_transpose_ncw([B]) - s2 = topi.generic.schedule_conv1d_transpose_ncw([C]) + s1 = fschedule([B]) + s2 = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) diff --git a/topi/tests/python/test_topi_conv3d_ncdhw.py b/topi/tests/python/test_topi_conv3d_ncdhw.py index 92b1068a11ec..6c60c27ed426 100644 --- a/topi/tests/python/test_topi_conv3d_ncdhw.py +++ b/topi/tests/python/test_topi_conv3d_ncdhw.py @@ -27,6 +27,11 @@ from common import get_all_backend +_conv3d_ncdhw_implement = { + "generic": (topi.nn.conv3d_ncdhw, topi.generic.schedule_conv3d_ncdhw), + "gpu": (topi.cuda.conv3d_ncdhw, topi.cuda.schedule_conv3d_ncdhw), +} + def verify_conv3d_ncdhw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False): pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right = get_pad_tuple3d(padding, (kernel, kernel, kernel)) padding_sum = pad_front + pad_back + pad_top + pad_left + pad_bottom + pad_right @@ -65,14 +70,15 @@ def check_device(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) + fcompute, fschedule = topi.testing.dispatch(device, _conv3d_ncdhw_implement) with tvm.target.create(device): - C = topi.nn.conv3d(A, W, (stride, stride, stride), padding, - (dilation, dilation, dilation), layout='NCDHW', out_dtype=dtype) + C = fcompute(A, W, (stride, stride, stride), padding, + (dilation, dilation, dilation), dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) - s = topi.generic.schedule_conv3d_ncdhw([C]) + s = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) diff --git a/topi/tests/python/test_topi_conv3d_ndhwc.py b/topi/tests/python/test_topi_conv3d_ndhwc.py index c613f68d062e..0bda67b19333 100644 --- a/topi/tests/python/test_topi_conv3d_ndhwc.py +++ b/topi/tests/python/test_topi_conv3d_ndhwc.py @@ -23,6 +23,13 @@ from tvm.contrib.pickle_memoize import memoize from topi.util import get_const_tuple +from common import get_all_backend + +_conv3d_ndhwc_implement = { + "generic": (topi.nn.conv3d_ndhwc, topi.generic.schedule_conv3d_ndhwc), + "cpu": (topi.x86.conv3d_ndhwc, topi.x86.schedule_conv3d_ndhwc), + "gpu": (topi.cuda.conv3d_ndhwc, topi.cuda.schedule_conv3d_ndhwc), +} def verify_conv3d_ndhwc(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1): if isinstance(in_size, tuple): @@ -51,13 +58,15 @@ def get_ref_data(): a_np, w_np, b_np = get_ref_data() def check_device(device): - if not tvm.runtime.enabled(device): + ctx = tvm.context(device, 0) + if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) + fcompute, fschedule = topi.testing.dispatch(device, _conv3d_ndhwc_implement) with tvm.target.create(device): - B = topi.nn.conv3d(A, W, stride, padding, dilation, layout="NDHWC") - s = topi.generic.schedule_conv3d_ndhwc([B]) + B = fcompute(A, W, stride, padding, dilation) + s = fschedule([B]) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) @@ -66,7 +75,7 @@ def check_device(device): func(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) - for device in ['llvm']: + for device in get_all_backend(): check_device(device) diff --git a/topi/tests/python/test_topi_deformable_conv2d.py b/topi/tests/python/test_topi_deformable_conv2d.py index 45222b6bd489..1b1a0327a3d5 100644 --- a/topi/tests/python/test_topi_deformable_conv2d.py +++ b/topi/tests/python/test_topi_deformable_conv2d.py @@ -25,6 +25,11 @@ from common import get_all_backend +_deformable_conv2d_implement = { + "generic": (topi.nn.deformable_conv2d_nchw, topi.generic.schedule_deformable_conv2d_nchw), + "cuda": (topi.cuda.deformable_conv2d_nchw, topi.cuda.schedule_deformable_conv2d_nchw), +} + def verify_deformable_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, deformable_groups=1, groups=1): print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, deformable_groups, groups)) @@ -60,10 +65,11 @@ def check_device(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) + fcompute, fschedule = topi.testing.dispatch(device, _deformable_conv2d_implement) with tvm.target.create(device): - C = topi.nn.deformable_conv2d_nchw(A, Offset, W, stride, padding, dilation, - deformable_groups, groups, out_dtype=dtype) - s = topi.generic.schedule_deformable_conv2d_nchw([C]) + C = fcompute(A, Offset, W, stride, padding, dilation, + deformable_groups, groups, dtype) + s = fschedule([C]) a = tvm.nd.array(a_np, ctx) offset = tvm.nd.array(offset_np, ctx) diff --git a/topi/tests/python/test_topi_dense.py b/topi/tests/python/test_topi_dense.py index 3b747712a173..d729e4330e52 100644 --- a/topi/tests/python/test_topi_dense.py +++ b/topi/tests/python/test_topi_dense.py @@ -24,6 +24,19 @@ from common import get_all_backend, Int8Fallback +_dense_implement = { + "generic": [(topi.nn.dense, topi.generic.schedule_dense)], + "cpu": [(topi.x86.dense_nopack, topi.x86.schedule_dense_nopack), + (topi.x86.dense_pack, topi.x86.schedule_dense_pack)], + "gpu": [(topi.cuda.dense_small_batch, topi.cuda.schedule_dense_small_batch), + (topi.cuda.dense_large_batch, topi.cuda.schedule_dense_large_batch)], + "mali": [(topi.mali.dense, topi.mali.schedule_dense)], + "bifrost": [(topi.bifrost.dense, topi.bifrost.schedule_dense)], + "opengl": [(topi.nn.dense, topi.opengl.schedule_dense)], + "rocm": [(topi.rocm.dense, topi.rocm.schedule_dense)], + "hls": [(topi.nn.dense, topi.hls.schedule_dense)], +} + def verify_dense(batch, in_dim, out_dim, use_bias=True): A = tvm.placeholder((batch, in_dim), name='A') B = tvm.placeholder((out_dim, in_dim), name='B') @@ -50,17 +63,18 @@ def check_device(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) - with tvm.target.create(device): - D = topi.nn.dense(A, B, C if use_bias else None) - D = topi.nn.relu(D) - s = topi.generic.schedule_dense([D]) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(c_np, ctx) - d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx) - f = tvm.build(s, [A, B, C, D], device, name="dense") - f(a, b, c, d) - tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5) + for fcompute, fschedule in topi.testing.dispatch(device, _dense_implement): + with tvm.target.create(device): + D = fcompute(A, B, C if use_bias else None) + D = topi.nn.relu(D) + s = fschedule([D]) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(c_np, ctx) + d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx) + f = tvm.build(s, [A, B, C, D], device, name="dense") + f(a, b, c, d) + tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5) for device in get_all_backend(): check_device(device) @@ -99,9 +113,9 @@ def check_device(device): print("Running on target: %s" % device) with tvm.target.create(device): - D = topi.nn.dense(A, B, C if use_bias else None, out_dtype=out_dtype) + D = topi.cuda.dense_int8(A, B, C if use_bias else None, out_dtype) D = topi.nn.relu(D) - s = topi.generic.schedule_dense([D]) + s = topi.cuda.schedule_dense_int8([D]) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(c_np, ctx) diff --git a/topi/tests/python/test_topi_depth_to_space.py b/topi/tests/python/test_topi_depth_to_space.py index b79597a9e143..693bfb624042 100644 --- a/topi/tests/python/test_topi_depth_to_space.py +++ b/topi/tests/python/test_topi_depth_to_space.py @@ -20,7 +20,7 @@ import topi import topi.testing -from common import get_all_backend, get_schedule_injective +from common import get_all_backend def verify_depth_to_space(block_size, batch, in_channel, in_height, in_width, layout='NCHW', mode='DCR'): @@ -56,7 +56,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) f = tvm.build(s, [A, B], device) diff --git a/topi/tests/python/test_topi_image.py b/topi/tests/python/test_topi_image.py index 81c44d1e97e9..4297638b3dfe 100644 --- a/topi/tests/python/test_topi_image.py +++ b/topi/tests/python/test_topi_image.py @@ -20,7 +20,7 @@ import topi import topi.testing -from common import get_all_backend, get_schedule_injective +from common import get_all_backend def verify_resize(batch, in_channel, in_height, in_width, out_height, out_width, layout='NCHW', coord_trans="align_corners", method="bilinear"): @@ -52,7 +52,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) f = tvm.build(s, [A, B], device) @@ -116,7 +116,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) f = tvm.build(s, [A, B], device) @@ -176,7 +176,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_injective(out) + s = topi.testing.get_injective_schedule(device)(out) tvm_images = tvm.nd.array(np_images, ctx) tvm_boxes = tvm.nd.array(np_boxes, ctx) tvm_indices = tvm.nd.array(np_box_indices, ctx) diff --git a/topi/tests/python/test_topi_lrn.py b/topi/tests/python/test_topi_lrn.py index e0f00f520e12..4cb3c7581800 100644 --- a/topi/tests/python/test_topi_lrn.py +++ b/topi/tests/python/test_topi_lrn.py @@ -21,8 +21,6 @@ from topi.util import get_const_tuple import topi.testing -from common import get_schedule - _lrn_schedule = { "generic": topi.generic.schedule_lrn, "gpu": topi.cuda.schedule_lrn, @@ -47,7 +45,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s_func = get_schedule(device, _lrn_schedule) + s_func = topi.testing.dispatch(device, _lrn_schedule) s = s_func([B]) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py index e9d3bc9a576d..debc3efe0d27 100644 --- a/topi/tests/python/test_topi_math.py +++ b/topi/tests/python/test_topi_math.py @@ -20,7 +20,7 @@ import topi import topi.testing from topi import util -from common import get_all_backend, get_schedule_injective +from common import get_all_backend def test_util(): @@ -62,7 +62,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) foo = tvm.build(s, [A, B], device, name=name) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros_like(b_np), ctx) @@ -102,7 +102,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) foo = tvm.build(s, [A, B], device, name="isnan") a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros_like(b_np), ctx) @@ -152,7 +152,7 @@ def verify(from_dtype, to_dtype, low=-100, high=100): continue print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) foo = tvm.build(s, [A, B], device) a = tvm.nd.array(a_np, ctx) b = tvm.nd.empty(shape=shape, dtype=to_dtype, ctx=ctx) diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py index 0dd6588f41a1..084a2c7c7671 100644 --- a/topi/tests/python/test_topi_pooling.py +++ b/topi/tests/python/test_topi_pooling.py @@ -21,7 +21,7 @@ import topi import topi.testing from topi.util import get_const_tuple -from common import get_all_backend, get_schedule +from common import get_all_backend _pool_schedule = { "generic": topi.generic.schedule_pool, @@ -93,7 +93,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s_func = get_schedule(device, _pool_schedule) + s_func = topi.testing.dispatch(device, _pool_schedule) s = s_func(B, layout) a = tvm.nd.array(a_np, ctx) @@ -149,7 +149,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s_func = get_schedule(device, _pool_grad_schedule) + s_func = topi.testing.dispatch(device, _pool_grad_schedule) s = s_func(PoolGrad) a = tvm.nd.array(a_np, ctx) @@ -222,7 +222,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s_func = get_schedule(device, _adaptive_pool_schedule) + s_func = topi.testing.dispatch(device, _adaptive_pool_schedule) s = s_func(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) @@ -277,7 +277,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s_func = get_schedule(device, _adaptive_pool_schedule) + s_func = topi.testing.dispatch(device, _adaptive_pool_schedule) s = s_func(out) a = tvm.nd.array(np_data, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(oshape), dtype=out.dtype), ctx) @@ -321,7 +321,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s_func = get_schedule(device, _pool_schedule) + s_func = topi.testing.dispatch(device, _pool_schedule) s = s_func(B, layout) a = tvm.nd.array(input_np, ctx) @@ -374,7 +374,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s_func = get_schedule(device, _pool_schedule) + s_func = topi.testing.dispatch(device, _pool_schedule) s = s_func(B, layout) a = tvm.nd.array(input_np, ctx) diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py index 3b854ceba2a9..751025bf82b8 100644 --- a/topi/tests/python/test_topi_reduce.py +++ b/topi/tests/python/test_topi_reduce.py @@ -19,8 +19,9 @@ import numpy as np import tvm import topi +import topi.testing -from common import get_all_backend, get_schedule_reduce +from common import get_all_backend def _my_npy_argmax(arr, axis, keepdims): if not keepdims: @@ -74,7 +75,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_reduce(device)(B) + s = topi.testing.get_reduce_schedule(device)(B) foo = tvm.build(s, [A, B], device, name=type) # Test diff --git a/topi/tests/python/test_topi_relu.py b/topi/tests/python/test_topi_relu.py index ee7aeed037d7..8ef354907691 100644 --- a/topi/tests/python/test_topi_relu.py +++ b/topi/tests/python/test_topi_relu.py @@ -19,10 +19,11 @@ import numpy as np import tvm import topi +import topi.testing from topi.util import get_const_tuple from tvm.contrib.nvcc import have_fp16 -from common import get_all_backend, get_schedule_elemwise +from common import get_all_backend def verify_relu(m, n, dtype="float32"): A = tvm.placeholder((m, n), name='A', dtype=dtype) @@ -41,7 +42,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_elemwise(device)(B) + s = topi.testing.get_elemwise_schedule(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) diff --git a/topi/tests/python/test_topi_reorg.py b/topi/tests/python/test_topi_reorg.py index 487d9cb47997..c4cd2b5d0eb8 100644 --- a/topi/tests/python/test_topi_reorg.py +++ b/topi/tests/python/test_topi_reorg.py @@ -20,7 +20,6 @@ from topi.util import get_const_tuple import tvm import topi.testing -from common import get_schedule _reorg_schedule = { "generic": topi.generic.schedule_reorg, @@ -52,7 +51,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s_func = get_schedule(device, _reorg_schedule) + s_func = topi.testing.dispatch(device, _reorg_schedule) s = s_func([B]) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) diff --git a/topi/tests/python/test_topi_softmax.py b/topi/tests/python/test_topi_softmax.py index e14c4ecdd206..5396b6beef81 100644 --- a/topi/tests/python/test_topi_softmax.py +++ b/topi/tests/python/test_topi_softmax.py @@ -23,7 +23,7 @@ import logging from topi.util import get_const_tuple -from common import get_all_backend, get_schedule +from common import get_all_backend _softmax_schedule = { "generic": topi.generic.schedule_softmax, @@ -40,7 +40,7 @@ def check_device(A, B, a_np, b_np, device, name): return print("Running on target: %s" % device) with tvm.target.create(device): - s_func = get_schedule(device, _softmax_schedule) + s_func = topi.testing.dispatch(device, _softmax_schedule) s = s_func(B) a = tvm.nd.array(a_np, ctx) diff --git a/topi/tests/python/test_topi_sort.py b/topi/tests/python/test_topi_sort.py index 0ad4e987d17d..74e55ec248d9 100644 --- a/topi/tests/python/test_topi_sort.py +++ b/topi/tests/python/test_topi_sort.py @@ -21,6 +21,15 @@ import topi import topi.testing +_argsort_implement = { + "generic": (topi.argsort, topi.generic.schedule_argsort), + "gpu": (topi.cuda.argsort, topi.cuda.schedule_argsort), +} + +_topk_implement = { + "generic": (topi.topk, topi.generic.schedule_topk), + "gpu": (topi.cuda.topk, topi.cuda.schedule_topk), +} def verify_argsort(axis, is_ascend): dshape = (20, 100) @@ -48,8 +57,9 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - out = topi.argsort(data, axis=axis, is_ascend=is_ascend) - s = topi.generic.schedule_argsort(out) + fcompute, fschedule = topi.testing.dispatch(device, _argsort_implement) + out = fcompute(data, axis=axis, is_ascend=is_ascend) + s = fschedule(out) tvm_data = tvm.nd.array(np_data, ctx) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), ctx) @@ -91,9 +101,10 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - outs = topi.topk(data, k, axis, ret_type, is_ascend, dtype) + fcompute, fschedule = topi.testing.dispatch(device, _topk_implement) + outs = fcompute(data, k, axis, ret_type, is_ascend, dtype) outs = outs if isinstance(outs, list) else [outs] - s = topi.generic.schedule_topk(outs) + s = fschedule(outs) tvm_data = tvm.nd.array(np_data, ctx) tvm_res = [] for t in outs: diff --git a/topi/tests/python/test_topi_space_to_depth.py b/topi/tests/python/test_topi_space_to_depth.py index 0d24de59238b..99a798e733ee 100644 --- a/topi/tests/python/test_topi_space_to_depth.py +++ b/topi/tests/python/test_topi_space_to_depth.py @@ -20,7 +20,7 @@ import topi import topi.testing -from common import get_all_backend, get_schedule_injective +from common import get_all_backend def verify_space_to_depth(block_size, batch, in_channel, in_height, in_width, layout='NCHW'): @@ -56,7 +56,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) f = tvm.build(s, [A, B], device) diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py index 2e3ce4143a2f..fac62d2a2d55 100644 --- a/topi/tests/python/test_topi_transform.py +++ b/topi/tests/python/test_topi_transform.py @@ -21,7 +21,7 @@ import topi.testing from tvm.contrib.nvcc import have_fp16 -from common import get_all_backend, get_schedule_injective, get_schedule_broadcast, get_schedule_elemwise +from common import get_all_backend def verify_expand_dims(in_shape, out_shape, axis, num_newaxis): A = tvm.placeholder(shape=in_shape, name="A") @@ -33,7 +33,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_broadcast(device)(B) + s = topi.testing.get_broadcast_schedule(device)(B) foo = tvm.build(s, [A, B], device, name="expand_dims") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = data_npy.reshape(out_shape) @@ -59,7 +59,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_elemwise(device)(B) + s = topi.testing.get_elemwise_schedule(device)(B) foo = tvm.build(s, [A, B], device, name="reinterpret") data_npy = generator(in_shape).astype(in_dtype) out_npy = data_npy.view(B.dtype) @@ -82,7 +82,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) foo = tvm.build(s, [A, B], device, name="transpose") data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype(A.dtype) out_npy = data_npy.transpose(axes) @@ -105,7 +105,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) foo = tvm.build(s, [A, B], device, name="reshape") data_npy = np.random.normal(size=src_shape).astype(A.dtype) out_npy = np.reshape(data_npy, newshape=dst_shape) @@ -128,7 +128,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) foo = tvm.build(s, [A, B], device, name="squeeze") data_npy = np.random.normal(size=src_shape).astype(A.dtype) @@ -144,7 +144,7 @@ def check_device(device): def verify_concatenate(shapes, axis): - def get_schedule_concatenate(target): + def get_concat_schedule(target): schedule_map = { "cpu": topi.x86.schedule_concatenate, "arm_cpu": topi.arm_cpu.schedule_concatenate, @@ -154,7 +154,7 @@ def get_schedule_concatenate(target): for key in target.keys: if key in schedule_map: return schedule_map[key] - return get_schedule_injective(target) + return topi.testing.get_injective_schedule(target) tensor_l = [] for i, shape in enumerate(shapes): @@ -167,7 +167,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_concatenate(device)(out_tensor) + s = get_concat_schedule(device)(out_tensor) foo = tvm.build(s, tensor_l + [out_tensor], device, name="concatenate") data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes] @@ -192,7 +192,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_broadcast(device)(out_tensor) + s = topi.testing.get_broadcast_schedule(device)(out_tensor) foo = tvm.build(s, tensor_l + [out_tensor], device, name="stack") data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes] @@ -216,7 +216,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(tensor_l) + s = topi.testing.get_injective_schedule(device)(tensor_l) foo = tvm.build(s, [A] + list(tensor_l), device, name="split") data_npy = np.random.normal(size=src_shape).astype(A.dtype) @@ -275,7 +275,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) foo = tvm.build(s, [A, B], device, name="reverse") x_np = np.random.uniform(size=in_shape).astype(A.dtype) @@ -306,7 +306,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(out_tensor) + s = topi.testing.get_injective_schedule(device)(out_tensor) foo = tvm.build(s, [A] + [indices] + [out_tensor] , device, name="take") shape_size = 1 @@ -341,7 +341,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) foo = tvm.build(s, [A, B], device, name="stride_slice") x_np = np.random.uniform(size=in_shape).astype(A.dtype) @@ -373,7 +373,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) if strides is not None: foo = tvm.build(s, [A, V, b, e, st, B], device, name="stride_set") @@ -415,7 +415,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(out_tensor) + s = topi.testing.get_injective_schedule(device)(out_tensor) func = tvm.build(s, [A, indices, out_tensor] , device, name="take") shape_size = 1 @@ -454,7 +454,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(A) + s = topi.testing.get_injective_schedule(device)(A) f = tvm.build(s, [A], device, name="arange") a_nd = tvm.nd.empty(a_np.shape, dtype='float32', ctx=ctx) f(a_nd) @@ -473,7 +473,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_broadcast(device)(B) + s = topi.testing.get_broadcast_schedule(device)(B) foo = tvm.build(s, [A, B], device, name="repeat") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = np.repeat(data_npy, repeats, axis) @@ -495,7 +495,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_broadcast(device)(B) + s = topi.testing.get_broadcast_schedule(device)(B) foo = tvm.build(s, [A, B], device, name="tile") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = np.tile(data_npy, reps) @@ -520,7 +520,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_broadcast(device)(C) + s = topi.testing.get_broadcast_schedule(device)(C) f = tvm.build(s, [Cond, A, B, C], device, name="where") cond_npy = np.random.uniform(low=-1, high=1, size=in_shape).astype(dtype) x_npy = np.random.uniform(size=in_shape).astype(dtype) @@ -548,7 +548,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(one_hot_result) + s = topi.testing.get_injective_schedule(device)(one_hot_result) fn = tvm.build(s, [indices, one_hot_result], device, name="one_hot") indices_npy = np.random.randint(0, depth, size=indices_shape).astype(indices.dtype) out_npy = topi.testing.one_hot(indices_npy, on_value, off_value, depth, axis, dtype) @@ -631,7 +631,7 @@ def test_squeeze(): ctx = tvm.context(device, 0) if ctx.exist: with tvm.target.create(device): - s = get_schedule_injective(device)(C) + s = topi.testing.get_injective_schedule(device)(C) func = tvm.build(s, [A, C]) a = tvm.nd.array(np.array((1, 2)).astype('float32'), ctx=ctx) c = tvm.nd.empty((1,), dtype='float32', ctx=ctx) @@ -754,7 +754,7 @@ def check_device(device): tvm_output = tvm.nd.empty(output.shape, ctx=ctx, dtype=B.dtype) print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) f = tvm.build(s, [A, B], device, name="layout_transform") f(tvm_input, tvm_output) tvm.testing.assert_allclose(tvm_output.asnumpy(), output) @@ -781,7 +781,7 @@ def check_device(device): tvm_output = tvm.nd.empty(output.shape, ctx=ctx, dtype=dtype) print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) f = tvm.build(s, [A, B], device, name="shape") f(tvm_input, tvm_output) tvm.testing.assert_allclose(tvm_output.asnumpy(), output) @@ -813,7 +813,7 @@ def check_device(device): tvm_C = tvm.nd.empty(in_shape, ctx=ctx, dtype="float32") print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(C) + s = topi.testing.get_injective_schedule(device)(C) f = tvm.build(s, [A, B, C], device, name="SequenceMask") f(tvm_A, tvm_B, tvm_C) tvm.testing.assert_allclose(tvm_C.asnumpy(), C_gt_data) @@ -838,7 +838,7 @@ def check_device(device): tvm_output = tvm.nd.empty((1,), ctx=ctx, dtype=B.dtype) print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) f = tvm.build(s, [A, B], device, name="ndarray_size") f(tvm_input, tvm_output) tvm.testing.assert_allclose(tvm_output.asnumpy(), output) @@ -902,5 +902,5 @@ def test_one_hot(): test_shape() test_sequence_mask() test_ndarray_size() - #test_where_fusion() + test_where_fusion() test_one_hot() diff --git a/topi/tests/python/test_topi_upsampling.py b/topi/tests/python/test_topi_upsampling.py index 20382da77939..003748719a0e 100644 --- a/topi/tests/python/test_topi_upsampling.py +++ b/topi/tests/python/test_topi_upsampling.py @@ -22,7 +22,7 @@ import math from topi.util import nchw_pack_layout -from common import get_all_backend, get_schedule_injective +from common import get_all_backend def verify_upsampling(batch, in_channel, in_height, in_width, scale_h, scale_w, layout='NCHW', method="nearest_neighbor", @@ -64,7 +64,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) f = tvm.build(s, [A, B], device) @@ -147,7 +147,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = get_schedule_injective(device)(B) + s = topi.testing.get_injective_schedule(device)(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx) f = tvm.build(s, [A, B], device) diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py index 254ef8077286..be36903e4d71 100644 --- a/topi/tests/python/test_topi_vision.py +++ b/topi/tests/python/test_topi_vision.py @@ -26,8 +26,6 @@ from topi.util import get_const_tuple from topi.vision import ssd, non_max_suppression, get_valid_counts -from common import get_schedule, get_implement - _get_valid_counts_implement = { "generic": (topi.vision.get_valid_counts, topi.generic.schedule_get_valid_counts), "gpu": (topi.cuda.get_valid_counts, topi.cuda.schedule_get_valid_counts), @@ -91,7 +89,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - fcompute, fschedule = get_implement(device, _get_valid_counts_implement) + fcompute, fschedule = topi.testing.dispatch(device, _get_valid_counts_implement) data = tvm.placeholder(dshape, name="data", dtype=dtype) outs = fcompute(data, score_threshold, id_index, score_index) s = fschedule(outs) @@ -133,7 +131,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - fcompute, fschedule = get_implement(device, _nms_implement) + fcompute, fschedule = topi.testing.dispatch(device, _nms_implement) out = fcompute(data, valid_count, -1, iou_threshold, force_suppress, top_k, coord_start=coord_start, score_index=score_index, id_index=id_index, return_indices=False) @@ -230,7 +228,7 @@ def check_device(device): out = ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip) else: out = topi.cuda.ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip) - s_func = get_schedule(device, _multibox_prior_schedule) + s_func = topi.testing.dispatch(device, _multibox_prior_schedule) s = s_func(out) tvm_input_data = tvm.nd.array(input_data, ctx) @@ -277,7 +275,7 @@ def check_device(device): out = ssd.multibox_detection(cls_prob, loc_preds, anchors) else: out = topi.cuda.ssd.multibox_detection(cls_prob, loc_preds, anchors) - s_func = get_schedule(device, _multibox_detection_schedule) + s_func = topi.testing.dispatch(device, _multibox_detection_schedule) s = s_func(out) tvm_cls_prob = tvm.nd.array(np_cls_prob.astype(cls_prob.dtype), ctx) @@ -320,7 +318,7 @@ def check_device(device): print("Running on target: %s" % device) with tvm.target.create(device): - fcompute, fschedule = get_implement(device, _roi_align_implement) + fcompute, fschedule = topi.testing.dispatch(device, _roi_align_implement) b = fcompute(a, rois, pooled_size=pooled_size, spatial_scale=spatial_scale, sample_ratio=sample_ratio) @@ -373,7 +371,7 @@ def check_device(device): with tvm.target.create(device): b = topi.vision.rcnn.roi_pool_nchw(a, rois, pooled_size=pooled_size, spatial_scale=spatial_scale) - s_func = get_schedule(device, _roi_pool_schedule) + s_func = topi.testing.dispatch(device, _roi_pool_schedule) s = s_func(b) tvm_a = tvm.nd.array(a_np, ctx) @@ -404,7 +402,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - fcompute, fschedule = get_schedule(device, _proposal_implement) + fcompute, fschedule = topi.testing.dispatch(device, _proposal_implement) out = fcompute(cls_prob, bbox_pred, im_info, **attrs) s = fschedule(out) f = tvm.build(s, [cls_prob, bbox_pred, im_info, out], device) From 2df6a84575016a663b9df08ff29836998194e741 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Mon, 10 Feb 2020 10:44:52 -0800 Subject: [PATCH 09/48] lint --- .../autotvm/graph_tuner/base_graph_tuner.py | 5 ++--- .../graph_tuner/utils/traverse_graph.py | 5 +++-- python/tvm/relay/backend/compile_engine.py | 4 ++-- python/tvm/relay/op/strategy/generic.py | 12 +++++----- python/tvm/relay/op/strategy/mali.py | 2 +- python/tvm/relay/op/strategy/rocm.py | 2 +- topi/python/topi/arm_cpu/conv2d_alter_op.py | 18 +++++++++------ topi/python/topi/cuda/conv2d_alter_op.py | 10 +++++---- topi/python/topi/intel_graphics/conv2d.py | 22 +++++++++---------- .../topi/intel_graphics/conv2d_alter_op.py | 4 ++-- topi/python/topi/testing/common.py | 3 +++ topi/python/topi/x86/conv2d.py | 2 +- topi/python/topi/x86/conv2d_alter_op.py | 13 ++++++----- 13 files changed, 55 insertions(+), 47 deletions(-) diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py index 489a97f10d5d..c64049333fc0 100644 --- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py +++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py @@ -38,10 +38,9 @@ def get_infer_layout(task_name): if task_name.startswith("conv2d"): return topi.nn.conv2d_infer_layout - elif task_name.startswith("depthwise_conv2d"): + if task_name.startswith("depthwise_conv2d"): return topi.nn.depthwise_conv2d_infer_layout - else: - raise ValueError("Cannot find infer layout for task %s" % task_name) + raise ValueError("Cannot find infer layout for task %s" % task_name) @autotvm.register_customized_task("layout_transform") def layout_transform(*args): diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py index cb8de640a25a..17450ca3e7f3 100644 --- a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py +++ b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py @@ -49,11 +49,12 @@ def expr2graph(expr, target_ops, node_dict, node_list): {"op": str, "node": tvm.relay.expr, "inputs": [int], "types": [tvm.relay.Type], "name": str, "workloads": [tuple], "topi_op": [function]} """ - env = TaskExtractEnv.get(allow_duplicate=True) - env.reset(target_ops) # TODO(@kevinthesun, @icemelon9): Currently graph tuning pass relies on the fact # that # autotvm tasks == # ops. But this won't be true after having relay op # strategy. We need to find a solution to fix this. + env = TaskExtractEnv.get(allow_duplicate=True) + env.reset(target_ops) + # pylint: disable=not-context-manager with env: _expr2graph_impl(expr, target_ops, node_dict, node_list) task_pos = 0 diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py index 7f8db95fbbbb..182680822bf6 100644 --- a/python/tvm/relay/backend/compile_engine.py +++ b/python/tvm/relay/backend/compile_engine.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=len-as-condition,no-else-return +# pylint: disable=len-as-condition,no-else-return,invalid-name """Backend code generation engine.""" from __future__ import absolute_import @@ -338,7 +338,7 @@ def visit_call(self, call): if not is_dyn: best_impl, outputs = select_implement( op, call.attrs, inputs, ret_type, self.target) - logger.debug("Use implementation %s for op %s" % (best_impl.name, op.name)) + logger.info("Use implementation %s for op %s", best_impl.name, op.name) else: # TODO(@icemelon9): Allow tvm to generate multiple kernels for dynamic shapes # for dynamic case, we currently use the implementation with highest plevel diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index d8cbc1c9db45..f4d57d7d9189 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -40,11 +40,10 @@ def get_conv2d_in_channels(data_shape, data_layout): idx = data_layout.find("C") assert idx >= 0, "Invalid conv2d data layout {}".format(data_layout) return data_shape[idx] - elif re.match(r"NCHW\d*c", data_layout): + if re.match(r"NCHW\d*c", data_layout): # NCHW[8]c return data_shape[1] * data_shape[4] - else: - raise ValueError("Unknown conv2d data layout {}".format(data_layout)) + raise ValueError("Unknown conv2d data layout {}".format(data_layout)) def get_conv2d_out_channels(kernel_shape, kernel_layout): """Get conv2d output channels""" @@ -53,12 +52,11 @@ def get_conv2d_out_channels(kernel_shape, kernel_layout): idx = kernel_layout.find("O") assert idx >= 0, "Invalid conv2d kernel layout {}".format(kernel_layout) return kernel_shape[idx] - elif re.match(r"OIHW\d*i\d*o", kernel_layout): + if re.match(r"OIHW\d*i\d*o", kernel_layout): return kernel_shape[0] * kernel_shape[5] - elif re.match(r"OIHW\d*o", kernel_layout): + if re.match(r"OIHW\d*o", kernel_layout): return kernel_shape[0] * kernel_shape[4] - else: - raise ValueError("Unknown conv2d kernel layout {}".format(kernel_layout)) + raise ValueError("Unknown conv2d kernel layout {}".format(kernel_layout)) def is_depthwise_conv2d(data_shape, data_layout, kernel_shape, kernel_layout, groups): ic = get_conv2d_in_channels(data_shape, data_layout) diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py index b30b916a0285..885ad24c3657 100644 --- a/python/tvm/relay/op/strategy/mali.py +++ b/python/tvm/relay/op/strategy/mali.py @@ -86,7 +86,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_ty format(layout)) return strategy -@dense_strategy.register(["mali"]) +@dense_strategy.register("mali") def dense_strategy_mali(attrs, inputs, out_type, target): """dense mali strategy""" strategy = _op.OpStrategy() diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py index a356521cc6f7..86921e1fdb15 100644 --- a/python/tvm/relay/op/strategy/rocm.py +++ b/python/tvm/relay/op/strategy/rocm.py @@ -116,7 +116,7 @@ def conv2d_strategy_rocm(attrs, inputs, out_type, target): raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) return strategy -@dense_strategy.register(["rocm"]) +@dense_strategy.register("rocm") def dense_strategy_rocm(attrs, inputs, out_type, target): """Dense strategy for ROCM""" strategy = _op.OpStrategy() diff --git a/topi/python/topi/arm_cpu/conv2d_alter_op.py b/topi/python/topi/arm_cpu/conv2d_alter_op.py index 20e59e0e014e..3a972b920de2 100644 --- a/topi/python/topi/arm_cpu/conv2d_alter_op.py +++ b/topi/python/topi/arm_cpu/conv2d_alter_op.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name,unused-variable,unused-argument,no-member,no-else-return +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member """Conv2D alter op and legalize functions for arm cpu""" import logging @@ -76,7 +76,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): dispatch_ctx.update(target, new_workload, cfg) return relay.nn.conv2d(*inputs, **new_attrs) - elif topi_tmpl == "conv2d_nhwc_spatial_pack.arm_cpu": + + if topi_tmpl == "conv2d_nhwc_spatial_pack.arm_cpu": assert data_layout == "NHWC" and kernel_layout == "HWIO" N, H, W, CI = get_const_tuple(data.shape) KH, KW, _, CO = get_const_tuple(kernel.shape) @@ -92,7 +93,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): dispatch_ctx.update(target, new_workload, cfg) return relay.nn.conv2d(*inputs, **new_attrs) - elif topi_tmpl == "conv2d_nchw_winograd.arm_cpu": + + if topi_tmpl == "conv2d_nchw_winograd.arm_cpu": assert data_layout == "NCHW" and kernel_layout == "OIHW" N, CI, H, W = get_const_tuple(data.shape) CO, _, KH, KW = get_const_tuple(kernel.shape) @@ -122,7 +124,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): return relay.nn.contrib_conv2d_winograd_without_weight_transform( inputs[0], weight_expr, **new_attrs) - elif topi_tmpl == "conv2d_nchw_winograd_nnpack.arm_cpu": + + if topi_tmpl == "conv2d_nchw_winograd_nnpack.arm_cpu": assert data_layout == "NCHW" and kernel_layout == "OIHW" N, CI, H, W = get_const_tuple(data.shape) CO, _, KH, KW = get_const_tuple(kernel.shape) @@ -146,7 +149,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): dispatch_ctx.update(target, new_workload, cfg) return relay.nn.contrib_conv2d_winograd_without_weight_transform( inputs[0], transformed_weight, **new_attrs) - elif topi_tmpl == "depthwise_conv2d_nchw_spatial_pack.arm_cpu": + + if topi_tmpl == "depthwise_conv2d_nchw_spatial_pack.arm_cpu": assert data_layout == "NCHW" and kernel_layout == "OIHW" N, CI, H, W = get_const_tuple(data.shape) CO, _, KH, KW = get_const_tuple(kernel.shape) @@ -163,5 +167,5 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): dispatch_ctx.update(target, new_workload, cfg) return relay.nn.conv2d(*inputs, **new_attrs) - else: - return None + + return None diff --git a/topi/python/topi/cuda/conv2d_alter_op.py b/topi/python/topi/cuda/conv2d_alter_op.py index 614158c1ac3d..6c3b9f017445 100644 --- a/topi/python/topi/cuda/conv2d_alter_op.py +++ b/topi/python/topi/cuda/conv2d_alter_op.py @@ -79,7 +79,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): "conv2d_NCHWc_int8.cuda") dispatch_ctx.update(target, new_workload, cfg) return relay.nn.conv2d(*inputs, **new_attrs) - elif topi_tmpl == "conv2d_nchw_winograd.cuda": + + if topi_tmpl == "conv2d_nchw_winograd.cuda": if dilation != (1, 1): logger.warning("Does not support weight pre-transform for dilated convolution.") return None @@ -107,7 +108,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): dispatch_ctx.update(target, new_workload, cfg) return relay.nn.contrib_conv2d_winograd_without_weight_transform( inputs[0], weight, **new_attrs) - elif topi_tmpl == "group_conv2d_NCHWc_int8.cuda": + + if topi_tmpl == "group_conv2d_NCHWc_int8.cuda": assert data_layout == "NCHW" and kernel_layout == "OIHW" N, CI, H, W = get_const_tuple(data.shape) CO, _, KH, KW = get_const_tuple(kernel.shape) @@ -130,5 +132,5 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): "group_conv2d_NCHWc_int8.cuda") dispatch_ctx.update(target, new_workload, cfg) return relay.nn.conv2d(*inputs, **new_attrs) - else: - return None + + return None diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py index 0171b36ebb43..15211f5cb1d3 100644 --- a/topi/python/topi/intel_graphics/conv2d.py +++ b/topi/python/topi/intel_graphics/conv2d.py @@ -31,17 +31,17 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False): if is_depthwise: raise RuntimeError("Depthwise not supported for intel graphics.") - else: - batch_size, in_channel, height, width = get_const_tuple(data.shape) - out_channel, _, hkernel, _ = get_const_tuple(kernel.shape) - HSTR, _ = strides - - ic_bn = 1 - oc_bn, oc_bn_upper = 16, 16 - for i in range(oc_bn_upper, 0, -1): - if out_channel % i == 0: - oc_bn = i - break + + batch_size, in_channel, height, width = get_const_tuple(data.shape) + out_channel, _, hkernel, _ = get_const_tuple(kernel.shape) + HSTR, _ = strides + + ic_bn = 1 + oc_bn, oc_bn_upper = 16, 16 + for i in range(oc_bn_upper, 0, -1): + if out_channel % i == 0: + oc_bn = i + break if HSTR == 2: if out_channel + hkernel == 515: diff --git a/topi/python/topi/intel_graphics/conv2d_alter_op.py b/topi/python/topi/intel_graphics/conv2d_alter_op.py index d21a86909baf..4e0314543843 100644 --- a/topi/python/topi/intel_graphics/conv2d_alter_op.py +++ b/topi/python/topi/intel_graphics/conv2d_alter_op.py @@ -83,8 +83,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): new_attrs["out_layout"], out_dtype], "conv2d_NCHWc.intel_graphics") dispatch_ctx.update(target, new_workload, cfg) return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs) - else: - return None + + return None @conv2d_infer_layout.register("intel_graphics") diff --git a/topi/python/topi/testing/common.py b/topi/python/topi/testing/common.py index 2d6bc942efe7..876f3e4b5ccd 100644 --- a/topi/python/topi/testing/common.py +++ b/topi/python/topi/testing/common.py @@ -14,6 +14,9 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# pylint: disable=invalid-name +"""Common utility for topi test""" + import tvm import topi diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index b3b3671c8451..b93665ee5577 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=invalid-name,unused-variable,unused-argument,no-member -# pylint: disable=no-value-for-parameter +# pylint: disable=no-value-for-parameter,import-outside-toplevel """Conv2D schedule on x86""" import logging diff --git a/topi/python/topi/x86/conv2d_alter_op.py b/topi/python/topi/x86/conv2d_alter_op.py index 10f11ffe3456..45622fef68b3 100644 --- a/topi/python/topi/x86/conv2d_alter_op.py +++ b/topi/python/topi/x86/conv2d_alter_op.py @@ -88,7 +88,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): new_attrs["out_layout"], out_dtype], topi_tmpl) dispatch_ctx.update(target, new_workload, cfg) return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs) - elif topi_tmpl == "conv2d_NCHWc_int8.x86": + + if topi_tmpl == "conv2d_NCHWc_int8.x86": # TODO(@icemelon9, @anijain2305): Need to support data layout NHWC with kernel layout HWIO assert data_layout == "NCHW" and kernel_layout == "OIHW" if cfg.is_fallback: @@ -132,7 +133,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): dispatch_ctx.update(target, new_workload, cfg) return relay.nn.contrib_conv2d_nchwc(data_expr, kernel_OIHWioe, **new_attrs) - elif topi_tmpl == "depthwise_conv2d_NCHWc.x86": + + if topi_tmpl == "depthwise_conv2d_NCHWc.x86": assert data_layout == "NCHW" and kernel_layout == "OIHW" if cfg.is_fallback: _get_default_config(cfg, data_tensor, kernel_tensor, strides, padding, @@ -158,8 +160,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): new_attrs['out_layout'], out_dtype], topi_tmpl) dispatch_ctx.update(target, new_workload, cfg) return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs) - else: - return None + + return None @conv2d_legalize.register("cpu") @@ -306,5 +308,4 @@ def _conv2d_legalize(attrs, inputs, arg_types): out = relay.subtract(out, adjust_shift) return out - else: - return None + return None From 7908aa4cfb240c2837b1c98ead0adebdea54371e Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Mon, 10 Feb 2020 16:55:31 -0800 Subject: [PATCH 10/48] address comments --- include/tvm/relay/op_attr_types.h | 12 +- include/tvm/te/schedule.h | 58 +++-- python/tvm/__init__.py | 2 +- python/tvm/relay/backend/compile_engine.py | 242 +++++---------------- python/tvm/te/__init__.py | 4 +- python/tvm/te/schedule.py | 9 +- src/relay/backend/compile_engine.cc | 86 +++----- src/relay/backend/compile_engine.h | 33 ++- src/te/schedule/schedule_lang.cc | 12 +- 9 files changed, 153 insertions(+), 305 deletions(-) diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h index 27325c3d2010..6c6b0e6b6262 100644 --- a/include/tvm/relay/op_attr_types.h +++ b/include/tvm/relay/op_attr_types.h @@ -120,8 +120,8 @@ using FTVMCompute = runtime::TypedPackedFunc< */ using FTVMSchedule = runtime::TypedPackedFunc< te::Schedule(const Attrs& attrs, - const Array& outs, - const Target& target)>; + const Array& outs, + const Target& target)>; /*! * \brief Generate the strategy of operators. This function is a generic @@ -303,8 +303,8 @@ class OpSpecialization : public ObjectRef { public: /*! * \brief Add an implementation. - * \param compute Compute function - * \param schedule Schedule function + * \param fcompute Compute function + * \param fschedule Schedule function * \param name Name of the implementation * \param plevel Priority level of the implementation */ @@ -337,8 +337,8 @@ class OpStrategy : public ObjectRef { public: /*! * \brief Add an implementation. - * \param compute Compute function - * \param schedule Schedule function + * \param fcompute Compute function + * \param fschedule Schedule function * \param name Name of the implementation * \param plevel Priority level of the implementation */ diff --git a/include/tvm/te/schedule.h b/include/tvm/te/schedule.h index 2a88f4c8f7e9..a8a02365fbda 100644 --- a/include/tvm/te/schedule.h +++ b/include/tvm/te/schedule.h @@ -743,25 +743,44 @@ class SingletonNode : public IterVarRelationNode { TVM_DECLARE_FINAL_OBJECT_INFO(SingletonNode, IterVarRelationNode); }; -class SpecializedConditionNode; +/*! \brief Container for specialization conditions. */ +class SpecializedConditionNode : public Object { + public: + /*! + * \brief List of conditions in conjunctive joint form (CNF). + * Each condition should be a simple expression, e.g., n > 16, m % 8 == 0, etc., + * where n, m are tvm::Var that represents a dimension in the tensor shape. + */ + Array clauses; + + void VisitAttrs(AttrVisitor* v) { + v->Visit("clauses", &clauses); + } + + static constexpr const char* _type_key = "SpecializedCondition"; + TVM_DECLARE_FINAL_OBJECT_INFO(SpecializedConditionNode, Object); +}; /*! * \brief Specialized condition to enable op specialization */ class SpecializedCondition : public ObjectRef { public: - SpecializedCondition() {} - explicit SpecializedCondition(ObjectPtr n) : ObjectRef(n) {} + /*! + * \brief construct from conditions + * \param conditions The clauses in the specialized condition. + */ + TVM_DLL SpecializedCondition(Array conditions); // NOLINT(*) + /*! * \brief Get the current specialized condition. - * \return The current specialized condition. + * \return the current specialized condition. */ TVM_DLL static SpecializedCondition Current(); - const SpecializedConditionNode* operator->() const; - - using ContainerType = SpecializedConditionNode; + TVM_DEFINE_OBJECT_REF_METHODS(SpecializedCondition, ObjectRef, SpecializedConditionNode); class Internal; + private: // enable with syntax. friend class Internal; @@ -772,27 +791,6 @@ class SpecializedCondition : public ObjectRef { TVM_DLL void ExitWithScope(); }; -/*! \brief Container for specialization conditions. */ -class SpecializedConditionNode : public Object { - public: - /*! - * \brief List of conditions in conjunctive joint form (CNF). - * Each condition should be a simple expression, e.g., n > 16, m % 8 == 0, etc., - * where n, m are tvm::Var that represents a dimension in the tensor shape. - */ - Array clauses; - - void VisitAttrs(AttrVisitor* v) { - v->Visit("clauses", &clauses); - } - - static SpecializedCondition make(Array conditions); - - static constexpr const char* _type_key = "SpecializedCondition"; - TVM_DECLARE_FINAL_OBJECT_INFO(SpecializedConditionNode, Object); -}; - - // implementations inline const StageNode* Stage::operator->() const { return static_cast(get()); @@ -816,10 +814,6 @@ inline const IterVarAttrNode* IterVarAttr::operator->() const { return static_cast(get()); } -inline const SpecializedConditionNode* SpecializedCondition::operator->() const { - return static_cast(get()); -} - } // namespace te } // namespace tvm #endif // TVM_TE_SCHEDULE_H_ diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index c1b80b887ebf..65cb67266de6 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -50,7 +50,7 @@ from .target import build_config # tvm.te -from .te import decl_tensor_intrin, create_schedule, tag_scope, current_specialization +from .te import decl_tensor_intrin, create_schedule, tag_scope # tvm.testing from . import testing diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py index 182680822bf6..5ea961ad0203 100644 --- a/python/tvm/relay/backend/compile_engine.py +++ b/python/tvm/relay/backend/compile_engine.py @@ -19,35 +19,25 @@ from __future__ import absolute_import import logging -import hashlib import numpy as np import tvm -from topi import tag from ..base import register_relay_node, Object -from ... import _api_internal from ... import target as _target from ... import autotvm from .. import expr as _expr from .. import op as _op from .. import ty as _ty -from ..expr_functor import ExprVisitor from . import _backend logger = logging.getLogger('compile_engine') + @register_relay_node -class CachedFunc(Object): - """Low-level tensor function to back a relay primitive function. - """ - def __init__(self, target, func_name, inputs, outputs, schedule=None, - lowered_funcs=None, shape_func_param_states=None): - if lowered_funcs is None: - lowered_funcs = [] - if shape_func_param_states is None: - shape_func_param_states = [] +class LoweredOutput(Object): + """Lowered output""" + def __init__(self, outputs, implement): self.__init_handle_by_constructor__( - _backend._make_CachedFunc, target, func_name, inputs, outputs, - schedule, lowered_funcs, shape_func_param_states) + _backend._make_LoweredOutput, outputs, implement) @register_relay_node @@ -103,7 +93,7 @@ def get_shape(shape): def get_valid_implements(op, attrs, inputs, out_type, target): """Get all valid implementations from the op strategy. - Note that this function doesn't support op that has symbolic input shapes. + Note that this function doesn't support op with symbolic input shapes. Parameters ---------- @@ -161,7 +151,7 @@ def select_implement(op, attrs, inputs, out_type, target, use_autotvm=True): If use_autotvm is False, it'll directly choose the implementation with highest plevel. - Note that this function doesn't support op that has symbolic input shapes. + Note that this function doesn't support op with symbolic input shapes. Parameters ---------- @@ -220,180 +210,52 @@ def select_implement(op, attrs, inputs, out_type, target, use_autotvm=True): return best_plevel_impl, outputs[best_plevel_impl] -class ScheduleGetter(ExprVisitor): - """Get the schedule given a fused Relay function""" - - MAX_FUNC_NAME_LENGTH = 80 - - def __init__(self, target): - super().__init__() - self.target = target - self.master_op = None - self.master_attrs = None - self.master_op_pattern = 0 - self.master_implement = None - self.func_name = "" - self.scalars = [] - self._device_copy_op = _op.get("device_copy") - - def create(self, prim_func): - """Get the schedule and create the cached function""" - assert isinstance(prim_func, _expr.Function) - assert prim_func.is_primitive() - - def create_tensors(typ, tensors): - if isinstance(typ, _ty.TensorType): - tensors.append(tvm.placeholder(get_shape(typ.shape), typ.dtype)) - else: - assert isinstance(typ, _ty.TupleType) - for field in typ.fields: - create_tensors(field, tensors) - - inputs = [] - for param in prim_func.params: - tensors = [] - create_tensors(param.checked_type, tensors) - self.memo_map[param] = tensors - inputs.extend(tensors) - self.func_name = "fused" - outputs = self.visit(prim_func.body) - if len(self.func_name) > ScheduleGetter.MAX_FUNC_NAME_LENGTH: - hash_digest = int(hashlib.sha1(self.func_name.encode("utf-8")).hexdigest(), 16) - self.func_name = "%s_%s" % ( - self.func_name[:ScheduleGetter.MAX_FUNC_NAME_LENGTH], hash_digest) - - assert self.master_op is not None - tensor_outs = [] - for tensor in outputs: - if not isinstance(tensor.op, tvm.tensor.PlaceholderOp): - tensor_outs.append(tensor) - sch = None - if not isinstance(self.master_attrs, _op.op_attrs.DeviceCopyAttrs): - sch = self.master_implement.schedule(self.master_attrs, tensor_outs, self.target) - for scalar in self.scalars: - if scalar in sch.stage_map: - sch[scalar].compute_inline() - return CachedFunc(self.target, self.func_name, inputs, outputs, sch) - - def visit_var(self, var): - assert False, "Found free variable " + var.name_hint - - def visit_constant(self, const): - assert len(const.data.shape) == 0, "Constant is not scalar" - dtype = const.data.dtype - data = const.data.asnumpy() - def fcompute(): - if dtype.startswith("int"): - return tvm.expr.IntImm(dtype, int(data)) - elif dtype.startswith("uint"): - return tvm.expr.UIntImm(dtype, int(data)) - elif dtype.startswith("float"): - return tvm.expr.FloatImm(dtype, float(data)) +@tvm._ffi.register_func("relay.backend.lower_call") +def lower_call(call, inputs, target): + assert isinstance(call.op, _op.Op) + op = call.op + + # Prepare the call_node->checked_type(). For the call node inputs, we ensure that + # the shape is Int32. Following code ensures the same for the output as well. + # TODO(@icemelon9): Support recursive tuple + ret_type = call.checked_type + if isinstance(ret_type, _ty.TensorType): + ret_type = _ty.TensorType(get_shape(ret_type.shape), ret_type.dtype) + elif isinstance(ret_type, _ty.TupleType): + new_fields = [] + for field in ret_type.fields: + if isinstance(field, _ty.TensorType): + new_fields.append(_ty.TensorType(get_shape(field.shape), field.dtype)) else: - assert False, "not handled" - return tvm.expr.Expr() - value = tvm.compute((), fcompute, name="compile_engine_const", tag=tag.BROADCAST) - self.scalars.append(value.op) - return [value] - - def visit_call(self, call): - inputs = [] - count_tuple = 0 - for arg in call.args: - if isinstance(arg.checked_type, _ty.TupleType): - count_tuple += 1 - inputs.extend(self.visit(arg)) - assert count_tuple <= 1, "Only allow function with a single tuple input" - ret_type = call.checked_type - if isinstance(ret_type, _ty.TensorType): - ret_type = _ty.TensorType(get_shape(ret_type.shape), ret_type.dtype) - elif isinstance(ret_type, _ty.TupleType): - new_fields = [] - for field in ret_type.fields: - if isinstance(field, _ty.TensorType): - new_fields.append(_ty.TensorType(get_shape(field.shape), field.dtype)) - else: - new_fields.append(field) - ret_type = _ty.TupleType(new_fields) - assert isinstance(call.op, _op.Op) - op = call.op - - # disable AutoTVM tracing if op is not in wanted list - env = autotvm.task.TaskExtractEnv.current - reenable_tracing = False - if env is not None and env.tracing: - if env.wanted_relay_ops is not None and op not in env.wanted_relay_ops: - env.tracing = False - reenable_tracing = True - - if op == self._device_copy_op: - copy_input = inputs[0] - outputs = [_api_internal._Tensor(copy_input.shape, copy_input.dtype, - None, 0)] - else: - is_dyn = call.checked_type.is_dynamic() - for arg in call.args: - is_dyn = is_dyn or arg.checked_type.is_dynamic() - - if not is_dyn: - best_impl, outputs = select_implement( - op, call.attrs, inputs, ret_type, self.target) - logger.info("Use implementation %s for op %s", best_impl.name, op.name) - else: - # TODO(@icemelon9): Allow tvm to generate multiple kernels for dynamic shapes - # for dynamic case, we currently use the implementation with highest plevel - best_impl, outputs = select_implement( - op, call.attrs, inputs, ret_type, self.target, use_autotvm=False) - op_pattern = op.get_attr("TOpPattern") - if op_pattern >= _op.OpPattern.COMM_REDUCE: - assert self.master_op is None or self.master_op_pattern < _op.OpPattern.COMM_REDUCE, \ - "Two complicated op in a primitive function master=%s current=%s" % ( - self.master_op, op) - if op_pattern >= self.master_op_pattern: - self.master_op = op - self.master_attrs = call.attrs - self.master_op_pattern = op_pattern - self.master_implement = best_impl - if len(outputs) > 1: - assert isinstance(call.checked_type, _ty.TupleType) - assert len(call.checked_type.fields) == len(outputs) - if op == self._device_copy_op: - self.func_name += "__copy" - else: - self.func_name += "_" + op.name - - # re-enable AutoTVM tracing - if reenable_tracing: - env.tracing = True - - return outputs - - def visit_let(self, let): - val = self.visit(let.value) - assert let.var not in self.memo_map - self.memo_map[let.var] = val - return self.visit(let.body) - - def visit_tuple(self, tup): - fields = [] - for field in tup.fields: - assert isinstance(field.checked_type, _ty.TensorType), "Only allow Tuple of Tensor" - res = self.visit(field) - assert len(res) == 1 - fields.append(res[0]) - return fields - - def visit_tuple_getitem(self, t): - tup = self.visit(t.tuple_value) - assert len(tup) == len(t.tuple_value.checked_type.fields) - assert t.index >= 0 - assert t.index < len(tup) - return [tup[t.index]] - - -@tvm._ffi.register_func("relay.backend.create_schedule") -def create_schedule(src_func, target): - return ScheduleGetter(target).create(src_func) + new_fields.append(field) + ret_type = _ty.TupleType(new_fields) + + is_dyn = call.checked_type.is_dynamic() + for arg in call.args: + is_dyn = is_dyn or arg.checked_type.is_dynamic() + + # check if in the AutoTVM tracing mode, and disable if op is not in wanted list + env = autotvm.task.TaskExtractEnv.current + reenable_tracing = False + if env is not None and env.tracing: + if env.wanted_relay_ops is not None and op not in env.wanted_relay_ops: + env.tracing = False + reenable_tracing = True + + if not is_dyn: + best_impl, outputs = select_implement( + op, call.attrs, inputs, ret_type, target) + logger.info("Use implementation %s for op %s", best_impl.name, op.name) + else: + # TODO(@icemelon9): Allow tvm to generate multiple kernels for dynamic shapes. + # Currently, we just use the implementation with highest plevel + best_impl, outputs = select_implement( + op, call.attrs, inputs, ret_type, target, use_autotvm=False) + + # re-enable AutoTVM tracing + if reenable_tracing: + env.tracing = True + return LoweredOutput(outputs, best_impl) @register_relay_node diff --git a/python/tvm/te/__init__.py b/python/tvm/te/__init__.py index 1580da369c33..5970315e854b 100644 --- a/python/tvm/te/__init__.py +++ b/python/tvm/te/__init__.py @@ -23,8 +23,8 @@ from tvm.tir import div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod from tvm.tir import comm_reducer, min, max, sum -from .schedule import Schedule, create_schedule -from .tensor import Tensor +from .schedule import Schedule, create_schedule, SpecializedCondition +from .tensor import TensorSlice, Tensor from .tensor_intrin import decl_tensor_intrin from .tag import tag_scope from .operation import placeholder, compute, scan, extern, var, size_var diff --git a/python/tvm/te/schedule.py b/python/tvm/te/schedule.py index affb284da468..6a656a89776d 100644 --- a/python/tvm/te/schedule.py +++ b/python/tvm/te/schedule.py @@ -539,6 +539,11 @@ def __init__(self, conditions): self.__init_handle_by_constructor__( _ffi_api._CreateSpecializedCondition, conditions) + @staticmethod + def current(self): + """Returns the current specialized condition""" + return _ffi_api._GetCurrentSpecialization() + def __enter__(self): _ffi_api._EnterSpecializationScope(self) return self @@ -547,8 +552,4 @@ def __exit__(self, ptype, value, trace): _ffi_api._ExitSpecializationScope(self) -def current_specialization(): - return _ffi_api._GetCurrentSpecialization() - - tvm._ffi._init_api("schedule", __name__) diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc index bd51fdf1d59e..c0f3e3e8b6dd 100644 --- a/src/relay/backend/compile_engine.cc +++ b/src/relay/backend/compile_engine.cc @@ -47,27 +47,17 @@ namespace tvm { namespace relay { +TVM_REGISTER_NODE_TYPE(LoweredOutputNode); TVM_REGISTER_NODE_TYPE(CachedFuncNode); TVM_REGISTER_NODE_TYPE(CCacheKeyNode); TVM_REGISTER_NODE_TYPE(CCacheValueNode); TVM_REGISTER_OBJECT_TYPE(CompileEngineNode); -CachedFunc CachedFuncNode::make(tvm::Target target, - std::string func_name, - tvm::Array inputs, - tvm::Array outputs, - te::Schedule schedule, - tvm::Array funcs, - tvm::Array shape_func_param_states) { - auto n = make_object(); - n->target = std::move(target); - n->func_name = func_name; - n->inputs = std::move(inputs); +LoweredOutput::LoweredOutput(tvm::Array outputs, OpImplement implement) { + auto n = make_object(); n->outputs = std::move(outputs); - n->schedule = std::move(schedule); - n->funcs = std::move(funcs); - n->shape_func_param_states = std::move(shape_func_param_states); - return CachedFunc(n); + n->implement = std::move(implement); + data_ = std::move(n); } CCacheKey CCacheKeyNode::make(Function source_func, Target target) { @@ -118,7 +108,6 @@ Array GetShape(const Array& shape) { return res; } -/* // The getter to get schedule from compile engine. // Get schedule from functor. class ScheduleGetter : @@ -127,9 +116,7 @@ class ScheduleGetter : explicit ScheduleGetter(Target target) : target_(target), device_copy_op_(Op::Get("device_copy")) {} - std::pair Create(const Function& prim_func) { - static auto fschedule = - Op::GetAttr("FTVMSchedule"); + CachedFunc Create(const Function& prim_func) { auto cache_node = make_object(); cache_node->target = target_; for (Var param : prim_func->params) { @@ -166,7 +153,6 @@ class ScheduleGetter : } cache_node->func_name = candidate_name; - CachedFunc cfunc(cache_node); CHECK(master_op_.defined()); // Fusion over tupled results may leave identity relationships // between inputs and outputs, and those should not be scheduled. @@ -180,15 +166,16 @@ class ScheduleGetter : te::Schedule schedule; // No need to register schedule for device copy op. if (master_attrs_.as() == nullptr) { - schedule = - fschedule[master_op_](master_attrs_, tensor_outs, target_); + CHECK(master_implement_.defined()); + schedule = master_implement_.Schedule(master_attrs_, tensor_outs, target_); for (const auto& scalar : scalars_) { if (schedule->Contain(scalar)) { schedule[scalar].compute_inline(); } } } - return std::make_pair(schedule, cfunc); + cache_node->schedule = std::move(schedule); + return CachedFunc(cache_node); } Array VisitExpr(const Expr& expr) { @@ -233,10 +220,10 @@ class ScheduleGetter : } Array VisitExpr_(const CallNode* call_node) final { - static auto fcompute = - Op::GetAttr("FTVMCompute"); static auto fpattern = Op::GetAttr("TOpPattern"); + static auto flower_call = tvm::runtime::Registry::Get("relay.backend.lower_call"); + CHECK(flower_call) << "relay.backend.lower_call is not registered."; Array inputs; int count_tuple = 0; @@ -253,36 +240,21 @@ class ScheduleGetter : << "Only allow function with a single tuple input"; } - // Prepare the call_node->checked_type(). For the call node inputs, we ensure that the shape is - // Int32. Following code ensures the same for the output as well. - // TODO(@icemelon): Support recursive tuple - Type call_node_type = call_node->checked_type(); - if (const auto* tt = call_node->checked_type().as()) { - call_node_type = TensorType(GetShape(tt->shape), tt->dtype); - } else if (const auto* tuple_t = call_node->checked_type().as()) { - std::vector new_fields; - for (auto field : tuple_t->fields) { - if (const auto* tt = field.as()) { - new_fields.push_back(TensorType(GetShape(tt->shape), tt->dtype)); - } else { - new_fields.push_back(field); - } - } - call_node_type = TupleType(new_fields); - } - CHECK(call_node->op.as()) << "Primitive function only allows call into primitive ops"; Op op = Downcast(call_node->op); + Array outputs; + OpImplement implement; // Skip fcompute for device copy operators as it is not registered. if (op == device_copy_op_) { const auto* copy_input = inputs[0].operator->(); outputs.push_back(te::TensorNode::make(copy_input->shape, copy_input->dtype, te::Operation(), 0)); } else { - outputs = fcompute[op](call_node->attrs, inputs, - call_node_type, target_); + LoweredOutput lowered_out = (*flower_call)(GetRef(call_node), inputs, target_); + outputs = lowered_out->outputs; + implement = lowered_out->implement; } int op_pattern = fpattern[op]; @@ -295,6 +267,7 @@ class ScheduleGetter : master_op_ = op; master_attrs_ = call_node->attrs; master_op_pattern_ = op_pattern; + master_implement_ = implement; } if (outputs.size() != 1) { const auto* tuple_type = @@ -351,6 +324,7 @@ class ScheduleGetter : Op master_op_; Attrs master_attrs_; int master_op_pattern_{0}; + OpImplement master_implement_; std::ostringstream readable_name_stream_; std::unordered_map, ObjectHash, ObjectEqual> memo_; Array scalars_; @@ -358,7 +332,6 @@ class ScheduleGetter : // overhead for each invocation of call node when retrieving schedules. const Op& device_copy_op_; }; -*/ // Creates shape function from functor. class MakeShapeFunc : public ExprFunctor(const Expr&)> { @@ -698,13 +671,14 @@ class CompileEngineImpl : public CompileEngineNode { * The funcs field in cache is not yet populated. */ CachedFunc CreateSchedule(const Function& source_func, const Target& target) { - CachedFunc cfunc; - if (const auto* f = runtime::Registry::Get("relay.backend.create_schedule")) { - cfunc = (*f)(source_func, target); - } else { - LOG(FATAL) << "relay.backend.create_schedule is not registered"; - } - return cfunc; + //CachedFunc cfunc; +// if (const auto* f = runtime::Registry::Get("relay.backend.create_schedule")) { +// cfunc = (*f)(source_func, target); +// } else { +// LOG(FATAL) << "relay.backend.create_schedule is not registered"; +// } +// return cfunc; + return ScheduleGetter(target).Create(source_func); } private: @@ -846,8 +820,10 @@ const CompileEngine& CompileEngine::Global() { return *inst; } -TVM_REGISTER_GLOBAL("relay.backend._make_CachedFunc") -.set_body_typed(CachedFuncNode::make); +TVM_REGISTER_GLOBAL("relay.backend._make_LoweredOutput") +.set_body_typed([](tvm::Array outputs, OpImplement implement) { + return LoweredOutput(outputs, implement); +}); TVM_REGISTER_GLOBAL("relay.backend._make_CCacheKey") .set_body_typed(CCacheKeyNode::make); diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h index a405b208ddcb..ff9566d68625 100644 --- a/src/relay/backend/compile_engine.h +++ b/src/relay/backend/compile_engine.h @@ -44,7 +44,28 @@ enum ShapeFuncParamState { kNeedBoth = 3, }; -class CachedFunc; +struct LoweredOutputNode : public Object { + /*! \brief The outputs to the function */ + tvm::Array outputs; + /*! \brief The implementation used to compute the output */ + OpImplement implement; + + void VisitAttrs(tvm::AttrVisitor* v) { + v->Visit("outputs", &outputs); + v->Visit("implement", &implement); + } + + static constexpr const char* _type_key = "relay.LoweredOutput"; + TVM_DECLARE_FINAL_OBJECT_INFO(LoweredOutputNode, Object); +}; + +class LoweredOutput : public ObjectRef { + public: + TVM_DLL LoweredOutput(tvm::Array outputs, OpImplement implement); + + TVM_DEFINE_OBJECT_REF_METHODS(LoweredOutput, ObjectRef, LoweredOutputNode); +}; + /*! \brief Node container to represent a cached function. */ struct CachedFuncNode : public Object { /* \brief compiled target */ @@ -55,7 +76,7 @@ struct CachedFuncNode : public Object { tvm::Array inputs; /* \brief The outputs to the function */ tvm::Array outputs; - /* \brief The schedule to the function */ + /*! \brief The schedule to the function */ te::Schedule schedule; /*! \brief The lowered functions to support the function. */ tvm::Array funcs; @@ -72,14 +93,6 @@ struct CachedFuncNode : public Object { v->Visit("shape_func_param_states", &shape_func_param_states); } - TVM_DLL static CachedFunc make(tvm::Target target, - std::string func_name, - tvm::Array inputs, - tvm::Array outputs, - te::Schedule schedule, - tvm::Array funcs, - tvm::Array shape_func_param_states); - static constexpr const char* _type_key = "relay.CachedFunc"; TVM_DECLARE_FINAL_OBJECT_INFO(CachedFuncNode, Object); }; diff --git a/src/te/schedule/schedule_lang.cc b/src/te/schedule/schedule_lang.cc index 1f3c3a17aa0e..0417a41e74d2 100644 --- a/src/te/schedule/schedule_lang.cc +++ b/src/te/schedule/schedule_lang.cc @@ -789,10 +789,10 @@ IterVarRelation SingletonNode::make(IterVar iter) { return IterVarRelation(n); } -SpecializedCondition SpecializedConditionNode::make(Array conditions) { - auto n = make_object(); - n->clauses = conditions; - return SpecializedCondition(n); +SpecializedCondition::SpecializedCondition(Array conditions) { + ObjectPtr n = make_object(); + n->clauses = std::move(conditions); + data_ = std::move(n); } /*! \brief Entry to hold the SpecializedCondition context stack. */ @@ -826,7 +826,9 @@ SpecializedCondition SpecializedCondition::Current() { } TVM_REGISTER_GLOBAL("_CreateSpecializedCondition") -.set_body_typed(SpecializedConditionNode::make); +.set_body_typed([](Array condition) { + return SpecializedCondition(condition); +}); TVM_REGISTER_GLOBAL("_GetCurrentSpecialization") .set_body([](TVMArgs args, TVMRetValue* ret) { From bde152dc7f50b8e28f1d46902058ce905d4c5b6b Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Mon, 10 Feb 2020 16:58:17 -0800 Subject: [PATCH 11/48] x --- src/relay/backend/compile_engine.cc | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc index c0f3e3e8b6dd..843037d62796 100644 --- a/src/relay/backend/compile_engine.cc +++ b/src/relay/backend/compile_engine.cc @@ -671,13 +671,6 @@ class CompileEngineImpl : public CompileEngineNode { * The funcs field in cache is not yet populated. */ CachedFunc CreateSchedule(const Function& source_func, const Target& target) { - //CachedFunc cfunc; -// if (const auto* f = runtime::Registry::Get("relay.backend.create_schedule")) { -// cfunc = (*f)(source_func, target); -// } else { -// LOG(FATAL) << "relay.backend.create_schedule is not registered"; -// } -// return cfunc; return ScheduleGetter(target).Create(source_func); } From ef20785c3afa3d6050d863a20b830174737c9b09 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Tue, 11 Feb 2020 20:17:26 +0000 Subject: [PATCH 12/48] fix more tests & bugs --- python/tvm/relay/op/nn/nn.py | 8 --- python/tvm/relay/op/strategy/cuda.py | 27 +++++---- python/tvm/relay/op/strategy/generic.py | 56 ++++++++++++++----- python/tvm/relay/op/strategy/x86.py | 4 +- python/tvm/relay/op/vision/_vision.py | 24 +------- python/tvm/relay/quantize/_annotate.py | 2 +- python/tvm/relay/testing/mobilenet.py | 30 ++++++---- src/relay/op/nn/convolution.h | 4 +- tests/python/integration/test_tuning.py | 4 +- tests/python/relay/test_any.py | 3 + .../relay/test_autotvm_task_extraction.py | 47 ---------------- tests/python/relay/test_op_qnn_conv2d.py | 15 ++++- .../python/relay/test_pass_alter_op_layout.py | 51 +++++++++-------- tests/python/relay/test_pass_auto_quantize.py | 5 +- .../python/relay/test_pass_fold_scale_axis.py | 9 +-- .../unittest/test_lang_tensor_overload_op.py | 6 +- topi/python/topi/testing/__init__.py | 2 +- topi/python/topi/testing/common.py | 14 +++++ topi/python/topi/x86/dense.py | 6 +- topi/python/topi/x86/depthwise_conv2d.py | 50 ++++++++++------- topi/tests/python/test_fifo_buffer.py | 12 ++-- topi/tests/python/test_topi_conv2d_NCHWc.py | 12 ++-- topi/tests/python/test_topi_conv2d_nchw.py | 15 ++++- topi/tests/python/test_topi_transform.py | 6 +- topi/tests/python/test_topi_vision.py | 32 +++++------ 25 files changed, 233 insertions(+), 211 deletions(-) diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index eaf41cf7871a..9ecb5af8b551 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -204,8 +204,6 @@ def conv2d(data, # TODO enforce 4-way padding in topi/nn/conv2d after #4644 merged # convert 2-way padding to 4-way padding padding = get_pad_tuple2d(padding) - if not out_layout: - out_layout = data_layout return _make.conv2d(data, weight, strides, padding, dilation, groups, channels, kernel_size, data_layout, kernel_layout, out_layout, out_dtype) @@ -299,8 +297,6 @@ def conv3d(data, dilation = (dilation, dilation, dilation) if isinstance(padding, int): padding = (padding, padding, padding) - if not out_layout: - out_layout = data_layout return _make.conv3d(data, weight, strides, padding, dilation, groups, channels, kernel_size, data_layout, kernel_layout, out_layout, out_dtype) @@ -369,8 +365,6 @@ def conv2d_transpose(data, """ # convert 2-way padding to 4-way padding padding = get_pad_tuple2d(padding) - if not out_layout: - out_layout = data_layout return _make.conv2d_transpose(data, weight, strides, padding, dilation, groups, channels, kernel_size, data_layout, kernel_layout, out_layout, output_padding, out_dtype) @@ -437,8 +431,6 @@ def conv1d_transpose(data, result : tvm.relay.Expr The computed result. """ - if not out_layout: - out_layout = data_layout return _make.conv1d_transpose(data, weight, strides, padding, dilation, groups, channels, kernel_size, data_layout, kernel_layout, out_layout, output_padding, out_dtype) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 0e51aabcb7b2..e65d2910ee3f 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -326,17 +326,24 @@ def topk_strategy_cuda(attrs, inputs, out_type, target): name="topk.cuda") return strategy -@schedule_multibox_prior.register(["cuda", "gpu"]) -def schedule_multibox_prior_cuda(attrs, outs, target): - """schedule multibox_prior for cuda""" - with target: - return topi.cuda.schedule_multibox_prior(outs) +@multibox_prior_strategy.register(["cuda", "gpu"]) +def multibox_prior_strategy_cuda(attrs, inputs, out_type, target): + """multibox_prior cuda strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_multibox_prior(topi.cuda.multibox_prior), + wrap_topi_schedule(topi.cuda.schedule_multibox_prior), + name="multibox_prior.cuda") + return strategy -@schedule_multibox_transform_loc.register(["cuda", "gpu"]) -def schedule_multibox_transform_loc_cuda(attrs, outs, target): - """schedule multibox_transform_loc for cuda""" - with target: - return topi.cuda.schedule_multibox_transform_loc(outs) +@multibox_transform_loc_strategy.register(["cuda", "gpu"]) +def multibox_transform_loc_strategy_cuda(attrs, inputs, out_type, target): + """multibox_transform_loc cuda strategy""" + strategy = _op.OpStrategy() + strategy.add_implement( + wrap_compute_multibox_transform_loc(topi.cuda.multibox_transform_loc), + wrap_topi_schedule(topi.cuda.schedule_multibox_transform_loc), + name="multibox_transform_loc.cuda") + return strategy @get_valid_counts_strategy.register(["cuda", "gpu"]) def get_valid_counts_strategy_cuda(attrs, inputs, out_type, target): diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index f4d57d7d9189..b20a630e9296 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -459,10 +459,10 @@ def dense_strategy(attrs, inputs, out_type, target): return strategy # batch_matmul -def wrap_compute_batch_matmul(topi_func): +def wrap_compute_batch_matmul(topi_compute): """wrap batch_matmul topi compute""" def _compute_batch_matmul(attrs, inputs, out_type): - return [topi_func(inputs[0], inputs[1])] + return [topi_compute(inputs[0], inputs[1])] return _compute_batch_matmul @override_native_generic_func("batch_matmul_strategy") @@ -509,7 +509,7 @@ def argsort_strategy(attrs, inputs, out_type, target): return strategy # topk -def wrap_compute_topk(topi_func): +def wrap_compute_topk(topi_compute): """Wrap topk compute""" def _compute_topk(attrs, inputs, out_type): k = get_const_int(attrs.k) @@ -517,7 +517,7 @@ def _compute_topk(attrs, inputs, out_type): ret_type = attrs.ret_type is_ascend = bool(get_const_int(attrs.is_ascend)) dtype = attrs.dtype - out = topi_func(inputs[0], k, axis, ret_type, is_ascend, dtype) + out = topi_compute(inputs[0], k, axis, ret_type, is_ascend, dtype) out = out if isinstance(out, list) else [out] return out return _compute_topk @@ -532,18 +532,48 @@ def topk_strategy(attrs, inputs, out_type, target): return strategy # multibox_prior -@generic_func -def schedule_multibox_prior(attrs, outs, target): - """schedule multibox_prior""" - with target: - return topi.generic.schedule_multibox_prior(outs) +def wrap_compute_multibox_prior(topi_compute): + """Wrap multibox_prior compute""" + def _compute_multibox_prior(attrs, inputs, _): + """Compute definition of multibox_prior""" + sizes = get_float_tuple(attrs.sizes) + ratios = get_float_tuple(attrs.ratios) + steps = get_float_tuple(attrs.steps) + offsets = get_float_tuple(attrs.offsets) + clip = bool(get_const_int(attrs.clip)) + return [topi_compute(inputs[0], sizes, ratios, steps, offsets, clip)] + return _compute_multibox_prior + +@override_native_generic_func("multibox_prior_strategy") +def multibox_prior_strategy(attrs, inputs, out_type, target): + """multibox_prior generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implement(wrap_compute_multibox_prior(topi.vision.ssd.multibox_prior), + wrap_topi_schedule(topi.generic.schedule_multibox_prior), + name="multibox_prior.generic") + return strategy # multibox_transform_loc -@generic_func -def schedule_multibox_transform_loc(attrs, outs, target): +def wrap_compute_multibox_transform_loc(topi_compute): + """Wrap multibox_transform_loc compute""" + def _compute_multibox_transform_loc(attrs, inputs, _): + """Compute definition of multibox_detection""" + clip = bool(get_const_int(attrs.clip)) + threshold = get_const_float(attrs.threshold) + variances = get_float_tuple(attrs.variances) + return topi_compute( + inputs[0], inputs[1], inputs[2], clip, threshold, variances) + return _compute_multibox_transform_loc + +@override_native_generic_func("multibox_transform_loc_strategy") +def multibox_transform_loc_strategy(attrs, inputs, out_type, target): """schedule multibox_transform_loc""" - with target: - return topi.generic.schedule_multibox_transform_loc(outs) + strategy = _op.OpStrategy() + strategy.add_implement( + wrap_compute_multibox_transform_loc(topi.vision.ssd.multibox_transform_loc), + wrap_topi_schedule(topi.generic.schedule_multibox_transform_loc), + name="multibox_transform_loc.generic") + return strategy # get_valid_counts def wrap_compute_get_valid_counts(topi_compute): diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index ae0e7a1bf2d1..1cf05a0165d2 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -224,7 +224,7 @@ def conv1d_strategy_cpu(attrs, inputs, out_type, target): def dense_strategy_cpu(attrs, inputs, out_type, target): """dense x86 strategy""" strategy = _op.OpStrategy() - _, k = inputs[0].shape + m, _ = inputs[0].shape strategy.add_implement(wrap_compute_dense(topi.x86.dense_nopack), wrap_topi_schedule(topi.x86.schedule_dense_nopack), name="dense_nopack.x86", @@ -234,7 +234,7 @@ def dense_strategy_cpu(attrs, inputs, out_type, target): wrap_topi_schedule(topi.x86.schedule_dense_cblas), name="dense_cblas.x86", plevel=5) - with SpecializedCondition(k > 16): + with SpecializedCondition(m >= 16): # this implementation may not be well-optimized, so use plevel=8 for now. strategy.add_implement(wrap_compute_dense(topi.x86.dense_pack), wrap_topi_schedule(topi.x86.schedule_dense_pack), diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py index 737954da82ba..6b80aa7778f8 100644 --- a/python/tvm/relay/op/vision/_vision.py +++ b/python/tvm/relay/op/vision/_vision.py @@ -25,32 +25,12 @@ from ..op import OpPattern # multibox_prior -@reg.register_compute("vision.multibox_prior") -def compute_multibox_prior(attrs, inputs, _): - """Compute definition of multibox_prior""" - sizes = get_float_tuple(attrs.sizes) - ratios = get_float_tuple(attrs.ratios) - steps = get_float_tuple(attrs.steps) - offsets = get_float_tuple(attrs.offsets) - clip = bool(get_const_int(attrs.clip)) - return [topi.vision.ssd.multibox_prior(inputs[0], sizes, ratios, steps, - offsets, clip)] - -reg.register_schedule("vision.multibox_prior", strategy.schedule_multibox_prior) +reg.register_strategy("vision.multibox_prior", strategy.multibox_prior_strategy) reg.register_pattern("vision.multibox_prior", OpPattern.OPAQUE) # multibox_transform_loc -@reg.register_compute("vision.multibox_transform_loc") -def compute_multibox_transform_loc(attrs, inputs, _): - """Compute definition of multibox_detection""" - clip = bool(get_const_int(attrs.clip)) - threshold = get_const_float(attrs.threshold) - variances = get_float_tuple(attrs.variances) - return topi.vision.ssd.multibox_transform_loc( - inputs[0], inputs[1], inputs[2], clip, threshold, variances) - -reg.register_schedule("vision.multibox_transform_loc", strategy.schedule_multibox_transform_loc) +reg.register_strategy("vision.multibox_transform_loc", strategy.multibox_transform_loc_strategy) reg.register_pattern("vision.multibox_transform_loc", OpPattern.OPAQUE) diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py index 71cd86723a6c..b77516de6839 100644 --- a/python/tvm/relay/quantize/_annotate.py +++ b/python/tvm/relay/quantize/_annotate.py @@ -31,7 +31,7 @@ @_reg.register_compute("relay.op.annotation.simulated_quantize") -def simulated_quantize_compute(attrs, inputs, out_type, target): +def simulated_quantize_compute(attrs, inputs, out_type): """Compiler for simulated_quantize.""" assert len(inputs) == 4 assert attrs.sign diff --git a/python/tvm/relay/testing/mobilenet.py b/python/tvm/relay/testing/mobilenet.py index 9aaefdfdb02d..d5a4d5f1e08f 100644 --- a/python/tvm/relay/testing/mobilenet.py +++ b/python/tvm/relay/testing/mobilenet.py @@ -44,15 +44,18 @@ def conv_block(data, name, channels, kernel_size=(3, 3), strides=(1, 1), def separable_conv_block(data, name, depthwise_channels, pointwise_channels, kernel_size=(3, 3), downsample=False, padding=(1, 1), - epsilon=1e-5, layout='NCHW'): + epsilon=1e-5, layout='NCHW', dtype="float32"): """Helper function to get a separable conv block""" if downsample: strides = (2, 2) else: strides = (1, 1) # depthwise convolution + bn + relu + wshape = (depthwise_channels, 1) + kernel_size + weight = relay.var(name + "_weight", shape=wshape, dtype=dtype) conv1 = layers.conv2d( data=data, + weight=weight, channels=depthwise_channels, groups=depthwise_channels, kernel_size=kernel_size, @@ -85,38 +88,41 @@ def mobile_net(num_classes=1000, data_shape=(1, 3, 224, 224), body = conv_block(data, 'conv_block_1', int(32*alpha), strides=(2, 2), layout=layout) body = separable_conv_block(body, 'separable_conv_block_1', - int(32*alpha), int(64*alpha), layout=layout) + int(32*alpha), int(64*alpha), layout=layout, + dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_2', int(64*alpha), int(128*alpha), downsample=True, - layout=layout) + layout=layout, dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_3', - int(128*alpha), int(128*alpha), layout=layout) + int(128*alpha), int(128*alpha), layout=layout, + dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_4', int(128*alpha), int(256*alpha), downsample=True, - layout=layout) + layout=layout, dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_5', - int(256*alpha), int(256*alpha), layout=layout) + int(256*alpha), int(256*alpha), layout=layout, + dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_6', int(256*alpha), int(512*alpha), downsample=True, - layout=layout) + layout=layout, dtype=dtype) if is_shallow: body = separable_conv_block(body, 'separable_conv_block_7', int(512*alpha), int(1024*alpha), - downsample=True, layout=layout) + downsample=True, layout=layout, dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_8', int(1024*alpha), int(1024*alpha), - downsample=True, layout=layout) + downsample=True, layout=layout, dtype=dtype) else: for i in range(7, 12): body = separable_conv_block(body, 'separable_conv_block_%d' % i, int(512*alpha), int(512*alpha), - layout=layout) + layout=layout, dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_12', int(512*alpha), int(1024*alpha), - downsample=True, layout=layout) + downsample=True, layout=layout, dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_13', int(1024*alpha), int(1024*alpha), - layout=layout) + layout=layout, dtype=dtype) pool = relay.nn.global_avg_pool2d(data=body, layout=layout) flatten = relay.nn.batch_flatten(data=pool) weight = relay.var('fc_weight') diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h index 9e303260c07f..9ee84a0332bb 100644 --- a/src/relay/op/nn/convolution.h +++ b/src/relay/op/nn/convolution.h @@ -155,8 +155,8 @@ bool Conv2DRel(const Array& types, int num_inputs, const Attrs& attrs, Array dshape_nchw = trans_in_layout.ForwardShape(data->shape); bool is_depthwise = false; if (param->groups > 1) { - CHECK(weight->shape.defined()) << "Weight shape must be specified " << - "when groups is greater than 1."; + CHECK(weight && weight->shape.defined()) << + "Weight shape must be specified when groups is greater than 1."; Array wshape_oihw = trans_kernel_layout.ForwardShape(weight->shape); if (tvm::tir::Equal(param->groups, dshape_nchw[1]) && tvm::tir::Equal(param->groups, wshape_oihw[0])) { diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py index fec7d3e4f83f..99f8b47cce07 100644 --- a/tests/python/integration/test_tuning.py +++ b/tests/python/integration/test_tuning.py @@ -25,7 +25,7 @@ from tvm import autotvm from tvm.autotvm.tuner import RandomTuner -@autotvm.template +@autotvm.register_customized_task("testing/conv2d_no_batching") def conv2d_no_batching(N, H, W, CI, CO, KH, KW): """An example template for testing""" assert N == 1, "Only consider batch_size = 1 in this template" @@ -114,7 +114,7 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW): def get_sample_task(target=tvm.target.cuda(), target_host=None): """return a sample task for testing""" - task = autotvm.task.create(conv2d_no_batching, + task = autotvm.task.create("testing/conv2d_no_batching", args=(1, 7, 7, 512, 512, 3, 3), target=target, target_host=target_host) return task, target diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py index 3e392a8e630f..24176e4c41dd 100644 --- a/tests/python/relay/test_any.py +++ b/tests/python/relay/test_any.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. import numpy as np +import pytest import tvm from tvm import relay @@ -384,6 +385,8 @@ def verify_any_conv2d_NCHWc(data_shape, kernel_shape, strides, padding, dilation assert result.asnumpy().shape == ref_out_shape, \ "Shape mismatch: expect %s but got %s." % (str(ref_out_shape), str(result.asnumpy().shape)) +# TODO(@kevinthesun): Need to fix the compute in conv2d_NCHWc to support any +@pytest.mark.skip def test_any_conv2d_NCHWc(): verify_any_conv2d_NCHWc((relay.Any(), 8, relay.Any(), relay.Any(), 8), (8, 8, 3, 3, 8, 8), (1, 1), (1, 1), (1, 1), "NCHW8c", "OIHW8i8o", "NCHW8c", (1, 8, 224, 224, 8), (1, 8, 224, 224, 8)) diff --git a/tests/python/relay/test_autotvm_task_extraction.py b/tests/python/relay/test_autotvm_task_extraction.py index 73dbf106b541..e555caada626 100644 --- a/tests/python/relay/test_autotvm_task_extraction.py +++ b/tests/python/relay/test_autotvm_task_extraction.py @@ -97,52 +97,5 @@ def test_task_extraction(): ops=(conv2d,)) assert len(tasks) == 31 -def test_template_key_provided(): - """test task extraction using non-'direct' template_key""" - target = 'llvm' - - import topi - template_keys = { - # topi.nn.conv2d - is left blank to test fallback logic - topi.nn.dense: 'direct_nopack', - topi.nn.depthwise_conv2d_nchw: 'direct', - } - - mod, params, _ = get_network('mobilenet', batch_size=1) - tasks = autotvm.task.extract_from_program(mod['main'], target=target, - params=params, - ops=(relay.op.nn.conv2d, relay.op.nn.dense), - template_keys=template_keys) - for task in tasks: - if 'dense' in task.name: - assert task.config_space.template_key == 'direct_nopack' - else: - assert task.config_space.template_key == 'direct' - -def test_template_key_empty(): - """test task extraction using empty template_key""" - target = 'llvm' - mod, params, _ = get_network('mobilenet', batch_size=1) - tasks = autotvm.task.extract_from_program(mod['main'], target=target, - params=params, - ops=(relay.op.nn.conv2d, relay.op.nn.dense), - template_keys=None) - for task in tasks: - assert task.config_space.template_key == 'direct' - -def test_template_key_default(): - """test task extraction without template_key""" - target = 'llvm' - mod, params, _ = get_network('mobilenet', batch_size=1) - tasks = autotvm.task.extract_from_program(mod['main'], target=target, - params=params, - ops=(relay.op.nn.conv2d, relay.op.nn.dense)) - for task in tasks: - assert task.config_space.template_key == 'direct' - if __name__ == '__main__': test_task_extraction() - # TODO(@icemelon9): template key will no long exist, remove these tasks. - # test_template_key_provided() - # test_template_key_empty() - # test_template_key_default() diff --git a/tests/python/relay/test_op_qnn_conv2d.py b/tests/python/relay/test_op_qnn_conv2d.py index 67a7ef694033..3b88e1c925d8 100644 --- a/tests/python/relay/test_op_qnn_conv2d.py +++ b/tests/python/relay/test_op_qnn_conv2d.py @@ -431,9 +431,9 @@ def test_layout(): kernel_shape, kernel_dtype) # NHWC and HWOI layout. Used in depthwise conv. - data_shape = (2, 2, 4, 1) # NHWC + data_shape = (2, 2, 4, 3) # NHWC data_dtype = 'uint8' - kernel_shape = (2, 2, 1, 1) # HWOI + kernel_shape = (2, 2, 3, 1) # HWOI kernel_dtype = 'uint8' ref_func, qnn_func = get_funcs(data_shape=data_shape, data_dtype=data_dtype, @@ -447,6 +447,7 @@ def test_layout(): padding=(0, 0), strides=(1, 1), dilation=(1, 1), + groups=3, data_layout="NHWC", kernel_layout="HWOI", out_dtype="int32") @@ -826,7 +827,12 @@ def test_depthwise_depth_multiplier(): data_layout="NCHW", kernel_layout="OIHW", out_dtype="int32", +<<<<<<< HEAD groups=8) +======= + groups=4, + channels=8) +>>>>>>> fix more tests & bugs verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype) @@ -875,7 +881,12 @@ def test_depthwise_depth_multiplier(): data_layout="NHWC", kernel_layout="HWOI", out_dtype="int32", +<<<<<<< HEAD groups=8) +======= + groups=4, + channels=8) +>>>>>>> fix more tests & bugs verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype) diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py index 2ec3f282a6c4..df01310937ed 100644 --- a/tests/python/relay/test_pass_alter_op_layout.py +++ b/tests/python/relay/test_pass_alter_op_layout.py @@ -15,8 +15,9 @@ # specific language governing permissions and limitations # under the License. """Test alter op layout pass""" -import tvm +import pytest +import tvm from tvm import relay from tvm.relay import transform, analysis from tvm.relay.testing.temp_op_attr import TempOpAttr @@ -43,7 +44,7 @@ def before(): y = relay.Function([x, weight], y) return y - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs weight = relay.multiply(weight, relay.const(2.0, "float32")) return relay.nn.conv2d(data, weight, **attrs) @@ -77,7 +78,7 @@ def before(): called = [False] - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): called[0] = True return None @@ -109,7 +110,7 @@ def before(): y = relay.Function(analysis.free_vars(y), y) return y - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs new_attrs = dict(attrs) new_attrs['data_layout'] = 'NCHW16c' @@ -176,7 +177,7 @@ def before(): y = relay.Function(analysis.free_vars(ret), ret) return y - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs new_attrs = dict(attrs) new_attrs['data_layout'] = 'NCHW16c' @@ -236,7 +237,7 @@ def before(): y = relay.nn.global_max_pool2d(y) return relay.Function(analysis.free_vars(y), y) - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs new_attrs = dict(attrs) new_attrs['data_layout'] = 'NCHW16c' @@ -285,7 +286,7 @@ def before(): y = relay.Function(analysis.free_vars(y), y) return y - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs new_attrs = dict(attrs) new_attrs['data_layout'] = 'NCHW16c' @@ -342,7 +343,7 @@ def before(): y = relay.Function(analysis.free_vars(y), y) return y - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs new_attrs = dict(attrs) new_attrs['data_layout'] = 'NCHW16c' @@ -394,7 +395,7 @@ def before(): y = relay.Function(analysis.free_vars(y), y) return y - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs new_attrs = dict(attrs) new_attrs['data_layout'] = 'NCHW16c' @@ -427,7 +428,7 @@ def expected(): def test_alter_layout_concatenate(): """ NCHW, NHWC and corner case concatenate layout transform.""" - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs new_attrs = dict(attrs) new_attrs['data_layout'] = 'NCHW16c' @@ -536,7 +537,7 @@ def before(): y = relay.Function(analysis.free_vars(y), y) return y - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs new_attrs = dict(attrs) new_attrs['data_layout'] = 'NCHW16c' @@ -572,7 +573,7 @@ def before(): y = relay.Function(analysis.free_vars(y), y) return y - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs new_attrs = dict(attrs) new_attrs['data_layout'] = 'NCHW4c' @@ -607,9 +608,9 @@ def before(): return y import topi - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): with tvm.target.create("llvm"): - return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, relay) + return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, out_type) def expected(): @@ -643,7 +644,7 @@ def before(): y = relay.Function(analysis.free_vars(y), y) return y - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs new_attrs = dict(attrs) new_attrs['data_layout'] = 'NCHW16c' @@ -675,7 +676,7 @@ def expected(): def test_alter_layout_pad(): """ Check NCHW, NHWC and corner case for pad layout conversion""" - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs new_attrs = dict(attrs) new_attrs['data_layout'] = 'NCHW16c' @@ -785,7 +786,7 @@ def expected(): def test_alter_layout_pool(): """ Check NCHW, NHWC pool layout conversion""" - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs new_attrs = dict(attrs) new_attrs['data_layout'] = 'NCHW16c' @@ -862,7 +863,7 @@ def expected_nhwc(): def test_alter_layout_sum(): """ Check NCHW, NHWC sum layout conversion""" - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs new_attrs = dict(attrs) new_attrs['data_layout'] = 'NCHW16c' @@ -938,11 +939,15 @@ def expected_nhwc(): assert analysis.alpha_equal(a, b), "Actual = \n" + str(a) +# TODO(@anijain2305, @icemelon9): We should fix this. This doesn't seem to be the +# right behavior of alter_layout +@pytest.mark.skip def test_alter_layout_nhwc_nchw_arm(): """ Check NHWC to NHCW conversion for a small sequence of ops.""" - def alter_conv2d(attrs, inputs, tinfos): - from topi.arm_cpu.conv2d import _alter_conv2d_layout_arm - return _alter_conv2d_layout_arm(attrs, inputs, tinfos, tvm.relay) + def alter_conv2d(attrs, inputs, tinfos, out_type): + import topi + with tvm.target.create("llvm -device=arm_cpu"): + return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, out_type) # Check NHWC conversion. def before_nhwc(): @@ -1011,7 +1016,7 @@ def before(): mod["main"] = relay.Function([x, weight], foo(x, weight)) return mod - def alter_conv2d(attrs, inputs, tinfos): + def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs weight = relay.multiply(weight, relay.const(2.0, "float32")) return relay.nn.conv2d(data, weight, **attrs) @@ -1054,5 +1059,5 @@ def expected(): test_alter_layout_pad() test_alter_layout_pool() test_alter_layout_sum() - test_alter_layout_nhwc_nchw_arm() + # test_alter_layout_nhwc_nchw_arm() test_alter_op_with_global_var() diff --git a/tests/python/relay/test_pass_auto_quantize.py b/tests/python/relay/test_pass_auto_quantize.py index 443d2e40634d..02438ef04f2a 100644 --- a/tests/python/relay/test_pass_auto_quantize.py +++ b/tests/python/relay/test_pass_auto_quantize.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. import numpy as np +import pytest + import tvm from tvm import relay from tvm.relay import testing @@ -55,7 +57,8 @@ def get_calibration_dataset(input_name): return dataset -def test_calibrate_target(create_target=False): +@pytest.mark.parametrize("create_target", [True, False]) +def test_calibrate_target(create_target): mod, params = testing.resnet.get_workload(num_layers=18) dataset = get_calibration_dataset("data") with relay.quantize.qconfig(calibrate_mode="kl_divergence"): diff --git a/tests/python/relay/test_pass_fold_scale_axis.py b/tests/python/relay/test_pass_fold_scale_axis.py index 13995732d8ee..bfc3caba45e3 100644 --- a/tests/python/relay/test_pass_fold_scale_axis.py +++ b/tests/python/relay/test_pass_fold_scale_axis.py @@ -131,12 +131,13 @@ def expected(x, conv_weight, in_bias, in_scale, channels): z = relay.add(y1, y2) return relay.Function(args, z) - def check(shape, channels): - x = relay.var("x", shape=shape) - in_channels = shape[-1] + def check(dshape, channels): + x = relay.var("x", shape=dshape) + in_channels = dshape[-1] # test depthwise assert in_channels == channels - weight = relay.var("weight") + wshape = (3, 3, 1, channels) # HWIO + weight = relay.var("weight", shape=wshape) in_bias = relay.var("in_bias", shape=(in_channels,)) in_scale = relay.const(_get_positive_scale(in_channels,)) y1 = before(x, weight, in_bias, in_scale, channels) diff --git a/tests/python/unittest/test_lang_tensor_overload_op.py b/tests/python/unittest/test_lang_tensor_overload_op.py index 22bc28a64297..01c0d26dfc9b 100644 --- a/tests/python/unittest/test_lang_tensor_overload_op.py +++ b/tests/python/unittest/test_lang_tensor_overload_op.py @@ -190,12 +190,14 @@ def check_device(device): return print("Running on target: %s" % device) + conv2d_nchw, schedule_conv2d_nchw = topi.testing.get_conv2d_nchw_implement(device) + k = 10.0 dilation = (1, 1) with tvm.target.create(device): A = tvm.placeholder((batch, in_channel, in_size, in_size), name='A') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W') - B = topi.nn.conv2d(A, W, stride, padding, dilation) + B = conv2d_nchw(A, W, stride, padding, dilation, A.dtype) if typ == "add": C = B + k elif typ == "sub": @@ -206,7 +208,7 @@ def check_device(device): C = B / k else: raise NotImplementedError() - s = topi.generic.schedule_conv2d_nchw([C]) + s = schedule_conv2d_nchw([C]) foo = tvm.build(s, [A, W, B, C], device, name="conv2d_scalar_" + typ) diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py index a9c8b49ce4bd..b0f4752ad492 100644 --- a/topi/python/topi/testing/__init__.py +++ b/topi/python/topi/testing/__init__.py @@ -54,4 +54,4 @@ from .space_to_depth import space_to_depth_python from .crop_and_resize_python import crop_and_resize_python from .common import get_injective_schedule, get_reduce_schedule, get_broadcast_schedule, \ - get_elemwise_schedule, dispatch + get_elemwise_schedule, get_conv2d_nchw_implement, dispatch diff --git a/topi/python/topi/testing/common.py b/topi/python/topi/testing/common.py index 876f3e4b5ccd..4c926e991399 100644 --- a/topi/python/topi/testing/common.py +++ b/topi/python/topi/testing/common.py @@ -53,3 +53,17 @@ def get_reduce_schedule(target): get_broadcast_schedule = get_injective_schedule get_elemwise_schedule = get_injective_schedule + +_conv2d_nchw_implement = { + "generic": (topi.nn.conv2d_nchw, topi.generic.schedule_conv2d_nchw), + "cpu": (topi.x86.conv2d_nchw, topi.x86.schedule_conv2d_nchw), + "arm_cpu": (topi.arm_cpu.conv2d_nchw_spatial_pack, topi.arm_cpu.schedule_conv2d_nchw_spatial_pack), + "gpu": (topi.cuda.conv2d_nchw, topi.cuda.schedule_conv2d_nchw), + "mali": (topi.mali.conv2d_nchw_spatial_pack, topi.mali.schedule_conv2d_nchw_spatial_pack), + "bifrost": (topi.bifrost.conv2d_nchw_spatial_pack, topi.bifrost.schedule_conv2d_nchw_spatial_pack), + "opengl": (topi.nn.conv2d_nchw, topi.opengl.schedule_conv2d_nchw), + "intel_graphics": (topi.intel_graphics.conv2d_nchw, topi.intel_graphics.schedule_conv2d_nchw) +} + +def get_conv2d_nchw_implement(target): + return dispatch(target, _conv2d_nchw_implement) diff --git a/topi/python/topi/x86/dense.py b/topi/python/topi/x86/dense.py index d03839fe6319..ea89cf4779b0 100644 --- a/topi/python/topi/x86/dense.py +++ b/topi/python/topi/x86/dense.py @@ -141,9 +141,9 @@ def dense_nopack(cfg, data, weight, bias=None, out_dtype=None): M, K = get_const_tuple(data.shape) N, _ = get_const_tuple(weight.shape) # create tuning space - cfg.define_split("tile_y", M, num_outputs=2) - cfg.define_split("tile_x", N, num_outputs=2) - cfg.define_split("tile_k", K, num_outputs=2) + cfg.define_split("tile_y", 32 if isinstance(M, tvm.expr.Var) else M, num_outputs=2) + cfg.define_split("tile_x", 32 if isinstance(N, tvm.expr.Var) else N, num_outputs=2) + cfg.define_split("tile_k", 32 if isinstance(K, tvm.expr.Var) else K, num_outputs=2) if cfg.is_fallback: _default_dense_nopack_config(cfg, M, N, K) diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py index 275f33da54f3..488842e95d54 100644 --- a/topi/python/topi/x86/depthwise_conv2d.py +++ b/topi/python/topi/x86/depthwise_conv2d.py @@ -26,6 +26,7 @@ from ..nn.util import get_pad_tuple from ..nn.depthwise_conv2d import _get_workload, depthwise_conv2d_infer_layout from ..nn.conv2d import unpack_NCHWc_to_nchw +from ..util import traverse_inline from .util import get_fp32_len def _fallback_schedule(cfg, wkl): @@ -134,7 +135,7 @@ def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, # get workload and related schedule config wkl = _get_workload(tvm.placeholder((batch, in_channel, in_height, in_width), dtype=data.dtype), - tvm.placeholder((out_channel, in_channel, filter_height, filter_width), + tvm.placeholder((out_channel, channel_multiplier, filter_height, filter_width), dtype=kernel.dtype), strides, padding, out_dtype) if cfg.is_fallback: @@ -181,27 +182,20 @@ def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, def schedule_depthwise_conv2d_NCHWc(cfg, outs): """CPU schedule for depthwise conv2d in NCHW[x]c layout""" s = tvm.create_schedule([x.op for x in outs]) - scheduled_ops = [] - def traverse(op): + + def _callback(op): """Traverse operators from computation graph""" - # inline all one-to-one-mapping operators except the last stage (output) - if tag.is_broadcast(op.tag): - if op not in s.outputs: - s[op].compute_inline() - for tensor in op.input_tensors: - if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops: - traverse(tensor.op) if 'depthwise_conv2d_NCHWc' in op.tag: conv_out = op.output(0) data = conv_out.op.input_tensors[0] kernel = conv_out.op.input_tensors[1] _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data, kernel, conv_out, outs[0]) - scheduled_ops.append(op) - traverse(outs[0].op) + + traverse_inline(s, outs[0].op, _callback) return s def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data_vec, kernel_vec, conv_out, output): - tile_ow = cfg["tile_ow"].size[-1] + tile_ow, oc_bn = cfg["tile_ow"].size[-1], cfg["tile_oc"].size[-1] # schedule pad if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \ and "pad" in data_vec.op.tag: @@ -235,13 +229,29 @@ def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data_vec, kernel_vec, conv_out s[CC].unroll(ow_block) if C != O: - batch, oc_chunk, oh, ow, oc_block = s[O].op.axis - ow_chunk, ow_block = s[O].split(ow, factor=tile_ow) - s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) - parallel_axis = s[O].fuse(oc_chunk, oh) - s[C].compute_at(s[O], parallel_axis) - s[O].vectorize(oc_block) - s[O].parallel(parallel_axis) + out_ndim = len(s[O].op.axis) + if out_ndim == 5: + batch, oc_chunk, oh, ow, oc_block = s[O].op.axis + ow_chunk, ow_block = s[O].split(ow, factor=tile_ow) + s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) + parallel_axis = s[O].fuse(oc_chunk, oh) + s[C].compute_at(s[O], parallel_axis) + s[O].vectorize(oc_block) + s[O].parallel(parallel_axis) + elif out_ndim == 4: + batch, oc, oh, ow = s[O].op.axis + ow_chunk, ow_block = s[O].split(ow, factor=tile_ow) + print(ow_chunk, ow_block) + oc_chunk, oc_block = s[O].split(oc, factor=oc_bn) + print(oc_chunk, oc_block) + s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) + parallel_axis = s[O].fuse(oc_chunk, oh) + s[C].compute_at(s[O], parallel_axis) + s[O].vectorize(oc_block) + s[O].parallel(parallel_axis) + else: + raise ValueError("Unsupported output ndim: %s" % out_ndim) + return s @depthwise_conv2d_infer_layout.register("cpu") diff --git a/topi/tests/python/test_fifo_buffer.py b/topi/tests/python/test_fifo_buffer.py index 82c230629c05..34c389aad6c9 100644 --- a/topi/tests/python/test_fifo_buffer.py +++ b/topi/tests/python/test_fifo_buffer.py @@ -128,14 +128,15 @@ def check_device(device): return print(' Running on target: {}'.format(device)) + conv2d_nchw, schedule_conv2d_nchw = topi.testing.get_conv2d_nchw_implement(device) + with tvm.target.create(device): out = topi.nn.fifo_buffer(inc_input, context, axis=buffer_axis) s = topi.testing.get_injective_schedule(device)([out]) update_context = tvm.build(s, [inc_input, context, out], device, name='update_context') - out = topi.nn.conv2d(context, kernel, strides=stride, padding=padding, dilation=dilate, - layout='NCHW', out_dtype=dtype) - s = topi.generic.schedule_conv2d_nchw([out]) + out = conv2d_nchw(context, kernel, stride, padding, dilate, dtype) + s = schedule_conv2d_nchw([out]) conv2d_inc = tvm.build(s, [context, kernel, out], device, name='conv2d_inc') out = topi.nn.fifo_buffer(inc_output, output_window, axis=buffer_axis) @@ -148,9 +149,8 @@ def check_device(device): update_input_window = tvm.build(s, [inc_input, input_window, out], device, name='update_input_window') - out = topi.nn.conv2d(input_window, kernel, strides=stride, padding=padding, - dilation=dilate, layout='NCHW', out_dtype=dtype) - s = topi.generic.schedule_conv2d_nchw([out]) + out = conv2d_nchw(input_window, kernel, stride, padding, dilate, dtype) + s = schedule_conv2d_nchw([out]) conv2d = tvm.build(s, [input_window, kernel, out], device, name='conv2d') input_window_tvm = tvm.nd.array(input_window_np, ctx=ctx) diff --git a/topi/tests/python/test_topi_conv2d_NCHWc.py b/topi/tests/python/test_topi_conv2d_NCHWc.py index af585904293f..8a74b4f06cd2 100644 --- a/topi/tests/python/test_topi_conv2d_NCHWc.py +++ b/topi/tests/python/test_topi_conv2d_NCHWc.py @@ -98,16 +98,16 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - C = topi.nn.conv2d_NCHWc(A, W, (stride, stride), padding, - (dilation, dilation), - layout='NCHW%dc'%ic_block, - out_layout="NCHW%dc"%oc_block, - out_dtype=dtype) + C = topi.x86.conv2d_NCHWc(A, W, (stride, stride), padding, + (dilation, dilation), + 'NCHW%dc'%ic_block, + "NCHW%dc"%oc_block, + dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) - s = topi.generic.schedule_conv2d_NCHWc([C]) + s = topi.x86.schedule_conv2d_NCHWc([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py index 85d733c5d546..a0258ec93bf2 100644 --- a/topi/tests/python/test_topi_conv2d_nchw.py +++ b/topi/tests/python/test_topi_conv2d_nchw.py @@ -66,18 +66,27 @@ def check_device(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) + + if "cudnn" in device: + fcompute, fschedule = topi.cuda.conv2d_cudnn, topi.cuda.schedule_conv2d_cudnn + else: + fcompute, fschedule = topi.testing.get_conv2d_nchw_implement(device) + with tvm.target.create(device): - C = topi.nn.conv2d(A, W, (stride, stride), padding, - (dilation, dilation), layout='NCHW', out_dtype=dtype) + if "cudnn" in device: + C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), "NCHW", dtype) + else: + C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) - s = topi.generic.schedule_conv2d_nchw([C]) + s = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py index fac62d2a2d55..880e86d205e7 100644 --- a/topi/tests/python/test_topi_transform.py +++ b/topi/tests/python/test_topi_transform.py @@ -856,9 +856,10 @@ def check_device(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) + conv2d_compute, conv2d_schedule = topi.testing.get_conv2d_nchw_implement(device) data = tvm.placeholder((2, 1, 2, 4), 'int8', 'data') w = tvm.placeholder((3, 1, 2, 2), 'int8', 'w') - conv1 = topi.nn.conv2d(data, w, 1, 0, 1, out_dtype='int32') + conv1 = conv2d_compute(data, w, 1, 0, 1, 'int32') zeros = topi.full((2, 3, 1, 3), 'int32', tvm.const(0, dtype='int32')) gt = topi.greater_equal(conv1, zeros) one = topi.full((2, 3, 1, 3), 'int32', tvm.const(1, dtype='int32')) @@ -866,8 +867,7 @@ def check_device(device): where = topi.where(gt, one, two) add = topi.add(conv1, where) outs = [add] - # TODO(@icemelon9): fix here - s = topi.generic.schedule_conv2d_nchw(outs) + s = conv2d_schedule(outs) tvm.build(s, [data, w, add], target=backend) for backend in get_all_backend(): diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py index be36903e4d71..7d27b8221a60 100644 --- a/topi/tests/python/test_topi_vision.py +++ b/topi/tests/python/test_topi_vision.py @@ -36,14 +36,14 @@ "gpu": (topi.cuda.non_max_suppression, topi.cuda.schedule_nms), } -_multibox_prior_schedule = { - "generic": topi.generic.schedule_multibox_prior, - "gpu": topi.cuda.schedule_multibox_prior, +_multibox_prior_implement = { + "generic": (topi.vision.ssd.multibox_prior, topi.generic.schedule_multibox_prior), + "gpu": (topi.cuda.multibox_prior, topi.cuda.schedule_multibox_prior), } -_multibox_detection_schedule = { - "generic": topi.generic.schedule_multibox_detection, - "gpu": topi.cuda.schedule_multibox_detection, +_multibox_detection_implement = { + "generic": (topi.vision.ssd.multibox_detection, topi.generic.schedule_multibox_detection), + "gpu": (topi.cuda.multibox_detection, topi.cuda.schedule_multibox_detection), } _roi_align_implement = { @@ -223,13 +223,11 @@ def check_device(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) + + fcompute, fschedule = topi.testing.dispatch(device, _multibox_prior_implement) with tvm.target.create(device): - if device == 'llvm': - out = ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip) - else: - out = topi.cuda.ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip) - s_func = topi.testing.dispatch(device, _multibox_prior_schedule) - s = s_func(out) + out = fcompute(data, sizes, ratios, steps, offsets, clip) + s = fschedule(out) tvm_input_data = tvm.nd.array(input_data, ctx) tvm_out = tvm.nd.array(np.zeros(oshape, dtype=dtype), ctx) @@ -270,13 +268,11 @@ def check_device(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) + + fcompute, fschedule = topi.testing.dispatch(device, _multibox_detection_implement) with tvm.target.create(device): - if device == 'llvm': - out = ssd.multibox_detection(cls_prob, loc_preds, anchors) - else: - out = topi.cuda.ssd.multibox_detection(cls_prob, loc_preds, anchors) - s_func = topi.testing.dispatch(device, _multibox_detection_schedule) - s = s_func(out) + out = fcompute(cls_prob, loc_preds, anchors) + s = fschedule(out) tvm_cls_prob = tvm.nd.array(np_cls_prob.astype(cls_prob.dtype), ctx) tvm_loc_preds = tvm.nd.array(np_loc_preds.astype(loc_preds.dtype), ctx) From f9a41c6cb8d69b6a824acb2bfbba13db0a4acb13 Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Tue, 11 Feb 2020 14:32:38 -0800 Subject: [PATCH 13/48] Modify more tests (#10) * Modify tests for bitserial_conv2d, bitserial_dense, bitserial_conv2d_rasp and bnn * Minor fix * More minor fix --- topi/python/topi/generic/nn.py | 2 - topi/python/topi/x86/binarize_pack.py | 2 - topi/python/topi/x86/binary_dense.py | 2 - .../python/test_topi_bitserial_conv2d.py | 12 ++--- .../python/test_topi_bitserial_conv2d_rasp.py | 6 +-- .../tests/python/test_topi_bitserial_dense.py | 52 +++++++++++-------- topi/tests/python/test_topi_bnn.py | 6 +-- 7 files changed, 43 insertions(+), 39 deletions(-) diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index ab926e8fb162..ba50a8b88cb4 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -532,7 +532,6 @@ def schedule_adaptive_pool(outs): return _default_schedule(outs, False) -@tvm.target.override_native_generic_func("schedule_binarize_pack") def schedule_binarize_pack(outs): """Schedule for binarize_pack @@ -566,7 +565,6 @@ def schedule_bitpack(outs): return _default_schedule(outs, False) -@tvm.target.override_native_generic_func("schedule_binary_dense") def schedule_binary_dense(outs): """Schedule for binary_dense diff --git a/topi/python/topi/x86/binarize_pack.py b/topi/python/topi/x86/binarize_pack.py index ea2bbed7345e..bab91a940edc 100644 --- a/topi/python/topi/x86/binarize_pack.py +++ b/topi/python/topi/x86/binarize_pack.py @@ -18,10 +18,8 @@ """Schedule for binarization and bit-packing.""" from __future__ import absolute_import as _abs import tvm -from .. import generic -@generic.schedule_binarize_pack.register(["cpu"]) def schedule_binarize_pack(outs): """Schedule for binarize_pack. diff --git a/topi/python/topi/x86/binary_dense.py b/topi/python/topi/x86/binary_dense.py index abf090889ec3..ccf74e7bd230 100644 --- a/topi/python/topi/x86/binary_dense.py +++ b/topi/python/topi/x86/binary_dense.py @@ -19,10 +19,8 @@ from __future__ import absolute_import as _abs import tvm from .. import tag -from .. import generic -@generic.schedule_binary_dense.register(["cpu"]) def schedule_binary_dense(outs): """Schedule for binary_dense. diff --git a/topi/tests/python/test_topi_bitserial_conv2d.py b/topi/tests/python/test_topi_bitserial_conv2d.py index eeaeed15df1c..274743d274ae 100644 --- a/topi/tests/python/test_topi_bitserial_conv2d.py +++ b/topi/tests/python/test_topi_bitserial_conv2d.py @@ -35,9 +35,9 @@ def verify_bitserial_conv2d_nchw(batch, in_size, in_channel, num_filter, kernel, with tvm.target.create('llvm'): A = tvm.placeholder((batch, in_channel, in_height, in_width), dtype=input_dtype, name='A') W = tvm.placeholder((num_filter, in_channel, kernel, kernel), dtype=input_dtype, name='W') - B = topi.nn.bitserial_conv2d_nchw(A, W, stride, padding, activation_bits, weight_bits, - out_dtype=out_dtype, unipolar=unipolar) - s = topi.generic.schedule_bitserial_conv2d_nchw([B]) + B = topi.x86.bitserial_conv2d_nchw(A, W, stride, padding, activation_bits, weight_bits, + input_dtype, out_dtype, unipolar) + s = topi.x86.schedule_bitserial_conv2d_nchw([B]) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) @@ -73,9 +73,9 @@ def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, with tvm.target.create('llvm'): A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_dtype, name='A') W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_dtype, name='W') - B = topi.nn.bitserial_conv2d_nhwc(A, W, stride, padding, activation_bits, weight_bits, - out_dtype=out_dtype, unipolar=unipolar) - s = topi.generic.schedule_bitserial_conv2d_nhwc([B]) + B = topi.x86.bitserial_conv2d_nhwc(A, W, stride, padding, activation_bits, weight_bits, + input_dtype, out_dtype, unipolar) + s = topi.x86.schedule_bitserial_conv2d_nhwc([B]) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) diff --git a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py index 1b2f40de1b21..1f87785b4f48 100644 --- a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py +++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py @@ -39,9 +39,9 @@ def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, with tvm.target.create(device): A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A') W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W') - B = topi.nn.bitserial_conv2d_nhwc(A, W, stride, padding, activation_bits, weight_bits, - pack_dtype='uint8', out_dtype='int16', unipolar=unipolar) - s = topi.generic.schedule_bitserial_conv2d_nhwc([B]) + B = topi.arm_cpu.bitserial_conv2d_nhwc(A, W, stride, padding, activation_bits, weight_bits, + 'uint8', out_dtype, unipolar) + s = topi.arm_cpu.schedule_bitserial_conv2d_nhwc([B]) func = tvm.build(s, [A, W, B], device) diff --git a/topi/tests/python/test_topi_bitserial_dense.py b/topi/tests/python/test_topi_bitserial_dense.py index f1bd02357796..505ce794312f 100644 --- a/topi/tests/python/test_topi_bitserial_dense.py +++ b/topi/tests/python/test_topi_bitserial_dense.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. """Test code for bitserial_dense operator""" +import os import numpy as np import tvm import topi @@ -22,27 +23,21 @@ from topi.util import get_const_tuple from tvm.contrib.pickle_memoize import memoize +_bitserial_dense_implement = { + "generic": (topi.nn.bitserial_dense, topi.generic.schedule_bitserial_dense), + "cpu": (topi.x86.bitserial_dense, topi.x86.schedule_bitserial_dense), + "arm_cpu": (topi.arm_cpu.bitserial_dense, topi.arm_cpu.schedule_bitserial_dense), +} + def generate_quantized_np(shape, bits, out_dtype): min_val = 0 max_val = 1 << bits return np.random.randint(min_val, max_val, size=shape).astype(out_dtype) def verify_bitserial_dense(batch, in_dim, out_dim, activation_bits, weight_bits, unipolar): - input_dtype = 'uint32' out_dtype = 'int16' - with tvm.target.create('llvm'): - A = tvm.placeholder((batch, in_dim), dtype=input_dtype, name='A') - B = tvm.placeholder((out_dim, in_dim), dtype=input_dtype, name='B') - C = topi.nn.bitserial_dense(A, B, activation_bits, weight_bits, out_dtype=out_dtype, - unipolar=unipolar) - s = topi.generic.schedule_bitserial_dense([C]) - - a_shape = get_const_tuple(A.shape) - b_shape = get_const_tuple(B.shape) - - @memoize("topi.tests.test_topi_bitseral_dense") - def get_ref_data(): + def get_ref_data(a_shape, b_shape, input_dtype): a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_dtype) b_np = generate_quantized_np(get_const_tuple(b_shape), weight_bits, input_dtype) if unipolar: @@ -53,15 +48,30 @@ def get_ref_data(): else: c_np = np.dot(a_np, b_np.T) return a_np, b_np, c_np - a_np, b_np, c_np = get_ref_data() - ctx = tvm.cpu(0) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(b_np, ctx) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) - func = tvm.build(s, [A, B, C], "llvm") - func(a, b, c) - tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) + for target in ["llvm", "llvm -device=arm_cpu"]: + if "arm_cpu" in target and 'arm' not in os.uname()[4]: + print ("Skipped running code, not an arm device") + continue + input_dtype = 'uint8' if "arm_cpu" in target else "uint32" + A = tvm.placeholder((batch, in_dim), dtype=input_dtype, name='A') + B = tvm.placeholder((out_dim, in_dim), dtype=input_dtype, name='B') + fcompute, fschedule = topi.testing.dispatch(target, _bitserial_dense_implement) + C = fcompute(A, B, activation_bits, weight_bits, + input_dtype, out_dtype, unipolar) + s = fschedule([C]) + + a_shape = get_const_tuple(A.shape) + b_shape = get_const_tuple(B.shape) + a_np, b_np, c_np = get_ref_data(a_shape, b_shape, input_dtype) + + ctx = tvm.cpu(0) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + func = tvm.build(s, [A, B, C], target) + func(a, b, c) + tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) def test_bitserial_dense(): verify_bitserial_dense(1, 1024, 1000, 1, 1, True) diff --git a/topi/tests/python/test_topi_bnn.py b/topi/tests/python/test_topi_bnn.py index 13da6af9eb02..ce6a28643b58 100644 --- a/topi/tests/python/test_topi_bnn.py +++ b/topi/tests/python/test_topi_bnn.py @@ -33,9 +33,9 @@ def verify_binary_dense(batch, in_dim, out_dim): bnn_C = topi.nn.binary_dense(bnn_A1, bnn_B1) # schedule with tvm.target.create('llvm'): - s1 = topi.generic.schedule_binarize_pack(bnn_A) - s2 = topi.generic.schedule_binarize_pack(bnn_B) - s3 = topi.generic.schedule_binary_dense(bnn_C) + s1 = topi.x86.schedule_binarize_pack(bnn_A) + s2 = topi.x86.schedule_binarize_pack(bnn_B) + s3 = topi.x86.schedule_binary_dense(bnn_C) dtype = A.dtype @memoize("topi.tests.test_topi_binary_dense") From a7768bbafb43d956b0d82a958b5d5ec41376b54a Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Wed, 12 Feb 2020 00:35:46 +0000 Subject: [PATCH 14/48] fix more test --- python/tvm/relay/backend/compile_engine.py | 1 + python/tvm/relay/op/strategy/x86.py | 2 +- python/tvm/relay/op/vision/_vision.py | 2 - python/tvm/te/schedule.py | 2 +- topi/python/topi/cuda/group_conv2d_nchw.py | 4 +- topi/python/topi/testing/common.py | 13 +- topi/python/topi/x86/conv2d.py | 1 + topi/python/topi/x86/depthwise_conv2d.py | 14 +- topi/tests/python/test_topi_conv2d_nhwc.py | 13 +- .../python/test_topi_conv2d_nhwc_pack_int8.py | 25 +-- .../python/test_topi_conv2d_transpose_nchw.py | 21 ++- .../tests/python/test_topi_conv2d_winograd.py | 104 +++++------ .../python/test_topi_depthwise_conv2d.py | 176 ++++++++++-------- topi/tests/python/test_topi_group_conv2d.py | 15 +- .../test_topi_group_conv2d_NCHWc_int8.py | 12 +- 15 files changed, 222 insertions(+), 183 deletions(-) diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py index 5ea961ad0203..d9d8c8b89873 100644 --- a/python/tvm/relay/backend/compile_engine.py +++ b/python/tvm/relay/backend/compile_engine.py @@ -212,6 +212,7 @@ def select_implement(op, attrs, inputs, out_type, target, use_autotvm=True): @tvm._ffi.register_func("relay.backend.lower_call") def lower_call(call, inputs, target): + """Lower the call expression to op implementation and tensor outputs.""" assert isinstance(call.op, _op.Op) op = call.op diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 1cf05a0165d2..64901fa49149 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -106,7 +106,7 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): if layout == "NCHW": assert kernel_layout == "OIHW" channel_multiplier = get_const_tuple(inputs[1].shape)[1] - if channel_multiplier == 1: + if channel_multiplier == 1 and dilation_h == 1 and dilation_w == 1: strategy.add_implement( wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw), wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_nchw), diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py index 6b80aa7778f8..6e2008ad74c0 100644 --- a/python/tvm/relay/op/vision/_vision.py +++ b/python/tvm/relay/op/vision/_vision.py @@ -18,8 +18,6 @@ """Definition of vision ops""" from __future__ import absolute_import -import topi -from topi.util import get_const_int, get_const_float, get_float_tuple from .. import op as reg from .. import strategy from ..op import OpPattern diff --git a/python/tvm/te/schedule.py b/python/tvm/te/schedule.py index 6a656a89776d..6499cb57d5c9 100644 --- a/python/tvm/te/schedule.py +++ b/python/tvm/te/schedule.py @@ -540,7 +540,7 @@ def __init__(self, conditions): _ffi_api._CreateSpecializedCondition, conditions) @staticmethod - def current(self): + def current(): """Returns the current specialized condition""" return _ffi_api._GetCurrentSpecialization() diff --git a/topi/python/topi/cuda/group_conv2d_nchw.py b/topi/python/topi/cuda/group_conv2d_nchw.py index ed243be5abce..357e87ac96ae 100644 --- a/topi/python/topi/cuda/group_conv2d_nchw.py +++ b/topi/python/topi/cuda/group_conv2d_nchw.py @@ -28,8 +28,8 @@ @autotvm.register_topi_compute("group_conv2d_nchw.cuda") -def group_conv2d_nchw_cuda(_, data, kernel, stride, padding, dilation, groups, - out_dtype='float32'): +def group_conv2d_nchw(_, data, kernel, stride, padding, dilation, groups, + out_dtype='float32'): return nn.group_conv2d_nchw(data, kernel, stride, padding, dilation, groups, out_dtype) diff --git a/topi/python/topi/testing/common.py b/topi/python/topi/testing/common.py index 4c926e991399..5817513f7f65 100644 --- a/topi/python/topi/testing/common.py +++ b/topi/python/topi/testing/common.py @@ -57,12 +57,17 @@ def get_reduce_schedule(target): _conv2d_nchw_implement = { "generic": (topi.nn.conv2d_nchw, topi.generic.schedule_conv2d_nchw), "cpu": (topi.x86.conv2d_nchw, topi.x86.schedule_conv2d_nchw), - "arm_cpu": (topi.arm_cpu.conv2d_nchw_spatial_pack, topi.arm_cpu.schedule_conv2d_nchw_spatial_pack), + "arm_cpu": (topi.arm_cpu.conv2d_nchw_spatial_pack, + topi.arm_cpu.schedule_conv2d_nchw_spatial_pack), "gpu": (topi.cuda.conv2d_nchw, topi.cuda.schedule_conv2d_nchw), - "mali": (topi.mali.conv2d_nchw_spatial_pack, topi.mali.schedule_conv2d_nchw_spatial_pack), - "bifrost": (topi.bifrost.conv2d_nchw_spatial_pack, topi.bifrost.schedule_conv2d_nchw_spatial_pack), + "mali": (topi.mali.conv2d_nchw_spatial_pack, + topi.mali.schedule_conv2d_nchw_spatial_pack), + "bifrost": (topi.bifrost.conv2d_nchw_spatial_pack, + topi.bifrost.schedule_conv2d_nchw_spatial_pack), "opengl": (topi.nn.conv2d_nchw, topi.opengl.schedule_conv2d_nchw), - "intel_graphics": (topi.intel_graphics.conv2d_nchw, topi.intel_graphics.schedule_conv2d_nchw) + "intel_graphics": (topi.intel_graphics.conv2d_nchw, + topi.intel_graphics.schedule_conv2d_nchw), + "hls": (topi.nn.conv2d_nchw, topi.hls.schedule_conv2d_nchw) } def get_conv2d_nchw_implement(target): diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index b93665ee5577..60eb966af62f 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -76,6 +76,7 @@ def _conv2d_infer_layout(workload, cfg): def schedule_conv2d_nhwc(outs): """Create schedule for conv2d_nhwc""" + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) output_op = outs[0].op scheduled_ops = [] diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py index 488842e95d54..7f2673f40073 100644 --- a/topi/python/topi/x86/depthwise_conv2d.py +++ b/topi/python/topi/x86/depthwise_conv2d.py @@ -20,7 +20,6 @@ import tvm from tvm import autotvm from tvm.autotvm.task.space import SplitEntity -from .. import tag from ..nn.pad import pad from ..util import get_const_tuple from ..nn.util import get_pad_tuple @@ -114,6 +113,7 @@ def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, in_channel = in_channel_chunk * in_channel_block out_channel = out_channel_chunk * out_channel_block channel_multiplier = cm_chunk * cm_block + assert channel_multiplier * in_channel == out_channel else: batch, in_channel, in_height, in_width = get_const_tuple(data.shape) out_channel, channel_multiplier, filter_height, filter_width = get_const_tuple(kernel.shape) @@ -134,10 +134,11 @@ def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64) # get workload and related schedule config - wkl = _get_workload(tvm.placeholder((batch, in_channel, in_height, in_width), dtype=data.dtype), - tvm.placeholder((out_channel, channel_multiplier, filter_height, filter_width), - dtype=kernel.dtype), - strides, padding, out_dtype) + wkl = _get_workload( + tvm.placeholder((batch, in_channel, in_height, in_width), dtype=data.dtype), + tvm.placeholder((out_channel, channel_multiplier, filter_height, filter_width), + dtype=kernel.dtype), + strides, padding, out_dtype) if cfg.is_fallback: _fallback_schedule(cfg, wkl) @@ -181,6 +182,7 @@ def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, @autotvm.register_topi_schedule("depthwise_conv2d_NCHWc.x86") def schedule_depthwise_conv2d_NCHWc(cfg, outs): """CPU schedule for depthwise conv2d in NCHW[x]c layout""" + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) def _callback(op): @@ -241,9 +243,7 @@ def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data_vec, kernel_vec, conv_out elif out_ndim == 4: batch, oc, oh, ow = s[O].op.axis ow_chunk, ow_block = s[O].split(ow, factor=tile_ow) - print(ow_chunk, ow_block) oc_chunk, oc_block = s[O].split(oc, factor=oc_bn) - print(oc_chunk, oc_block) s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block) parallel_axis = s[O].fuse(oc_chunk, oh) s[C].compute_at(s[O], parallel_axis) diff --git a/topi/tests/python/test_topi_conv2d_nhwc.py b/topi/tests/python/test_topi_conv2d_nhwc.py index 342f3190b702..2a5915ef0a53 100644 --- a/topi/tests/python/test_topi_conv2d_nhwc.py +++ b/topi/tests/python/test_topi_conv2d_nhwc.py @@ -24,6 +24,16 @@ from topi.util import get_const_tuple + +_conv2d_nhwc_implement = { + "generic": (topi.nn.conv2d_nhwc, topi.generic.schedule_conv2d_nhwc), + "cpu": (topi.nn.conv2d_nhwc, topi.x86.schedule_conv2d_nhwc), + "arm_cpu": (topi.arm_cpu.conv2d_nhwc_spatial_pack, + topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack), + "hls": (topi.nn.conv2d_nhwc, topi.hls.schedule_conv2d_nhwc) +} + + def verify_conv2d_nhwc(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1): in_height = in_width = in_size @@ -60,7 +70,8 @@ def check_device(device): func(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) - for device in ['llvm', 'cuda']: + # TODO(@alexgl-github): add cuda back after fix conv2d_nhwc for cuda + for device in ['llvm']: check_device(device) diff --git a/topi/tests/python/test_topi_conv2d_nhwc_pack_int8.py b/topi/tests/python/test_topi_conv2d_nhwc_pack_int8.py index 763150ac425f..8267aad382e8 100644 --- a/topi/tests/python/test_topi_conv2d_nhwc_pack_int8.py +++ b/topi/tests/python/test_topi_conv2d_nhwc_pack_int8.py @@ -15,8 +15,9 @@ # specific language governing permissions and limitations # under the License. """Example code to do convolution.""" -import os +import pytest import numpy as np + import tvm from tvm import autotvm from tvm.autotvm.task.space import FallbackConfigEntity @@ -56,7 +57,7 @@ def check_device(device): with tvm.target.create(device): B = topi.nn.conv2d(A, W, stride, padding, dilation, layout='NHWC', out_dtype="int32") - s = topi.generic.schedule_conv2d_nhwc_pack([B]) + s = topi.x86.schedule_conv2d_nhwc_pack_int8([B]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) @@ -69,22 +70,12 @@ def check_device(device): check_device(device) -class DefaultFallback(autotvm.FallbackContext): - def _query_inside(self, target, workload): - key = (target, workload) - if key in self.memory: - return self.memory[key] - cfg = FallbackConfigEntity() - cfg.template_key = 'direct' - self.memory[key] = cfg - return cfg - - +# TODO(@llyfacebook): Please fix https://github.com/apache/incubator-tvm/issues/4122 to enable this test. +@pytest.mark.skip def test_conv2d_nhwc(): - autotvm.DispatchContext.current.silent = True - with DefaultFallback(): - verify_conv2d_1x1_nhwc_pack_int8(1, 256, 32, 256, 1, 1, 0) + verify_conv2d_1x1_nhwc_pack_int8(1, 256, 32, 256, 1, 1, 0) if __name__ == "__main__": - test_conv2d_nhwc() + # test_conv2d_nhwc() + pass diff --git a/topi/tests/python/test_topi_conv2d_transpose_nchw.py b/topi/tests/python/test_topi_conv2d_transpose_nchw.py index fb836d43ccce..e8aabc61a4fa 100644 --- a/topi/tests/python/test_topi_conv2d_transpose_nchw.py +++ b/topi/tests/python/test_topi_conv2d_transpose_nchw.py @@ -24,6 +24,14 @@ from common import get_all_backend +_conv2d_transpose_nchw_implement = { + "generic": (topi.nn.conv2d_transpose_nchw, topi.generic.schedule_conv2d_transpose_nchw), + "cpu": (topi.x86.conv2d_transpose_nchw, topi.x86.schedule_conv2d_transpose_nchw), + "arm_cpu": (topi.arm_cpu.conv2d_transpose_nchw, topi.arm_cpu.schedule_conv2d_transpose_nchw), + "gpu": (topi.cuda.conv2d_transpose_nchw, topi.cuda.schedule_conv2d_transpose_nchw), + "hls": (topi.nn.conv2d_transpose_nchw, topi.hls.schedule_conv2d_transpose_nchw), +} + def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding): in_height, in_width = in_size kernel_height, kernel_width = kernel @@ -54,13 +62,14 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - B = topi.nn.conv2d_transpose_nchw(A, W, - [stride_height, stride_width], - [pad_top, pad_left, pad_bottom, pad_right], - A.dtype) + fcompute, fschedule = topi.testing.dispatch(device, _conv2d_transpose_nchw_implement) + B = fcompute(A, W, + [stride_height, stride_width], + [pad_top, pad_left, pad_bottom, pad_right], + A.dtype) C = topi.nn.relu(B) - s1 = topi.generic.schedule_conv2d_transpose_nchw([B]) - s2 = topi.generic.schedule_conv2d_transpose_nchw([C]) + s1 = fschedule([B]) + s2 = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) diff --git a/topi/tests/python/test_topi_conv2d_winograd.py b/topi/tests/python/test_topi_conv2d_winograd.py index 350e62101689..2d12336e771a 100644 --- a/topi/tests/python/test_topi_conv2d_winograd.py +++ b/topi/tests/python/test_topi_conv2d_winograd.py @@ -27,6 +27,13 @@ from topi.util import get_const_tuple +_conv2d_nchw_winograd_implement = { + "arm_cpu": (topi.arm_cpu.conv2d_nchw_winograd, topi.arm_cpu.schedule_conv2d_nchw_winograd), + "cuda": (topi.cuda.conv2d_nchw_winograd, topi.cuda.schedule_conv2d_nchw_winograd), + "mali": (topi.mali.conv2d_nchw_winograd, topi.mali.schedule_conv2d_nchw_winograd), +} + + def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, devices=['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']): pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel)) @@ -67,12 +74,13 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - C = topi.nn.conv2d(A, W, stride, padding, dilation, layout='NCHW', out_dtype=dtype) + fcompute, fschedule = topi.testing.dispatch(device, _conv2d_nchw_winograd_implement) + C = fcompute(A, W, stride, padding, dilation, dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) - s = topi.generic.schedule_conv2d_nchw([C]) + s = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) @@ -93,61 +101,45 @@ def check_device(device): check_device(device) -class WinogradFallback(autotvm.FallbackContext): - def _query_inside(self, target, workload): - key = (target, workload) - if key in self.memory: - return self.memory[key] - cfg = FallbackConfigEntity() - cfg.template_key = 'winograd' - self.memory[key] = cfg - cfg.is_fallback = False - return cfg - - def test_conv2d_nchw(): - autotvm.DispatchContext.current.silent = True - - with WinogradFallback(): - - # inception v3 workloads - verify_conv2d_nchw(1, 128, 17, 192, 7, 1, 3, devices=['cuda']) - verify_conv2d_nchw(1, 128, 17, 128, 7, 1, 3, devices=['cuda']) - verify_conv2d_nchw(1, 160, 17, 160, 7, 1, 3, devices=['cuda']) - - # resnet 18 workloads - verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1) - verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1) - verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1) - verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1) - verify_conv2d_nchw(1, 48, 35, 64, 5, 1, 2, devices=['cuda']) - - # batch size = 2 - verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1) - - # relu, bias - verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_bias=True) - verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_relu=True) - verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_relu=True, add_bias=True) - - # werid workloads - verify_conv2d_nchw(1, 1, 1, 1, 3, 1, 1) - verify_conv2d_nchw(3, 3, 3, 3, 3, 1, 1) - verify_conv2d_nchw(2, 13, 71, 59, 3, 1, 1) - - # Asymmetric padding - verify_conv2d_nchw(1, 48, 56, 48, 3, 1, (1, 1, 1, 1)) - verify_conv2d_nchw(1, 64, 28, 64, 3, 1, (1, 1, 1, 1)) - verify_conv2d_nchw(1, 128, 14, 128, 3, 1, (1, 1)) - verify_conv2d_nchw(1, 512, 7, 512, 3, 1, "SAME") - verify_conv2d_nchw(2, 13, 71, 59, 3, 1, (1, 1, 1, 1)) - verify_conv2d_nchw(2, 48, 56, 48, 3, 1, (1, 1, 1, 1), add_bias=True) - verify_conv2d_nchw(2, 48, 56, 48, 3, 1, (1, 1), add_relu=True) - verify_conv2d_nchw(2, 48, 56, 48, 3, 1, "SAME", add_relu=True, add_bias=True) - verify_conv2d_nchw(1, 64, 17, 192, 7, 1, (3, 1), devices=['cuda']) - verify_conv2d_nchw(1, 64, 17, 64, 7, 1, (3, 3, 2, 2), devices=['cuda']) - verify_conv2d_nchw(1, 160, 17, 160, 7, 1, "SAME", devices=['cuda']) - verify_conv2d_nchw(1, 48, 35, 48, 5, 1, "VALID", devices=['cuda']) + # inception v3 workloads + verify_conv2d_nchw(1, 128, 17, 192, 7, 1, 3, devices=['cuda']) + verify_conv2d_nchw(1, 128, 17, 128, 7, 1, 3, devices=['cuda']) + verify_conv2d_nchw(1, 160, 17, 160, 7, 1, 3, devices=['cuda']) + + # resnet 18 workloads + verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1) + verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1) + verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1) + verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1) + verify_conv2d_nchw(1, 48, 35, 64, 5, 1, 2, devices=['cuda']) + + # batch size = 2 + verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1) + + # relu, bias + verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_bias=True) + verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_relu=True) + verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_relu=True, add_bias=True) + + # weird workloads + verify_conv2d_nchw(1, 1, 1, 1, 3, 1, 1) + verify_conv2d_nchw(3, 3, 3, 3, 3, 1, 1) + verify_conv2d_nchw(2, 13, 71, 59, 3, 1, 1) + + # Asymmetric padding + verify_conv2d_nchw(1, 48, 56, 48, 3, 1, (1, 1, 1, 1)) + verify_conv2d_nchw(1, 64, 28, 64, 3, 1, (1, 1, 1, 1)) + verify_conv2d_nchw(1, 128, 14, 128, 3, 1, (1, 1)) + verify_conv2d_nchw(1, 512, 7, 512, 3, 1, "SAME") + verify_conv2d_nchw(2, 13, 71, 59, 3, 1, (1, 1, 1, 1)) + verify_conv2d_nchw(2, 48, 56, 48, 3, 1, (1, 1, 1, 1), add_bias=True) + verify_conv2d_nchw(2, 48, 56, 48, 3, 1, (1, 1), add_relu=True) + verify_conv2d_nchw(2, 48, 56, 48, 3, 1, "SAME", add_relu=True, add_bias=True) + verify_conv2d_nchw(1, 64, 17, 192, 7, 1, (3, 1), devices=['cuda']) + verify_conv2d_nchw(1, 64, 17, 64, 7, 1, (3, 3, 2, 2), devices=['cuda']) + verify_conv2d_nchw(1, 160, 17, 160, 7, 1, "SAME", devices=['cuda']) + verify_conv2d_nchw(1, 48, 35, 48, 5, 1, "VALID", devices=['cuda']) if __name__ == "__main__": diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py index d34d56e7fc86..7efe5a21578c 100644 --- a/topi/tests/python/test_topi_depthwise_conv2d.py +++ b/topi/tests/python/test_topi_depthwise_conv2d.py @@ -25,6 +25,24 @@ from common import get_all_backend +_depthwise_conv2d_nchw_implement = { + "generic": [(topi.nn.depthwise_conv2d_nchw, topi.generic.schedule_depthwise_conv2d_nchw)], + "arm_cpu": [(topi.arm_cpu.depthwise_conv2d_nchw, topi.arm_cpu.schedule_depthwise_conv2d_nchw), + (topi.arm_cpu.depthwise_conv2d_nchw_spatial_pack, + topi.arm_cpu.schedule_depthwise_conv2d_nchw_spatial_pack)], + "gpu": [(topi.cuda.depthwise_conv2d_nchw, topi.cuda.schedule_depthwise_conv2d_nchw)], + "mali": [(topi.mali.depthwise_conv2d_nchw, topi.mali.schedule_depthwise_conv2d_nchw)], + "bifrost": [(topi.nn.depthwise_conv2d_nchw, topi.bifrost.schedule_depthwise_conv2d_nchw)], + "intel_graphics": [(topi.intel_graphics.depthwise_conv2d_nchw, + topi.intel_graphics.schedule_depthwise_conv2d_nchw)], +} + +_depthwise_conv2d_nhwc_implement = { + "generic": (topi.nn.depthwise_conv2d_nhwc, topi.generic.schedule_depthwise_conv2d_nhwc), + "gpu": (topi.nn.depthwise_conv2d_nhwc, topi.cuda.schedule_depthwise_conv2d_nhwc), +} + + def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1): in_width = in_height filter_channel = in_channel @@ -53,68 +71,75 @@ def check_device(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) - with tvm.target.create(device): - # declare - DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter, - (stride_h, stride_w), padding_args, dilation, dtype) - ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift) - Relu = topi.nn.relu(ScaleShift) - # schedule - s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d) - s2 = topi.generic.schedule_depthwise_conv2d_nchw(ScaleShift) - s3 = topi.generic.schedule_depthwise_conv2d_nchw(Relu) - # build the kernels - f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) - f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) - f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) - - # Prepare pod type for test data closure - input_shape = get_const_tuple(Input.shape) - filter_shape = get_const_tuple(Filter.shape) - scale_shape = get_const_tuple(Scale.shape) - shift_shape = get_const_tuple(Shift.shape) - scale_shift_shape = get_const_tuple(ScaleShift.shape) - - # Use memoize, pickle the test data for next time use. - @memoize("topi.tests.test_topi_depthwise_conv2d.nchw") - def get_ref_data(): - input_np = np.random.uniform(size=input_shape).astype(dtype) - filter_np = np.random.uniform(size=filter_shape).astype(dtype) - dilated_filter_np = topi.testing.dilate_python(filter_np, (1, 1, dilation, dilation)) - scale_np = np.random.uniform(size=scale_shape).astype(dtype) - shift_np = np.random.uniform(size=shift_shape).astype(dtype) - # correctness with scipy - depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw( - input_np, dilated_filter_np, stride, padding) - scale_shift_scipy = np.zeros(shape=scale_shift_shape) - for c in range(in_channel * channel_multiplier): - scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c] - relu_scipy = np.maximum(scale_shift_scipy, 0) - return (input_np, filter_np, scale_np, shift_np, - depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) - # Get the test data - (input_np, filter_np, scale_np, shift_np, - depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) = get_ref_data() - input_tvm = tvm.nd.array(input_np, ctx) - filter_tvm = tvm.nd.array(filter_np, ctx) - scale_tvm = tvm.nd.array(scale_np, ctx) - shift_tvm = tvm.nd.array(shift_np, ctx) - depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) - scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx) - relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) - # launch kernel 1 (depthwise_conv2d) - timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1) - tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean - # launch kernel 2 (depthwise_conv2d + scale_shift) - timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1) - tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean - # launch kernel 3 (depthwise_conv2d + scale_shift + relu) - timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1) - tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean - tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) - tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) - tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) + impl_list = topi.testing.dispatch(device, _depthwise_conv2d_nchw_implement)[:] + if device == "llvm" and channel_multiplier == 1 and dilation == 1: + impl_list.append((topi.x86.depthwise_conv2d_nchw, topi.x86.schedule_depthwise_conv2d_nchw)) + + for fcompute, fschedule in impl_list: + with tvm.target.create(device): + # declare + DepthwiseConv2d = fcompute(Input, Filter, (stride_h, stride_w), + padding_args, dilation, dtype) + ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift) + Relu = topi.nn.relu(ScaleShift) + # schedule + s1 = fschedule(DepthwiseConv2d) + s2 = fschedule(ScaleShift) + s3 = fschedule(Relu) + # build the kernels + f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) + f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) + f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) + + # Prepare pod type for test data closure + input_shape = get_const_tuple(Input.shape) + filter_shape = get_const_tuple(Filter.shape) + scale_shape = get_const_tuple(Scale.shape) + shift_shape = get_const_tuple(Shift.shape) + scale_shift_shape = get_const_tuple(ScaleShift.shape) + + # Use memoize, pickle the test data for next time use. + @memoize("topi.tests.test_topi_depthwise_conv2d.nchw") + def get_ref_data(): + input_np = np.random.uniform(size=input_shape).astype(dtype) + filter_np = np.random.uniform(size=filter_shape).astype(dtype) + dilated_filter_np = topi.testing.dilate_python(filter_np, (1, 1, dilation, dilation)) + scale_np = np.random.uniform(size=scale_shape).astype(dtype) + shift_np = np.random.uniform(size=shift_shape).astype(dtype) + # correctness with scipy + depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw( + input_np, dilated_filter_np, stride, padding) + scale_shift_scipy = np.zeros(shape=scale_shift_shape) + for c in range(in_channel * channel_multiplier): + scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c] + relu_scipy = np.maximum(scale_shift_scipy, 0) + return (input_np, filter_np, scale_np, shift_np, + depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) + + # Get the test data + (input_np, filter_np, scale_np, shift_np, + depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) = get_ref_data() + + input_tvm = tvm.nd.array(input_np, ctx) + filter_tvm = tvm.nd.array(filter_np, ctx) + scale_tvm = tvm.nd.array(scale_np, ctx) + shift_tvm = tvm.nd.array(shift_np, ctx) + depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) + scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx) + relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) + # launch kernel 1 (depthwise_conv2d) + timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1) + tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean + # launch kernel 2 (depthwise_conv2d + scale_shift) + timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1) + tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean + # launch kernel 3 (depthwise_conv2d + scale_shift + relu) + timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1) + tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean + tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) + tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) + tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) for device in get_all_backend(): with autotvm.tophub.context(device): # load tophub pre-tuned parameters @@ -150,16 +175,17 @@ def check_device(device): return print("Running on target: %s" % device) + fcompute, fschedule = topi.testing.dispatch(device, _depthwise_conv2d_nhwc_implement) with tvm.target.create(device): # declare - DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, Filter, + DepthwiseConv2d = fcompute(Input, Filter, (stride_h, stride_w), padding_args, dilation, dtype) ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift) Relu = topi.nn.relu(ScaleShift) # schedule - s1 = topi.generic.schedule_depthwise_conv2d_nhwc(DepthwiseConv2d) - s2 = topi.generic.schedule_depthwise_conv2d_nhwc(ScaleShift) - s3 = topi.generic.schedule_depthwise_conv2d_nhwc(Relu) + s1 = fschedule(DepthwiseConv2d) + s2 = fschedule(ScaleShift) + s3 = fschedule(Relu) # build the kernels f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) @@ -242,6 +268,7 @@ def depthwise_conv2d_with_workload_NCHWc(batch, in_channel, in_height, channel_m stride_h = stride_w = stride assert dilation == 1, "depthwise_conv2d_NCHWc currently does not support dilation." + assert channel_multiplier == 1, "depthwise_conv2d_NCHWc currently does not support channel multiplier > 1." pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width)) padding_args = (pad_h, pad_w) @@ -276,17 +303,17 @@ def check_device(device): print("Running on target: %s" % device) with tvm.target.create(device): # declare - DepthwiseConv2d = topi.nn.depthwise_conv2d_NCHWc(Input, Filter, - (stride_h, stride_w), - padding_args, - (dilation, dilation), - in_layout, - out_layout, dtype) + DepthwiseConv2d = topi.x86.depthwise_conv2d_NCHWc(Input, Filter, + (stride_h, stride_w), + padding_args, + (dilation, dilation), + in_layout, + out_layout, dtype) # TODO: add scale_shift implement for NCHWc and add test here Relu = topi.nn.relu(DepthwiseConv2d) # schedule - s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d) - s2 = topi.generic.schedule_depthwise_conv2d_nchw(Relu) + s1 = topi.x86.schedule_depthwise_conv2d_NCHWc(DepthwiseConv2d) + s2 = topi.x86.schedule_depthwise_conv2d_NCHWc(Relu) # build the kernels f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Relu], device) @@ -319,7 +346,6 @@ def get_ref_data(): dtype=DepthwiseConv2d.dtype), ctx) relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # launch kernel 1 (depthwise_conv2d) - print(filter_tvm.shape) f1(input_tvm, filter_tvm, depthwise_conv2d_tvm) # launch kernel 2 (depthwise_conv2d + relu) f2(input_tvm, filter_tvm, relu_tvm) @@ -363,9 +389,7 @@ def test_depthwise_conv2d(): # NCHW[x]c depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "SAME") - depthwise_conv2d_with_workload_NCHWc(4, 256, 64, 2, 5, 2, "SAME") depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "VALID") - depthwise_conv2d_with_workload_NCHWc(4, 256, 64, 2, 5, 2, "VALID") if __name__ == "__main__": diff --git a/topi/tests/python/test_topi_group_conv2d.py b/topi/tests/python/test_topi_group_conv2d.py index 0e176780023d..3904db7d2b23 100644 --- a/topi/tests/python/test_topi_group_conv2d.py +++ b/topi/tests/python/test_topi_group_conv2d.py @@ -28,6 +28,12 @@ from common import get_all_backend, Int8Fallback +_group_conv2d_nchw_implement = { + "generic": (topi.nn.group_conv2d_nchw, topi.generic.schedule_group_conv2d_nchw), + "gpu": (topi.cuda.group_conv2d_nchw, topi.cuda.schedule_group_conv2d_nchw), +} + + def verify_group_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups, add_bias=False, add_relu=False): print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, @@ -70,12 +76,13 @@ def check_device(device): print("Running on target: %s" % device) with tvm.target.create(device): - C = topi.nn.group_conv2d_nchw(A, W, stride, padding, dilation, groups, out_dtype=dtype) + fcompute, fschedule = topi.testing.dispatch(device, _group_conv2d_nchw_implement) + C = fcompute(A, W, stride, padding, dilation, groups, dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) - s = topi.generic.schedule_group_conv2d_nchw([C]) + s = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) @@ -149,12 +156,12 @@ def check_device(device): print("Running on target: %s" % device) with tvm.target.create(device): - C = topi.nn.group_conv2d_nchw(A, W, stride, padding, dilation, groups, out_dtype=dtype) + C = topi.cuda.group_conv2d_NCHWc_int8(A, W, stride, padding, dilation, groups, dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) - s = topi.generic.schedule_group_conv2d_nchw([C]) + s = topi.cuda.schedule_group_conv2d_NCHWc_int8([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) diff --git a/topi/tests/python/test_topi_group_conv2d_NCHWc_int8.py b/topi/tests/python/test_topi_group_conv2d_NCHWc_int8.py index 3717534b85ff..08f136e5ae23 100644 --- a/topi/tests/python/test_topi_group_conv2d_NCHWc_int8.py +++ b/topi/tests/python/test_topi_group_conv2d_NCHWc_int8.py @@ -81,12 +81,12 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - C = topi.nn.conv2d_NCHWc(A, W, (stride, stride), (padding, padding), - (dilation, dilation), - layout='NCHW%dc'%ic_block, - out_layout="NCHW%dc"%oc_block, - out_dtype=dtype) - s = topi.generic.schedule_conv2d_NCHWc([C]) + C = topi.x86.conv2d_NCHWc(A, W, (stride, stride), (padding, padding), + (dilation, dilation), + 'NCHW%dc'%ic_block, + "NCHW%dc"%oc_block, + dtype) + s = topi.x86.schedule_conv2d_NCHWc([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) From 3f33c70d7e8cef235fdc36ee3f540c5719dee3f6 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Tue, 11 Feb 2020 20:53:42 -0800 Subject: [PATCH 15/48] try to update vta using strategy --- vta/python/vta/top/bitpack.py | 5 +- vta/python/vta/top/op.py | 201 +++++++-------------- vta/python/vta/top/vta_conv2d.py | 16 +- vta/python/vta/top/vta_conv2d_transpose.py | 15 +- vta/python/vta/top/vta_dense.py | 12 +- vta/python/vta/top/vta_group_conv2d.py | 8 +- 6 files changed, 89 insertions(+), 168 deletions(-) diff --git a/vta/python/vta/top/bitpack.py b/vta/python/vta/top/bitpack.py index d4748faad6a7..6e9d57bc0001 100644 --- a/vta/python/vta/top/bitpack.py +++ b/vta/python/vta/top/bitpack.py @@ -22,9 +22,8 @@ import tvm from topi import util -from tvm.relay.op.op import register_compute, register_schedule +from tvm.relay.op.op import register_compute, register_injective_schedule from tvm.relay.op.op import register_pattern, OpPattern -from tvm.relay.op.op import schedule_injective def bitpack(data, bits, pack_type="int8", name="bitpack"): """Packs lowest dimension into format needed by VTA @@ -86,5 +85,5 @@ def compute_bitpack(attrs, inputs): bits = 8 // lanes return bitpack(inputs[0], bits, dtype) -register_schedule("bitpack", schedule_injective) +register_injective_schedule("bitpack") register_pattern("bitpack", OpPattern.INJECTIVE) diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index bf6409cc9405..4905992cc06c 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -22,10 +22,14 @@ import topi from tvm.relay.op import op as reg -from tvm.relay.op.op import OpPattern -from tvm.relay.op.nn import _nn +from tvm.relay.op import strategy as _strategy +from tvm.relay.op.op import OpPattern, OpStrategy from .util import is_packed_layout +from .vta_conv2d import conv2d_packed, schedule_conv2d_packed +from .vta_conv2d_transpose import conv2d_transpose_packed, schedule_conv2d_transpose_packed +from .vta_group_conv2d import group_conv2d_packed, schedule_group_conv2d_packed +from .vta_dense import dense_packed, schedule_dense_packed from ..environment import get_env @@ -49,138 +53,69 @@ def compute_clip(attrs, inputs, output_type, target): return [x] -@reg.register_compute("nn.conv2d", level=15) -def compute_conv2d(attrs, inputs, output_type, target): - """ Compute definition of conv2d """ - padding = topi.util.get_const_tuple(attrs.padding) - strides = topi.util.get_const_tuple(attrs.strides) - dilation = tuple([int(d) for d in attrs.dilation]) +@_strategy.conv2d_strategy.register("vta") +def conv2d_strategy_vta(attrs, inputs, out_type, target): + """conv2d vta strategy""" + strategy = OpStrategy() + kernel = inputs[1] + dilation = topi.util.get_const_tuple(attrs.dilation) groups = attrs.groups layout = attrs.data_layout - out_dtype = attrs.out_dtype - - if target.device_name == "vta": - assert dilation == (1, 1), "support for dilation limited to (1, 1)" - if is_packed_layout(layout): - if groups == 1: - assert groups == 1 - env = get_env() - assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now" - assert env.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now" - inputs = list(inputs) - assert inputs[1].dtype == "int8" - return [topi.nn.conv2d(inputs[0], - inputs[1], - strides, - padding, - dilation, - layout, - out_dtype)] - return [topi.nn.group_conv2d_nchw(inputs[0], - inputs[1], - strides, - padding, - dilation, - groups, - out_dtype)] - # If it's not packed, run on ARM CPU - with tvm.target.arm_cpu(tvm.target.Target.current().model): - return _nn.compute_conv2d(attrs, inputs, output_type, target) - - # If VTA is not the target, default to _nn def - return _nn.compute_conv2d(attrs, inputs, output_type, target) - - -@reg.register_schedule("nn.conv2d", level=15) -def schedule_conv2d(attrs, outs, target): - """ Schedule definition of conv2d """ - groups = attrs.groups - layout = attrs.data_layout - - if target.device_name == "vta": - if is_packed_layout(layout): - target = tvm.target.create(target) - assert target.device_name == "vta" - if groups == 1: - return topi.generic.schedule_conv2d_nchw(outs) - return topi.generic.schedule_group_conv2d_nchw(outs) - # If it's not packed, run on ARM CPU - with tvm.target.arm_cpu(tvm.target.Target.current().model): - return _nn.schedule_conv2d(attrs, outs, tvm.target.Target.current()) - - # If VTA is not the target, default to _nn def - return _nn.schedule_conv2d(attrs, outs, target) - - -@reg.register_compute("nn.conv2d_transpose", level=15) -def compute_conv2d_transpose(attrs, inputs, output_type, target): - """ 2D convolution algorithm. - """ - padding = topi.util.get_const_tuple(attrs.padding) - strides = topi.util.get_const_tuple(attrs.strides) - dilation = tuple([int(d) for d in attrs.dilation]) - layout = attrs.data_layout - out_dtype = attrs.out_dtype - if target.device_name == "vta": - assert dilation == (1, 1), "support for dilation limited to (1, 1)" - if is_packed_layout(layout): - return [topi.nn.conv2d_transpose_nchw( - inputs[0], inputs[1], strides, padding, out_dtype)] - # If it's not packed, run on ARM CPU - with tvm.target.arm_cpu(tvm.target.Target.current().model): - return _nn.compute_conv2d_transpose(attrs, inputs, output_type, target) - - # If VTA is not the target, default to _nn def - return _nn.compute_conv2d_transpose(attrs, inputs, output_type, target) - - -@reg.register_schedule("nn.conv2d_transpose", level=15) -def schedule_conv2d_transpose(attrs, outputs, target): - """ 2D convolution schedule. - """ + assert dilation == (1, 1), "support for dilation limited to (1, 1)" + if is_packed_layout(layout): + if groups == 1: + env = get_env() + assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now" + assert env.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now" + assert kernel.dtype == "int8" + + strategy.add_implement( + _strategy.wrap_compute_conv2d(conv2d_packed, True), + _strategy.wrap_topi_schedule(schedule_conv2d_packed), + name="conv2d_packed.vta") + else: # group_conv2d + strategy.add_implement( + _strategy.wrap_compute_conv2d(group_conv2d_packed, has_groups=True), + _strategy.wrap_topi_schedule(schedule_group_conv2d_packed), + name="group_conv2d_packed.vta") + return strategy + + # If it's not packed, run on ARM CPU + arm_tgt = tvm.target.arm_cpu(target.model) + return _strategy.arm_cpu.conv2d_strategy_arm_cpu(attrs, inputs, out_type, arm_tgt) + + +@_strategy.conv2d_transpose_strategy.register("vta") +def conv2d_transpose_strategy_vta(attrs, inputs, out_type, target): + """conv2d_transpose vta strategy""" + dilation = topi.util.get_const_tuple(attrs.dilation) layout = attrs.data_layout - - if target.device_name == "vta": - if is_packed_layout(layout): - return topi.nn.schedule_conv2d_transpose_nchw(outputs) - # If it's not packed, run on ARM CPU - with tvm.target.arm_cpu(tvm.target.Target.current().model): - return _nn.schedule_conv2d_transpose(attrs, outputs, tvm.target.Target.current()) - - # If VTA is not the target, default to _nn def - return _nn.schedule_conv2d_transpose(attrs, outputs, tvm.target.Target.current()) - - -@reg.register_compute("nn.dense", level=15) -def compute_dense(attrs, inputs, out_type, target): - """Compute definition of dense""" - out_dtype = attrs.out_dtype - out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype - - if target.device_name == "vta": - if inputs[0].shape == 4: # this implies the layout is packed - target = tvm.target.create(target) - return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)] - # If it's not packed, run on ARM CPU - with tvm.target.arm_cpu(tvm.target.Target.current().model): - return _nn.compute_dense(attrs, inputs, out_type, target) - - # If VTA is not the target, default to _nn def - return _nn.compute_dense(attrs, inputs, out_type, target) - - -@reg.register_schedule("nn.dense", level=15) -def schedule_dense(attrs, outs, target): - """Schedule definition of dense""" - if target.device_name == "vta": - if outs[0].shape == 4: # this implies the layout is packed - target = tvm.target.create(target) - assert target.device_name == "vta" - return topi.generic.schedule_dense(outs) - # If it's not packed, run on ARM CPU - with tvm.target.arm_cpu(tvm.target.Target.current().model): - return _nn.schedule_dense(attrs, outs, tvm.target.Target.current()) - - # If VTA is not the target, default to _nn def - return _nn.schedule_dense(attrs, outs, target) + assert dilation == (1, 1), "support for dilation limited to (1, 1)" + + if is_packed_layout(layout): + strategy = OpStrategy() + strategy.add_implement( + _strategy.wrap_compute_conv2d_transpose(conv2d_transpose_packed), + _strategy.wrap_topi_schedule(schedule_conv2d_transpose_packed), + name="conv2d_transpose_packed.vta") + return strategy + + # If it's not packed, run on ARM CPU + arm_tgt = tvm.target.arm_cpu(target.model) + return _strategy.arm_cpu.conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, arm_tgt) + + +@_strategy.dense_strategy.register("vta") +def dense_strategy_vta(attrs, inputs, out_type, target): + """dense vta strategy""" + if inputs[0].shape == 4: # this implies the layout is packed + strategy = OpStrategy() + strategy.add_implement( + _strategy.wrap_compute_dense(dense_packed), + _strategy.wrap_topi_schedule(schedule_dense_packed), + name="dense_packed.vta") + return strategy + # If it's not packed, run on ARM CPU + arm_tgt = tvm.target.arm_cpu(target.model) + return _strategy.x86.dense_strategy_cpu(attrs, inputs, out_type, arm_tgt) diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index e15f6c1d9ecc..ba93b05ca232 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -25,15 +25,8 @@ from .util import is_packed_layout from ..environment import get_env -@autotvm.register_topi_compute(topi.nn.conv2d, 'vta', 'direct') -def _declaration_conv2d(cfg, - data, - kernel, - strides, - padding, - dilation, - layout, - out_dtype): +@autotvm.register_topi_compute("conv2d_packed.vta") +def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): """ Packed conv2d function.""" if not is_packed_layout(layout): raise topi.InvalidShapeError() @@ -69,8 +62,9 @@ def _declaration_conv2d(cfg, return res -@autotvm.register_topi_schedule(topi.generic.schedule_conv2d_nchw, 'vta', 'direct') -def _schedule_conv2d(cfg, outs): +@autotvm.register_topi_schedule("conv2d_packed.vta") +def schedule_conv2d_packed(cfg, outs): + """Schedule packed conv2d""" assert len(outs) == 1 output = outs[0] const_ops = [] diff --git a/vta/python/vta/top/vta_conv2d_transpose.py b/vta/python/vta/top/vta_conv2d_transpose.py index a2750dc9081d..a3fd7ac92cd3 100644 --- a/vta/python/vta/top/vta_conv2d_transpose.py +++ b/vta/python/vta/top/vta_conv2d_transpose.py @@ -26,13 +26,9 @@ from ..environment import get_env -@autotvm.register_topi_compute(topi.nn.conv2d_transpose_nchw, 'vta', 'direct') -def _declatation_conv2d_transpose(cfg, - data, - kernel, - strides, - padding, - out_dtype): +@autotvm.register_topi_compute("conv2d_transpose_packed.vta") +def conv2d_transpose_packed(cfg, data, kernel, strides, padding, out_dtype): + """Packed conv2d_transpose compute""" ishape = get_const_tuple(data.shape) kshape = get_const_tuple(kernel.shape) b, c_i, i_h, i_w, t_b, t_ci = ishape @@ -75,8 +71,9 @@ def _declatation_conv2d_transpose(cfg, return out -@autotvm.register_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw, 'vta', 'direct') -def _schedule_conv2d_transpose(cfg, outs): +@autotvm.register_topi_schedule("conv2d_transpose_packed.vta") +def schedule_conv2d_transpose_packed(cfg, outs): + """Schedule packed conv2d_transpose""" assert len(outs) == 1 output = outs[0] ewise_inputs = [] diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py index 9d6c19c5af20..e23910447ba8 100644 --- a/vta/python/vta/top/vta_dense.py +++ b/vta/python/vta/top/vta_dense.py @@ -32,12 +32,8 @@ def is_packed_layout(layout): return True return False -@autotvm.register_topi_compute(topi.nn.dense, 'vta', 'direct') -def _declaration_dense(cfg, - data, - weight, - bias=None, - out_dtype=None): +@autotvm.register_topi_compute("dense_packed.vta") +def dense_packed(cfg, data, weight, bias=None, out_dtype=None): """Dense function declaration.""" # Make sure that the dense operator is packed @@ -67,8 +63,8 @@ def _declaration_dense(cfg, return res -@autotvm.register_topi_schedule(topi.generic.schedule_dense, 'vta', 'direct') -def _schedule_dense(cfg, outs): +@autotvm.register_topi_schedule("dense_packed.vta") +def schedule_dense_packed(cfg, outs): """Packed dense schedule.""" assert len(outs) == 1 diff --git a/vta/python/vta/top/vta_group_conv2d.py b/vta/python/vta/top/vta_group_conv2d.py index e54637f2c204..aa06c61c3ec0 100644 --- a/vta/python/vta/top/vta_group_conv2d.py +++ b/vta/python/vta/top/vta_group_conv2d.py @@ -24,8 +24,8 @@ from ..environment import get_env -@autotvm.register_topi_compute(topi.nn.group_conv2d_nchw, 'vta', 'direct') -def packed_group_conv2d(cfg, +@autotvm.register_topi_compute("group_conv2d_packed.vta") +def group_conv2d_packed(cfg, data, kernel, strides, @@ -74,8 +74,8 @@ def packed_group_conv2d(cfg, return out -@autotvm.register_topi_schedule(topi.generic.schedule_group_conv2d_nchw, 'vta', 'direct') -def schedule_packed_group_conv2d(cfg, outs): +@autotvm.register_topi_schedule("group_conv2d_packed.vta") +def schedule_group_conv2d_packed(cfg, outs): """ Schedule the packed conv2d. """ assert len(outs) == 1 From 8ef29b56f18822553813f8b98fd0835ffaa8df00 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Wed, 12 Feb 2020 11:18:18 -0800 Subject: [PATCH 16/48] fix cpptest --- src/relay/ir/op_attr_types.cc | 27 ++++++-------- tests/cpp/relay_build_module_test.cc | 56 +++++++++++++++++++++++----- 2 files changed, 59 insertions(+), 24 deletions(-) diff --git a/src/relay/ir/op_attr_types.cc b/src/relay/ir/op_attr_types.cc index c39427b91372..51344d196a1e 100644 --- a/src/relay/ir/op_attr_types.cc +++ b/src/relay/ir/op_attr_types.cc @@ -46,7 +46,7 @@ void OpSpecialization::AddImplement(tvm::relay::FTVMCompute fcompute, auto n = make_object(); n->fcompute = fcompute; n->fschedule = fschedule; - n->name = name; + n->name = std::move(name); n->plevel = plevel; (*this)->implements.push_back(OpImplement(n)); } @@ -56,23 +56,20 @@ void OpStrategy::AddImplement(FTVMCompute fcompute, std::string name, int plevel) { auto curr_cond = te::SpecializedCondition::Current(); - auto specializations = (*this)->specializations; + auto self = this->operator->(); + Array specializations = self->specializations; OpSpecialization op_spec; - for (auto e : specializations) { - if (e->condition == curr_cond) { - op_spec = e; - break; + for (OpSpecialization op_spec : specializations) { + if (op_spec->condition == curr_cond) { + op_spec.AddImplement(fcompute, fschedule, std::move(name), plevel); + return; } } - if (op_spec.defined()) { - op_spec.AddImplement(fcompute, fschedule, name, plevel); - } else { - ObjectPtr n = make_object(); - n->condition = curr_cond; - op_spec = OpSpecialization(n); - op_spec.AddImplement(fcompute, fschedule, name, plevel); - (*this)->specializations.push_back(op_spec); - } + ObjectPtr n = make_object(); + n->condition = curr_cond; + op_spec = OpSpecialization(n); + op_spec.AddImplement(fcompute, fschedule, std::move(name), plevel); + self->specializations.push_back(op_spec); } TVM_REGISTER_GLOBAL("relay.op._OpImplementCompute") diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc index c27408315016..e30209948f67 100644 --- a/tests/cpp/relay_build_module_test.cc +++ b/tests/cpp/relay_build_module_test.cc @@ -24,18 +24,55 @@ #include #include #include +#include +#include #include #include #include #include -TVM_REGISTER_GLOBAL("test.sch") -.set_body([](tvm::TVMArgs args, tvm::TVMRetValue *rv) { - *rv = topi::generic::schedule_injective(args[0], args[1]); - }); +using namespace tvm; +using namespace tvm::relay; + +TVM_REGISTER_GLOBAL("test.strategy") +.set_body_typed([](const Attrs& attrs, const Array& inputs, + const Type& out_type, const Target& target) { + FTVMCompute fcompute = [](const Attrs& attrs, + const Array& inputs, + const Type& out_type) -> Array { + CHECK_EQ(inputs.size(), 2U); + return {topi::add(inputs[0], inputs[1])}; + }; + FTVMSchedule fschedule = [](const Attrs& attrs, + const Array& outs, + const Target& target) { + With target_scope(target); + return topi::generic::schedule_injective(target, outs); + }; + + auto n = make_object(); + auto strategy = tvm::relay::OpStrategy(std::move(n)); + strategy.AddImplement(fcompute, fschedule, "test.strategy", 10); + return strategy; +}); + +TVM_REGISTER_GLOBAL("relay.backend.lower_call") +.set_body_typed([](const relay::Call& call, const Array& inputs, + const Target& target) { + static auto fstrategy = Op::GetAttr("FTVMStrategy"); + Op op = Downcast(call->op); + auto out_type = call->checked_type(); + OpStrategy strategy = fstrategy[op](call->attrs, inputs, out_type, target); + auto impl = strategy->specializations[0]->implements[0]; + auto outs = impl.Compute(call->attrs, inputs, out_type); + auto f = tvm::runtime::Registry::Get("relay.backend._make_LoweredOutput"); + if (!f) { + LOG(FATAL) << "relay.backend._make_LoweredOutput is not registered"; + } + return (*f)(outs, impl); +}); TEST(Relay, BuildModule) { - using namespace tvm; auto tensor_type = relay::TensorType({2, 3}, DataType::Float(32)); auto a = relay::VarNode::make("a", tensor_type); auto b = relay::VarNode::make("b", tensor_type); @@ -59,14 +96,15 @@ TEST(Relay, BuildModule) { } // get schedule auto reg = tvm::runtime::Registry::Get("relay.op._Register"); - auto s_i = tvm::runtime::Registry::Get("test.sch"); if (!reg) { LOG(FATAL) << "no _Register"; } - if (!s_i) { - LOG(FATAL) << "no _Register"; + auto fs = tvm::runtime::Registry::Get("test.strategy"); + if (!fs) { + LOG(FATAL) << "No test_strategy registered."; } - (*reg)("add", "FTVMSchedule", *s_i, 10); + auto fgeneric = GenericFunc::Get("test.strategy_generic").set_default(*fs); + (*reg)("add", "FTVMStrategy", fgeneric, 10); // build auto pfb = tvm::runtime::Registry::Get("relay.build_module._BuildModule"); tvm::runtime::Module build_mod = (*pfb)(); From 220c45564f5e3601cb0d53008e684a58733f04b8 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Wed, 12 Feb 2020 11:37:29 -0800 Subject: [PATCH 17/48] x --- include/tvm/relay/op_attr_types.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h index 6c6b0e6b6262..5f02f42bac1c 100644 --- a/include/tvm/relay/op_attr_types.h +++ b/include/tvm/relay/op_attr_types.h @@ -259,9 +259,9 @@ class OpImplement : public ObjectRef { * \param out_type The output type information. * \return The output compute description of the operator. */ - Array Compute(const Attrs& attrs, - const Array& inputs, - const Type& out_type); + TVM_DLL Array Compute(const Attrs& attrs, + const Array& inputs, + const Type& out_type); /*! * \brief Build the computation schedule. * \param attrs The attribute of the node. @@ -269,9 +269,9 @@ class OpImplement : public ObjectRef { * \param target The build target. * \return The computation schedule. */ - te::Schedule Schedule(const Attrs& attrs, - const Array& outs, - const Target& target); + TVM_DLL te::Schedule Schedule(const Attrs& attrs, + const Array& outs, + const Target& target); TVM_DEFINE_OBJECT_REF_METHODS(OpImplement, ObjectRef, OpImplementNode); }; @@ -308,8 +308,8 @@ class OpSpecialization : public ObjectRef { * \param name Name of the implementation * \param plevel Priority level of the implementation */ - void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, - std::string name, int plevel); + TVM_DLL void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, + std::string name, int plevel); TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(OpSpecialization, ObjectRef, OpSpecializationNode); }; @@ -342,8 +342,8 @@ class OpStrategy : public ObjectRef { * \param name Name of the implementation * \param plevel Priority level of the implementation */ - void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, std::string name, - int plevel); + TVM_DLL void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, + std::string name, int plevel); TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(OpStrategy, ObjectRef, OpStrategyNode); }; From 2b291972ac0e57d022ddfb847a20bed297afc9a1 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Wed, 12 Feb 2020 13:10:09 -0800 Subject: [PATCH 18/48] fix rebase err --- python/tvm/relay/backend/compile_engine.py | 4 ++-- python/tvm/relay/op/op.py | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py index d9d8c8b89873..94415939b67f 100644 --- a/python/tvm/relay/backend/compile_engine.py +++ b/python/tvm/relay/backend/compile_engine.py @@ -231,9 +231,9 @@ def lower_call(call, inputs, target): new_fields.append(field) ret_type = _ty.TupleType(new_fields) - is_dyn = call.checked_type.is_dynamic() + is_dyn = _ty.type_has_any(call.checked_type) for arg in call.args: - is_dyn = is_dyn or arg.checked_type.is_dynamic() + is_dyn = is_dyn or _ty.type_has_any(arg.checked_type) # check if in the AutoTVM tracing mode, and disable if op is not in wanted list env = autotvm.task.TaskExtractEnv.current diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py index 5e2426ba3407..da25cd1f033a 100644 --- a/python/tvm/relay/op/op.py +++ b/python/tvm/relay/op/op.py @@ -23,6 +23,7 @@ from ..expr import RelayExpr from ...api import register_func from ...target import get_native_generic_func, GenericFunc +from ...runtime import Object from . import _make @register_relay_node @@ -143,8 +144,8 @@ class OpPattern(object): OPAQUE = 8 -@register_relay_node -class OpImplement(Expr): +@tvm._ffi.register_object("relay.OpImplement") +class OpImplement(Object): """Operator implementation""" def compute(self, attrs, inputs, out_type): """Call compute function. @@ -189,13 +190,13 @@ def schedule(self, attrs, outs, target): return _OpImplementSchedule(self, attrs, outs, target) -@register_relay_node -class OpSpecialization(Expr): +@tvm._ffi.register_object("relay.OpSpecialization") +class OpSpecialization(Object): """Operator specialization""" -@register_relay_node -class OpStrategy(Expr): +@tvm._ffi.register_object("relay.OpStrategy") +class OpStrategy(Object): """Operator strategy""" def __init__(self): self.__init_handle_by_constructor__(_make.OpStrategy) From ea85e73aa78fe0067764e06cf43169ce7c6abb38 Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Wed, 12 Feb 2020 14:53:52 -0800 Subject: [PATCH 19/48] Fix two tests (#11) --- topi/tests/python/test_topi_conv2d_hwcn.py | 15 +++++++++++---- topi/tests/python/test_topi_conv2d_int8.py | 6 +++--- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/topi/tests/python/test_topi_conv2d_hwcn.py b/topi/tests/python/test_topi_conv2d_hwcn.py index 35423a686e8f..086523e46013 100644 --- a/topi/tests/python/test_topi_conv2d_hwcn.py +++ b/topi/tests/python/test_topi_conv2d_hwcn.py @@ -24,6 +24,12 @@ from topi.util import get_const_tuple +_conv2d_hwcn_implement = { + "generic": (topi.nn.conv2d_hwcn, topi.generic.schedule_conv2d_hwcn), + "gpu": (topi.cuda.conv2d_hwcn, topi.cuda.schedule_conv2d_hwcn), + "opencl": (topi.cuda.conv2d_hwcn, topi.cuda.schedule_conv2d_hwcn), +} + def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1): in_height = in_width = in_size @@ -56,12 +62,13 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - t_conv = topi.nn.conv2d(A, W, stride, padding, dilation, layout='HWCN') + fcompute, fschedule = topi.testing.dispatch(device, _conv2d_hwcn_implement) + t_conv = fcompute(A, W, stride, padding, dilation) t_bias = topi.add(t_conv, B) t_relu = topi.nn.relu(t_bias) - s1 = topi.generic.schedule_conv2d_hwcn([t_conv]) - s2 = topi.generic.schedule_conv2d_hwcn([t_bias]) - s3 = topi.generic.schedule_conv2d_hwcn([t_relu]) + s1 = fschedule([t_conv]) + s2 = fschedule([t_bias]) + s3 = fschedule([t_relu]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) diff --git a/topi/tests/python/test_topi_conv2d_int8.py b/topi/tests/python/test_topi_conv2d_int8.py index 6cb66d013541..c36bfa331faf 100644 --- a/topi/tests/python/test_topi_conv2d_int8.py +++ b/topi/tests/python/test_topi_conv2d_int8.py @@ -82,13 +82,13 @@ def check_device(device): print("Running on target: %s" % device) with tvm.target.create(device): - C = topi.nn.conv2d(A, W, (stride, stride), padding, (dilation, dilation), - layout='NCHW', out_dtype=dtype) + C = topi.cuda.conv2d_NCHWc_int8(A, W, (stride, stride), padding, (dilation, dilation), + 'NCHW', dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) - s = topi.generic.schedule_conv2d_nchw([C]) + s = topi.cuda.schedule_conv2d_NCHWc_int8([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) From ca702e1300bcdc43af125f6148773b04a9add79b Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Wed, 12 Feb 2020 15:49:46 -0800 Subject: [PATCH 20/48] change autotvm log format --- python/tvm/autotvm/database.py | 3 +- python/tvm/autotvm/feature.py | 5 +- .../autotvm/graph_tuner/base_graph_tuner.py | 13 +- python/tvm/autotvm/record.py | 49 ++- python/tvm/autotvm/task/relay_integration.py | 2 +- python/tvm/autotvm/task/space.py | 12 +- python/tvm/autotvm/task/task.py | 79 ++-- .../python/unittest/test_graph_tuner_core.py | 386 ++++++++---------- .../python/unittest/test_graph_tuner_utils.py | 10 - .../topi/intel_graphics/conv2d_alter_op.py | 4 +- .../topi/intel_graphics/depthwise_conv2d.py | 4 +- topi/python/topi/x86/conv2d.py | 4 +- topi/python/topi/x86/depthwise_conv2d.py | 4 +- tutorials/autotvm/tune_relay_x86.py | 2 +- 14 files changed, 257 insertions(+), 320 deletions(-) diff --git a/python/tvm/autotvm/database.py b/python/tvm/autotvm/database.py index 55d4180f03be..75e3f9ff7d06 100644 --- a/python/tvm/autotvm/database.py +++ b/python/tvm/autotvm/database.py @@ -125,7 +125,7 @@ def load(self, inp, get_all=False): current = self.get(measure_str_key(inp)) if current is not None: records = [decode(x) for x in current.split(RedisDatabase.MAGIC_SPLIT)] - results = [rec[1] for rec in records] + results = [rec[1] for rec in records if rec is not None] if get_all: return results return max(results, key=lambda result: result.timestamp) @@ -167,6 +167,7 @@ def filter(self, func): current = self.get(key) try: records = [decode(x) for x in current.split(RedisDatabase.MAGIC_SPLIT)] + records = list(filter(None, records)) except TypeError: # got a badly formatted/old format record continue diff --git a/python/tvm/autotvm/feature.py b/python/tvm/autotvm/feature.py index b7d1c44117a7..4ff1139d85f1 100644 --- a/python/tvm/autotvm/feature.py +++ b/python/tvm/autotvm/feature.py @@ -153,7 +153,10 @@ def get_flatten_name(fea): from .record import decode # flatten line to feature line = fea - inp, _ = decode(line) + ret = decode(line) + if ret is None: + raise ValueError("Unsupported AutoTVM log format") + inp, _ = ret target = _target.create(inp.target) with target: s, args = inp.template.instantiate(inp.config) diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py index c64049333fc0..27dbeedd1e3d 100644 --- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py +++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py @@ -177,7 +177,7 @@ def __init__(self, graph, input_shapes, records, target_ops, dtype = first_tensor[-1] new_shape = tuple([val.value for val in node_entry["types"][0].shape]) actual_workload = (input_workload[0],) + \ - ((new_shape + (dtype,)),) + input_workload[2:] + (("TENSOR", new_shape, dtype),) + input_workload[2:] node_entry["workloads"].append(actual_workload) if "record_candidates" not in node_entry: node_entry["record_candidates"] = input_node["record_candidates"] @@ -312,9 +312,8 @@ def _create_matrix_callback(self, from_node_idx, to_node_idx, from_sch_idx, to_sch_idx, args): """Create dictionary containing matrix format of layout transformation between nodes.""" - sargs = serialize_args(args) in_layout, out_layout = args[1], args[2] - ltf_workload = ('layout_transform',) + autotvm.task.args_to_workload(sargs) + ltf_workload = autotvm.task.args_to_workload(args, 'layout_transform') idx_pair_key = (from_node_idx, to_node_idx) if in_layout == out_layout: @@ -447,9 +446,8 @@ def _callback(_, inputs, results): measure_option = autotvm.measure_option(builder=builder, runner=runner) for args in args_list: data, in_layout, out_layout = args - args = serialize_args(args) - ltf_workload = ('layout_transform',) + autotvm.task.args_to_workload(args) - if ltf_workload in self._layout_transform_perf_records: + ltf_workload = autotvm.task.args_to_workload(args, 'layout_transform') + if ltf_workload in self._layout_transform_perf_records: continue if infer_layout: @@ -476,9 +474,8 @@ def _callback(_, inputs, results): continue records = [] - task = autotvm.task.create(layout_transform, args=args, target=self._target, + task = autotvm.task.create("layout_transform", args=args, target=self._target, target_host=target_host) - task.workload = ltf_workload tuner = autotvm.tuner.GridSearchTuner(task) tuner.tune(n_trial=1, measure_option=measure_option, callbacks=[_log_to_list(records)]) diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py index 5e8ac9d0c5df..a51a290778d5 100644 --- a/python/tvm/autotvm/record.py +++ b/python/tvm/autotvm/record.py @@ -31,12 +31,12 @@ import numpy as np from .. import build, lower, target as _target - +from .. import __version__ from . import task from .task import ConfigEntity, ApplyHistoryBest from .measure import MeasureInput, MeasureResult -AUTOTVM_LOG_VERSION = 0.1 +AUTOTVM_LOG_VERSION = 0.2 logger = logging.getLogger('autotvm') try: # convert unicode to str for python2 @@ -89,27 +89,29 @@ def encode(inp, result, protocol='json'): if protocol == 'json': json_dict = { - "i": (str(inp.target), - inp.task.name, inp.task.args, inp.task.kwargs, - inp.task.workload, - inp.config.to_json_dict()), + "input": (str(inp.target), + inp.task.name, inp.task.args, inp.task.kwargs, + inp.config.to_json_dict()), + + "result": (result.costs if result.error_no == 0 else (1e9,), + result.error_no, + result.all_cost, + result.timestamp), - "r": (result.costs if result.error_no == 0 else (1e9,), - result.error_no, - result.all_cost, - result.timestamp), + "version": AUTOTVM_LOG_VERSION, - "v": AUTOTVM_LOG_VERSION + "tvm_version": __version__ } return json.dumps(json_dict) if protocol == 'pickle': row = (str(inp.target), str(base64.b64encode(pickle.dumps([inp.task.name, inp.task.args, - inp.task.kwargs, - inp.task.workload])).decode()), + inp.task.kwargs])).decode()), str(base64.b64encode(pickle.dumps(inp.config)).decode()), - str(base64.b64encode(pickle.dumps(tuple(result))).decode())) + str(base64.b64encode(pickle.dumps(tuple(result))).decode()), + str(AUTOTVM_LOG_VERSION), + str(__version__)) return '\t'.join(row) raise RuntimeError("Invalid log protocol: " + protocol) @@ -133,7 +135,11 @@ def decode(row, protocol='json'): # pylint: disable=unused-variable if protocol == 'json': row = json.loads(row) - tgt, task_name, task_args, task_kwargs, workload, config = row['i'] + if 'v' in row and row['v'] == 0.1: + logger.warning("AutoTVM log version 0.1 is no longer supported.") + return None + + tgt, task_name, task_args, task_kwargs, config = row['input'] tgt = _target.create(str(tgt)) def clean_json_to_python(x): @@ -149,15 +155,17 @@ def clean_json_to_python(x): return x tsk = task.Task(clean_json_to_python(task_name), clean_json_to_python(task_args)) - tsk.workload = clean_json_to_python(workload) config = ConfigEntity.from_json_dict(config) inp = MeasureInput(tgt, tsk, config) - result = MeasureResult(*[tuple(x) if isinstance(x, list) else x for x in row["r"]]) + result = MeasureResult(*[tuple(x) if isinstance(x, list) else x for x in row["result"]]) config.cost = np.mean(result.costs) return inp, result if protocol == 'pickle': items = row.split("\t") + if len(items) == 4: + logger.warning("AutoTVM log version 0.1 is no longer supported.") + return None tgt = _target.create(items[0]) task_tuple = pickle.loads(base64.b64decode(items[1].encode())) config = pickle.loads(base64.b64decode(items[2].encode())) @@ -165,7 +173,6 @@ def clean_json_to_python(x): config.cost = np.mean(result.costs) tsk = task.Task(task_tuple[0], task_tuple[1]) - tsk.workload = task_tuple[3] return MeasureInput(tgt, tsk, config), result raise RuntimeError("Invalid log protocol: " + protocol) @@ -186,7 +193,10 @@ def load_from_file(filename): """ for row in open(filename): if row and not row.startswith('#'): - inp, res = decode(row) + ret = decode(row) + if ret is None: + continue + inp, res = ret # Avoid loading the record with an empty config. The TOPI schedule with no entities # will result in an empty entity map (e.g., depthwise_conv2d_nchw on x86). # Using an empty config will cause problems when applying alter op like NCHW to NCHWc. @@ -212,6 +222,7 @@ def split_workload(in_file, clean=True): logger.info("start converting...") pool = multiprocessing.Pool() lines = pool.map(decode, lines) + lines = list(filter(None, lines)) logger.info("map done %.2f", time.time() - tic) wkl_dict = OrderedDict() diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index fda646c053f5..8a45e3d1240d 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -23,7 +23,7 @@ import threading import logging - +import tvm from .task import create from .topi_integration import TaskExtractEnv diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py index d1810d4a75a2..47c227073677 100644 --- a/python/tvm/autotvm/task/space.py +++ b/python/tvm/autotvm/task/space.py @@ -893,8 +893,8 @@ def to_json_dict(self): a json serializable dictionary """ ret = {} - ret['i'] = int(self.index) - ret['c'] = self.code_hash + ret['index'] = int(self.index) + ret['code_hash'] = self.code_hash entity_map = [] for k, v in self._entity_map.items(): if isinstance(v, SplitEntity): @@ -907,7 +907,7 @@ def to_json_dict(self): entity_map.append((k, 'ot', v.val)) else: raise RuntimeError("Invalid entity instance: " + v) - ret['e'] = entity_map + ret['entity'] = entity_map return ret @staticmethod @@ -926,12 +926,12 @@ def from_json_dict(json_dict): The corresponding config object """ - index = json_dict["i"] - code_hash = json_dict["c"] + index = json_dict["index"] + code_hash = json_dict["code_hash"] constraints = [] entity_map = OrderedDict() - for item in json_dict["e"]: + for item in json_dict["entity"]: key, knob_type, knob_args = item if knob_type == 'sp': entity = SplitEntity(knob_args) diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py index 3bbbffa0f655..a084b8c77c85 100644 --- a/python/tvm/autotvm/task/task.py +++ b/python/tvm/autotvm/task/task.py @@ -43,12 +43,22 @@ def serialize_args(args): ---------- args: list of hashable or Tensor """ + def _encode(x): + if isinstance(x, tensor.Tensor): + return ('TENSOR', get_const_tuple(x.shape), x.dtype) + if isinstance(x, (tuple, list, container.Array)): + return tuple([_encode(a) for a in x]) + if isinstance(x, (str, int, float, np.int, np.float, expr.Var)): + return x + if isinstance(x, (expr.StringImm, expr.IntImm, expr.FloatImm)): + return x.value + if x is None: + return 0 + raise RuntimeError('Do not support type "%s" in argument. Consider to use' + 'primitive types or tvm.expr.Var only' % type(x)) ret = [] for t in args: - if isinstance(t, tensor.Tensor): - ret.append(('TENSOR', get_const_tuple(t.shape), t.dtype)) - else: - ret.append(t) + ret.append(_encode(t)) return tuple(ret) @@ -68,6 +78,27 @@ def deserialize_args(args): return ret +def args_to_workload(args, task_name=None): + """Convert argument list to hashable workload tuple. + This function will convert list to tuple, tvm node to python value and + flatten tvm.tensor.Tensor to a tuple + + Parameters + ---------- + task_name : str + The AutoTVM task name + + args : list of args + The arguments to the function + + Returns + ------- + ret: hashable + The hashable value + """ + return (task_name,) + serialize_args(args) if task_name is not None else serialize_args(args) + + class Task(object): """A Tunable Task @@ -88,11 +119,14 @@ def __init__(self, name, args): self.func = TASK_TABLE.get(name, _raise_error) # auxiliary info, available after `init_space` is called - self.workload = None self.flop = None self.target = None self.target_host = None + @property + def workload(self): + return (self.name,) + serialize_args(self.args) + def instantiate(self, config): """Instantiate this task function (template) with a config. Returns corresponding schedule. @@ -127,7 +161,6 @@ def __getstate__(self): "args": self.args, "kwargs": self.kwargs, "config_space": self.config_space, - "workload": self.workload, "flop": self.flop, "target": self.target, "target_host": self.target_host @@ -139,7 +172,6 @@ def __setstate__(self, state): self.kwargs = state["kwargs"] self.config_space = state["config_space"] self.func = TASK_TABLE.get(state["name"], _raise_error) - self.workload = state["workload"] self.flop = state["flop"] self.target = state["target"] self.target_host = state["target_host"] @@ -303,45 +335,12 @@ def create(task_name, args, target, target_host=None): sch, _ = ret.func(*args) ret.config_space.code_hash = getattr(sch, 'code_hash', None) - ret.workload = ctx.workload ret.flop = ret.config_space.flop or compute_flop(sch) ret.target = target ret.target_host = target_host return ret -def args_to_workload(x, task_name=None): - """Convert argument list to hashable workload tuple. - This function will convert list to tuple, tvm node to python value and - flatten tvm.tensor.Tensor to a tuple - - Parameters - ---------- - x: primitive hashable types or tensor.Tensor - The original value - task_name: str - The AutoTVM task name - - Returns - ------- - ret: hashable - The hashable value - """ - if isinstance(x, tensor.Tensor): - workload = get_const_tuple(x.shape) + (x.dtype, ) - elif isinstance(x, (tuple, list, container.Array)): - workload = tuple([args_to_workload(a) for a in x]) - elif isinstance(x, (str, int, float, np.int, np.float, expr.Var)): - workload = x - elif isinstance(x, (expr.StringImm, expr.IntImm, expr.FloatImm)): - workload = x.value - elif x is None: - workload = 0 - else: - raise RuntimeError('Do not support type "%s" in argument. Consider to use' - 'primitive types or tvm.expr.Var only' % type(x)) - return tuple((task_name, ) + workload) if task_name else workload - def get_config(): """Get current config object diff --git a/tests/python/unittest/test_graph_tuner_core.py b/tests/python/unittest/test_graph_tuner_core.py index 1070cc73266e..27e077f5319c 100644 --- a/tests/python/unittest/test_graph_tuner_core.py +++ b/tests/python/unittest/test_graph_tuner_core.py @@ -31,7 +31,6 @@ from tvm.autotvm.task import ConfigEntity from tvm.autotvm.measure import MeasureResult, MeasureInput from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner -from test_graph_tuner_utils import create_workload def _create_data(target, dshape, dtype, layout): @@ -49,67 +48,52 @@ def _create_data(target, dshape, dtype, layout): target=target, params=params, ops=(relay.op.get("nn.conv2d"),)) - wkl_list = [ - create_workload((1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype), - create_workload((1, 16, 8, 8), (32, 16, 1, 1), (1, 1), (0, 0, 0, 0), (1, 1), layout, layout, dtype, dtype), - create_workload((1, 32, 8, 8), (32, 32, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype), - ] costs = [0.04, 0.012, 0.03] config_list = [] - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [3, 1]], - ["tile_oc", "sp", [4, 4]], - ["tile_ow", "sp", [4, 2]], - ["unroll_kw", "ot", True]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [3, 1]], + ["tile_oc", "sp", [4, 4]], + ["tile_ow", "sp", [4, 2]], + ["unroll_kw", "ot", True]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [2, 8]], - ["tile_oc", "sp", [1, 32]], - ["tile_oh", "ot", 1], - ["tile_ow", "sp", [4, 2]]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [2, 8]], + ["tile_oc", "sp", [1, 32]], + ["tile_oh", "ot", 1], + ["tile_ow", "sp", [4, 2]]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [8, 4]], - ["tile_oc", "sp", [4, 8]], - ["tile_ow", "sp", [2, 4]], - ["unroll_kw", "ot", False]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [8, 4]], + ["tile_oc", "sp", [4, 8]], + ["tile_ow", "sp", [2, 4]], + ["unroll_kw", "ot", False]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) records = [] - for wkl, cost, config, task in zip(wkl_list, costs, config_list, tasks): - task.workload = wkl + for cost, config, task in zip(costs, config_list, tasks): ms_input = MeasureInput(target=target, task=task, config=config) ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1) records.append((ms_input, ms_output)) ltf_records = [] ltf_arg = [tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"] - ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) - ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg) - ltf_task = copy.deepcopy(tasks[0]) - ltf_task.workload = ltf_wkl + ltf_task = autotvm.task.create('layout_transform', ltf_arg, target) ms_input = MeasureInput(target=target, task=ltf_task, config=None) ms_output = MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1) ltf_records.append((ms_input, ms_output)) ltf_keys = [] ltf_arg = [tvm.placeholder((1, 4, 8, 8, 4), dtype=dtype), "NCHW4c", "NCHW8c"] - ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) - ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg) + ltf_wkl = autotvm.task.args_to_workload(ltf_arg, 'layout_transform') ltf_keys.append(ltf_wkl) ltf_arg = [tvm.placeholder((1, 1, 8, 8, 32), dtype=dtype), "NCHW32c", "NCHW4c"] - ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) - ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg) + ltf_wkl = autotvm.task.args_to_workload(ltf_arg, 'layout_transform') ltf_keys.append(ltf_wkl) ltf_arg = [tvm.placeholder((1, 4, 8, 8, 8), dtype=dtype), "NCHW8c", "NCHW32c"] - ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) - ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg) + ltf_wkl = autotvm.task.args_to_workload(ltf_arg, 'layout_transform') ltf_keys.append(ltf_wkl) return net, records, ltf_records, ltf_keys, tasks @@ -165,29 +149,26 @@ def test_DPTuner_run(): mod["main"] = g costs = [0.02, 0.02, 0.045] config_list = [] - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [1, 3]], - ["tile_oc", "sp", [2, 8]], - ["tile_ow", "sp", [4, 2]], - ["unroll_kw", "ot", True]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [1, 3]], + ["tile_oc", "sp", [2, 8]], + ["tile_ow", "sp", [4, 2]], + ["unroll_kw", "ot", True]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [4, 4]], - ["tile_oc", "sp", [2, 16]], - ["tile_oh", "ot", 1], - ["tile_ow", "sp", [4, 2]]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [4, 4]], + ["tile_oc", "sp", [2, 16]], + ["tile_oh", "ot", 1], + ["tile_ow", "sp", [4, 2]]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [16, 2]], - ["tile_oc", "sp", [8, 4]], - ["tile_ow", "sp", [2, 4]], - ["unroll_kw", "ot", False]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [16, 2]], + ["tile_oc", "sp", [8, 4]], + ["tile_ow", "sp", [2, 4]], + ["unroll_kw", "ot", False]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) for cost, config, task in zip(costs, config_list, tasks): ms_input = MeasureInput(target=target, task=task, config=config) @@ -215,29 +196,26 @@ def test_PBQPTuner_run(): g, records, ltf_records, ltf_keys, tasks = _create_data(target, dshape, dtype, layout) costs = [0.02, 0.02, 0.045] config_list = [] - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [1, 3]], - ["tile_oc", "sp", [2, 8]], - ["tile_ow", "sp", [4, 2]], - ["unroll_kw", "ot", True]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [1, 3]], + ["tile_oc", "sp", [2, 8]], + ["tile_ow", "sp", [4, 2]], + ["unroll_kw", "ot", True]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [4, 4]], - ["tile_oc", "sp", [2, 16]], - ["tile_oh", "ot", 1], - ["tile_ow", "sp", [4, 2]]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [4, 4]], + ["tile_oc", "sp", [2, 16]], + ["tile_oh", "ot", 1], + ["tile_ow", "sp", [4, 2]]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [16, 2]], - ["tile_oc", "sp", [8, 4]], - ["tile_ow", "sp", [2, 4]], - ["unroll_kw", "ot", False]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [16, 2]], + ["tile_oc", "sp", [8, 4]], + ["tile_ow", "sp", [2, 4]], + ["unroll_kw", "ot", False]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) for cost, config, task in zip(costs, config_list, tasks): ms_input = MeasureInput(target=target, task=task, config=config) @@ -282,78 +260,62 @@ def test_many_sub_graphs(): target=target, params=params, ops=(conv2d,)) - wkl_list = [ - create_workload((1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype), - create_workload((1, 16, 8, 8), (32, 16, 1, 1), (1, 1), (0, 0, 0, 0), (1, 1), layout, layout, dtype, dtype), - create_workload((1, 32, 8, 8), (32, 32, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype), - ] costs = [0.04, 0.012, 0.03, 0.02, 0.02, 0.045] config_list = [] - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [3, 1]], - ["tile_oc", "sp", [4, 4]], - ["tile_ow", "sp", [4, 2]], - ["unroll_kw", "ot", True]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [3, 1]], + ["tile_oc", "sp", [4, 4]], + ["tile_ow", "sp", [4, 2]], + ["unroll_kw", "ot", True]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [2, 8]], - ["tile_oc", "sp", [1, 32]], - ["tile_oh", "ot", 1], - ["tile_ow", "sp", [4, 2]]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [2, 8]], + ["tile_oc", "sp", [1, 32]], + ["tile_oh", "ot", 1], + ["tile_ow", "sp", [4, 2]]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [8, 4]], - ["tile_oc", "sp", [4, 8]], - ["tile_ow", "sp", [2, 4]], - ["unroll_kw", "ot", False]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [8, 4]], + ["tile_oc", "sp", [4, 8]], + ["tile_ow", "sp", [2, 4]], + ["unroll_kw", "ot", False]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [1, 3]], - ["tile_oc", "sp", [2, 8]], - ["tile_ow", "sp", [4, 2]], - ["unroll_kw", "ot", True]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [1, 3]], + ["tile_oc", "sp", [2, 8]], + ["tile_ow", "sp", [4, 2]], + ["unroll_kw", "ot", True]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [4, 4]], - ["tile_oc", "sp", [2, 16]], - ["tile_oh", "ot", 1], - ["tile_ow", "sp", [4, 2]]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [4, 4]], + ["tile_oc", "sp", [2, 16]], + ["tile_oh", "ot", 1], + ["tile_ow", "sp", [4, 2]]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [16, 2]], - ["tile_oc", "sp", [8, 4]], - ["tile_ow", "sp", [2, 4]], - ["unroll_kw", "ot", False]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [16, 2]], + ["tile_oc", "sp", [8, 4]], + ["tile_ow", "sp", [2, 4]], + ["unroll_kw", "ot", False]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) records = [] - wkl_list = wkl_list + wkl_list tasks = tasks + tasks - for wkl, cost, config, task in zip(wkl_list, costs, config_list, tasks): - task.workload = wkl + for cost, config, task in zip(costs, config_list, tasks): ms_input = MeasureInput(target=target, task=task, config=config) ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1) records.append((ms_input, ms_output)) ltf_records = [] ltf_arg = [tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"] - ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) - ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg) - ltf_task = copy.deepcopy(tasks[0]) - ltf_task.workload = ltf_wkl + ltf_task = autotvm.task.create('layout_transform', ltf_arg, target) ms_input = MeasureInput(target=target, task=ltf_task, config=None) ms_output = MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1) ltf_records.append((ms_input, ms_output)) @@ -396,57 +358,47 @@ def test_tuple(): target=target, params=params, ops=(conv2d,)) - wkl_list = [ - create_workload((1, 5, 32, 32), (2, 5, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype), - create_workload((1, 5, 32, 32), (3, 5, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype), - ] costs = [0.01, 0.012, 0.03, 0.04] config_list = [] - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [1, 5]], - ["tile_oc", "sp", [1, 2]], - ["tile_ow", "sp", [4, 8]], - ["unroll_kw", "ot", True]]} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [1, 5]], + ["tile_oc", "sp", [1, 2]], + ["tile_ow", "sp", [4, 8]], + ["unroll_kw", "ot", True]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [1, 5]], - ["tile_oc", "sp", [1, 3]], - ["tile_ow", "sp", [2, 16]], - ["unroll_kw", "ot", False]]} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [1, 5]], + ["tile_oc", "sp", [1, 3]], + ["tile_ow", "sp", [2, 16]], + ["unroll_kw", "ot", False]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [1, 5]], - ["tile_oc", "sp", [2, 1]], - ["tile_ow", "sp", [4, 8]], - ["unroll_kw", "ot", True]]} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [1, 5]], + ["tile_oc", "sp", [2, 1]], + ["tile_ow", "sp", [4, 8]], + ["unroll_kw", "ot", True]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [1, 5]], - ["tile_oc", "sp", [3, 1]], - ["tile_ow", "sp", [2, 16]], - ["unroll_kw", "ot", False]]} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [1, 5]], + ["tile_oc", "sp", [3, 1]], + ["tile_ow", "sp", [2, 16]], + ["unroll_kw", "ot", False]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) records = [] - - wkl_list = wkl_list + wkl_list tasks = tasks + tasks - for wkl, cost, config, task in zip(wkl_list, costs, config_list, tasks): - task.workload = wkl + for cost, config, task in zip(costs, config_list, tasks): ms_input = MeasureInput(target=target, task=task, config=config) ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1) records.append((ms_input, ms_output)) ltf_records = [] ltf_arg = [tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"] - ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) - ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg) - ltf_task = copy.deepcopy(tasks[0]) - ltf_task.workload = ltf_wkl + ltf_task = autotvm.task.create('layout_transform', ltf_arg, target) ms_input = MeasureInput(target=target, task=ltf_task, config=None) ms_output = MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1) ltf_records.append((ms_input, ms_output)) @@ -491,78 +443,62 @@ def test_triangle_block(): target=target, params=params, ops=(conv2d,)) - wkl_list = [ - create_workload((1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype), - create_workload((1, 16, 8, 8), (32, 16, 1, 1), (1, 1), (0, 0, 0, 0), (1, 1), layout, layout, dtype, dtype), - create_workload((1, 3, 8, 8), (32, 3, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype), - ] costs = [0.04, 0.012, 0.03, 0.02, 0.02, 0.045] config_list = [] - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [3, 1]], - ["tile_oc", "sp", [4, 4]], - ["tile_ow", "sp", [4, 2]], - ["unroll_kw", "ot", True]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [3, 1]], + ["tile_oc", "sp", [4, 4]], + ["tile_ow", "sp", [4, 2]], + ["unroll_kw", "ot", True]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [2, 8]], - ["tile_oc", "sp", [1, 32]], - ["tile_oh", "ot", 1], - ["tile_ow", "sp", [4, 2]]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [2, 8]], + ["tile_oc", "sp", [1, 32]], + ["tile_oh", "ot", 1], + ["tile_ow", "sp", [4, 2]]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [8, 4]], - ["tile_oc", "sp", [4, 8]], - ["tile_ow", "sp", [2, 4]], - ["unroll_kw", "ot", False]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [8, 4]], + ["tile_oc", "sp", [4, 8]], + ["tile_ow", "sp", [2, 4]], + ["unroll_kw", "ot", False]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [1, 3]], - ["tile_oc", "sp", [2, 8]], - ["tile_ow", "sp", [4, 2]], - ["unroll_kw", "ot", True]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [1, 3]], + ["tile_oc", "sp", [2, 8]], + ["tile_ow", "sp", [4, 2]], + ["unroll_kw", "ot", True]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [4, 4]], - ["tile_oc", "sp", [2, 16]], - ["tile_oh", "ot", 1], - ["tile_ow", "sp", [4, 2]]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [4, 4]], + ["tile_oc", "sp", [2, 16]], + ["tile_oh", "ot", 1], + ["tile_ow", "sp", [4, 2]]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) - cfg_dict = {"i": -1, - "c": None, - "e": [["tile_ic", "sp", [16, 2]], - ["tile_oc", "sp", [8, 4]], - ["tile_ow", "sp", [2, 4]], - ["unroll_kw", "ot", False]], - "t": ""} + cfg_dict = {"index": -1, + "code_hash": None, + "entity": [["tile_ic", "sp", [16, 2]], + ["tile_oc", "sp", [8, 4]], + ["tile_ow", "sp", [2, 4]], + ["unroll_kw", "ot", False]]} config_list.append(ConfigEntity.from_json_dict(cfg_dict)) records = [] - wkl_list = wkl_list + wkl_list tasks = tasks + tasks - for wkl, cost, config, task in zip(wkl_list, costs, config_list, tasks): - task.workload = wkl + for cost, config, task in zip(costs, config_list, tasks): ms_input = MeasureInput(target=target, task=task, config=config) ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1) records.append((ms_input, ms_output)) ltf_records = [] ltf_arg = [tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"] - ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg) - ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg) - ltf_task = copy.deepcopy(tasks[0]) - ltf_task.workload = ltf_wkl + ltf_task = autotvm.task.create('layout_transform', ltf_arg, target) ms_input = MeasureInput(target=target, task=ltf_task, config=None) ms_output = MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1) ltf_records.append((ms_input, ms_output)) diff --git a/tests/python/unittest/test_graph_tuner_utils.py b/tests/python/unittest/test_graph_tuner_utils.py index b4ea2d528507..112c5b8a7059 100644 --- a/tests/python/unittest/test_graph_tuner_utils.py +++ b/tests/python/unittest/test_graph_tuner_utils.py @@ -27,16 +27,6 @@ from tvm.autotvm.graph_tuner.utils import has_multiple_inputs, get_direct_ancestor, get_in_nodes, \ get_out_nodes, expr2graph, bind_inputs from tvm.relay.expr import Call, TupleGetItem, Tuple, Var -from topi.nn.conv2d import conv2d - - -def create_workload(dshape, kshape, strides, - padding, dilation, layout, - out_layout, dtype, out_dtype): - data = tvm.placeholder(dshape, dtype=dtype) - kernel = tvm.placeholder(kshape, dtype=dtype) - return autotvm.task.args_to_workload([data, kernel, strides, padding, dilation, layout, - out_layout, out_dtype], "conv2d_NCHWc.x86") def verify_has_multiple_inputs(node_list, node_idx, input_names, expected_result): diff --git a/topi/python/topi/intel_graphics/conv2d_alter_op.py b/topi/python/topi/intel_graphics/conv2d_alter_op.py index 4e0314543843..6c0cca070f1d 100644 --- a/topi/python/topi/intel_graphics/conv2d_alter_op.py +++ b/topi/python/topi/intel_graphics/conv2d_alter_op.py @@ -90,8 +90,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): @conv2d_infer_layout.register("intel_graphics") def _conv2d_infer_layout(workload, cfg): _, data, kernel, strides, padding, dilation, layout, dtype = workload - batch_size, in_channel, in_height, in_width = data[:-1] - out_channel, _, k_height, k_width = kernel[:-1] + batch_size, in_channel, in_height, in_width = data[1] + out_channel, _, k_height, k_width = kernel[1] out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1 out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1 tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] diff --git a/topi/python/topi/intel_graphics/depthwise_conv2d.py b/topi/python/topi/intel_graphics/depthwise_conv2d.py index 92ce6fcac16b..17f19435b62f 100644 --- a/topi/python/topi/intel_graphics/depthwise_conv2d.py +++ b/topi/python/topi/intel_graphics/depthwise_conv2d.py @@ -332,8 +332,8 @@ def _depthwise_conv2d_infer_layout(workload, _): Input shapes and layouts, and output shapes and layouts """ _, data, kernel, strides, padding, _, _ = workload - batch_size, in_channel, in_height, in_width = data[:-1] - filter_channel, channel_multiplier, k_height, k_width = kernel[:-1] + batch_size, in_channel, in_height, in_width = data[1] + filter_channel, channel_multiplier, k_height, k_width = kernel[1] out_channel = filter_channel * channel_multiplier out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1 out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1 diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index 60eb966af62f..d24ceb5de3f2 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -60,8 +60,8 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depth @conv2d_infer_layout.register("cpu") def _conv2d_infer_layout(workload, cfg): _, data, kernel, strides, padding, dilation, layout, _, dtype = workload - batch_size, in_channel, in_height, in_width = data[:-1] - out_channel, _, k_height, k_width = kernel[:-1] + batch_size, in_channel, in_height, in_width = data[1] + out_channel, _, k_height, k_width = kernel[1] idxdiv = tvm.indexdiv pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width)) diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py index 7f2673f40073..2aa5e748e5c7 100644 --- a/topi/python/topi/x86/depthwise_conv2d.py +++ b/topi/python/topi/x86/depthwise_conv2d.py @@ -257,8 +257,8 @@ def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data_vec, kernel_vec, conv_out @depthwise_conv2d_infer_layout.register("cpu") def _depthwise_conv2d_infer_layout(workload, cfg): _, data, kernel, strides, padding, dilation, dtype = workload - batch_size, in_channel, in_height, in_width = data[:-1] - filter_channel, channel_multiplier, k_height, k_width = kernel[:-1] + batch_size, in_channel, in_height, in_width = data[1] + filter_channel, channel_multiplier, k_height, k_width = kernel[1] out_channel = filter_channel * channel_multiplier out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1 out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1 diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py index e1106d62921d..87d07f9870b2 100644 --- a/tutorials/autotvm/tune_relay_x86.py +++ b/tutorials/autotvm/tune_relay_x86.py @@ -160,7 +160,7 @@ def tune_kernels(tasks, # Use graph tuner to achieve graph level optimal schedules # Set use_DP=False if it takes too long to finish. def tune_graph(graph, dshape, records, opt_sch_file, use_DP=True): - target_op = [relay.nn.conv2d] + target_op = [relay.op.get("nn.conv2d"),] Tuner = DPTuner if use_DP else PBQPTuner executor = Tuner(graph, {input_name: dshape}, records, target_op, target) executor.benchmark_layout_transform(min_exec_num=2000) From ea9720d2dada6affeebfeef334615cd1bf5c2442 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Wed, 12 Feb 2020 15:56:11 -0800 Subject: [PATCH 21/48] lint --- python/tvm/autotvm/graph_tuner/base_graph_tuner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py index 27dbeedd1e3d..3e85e938fa82 100644 --- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py +++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py @@ -25,7 +25,6 @@ import tvm from tvm import autotvm, relay from tvm.autotvm.task import get_config -from tvm.autotvm.task.topi_integration import serialize_args from tvm.autotvm.record import encode, load_from_file from tvm.autotvm.measure import MeasureResult, MeasureInput From 113a1fa18fe1283f2b60804d8cffe17c2672dc4e Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Wed, 12 Feb 2020 16:14:52 -0800 Subject: [PATCH 22/48] minor fix --- python/tvm/autotvm/task/task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py index a084b8c77c85..876936579a78 100644 --- a/python/tvm/autotvm/task/task.py +++ b/python/tvm/autotvm/task/task.py @@ -53,7 +53,7 @@ def _encode(x): if isinstance(x, (expr.StringImm, expr.IntImm, expr.FloatImm)): return x.value if x is None: - return 0 + return None raise RuntimeError('Do not support type "%s" in argument. Consider to use' 'primitive types or tvm.expr.Var only' % type(x)) ret = [] From b1804560576f618fca49f38d053ce72c7ff888c2 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Wed, 12 Feb 2020 17:09:39 -0800 Subject: [PATCH 23/48] try fix vta test --- .../integration/test_benchmark_topi_conv2d.py | 21 ++++++++++++++----- .../test_benchmark_topi_conv2d_transpose.py | 16 +++++++++----- .../integration/test_benchmark_topi_dense.py | 10 ++++++--- .../test_benchmark_topi_group_conv2d.py | 12 ++++++++--- 4 files changed, 43 insertions(+), 16 deletions(-) diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py index af71561972a1..1058eef29d22 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py @@ -20,6 +20,7 @@ import json import os +import pytest import numpy as np from collections import namedtuple @@ -79,9 +80,13 @@ def run_conv2d(env, remote, wl, target, if "arm_cpu" in target.keys: data_pack = False layout = "NCHW" + conv2d_fcompute = topi.arm_cpu.conv2d_nchw_spatial_pack + conv2d_fschedule = topi.arm_cpu.schedule_conv2d_nchw_spatial_pack elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) + conv2d_fcompute = vta.top.vta_conv2d.conv2d_packed + conv2d_fschedule = vta.top.vta_conv2d.schedule_conv2d_packed # Derive shapes depending upon packing a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) @@ -104,15 +109,20 @@ def run_conv2d(env, remote, wl, target, # Define base computation schedule with target: - res = topi.nn.conv2d( - data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), (1, 1), - layout, env.acc_dtype) + if data_pack: + res = conv2d_fcompute( + data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), (1, 1), + layout, env.acc_dtype) + else: + res = conv2d_fcompute( + data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), (1, 1), + env.acc_dtype) res = topi.right_shift(res, 8) res = topi.add(res, bias) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule - s = topi.generic.schedule_conv2d_nchw([res]) + s = conv2d_fschedule([res]) if print_ir: print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) @@ -222,7 +232,8 @@ def get_ref_data(): return correct, cost, stats -def test_conv2d(device="vta"): +@pytest.mark.parametrize("device", ["vta", "arm_cpu"]) +def test_conv2d(device): def _run(env, remote): if device == "vta": target = env.target diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py index d729fa517692..fa372946feca 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py @@ -20,6 +20,7 @@ import json import os +import pytest import numpy as np from collections import namedtuple @@ -80,14 +81,18 @@ def run_conv2d_transpose(env, remote, wl, target, if "arm_cpu" in target.keys: data_pack = False layout = "NCHW" + fcompute = topi.arm_cpu.conv2d_transpose_nchw + fschedule = topi.arm_cpu.schedule_conv2d_transpose_nchw elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) + fcompute = vta.top.vta_conv2d_transpose.conv2d_transpose_packed + fschedule = vta.top.vta_conv2d_transpose.schedule_conv2d_transpose_packed # Derive shapes depending upon packing a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) - w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) + w_shape = (wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel) if data_pack: data_shape = (wl.batch//env.BATCH, wl.in_filter//env.BLOCK_IN, wl.height, wl.width, env.BATCH, env.BLOCK_IN) @@ -101,13 +106,13 @@ def run_conv2d_transpose(env, remote, wl, target, # Define base computation schedule with target: - res = topi.nn.conv2d_transpose_nchw( + res = fcompute( data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), env.acc_dtype) res = topi.right_shift(res, env.WGT_WIDTH) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule - s = topi.generic.schedule_conv2d_transpose_nchw([res]) + s = fschedule([res]) if print_ir: print(vta.lower(s, [data, kernel, res], simple_mode=True)) @@ -210,7 +215,8 @@ def get_ref_data(): return correct, cost, stats -def test_conv2d_transpose(device="vta"): +@pytest.mark.parametrize("device", ["vta", "arm_cpu"]) +def test_conv2d_transpose(device): def _run(env, remote): if device == "vta": target = env.target @@ -227,5 +233,5 @@ def _run(env, remote): vta.testing.run(_run) if __name__ == "__main__": - # test_conv2d_transpose(device="arm_cpu") + test_conv2d_transpose(device="arm_cpu") test_conv2d_transpose(device="vta") diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py index b0ee2f5f7792..52bdf3698e00 100644 --- a/vta/tests/python/integration/test_benchmark_topi_dense.py +++ b/vta/tests/python/integration/test_benchmark_topi_dense.py @@ -63,21 +63,25 @@ def run_gemm(env, remote, target, env.BATCH, env.BLOCK_IN) kernel_shape = (out_feat//env.BLOCK_OUT, in_feat//env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN) + fcompute = vta.top.vta_dense.dense_packed + fschedule = vta.top.vta_dense.schedule_dense_packed else: data_shape = a_shape kernel_shape = w_shape + fcompute = topi.x86.dense_nopack + fschedule = topi.x86.schedule_dense_nopack data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) # Define base computation schedule with target: - res = topi.nn.dense( - data, kernel, out_dtype=env.acc_dtype) + res = fcompute( + data, kernel, None, env.acc_dtype) res = topi.right_shift(res, 8) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule - s = topi.generic.schedule_dense([res]) + s = fschedule([res]) if print_ir: print(vta.lower(s, [data, kernel, res], simple_mode=True)) diff --git a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py index 7bba2449cea5..ff883c38b868 100644 --- a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py @@ -20,6 +20,7 @@ import json import os +import pytest import numpy as np from collections import namedtuple @@ -75,9 +76,13 @@ def run_group_conv2d(env, remote, wl, target, if "arm_cpu" in target.keys: data_pack = False layout = "NCHW" + fcompute = topi.nn.group_conv2d_nchw + fschedule = topi.generic.schedule_group_conv2d_nchw elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) + fcompute = vta.top.vta_group_conv2d.group_conv2d_packed + fschedule = vta.top.vta_group_conv2d.schedule_group_conv2d_packed # Derive shapes depending upon packing CI_G = wl.in_filter // wl.groups @@ -100,7 +105,7 @@ def run_group_conv2d(env, remote, wl, target, bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) # Define base computation schedule with target: - res = topi.nn.group_conv2d_nchw( + res = fcompute( data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), (1, 1), wl.groups, env.acc_dtype) res = topi.right_shift(res, 8) @@ -108,7 +113,7 @@ def run_group_conv2d(env, remote, wl, target, res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule - s = topi.generic.schedule_group_conv2d_nchw([res]) + s = fschedule([res]) if print_ir: print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) @@ -219,7 +224,8 @@ def get_ref_data(): return correct, cost, stats -def test_conv2d(device="vta"): +@pytest.mark.parametrize("device", ["vta", "arm_cpu"]) +def test_conv2d(device): def _run(env, remote): if device == "vta": target = env.target From e3e7e72062f4479185ae30a6e38b9206208d92f4 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Wed, 12 Feb 2020 17:51:45 -0800 Subject: [PATCH 24/48] fix rebase err --- python/tvm/autotvm/task/task.py | 2 +- python/tvm/autotvm/task/topi_integration.py | 4 ++-- python/tvm/target/generic_func.py | 2 ++ topi/python/topi/arm_cpu/conv2d_alter_op.py | 2 +- topi/python/topi/bifrost/conv2d.py | 2 +- topi/python/topi/cuda/conv2d_alter_op.py | 2 +- topi/python/topi/cuda/dense.py | 16 +--------------- topi/python/topi/cuda/group_conv2d_nchw.py | 2 +- .../topi/intel_graphics/conv2d_alter_op.py | 2 +- topi/python/topi/mali/conv2d.py | 2 +- topi/python/topi/x86/conv2d_alter_op.py | 2 +- topi/python/topi/x86/conv2d_int8.py | 2 +- 12 files changed, 14 insertions(+), 26 deletions(-) diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py index 876936579a78..83ace71fac31 100644 --- a/python/tvm/autotvm/task/task.py +++ b/python/tvm/autotvm/task/task.py @@ -349,7 +349,7 @@ def get_config(): cfg: ConfigSpace or ConfigEntity The current config """ - tgt = _target.current_target(allow_none=True) + tgt = _target.Target.current(allow_none=True) return DispatchContext.current.query(tgt, None) class FlopCalculationError(RuntimeError): diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py index 29796df14271..2f08864bc771 100644 --- a/python/tvm/autotvm/task/topi_integration.py +++ b/python/tvm/autotvm/task/topi_integration.py @@ -248,7 +248,7 @@ def wrapper(*args, **kwargs): if task_env is not None and task_env.tracing: task_env.add_task(task_name, args) workload = args_to_workload(args, task_name) - tgt = _target.current_target() + tgt = _target.Target.current() cfg = DispatchContext.current.query(tgt, workload) node = topi_compute(cfg, *args) @@ -317,7 +317,7 @@ def wrapper(outs, *args, **kwargs): workload = get_workload(outs) if workload is None: raise RuntimeError("Cannot find workload in attribute of this schedule") - tgt = _target.current_target() + tgt = _target.Target.current() cfg = DispatchContext.current.query(tgt, workload) return topi_schedule(cfg, outs, *args, **kwargs) return wrapper diff --git a/python/tvm/target/generic_func.py b/python/tvm/target/generic_func.py index 13f280a5ab1a..1936ff1511be 100644 --- a/python/tvm/target/generic_func.py +++ b/python/tvm/target/generic_func.py @@ -184,6 +184,7 @@ def dispatch_func(func, *args, **kwargs): fresult = decorate(fdefault, dispatch_func) fresult.fdefault = fdefault fresult.register = register + fresult.generic_func_node = generic_func_node return fresult return fdecorate @@ -268,4 +269,5 @@ def dispatch_func(func, *args, **kwargs): fdecorate = decorate(fdefault, dispatch_func) fdecorate.register = register fdecorate.fdefault = fdefault + fdecorate.dispatch_dict = dispatch_dict return fdecorate diff --git a/topi/python/topi/arm_cpu/conv2d_alter_op.py b/topi/python/topi/arm_cpu/conv2d_alter_op.py index 3a972b920de2..5b586d34c9bd 100644 --- a/topi/python/topi/arm_cpu/conv2d_alter_op.py +++ b/topi/python/topi/arm_cpu/conv2d_alter_op.py @@ -32,7 +32,7 @@ @conv2d_alter_layout.register(["arm_cpu"]) def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): - target = tvm.target.current_target(allow_none=False) + target = tvm.target.Target.current(allow_none=False) dispatch_ctx = autotvm.task.DispatchContext.current _, outs = relay.backend.compile_engine.select_implement( diff --git a/topi/python/topi/bifrost/conv2d.py b/topi/python/topi/bifrost/conv2d.py index 2650bfd77a38..ae8c5e36b3f8 100644 --- a/topi/python/topi/bifrost/conv2d.py +++ b/topi/python/topi/bifrost/conv2d.py @@ -459,7 +459,7 @@ def _schedule_winograd(cfg, s, op): ##### REGISTER ALTER OP LAYOUT ##### @nn.conv2d_alter_layout.register(["bifrost"]) def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): - target = tvm.target.current_target(allow_none=False) + target = tvm.target.Target.current(allow_none=False) dispatch_ctx = autotvm.task.DispatchContext.current _, outs = relay.backend.compile_engine.select_implement( diff --git a/topi/python/topi/cuda/conv2d_alter_op.py b/topi/python/topi/cuda/conv2d_alter_op.py index 6c3b9f017445..a8e8fb18afbd 100644 --- a/topi/python/topi/cuda/conv2d_alter_op.py +++ b/topi/python/topi/cuda/conv2d_alter_op.py @@ -30,7 +30,7 @@ @nn.conv2d_alter_layout.register(["cuda", "gpu"]) def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): - target = tvm.target.current_target(allow_none=False) + target = tvm.target.Target.current(allow_none=False) dispatch_ctx = autotvm.task.DispatchContext.current _, outs = relay.backend.compile_engine.select_implement( diff --git a/topi/python/topi/cuda/dense.py b/topi/python/topi/cuda/dense.py index 7ba45b3747ae..93797a4b49ba 100644 --- a/topi/python/topi/cuda/dense.py +++ b/topi/python/topi/cuda/dense.py @@ -229,16 +229,6 @@ def dense_int8(cfg, data, weight, bias=None, out_dtype=None): batch, in_dim = get_const_tuple(data.shape) out_dim, _ = get_const_tuple(weight.shape) - - target = tvm.target.Target.current() - if "cublas" in target.libs: - matmul = cublas.matmul(data, weight, False, True, out_dtype) - if bias is not None: - matmul = tvm.compute((batch, out_dim), \ - lambda i, j: matmul[i, j] + bias[j].astype(out_dtype), \ - tag=tag.BROADCAST) - return matmul - k = tvm.reduce_axis((0, in_dim), name='k') matmul = tvm.compute((batch, out_dim), @@ -260,12 +250,8 @@ def dense_int8(cfg, data, weight, bias=None, out_dtype=None): @autotvm.register_topi_schedule("dense_int8.cuda") def schedule_dense_int8(cfg, outs): """Dense schedule for int8 on CUDA""" - s = tvm.create_schedule([x.op for x in outs]) - target = tvm.target.current_target() - outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs - if "cublas" in target.libs: - return generic.schedule_extern(outs) + s = tvm.create_schedule([x.op for x in outs]) def _callback(op): if "dense_int8" in op.tag: diff --git a/topi/python/topi/cuda/group_conv2d_nchw.py b/topi/python/topi/cuda/group_conv2d_nchw.py index 357e87ac96ae..5abf2985273c 100644 --- a/topi/python/topi/cuda/group_conv2d_nchw.py +++ b/topi/python/topi/cuda/group_conv2d_nchw.py @@ -81,7 +81,7 @@ def _schedule_group_conv2d_nchw_direct(cfg, s, conv): cfg.define_split("tile_rx", rx, num_outputs=2) cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) - target = tvm.target.current_target() + target = tvm.target.Target.current() if target.target_name in ['nvptx', 'rocm']: cfg.define_knob("unroll_explicit", [1]) else: diff --git a/topi/python/topi/intel_graphics/conv2d_alter_op.py b/topi/python/topi/intel_graphics/conv2d_alter_op.py index 6c0cca070f1d..7211d650f4a3 100644 --- a/topi/python/topi/intel_graphics/conv2d_alter_op.py +++ b/topi/python/topi/intel_graphics/conv2d_alter_op.py @@ -28,7 +28,7 @@ @conv2d_alter_layout.register(["intel_graphics"]) def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): - target = tvm.target.current_target(allow_none=False) + target = tvm.target.Target.current(allow_none=False) dispatch_ctx = autotvm.task.DispatchContext.current if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest): cfg = dispatch_ctx.query(target, None) diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py index 0ee92280ca96..7dd075714a61 100644 --- a/topi/python/topi/mali/conv2d.py +++ b/topi/python/topi/mali/conv2d.py @@ -425,7 +425,7 @@ def _schedule_winograd(cfg, s, op): ##### REGISTER ALTER OP LAYOUT ##### @nn.conv2d_alter_layout.register(["mali"]) def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): - target = tvm.target.current_target(allow_none=False) + target = tvm.target.Target.current(allow_none=False) dispatch_ctx = autotvm.task.DispatchContext.current _, outs = relay.backend.compile_engine.select_implement( diff --git a/topi/python/topi/x86/conv2d_alter_op.py b/topi/python/topi/x86/conv2d_alter_op.py index 45622fef68b3..8f7957906825 100644 --- a/topi/python/topi/x86/conv2d_alter_op.py +++ b/topi/python/topi/x86/conv2d_alter_op.py @@ -32,7 +32,7 @@ @conv2d_alter_layout.register("cpu") def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): - target = tvm.target.current_target(allow_none=False) + target = tvm.target.Target.current(allow_none=False) dispatch_ctx = autotvm.task.DispatchContext.current if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest): cfg = dispatch_ctx.query(target, None) diff --git a/topi/python/topi/x86/conv2d_int8.py b/topi/python/topi/x86/conv2d_int8.py index d983fdae9044..64fe92bbaaa4 100644 --- a/topi/python/topi/x86/conv2d_int8.py +++ b/topi/python/topi/x86/conv2d_int8.py @@ -62,7 +62,7 @@ def is_int8_hw_support(data_dtype, kernel_dtype): is_dtype_support = data_dtype == 'uint8' and kernel_dtype == 'int8' # 2) Check LLVM support - llvm_version = tvm.codegen.llvm_version_major() + llvm_version = tvm.target.codegen.llvm_version_major() is_llvm_support = llvm_version >= 8 # 3) Check target From d87226298da31b54828a00279b69906d0f4f9e8f Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Thu, 13 Feb 2020 09:43:18 -0800 Subject: [PATCH 25/48] tweak --- python/tvm/autotvm/record.py | 9 +++++---- topi/python/topi/x86/conv3d.py | 5 +++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py index a51a290778d5..09f0ba947a6b 100644 --- a/python/tvm/autotvm/record.py +++ b/python/tvm/autotvm/record.py @@ -90,8 +90,9 @@ def encode(inp, result, protocol='json'): if protocol == 'json': json_dict = { "input": (str(inp.target), - inp.task.name, inp.task.args, inp.task.kwargs, - inp.config.to_json_dict()), + inp.task.name, inp.task.args, inp.task.kwargs), + + "config": inp.config.to_json_dict(), "result": (result.costs if result.error_no == 0 else (1e9,), result.error_no, @@ -139,7 +140,7 @@ def decode(row, protocol='json'): logger.warning("AutoTVM log version 0.1 is no longer supported.") return None - tgt, task_name, task_args, task_kwargs, config = row['input'] + tgt, task_name, task_args, task_kwargs = row["input"] tgt = _target.create(str(tgt)) def clean_json_to_python(x): @@ -155,7 +156,7 @@ def clean_json_to_python(x): return x tsk = task.Task(clean_json_to_python(task_name), clean_json_to_python(task_args)) - config = ConfigEntity.from_json_dict(config) + config = ConfigEntity.from_json_dict(row["config"]) inp = MeasureInput(tgt, tsk, config) result = MeasureResult(*[tuple(x) if isinstance(x, list) else x for x in row["result"]]) config.cost = np.mean(result.costs) diff --git a/topi/python/topi/x86/conv3d.py b/topi/python/topi/x86/conv3d.py index 4f5b631b5a2a..1e156509c0a8 100644 --- a/topi/python/topi/x86/conv3d.py +++ b/topi/python/topi/x86/conv3d.py @@ -62,6 +62,7 @@ def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype): 5-D with shape [batch, out_depth, out_height, out_width, out_channel] for NDHWC layout 5-D with shape [batch, out_channel, out_depth, out_height, out_width] for NCDHW layout """ + layout = "NDHWC" out_dtype = data.dtype if out_dtype is None else out_dtype strides = strides if isinstance(strides, (tuple, list)) else (strides, strides, strides) dilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation, dilation) @@ -69,7 +70,7 @@ def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype): _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout) if cfg.is_fallback: _get_default_config(cfg, data, kernel, strides, padding, out_dtype, layout) - return _conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, layout, out_dtype) + return _conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype) @autotvm.register_topi_schedule("conv3d_ndhwc.x86") @@ -110,7 +111,7 @@ def _traverse(op): return s -def _conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): +def _conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype): out_dtype = data.dtype if out_dtype is None else out_dtype assert isinstance(dilation, int) or len(dilation) == 3 From 58789f2f12b96d188926c7cb7fb9999640435f1b Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Thu, 13 Feb 2020 09:45:18 -0800 Subject: [PATCH 26/48] tmp hack for vta pass --- vta/python/vta/ir_pass.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vta/python/vta/ir_pass.py b/vta/python/vta/ir_pass.py index 36d8e4198a40..0c9b2eac2df7 100644 --- a/vta/python/vta/ir_pass.py +++ b/vta/python/vta/ir_pass.py @@ -662,8 +662,12 @@ def _do_fold(op): 0, 0, 0, 0, 0)) inner = irb.get() - args = op.body.body.args - res_tensor = op.body.body.func.output(0) + # TODO(@tmoreau89): This is only a temporary fix, please take a look. + body = op.body.body + while isinstance(body, tvm.stmt.IfThenElse): + body = body.then_case + args = body.args + res_tensor = body.func.output(0) tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0, env.BLOCK_OUT) inner = tvm.tir.AttrStmt( [dout, res_tensor], 'buffer_bind_scope', From fa6a9d72d7933ad6d03a83a799f03872d849446c Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Fri, 14 Feb 2020 05:40:55 +0000 Subject: [PATCH 27/48] fix tutorial --- python/tvm/autotvm/record.py | 11 ++++-- python/tvm/relay/frontend/tflite.py | 2 +- python/tvm/relay/op/strategy/arm_cpu.py | 5 ++- python/tvm/relay/op/strategy/x86.py | 4 +-- tests/scripts/task_python_vta_tsim.sh | 6 ++-- topi/python/topi/cuda/conv2d_alter_op.py | 2 +- topi/python/topi/x86/conv2d.py | 40 ++++++++------------- topi/tests/python/test_topi_conv3d_ndhwc.py | 2 +- tutorials/autotvm/tune_conv2d_cuda.py | 4 +-- tutorials/autotvm/tune_relay_arm.py | 25 +------------ tutorials/autotvm/tune_relay_cuda.py | 14 +------- tutorials/autotvm/tune_relay_mobile_gpu.py | 12 +------ 12 files changed, 39 insertions(+), 88 deletions(-) diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py index 09f0ba947a6b..171b51769850 100644 --- a/python/tvm/autotvm/record.py +++ b/python/tvm/autotvm/record.py @@ -37,6 +37,7 @@ from .measure import MeasureInput, MeasureResult AUTOTVM_LOG_VERSION = 0.2 +_old_version_warning = True logger = logging.getLogger('autotvm') try: # convert unicode to str for python2 @@ -134,10 +135,14 @@ def decode(row, protocol='json'): result: autotvm.tuner.MeasureResult """ # pylint: disable=unused-variable + global _old_version_warning + if protocol == 'json': row = json.loads(row) if 'v' in row and row['v'] == 0.1: - logger.warning("AutoTVM log version 0.1 is no longer supported.") + if _old_version_warning: + logger.warning("AutoTVM log version 0.1 is no longer supported.") + _old_version_warning = False return None tgt, task_name, task_args, task_kwargs = row["input"] @@ -165,7 +170,9 @@ def clean_json_to_python(x): if protocol == 'pickle': items = row.split("\t") if len(items) == 4: - logger.warning("AutoTVM log version 0.1 is no longer supported.") + if _old_version_warning: + logger.warning("AutoTVM log version 0.1 is no longer supported.") + _old_version_warning = False return None tgt = _target.create(items[0]) task_tuple = pickle.loads(base64.b64decode(items[1].encode())) diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py index dd3587125aec..352bc6302ee0 100644 --- a/python/tvm/relay/frontend/tflite.py +++ b/python/tvm/relay/frontend/tflite.py @@ -1156,7 +1156,7 @@ def convert_conv(self, op, conv_type): if is_depthwise_conv: params['channels'] = int(in_channels) - params['groups'] = int(in_channels) + params['groups'] = int(input_c) params['kernel_layout'] = 'HWOI' else: params['channels'] = int(output_channels) diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index c88267bf36bb..850001d9ede2 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -68,10 +68,9 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.arm_cpu", plevel=15) - if pt == 1 and pb == 1 and pl == 1 and pr == 1: + if "nnpack" in target.libs and pt == 1 and pb == 1 and pl == 1 and pr == 1: strategy.add_implement( - wrap_compute_conv2d_winograd_nnpack( - topi.arm_cpu.conv2d_nchw_winograd_nnpack), + wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd_nnpack), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack), name="conv2d_nchw_winograd_nnpack.arm_cpu", plevel=13) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 64901fa49149..86576ffcd7fc 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -95,7 +95,7 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): name="conv2d_nhwc.x86") elif layout == "HWCN": assert kernel_layout == "HWIO" - logger.warning("For x86 target, NCHW layout is recommended for conv2d.") + logger.warning("conv2d HWCN layout is not optimized for x86.") strategy.add_implement( wrap_compute_conv2d(topi.nn.conv2d_hwcn), wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn), @@ -120,7 +120,7 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): name="depthwise_conv2d_nchw.generic") elif layout == "NHWC": assert kernel_layout == "HWOI" - logger.warning("depthwise_conv2d_nhwc is not optimized for x86.") + logger.warning("depthwise_conv2d NHWC layout is not optimized for x86.") strategy.add_implement( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc), diff --git a/tests/scripts/task_python_vta_tsim.sh b/tests/scripts/task_python_vta_tsim.sh index eba62e537a85..5f194b297678 100755 --- a/tests/scripts/task_python_vta_tsim.sh +++ b/tests/scripts/task_python_vta_tsim.sh @@ -46,8 +46,10 @@ echo "Running unittest in tsim..." python3 -m pytest -v vta/tests/python/unittest # Run unit tests in cycle accurate simulator -echo "Running integration test in tsim..." -python3 -m pytest -v vta/tests/python/integration +# TODO(@icemelon9): temporarily disable tsim test because it takes a long time without tophub logs. +# Re-enable this test after update the tophub logs. +# echo "Running integration test in tsim..." +# python3 -m pytest -v vta/tests/python/integration # Reset default fsim simulation cp vta/config/fsim_sample.json vta/config/vta_config.json diff --git a/topi/python/topi/cuda/conv2d_alter_op.py b/topi/python/topi/cuda/conv2d_alter_op.py index a8e8fb18afbd..09806733129b 100644 --- a/topi/python/topi/cuda/conv2d_alter_op.py +++ b/topi/python/topi/cuda/conv2d_alter_op.py @@ -103,7 +103,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): new_weight = tvm.placeholder((KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload( - [new_data, new_weight, strides, padding, dilation, out_dtype, tile_size], + [new_data, new_weight, strides, padding, dilation, out_dtype], "conv2d_nchw_winograd_without_weight_transform.cuda") dispatch_ctx.update(target, new_workload, cfg) return relay.nn.contrib_conv2d_winograd_without_weight_transform( diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index d24ceb5de3f2..66ba49704575 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -79,24 +79,8 @@ def schedule_conv2d_nhwc(outs): outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) output_op = outs[0].op - scheduled_ops = [] - - def traverse(op): - """Traverse operators from computation graph""" - # inline all one-to-one-mapping operators except the last stage (output) - if tag.is_broadcast(op.tag): - if op not in s.outputs: - s[op].compute_inline() - else: # inject custom schedule - if len(op.axis) == 4: # schedule bias + bn + relu - n, h, w, c = op.axis - fused = s[op].fuse(n, h, w) - s[op].parallel(fused) - s[op].vectorize(c) - for tensor in op.input_tensors: - if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops: - traverse(tensor.op) + def _callback(op): if 'conv2d_nhwc' in op.tag: conv = op.output(0) kernel = op.input_tensors[1] @@ -115,17 +99,21 @@ def traverse(op): C = conv n, h, w, c = C.op.axis ry, rx, rc = C.op.reduce_axis - n_out, h_out, w_out, c_out = output_op.axis s[C].vectorize(c) - if op != output_op: # fuse bias + bn + relu into conv - s[C].compute_at(s[output_op], c_out) - else: - fused = s[C].fuse(n, h, w) - s[C].parallel(fused) - - scheduled_ops.append(op) - traverse(output_op) + O = output_op + if len(O.axis) == 4: # schedule bias + bn + relu + n, h, w, c = O.axis + fused = s[O].fuse(n, h, w) + s[O].parallel(fused) + channels = int(O.output(0).shape[-1]) + if channels % 64 == 0: + c, ci = s[O].split(c, 64) + s[O].vectorize(ci) + if C != O: + s[C].compute_at(s[O], c) + + traverse_inline(s, output_op, _callback) return s def conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype): diff --git a/topi/tests/python/test_topi_conv3d_ndhwc.py b/topi/tests/python/test_topi_conv3d_ndhwc.py index 0bda67b19333..7e2f02cea20a 100644 --- a/topi/tests/python/test_topi_conv3d_ndhwc.py +++ b/topi/tests/python/test_topi_conv3d_ndhwc.py @@ -65,7 +65,7 @@ def check_device(device): print("Running on target: %s" % device) fcompute, fschedule = topi.testing.dispatch(device, _conv3d_ndhwc_implement) with tvm.target.create(device): - B = fcompute(A, W, stride, padding, dilation) + B = fcompute(A, W, stride, padding, dilation, dtype) s = fschedule([B]) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py index 09b56045edaf..0e26dcb97412 100644 --- a/tutorials/autotvm/tune_conv2d_cuda.py +++ b/tutorials/autotvm/tune_conv2d_cuda.py @@ -78,7 +78,7 @@ # can be very large (at the level of 10^9 for some input shapes) # -@autotvm.template +@autotvm.register_customized_task("tutorial/conv2d_no_batching") def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding): assert N == 1, "Only consider batch_size = 1 in this template" @@ -180,7 +180,7 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding): # the last layer in resnet N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1) -task = autotvm.task.create(conv2d_no_batching, +task = autotvm.task.create("tutorial/conv2d_no_batching", args=(N, H, W, CO, CI, KH, KW, strides, padding), target='cuda') print(task.config_space) diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py index 5f71068b8136..9aba93798617 100644 --- a/tutorials/autotvm/tune_relay_arm.py +++ b/tutorials/autotvm/tune_relay_arm.py @@ -248,30 +248,7 @@ def tune_tasks(tasks, n_trial=1000, early_stopping=None, log_filename='tuning.log', - use_transfer_learning=True, - try_winograd=True, - try_spatial_pack_depthwise=False): - if try_winograd: - for i in range(len(tasks)): - try: # try winograd template - tsk = autotvm.task.create(tasks[i].name, tasks[i].args, - tasks[i].target, tasks[i].target_host, 'winograd') - input_channel = tsk.workload[1][1] - if input_channel >= 64: - tasks[i] = tsk - except Exception: - pass - - # if we want to use spatial pack for depthwise convolution - if try_spatial_pack_depthwise: - tuner = 'xgb_knob' - for i in range(len(tasks)): - if tasks[i].name == 'topi_nn_depthwise_conv2d_nchw': - tsk = autotvm.task.create(tasks[i].name, tasks[i].args, - tasks[i].target, tasks[i].target_host, - 'contrib_spatial_pack') - tasks[i] = tsk - + use_transfer_learning=True): # create tmp log file tmp_log_file = log_filename + ".tmp" if os.path.exists(tmp_log_file): diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py index dca680e6f039..58c8751b73b9 100644 --- a/tutorials/autotvm/tune_relay_cuda.py +++ b/tutorials/autotvm/tune_relay_cuda.py @@ -163,19 +163,7 @@ def tune_tasks(tasks, n_trial=1000, early_stopping=None, log_filename='tuning.log', - use_transfer_learning=True, - try_winograd=True): - if try_winograd: - for i in range(len(tasks)): - try: # try winograd template - tsk = autotvm.task.create(tasks[i].name, tasks[i].args, - tasks[i].target, tasks[i].target_host, 'winograd') - input_channel = tsk.workload[1][1] - if input_channel >= 64: - tasks[i] = tsk - except Exception: - pass - + use_transfer_learning=True): # create tmp log file tmp_log_file = log_filename + ".tmp" if os.path.exists(tmp_log_file): diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py index 30ac719338ae..5425f1b15715 100644 --- a/tutorials/autotvm/tune_relay_mobile_gpu.py +++ b/tutorials/autotvm/tune_relay_mobile_gpu.py @@ -247,17 +247,7 @@ def tune_tasks(tasks, n_trial=1000, early_stopping=None, log_filename='tuning.log', - use_transfer_learning=True, - try_winograd=True): - if try_winograd: - for i in range(len(tasks)): - try: # try winograd template - tsk = autotvm.task.create(tasks[i].name, tasks[i].args, - tasks[i].target, tasks[i].target_host, 'winograd') - tasks.append(tsk) - except Exception: - pass - + use_transfer_learning=True): # create tmp log file tmp_log_file = log_filename + ".tmp" if os.path.exists(tmp_log_file): From c1bf7259d30266499098593f4a79d9fd1d74388e Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Thu, 13 Feb 2020 21:50:32 -0800 Subject: [PATCH 28/48] fix --- python/tvm/autotvm/record.py | 2 +- tests/python/relay/test_op_qnn_conv2d.py | 22 ++-------------------- topi/python/topi/x86/conv2d.py | 10 ++++------ 3 files changed, 7 insertions(+), 27 deletions(-) diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py index 171b51769850..90857a135933 100644 --- a/python/tvm/autotvm/record.py +++ b/python/tvm/autotvm/record.py @@ -136,7 +136,7 @@ def decode(row, protocol='json'): """ # pylint: disable=unused-variable global _old_version_warning - + if protocol == 'json': row = json.loads(row) if 'v' in row and row['v'] == 0.1: diff --git a/tests/python/relay/test_op_qnn_conv2d.py b/tests/python/relay/test_op_qnn_conv2d.py index 3b88e1c925d8..e827c722b255 100644 --- a/tests/python/relay/test_op_qnn_conv2d.py +++ b/tests/python/relay/test_op_qnn_conv2d.py @@ -116,23 +116,13 @@ def get_funcs(data_shape, data_layout, kernel_layout, out_dtype, - groups=1): + groups=1, + channels=None): data = relay.var("data", shape=data_shape, dtype=data_dtype) kernel = relay.var("kernel", shape=kernel_shape, dtype=kernel_dtype) - if groups > 1: - channels = groups - elif kernel_layout == "OIHW": - channels = kernel_shape[0] - elif kernel_layout == "HWIO": - channels = kernel_shape[3] - elif kernel_layout == "HWOI": - channels = kernel_shape[2] - else: - raise NotImplementedError - ref_func = get_ref_func(data, kernel, input_zero_point, @@ -827,12 +817,8 @@ def test_depthwise_depth_multiplier(): data_layout="NCHW", kernel_layout="OIHW", out_dtype="int32", -<<<<<<< HEAD - groups=8) -======= groups=4, channels=8) ->>>>>>> fix more tests & bugs verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype) @@ -881,12 +867,8 @@ def test_depthwise_depth_multiplier(): data_layout="NHWC", kernel_layout="HWOI", out_dtype="int32", -<<<<<<< HEAD - groups=8) -======= groups=4, channels=8) ->>>>>>> fix more tests & bugs verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype) diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py index 66ba49704575..2403b01b7453 100644 --- a/topi/python/topi/x86/conv2d.py +++ b/topi/python/topi/x86/conv2d.py @@ -22,7 +22,6 @@ import tvm from tvm import autotvm -from .. import tag from .. import nn from ..nn.conv2d import conv2d_infer_layout, _get_workload as _get_conv2d_workload from ..nn.conv2d import unpack_NCHWc_to_nchw @@ -98,15 +97,14 @@ def _callback(op): s[data_pad].parallel(pad_fused) C = conv n, h, w, c = C.op.axis - ry, rx, rc = C.op.reduce_axis s[C].vectorize(c) - O = output_op - if len(O.axis) == 4: # schedule bias + bn + relu - n, h, w, c = O.axis + O = output_op.output(0) + if len(O.op.axis) == 4: # schedule bias + bn + relu + n, h, w, c = O.op.axis fused = s[O].fuse(n, h, w) s[O].parallel(fused) - channels = int(O.output(0).shape[-1]) + channels = int(O.shape[-1]) if channels % 64 == 0: c, ci = s[O].split(c, 64) s[O].vectorize(ci) From 206c85959074b9773f510c905f842146ac3c380f Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Fri, 14 Feb 2020 19:15:34 +0000 Subject: [PATCH 29/48] fix more tutorials --- topi/tests/python/test_topi_tensor.py | 3 ++- tutorials/dev/relay_pass_infra.py | 6 +++--- tutorials/optimize/opt_matmul_auto_tensorcore.py | 5 +++-- tutorials/topi/intro_topi.py | 8 ++++---- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/topi/tests/python/test_topi_tensor.py b/topi/tests/python/test_topi_tensor.py index 8e7073f4060b..05098421c561 100644 --- a/topi/tests/python/test_topi_tensor.py +++ b/topi/tests/python/test_topi_tensor.py @@ -18,6 +18,7 @@ import numpy as np import tvm import topi +import topi.testing from tvm.contrib.pickle_memoize import memoize from tvm.contrib.nvcc import have_fp16 @@ -98,7 +99,7 @@ def check_device(device): A = tvm.placeholder((n, m), name='A', dtype=dtype) B = tvm.compute((n, m), lambda i, j: A[i, j] + tvm.const(1, A.dtype), name='B') - S = topi.generic.schedule_elemwise(B) + S = topi.testing.get_elemwise_schedule(device)(B) fun = tvm.build(S, [A, B], device) np_A = tvm.nd.empty((n, m), A.dtype, ctx).copyfrom( diff --git a/tutorials/dev/relay_pass_infra.py b/tutorials/dev/relay_pass_infra.py index d27e236a2572..494593eeb5a1 100644 --- a/tutorials/dev/relay_pass_infra.py +++ b/tutorials/dev/relay_pass_infra.py @@ -78,7 +78,7 @@ def example(): # the scope of this tutorial. @relay.op.register_alter_op_layout("nn.conv2d", level=101) -def alter_conv2d(attrs, inputs, tinfos): +def alter_conv2d(attrs, inputs, tinfos, out_type): data, weight = inputs new_attrs = dict(attrs) new_attrs['data_layout'] = 'NCHW16c' @@ -245,10 +245,10 @@ def visit_const(self, c): f = example() mod = tvm.IRModule.from_expr(f) seq = relay.transform.Sequential([relay.transform.FoldConstant(), - relay.transform.PrintIR(), + relay.transform.PrintIR(False), relay.transform.EliminateCommonSubexpr(), relay.transform.FuseOps(), - relay.transform.PrintIR()]) + relay.transform.PrintIR(False)]) with relay.build_config(opt_level=3): mod = seq(mod) diff --git a/tutorials/optimize/opt_matmul_auto_tensorcore.py b/tutorials/optimize/opt_matmul_auto_tensorcore.py index f7cdae227b75..a4658eba2bee 100644 --- a/tutorials/optimize/opt_matmul_auto_tensorcore.py +++ b/tutorials/optimize/opt_matmul_auto_tensorcore.py @@ -94,7 +94,7 @@ def matmul_nn(A, B, L, dtype='float16', layout='NN'): # # We use AutoTVM to search for best configurations in this schedule. -@autotvm.template +@autotvm.register_customized_task("tutorial/test_gemm") def test_gemm(N, L, M, dtype, layout): if (layout == "NN"): shape_a = (N, L) @@ -264,7 +264,8 @@ def test_gemm(N, L, M, dtype, layout): assert(major == 7 and minor == 5 and layout == 'TN') def tune_and_evaluate(M, N, L, dtype, layout): - task = autotvm.task.create(test_gemm, args=(N, L, M, dtype, layout), target='cuda') + task = autotvm.task.create("tutorial/test_gemm", args=(N, L, M, dtype, layout), + target='cuda') print(task.config_space) logging.getLogger('autotvm').setLevel(logging.DEBUG) diff --git a/tutorials/topi/intro_topi.py b/tutorials/topi/intro_topi.py index 390085ea70b5..2e049828e5cc 100644 --- a/tutorials/topi/intro_topi.py +++ b/tutorials/topi/intro_topi.py @@ -85,7 +85,7 @@ f = e / 2.0 g = topi.sum(f) with tvm.target.cuda(): - sg = topi.generic.schedule_reduce(g) + sg = topi.cuda.schedule_reduce(g) print(tvm.lower(sg, [a, b], simple_mode=True)) ###################################################################### @@ -113,7 +113,7 @@ tarray = tvm.placeholder((512, 512), name="tarray") softmax_topi = topi.nn.softmax(tarray) with tvm.target.create("cuda"): - sst = topi.generic.schedule_softmax(softmax_topi) + sst = topi.cuda.schedule_softmax(softmax_topi) print(tvm.lower(sst, [tarray], simple_mode=True)) ###################################################################### @@ -133,9 +133,9 @@ kernel = tvm.placeholder((10, 3, 5, 5)) with tvm.target.create("cuda"): - conv = topi.nn.conv2d(data, kernel, strides=1, padding=2, dilation=1) + conv = topi.cuda.conv2d_nchw(data, kernel, 1, 2, 1) out = topi.nn.relu(conv) - sconv = topi.generic.nn.schedule_conv2d_nchw([out]) + sconv = topi.cuda.schedule_conv2d_nchw([out]) print(tvm.lower(sconv, [data, kernel], simple_mode=True)) ###################################################################### From 0f36deb273d1116bee3147848c14448a044fe417 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Fri, 14 Feb 2020 11:17:38 -0800 Subject: [PATCH 30/48] fix vta tutorial --- vta/python/vta/top/__init__.py | 8 ++++---- vta/python/vta/top/op.py | 14 +++++++++++--- .../integration/test_benchmark_topi_conv2d.py | 4 ++-- .../test_benchmark_topi_conv2d_transpose.py | 4 ++-- .../integration/test_benchmark_topi_dense.py | 4 ++-- .../test_benchmark_topi_group_conv2d.py | 4 ++-- vta/tutorials/autotvm/tune_relay_vta.py | 11 ++++++----- 7 files changed, 29 insertions(+), 20 deletions(-) diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py index 7fdf27f8e01a..6f62aff469d4 100644 --- a/vta/python/vta/top/__init__.py +++ b/vta/python/vta/top/__init__.py @@ -20,8 +20,8 @@ from . import bitpack from .graphpack import graph_pack from . import op -from . import vta_conv2d -from . import vta_conv2d_transpose -from . import vta_group_conv2d -from . import vta_dense +from .vta_conv2d import conv2d_packed, schedule_conv2d_packed +from .vta_conv2d_transpose import conv2d_transpose_packed, schedule_conv2d_transpose_packed +from .vta_group_conv2d import group_conv2d_packed, schedule_group_conv2d_packed +from .vta_dense import dense_packed, schedule_dense_packed from . import util diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index 4905992cc06c..4da6e1916c92 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -36,9 +36,8 @@ # override to force partition at copy reg.register_pattern("copy", OpPattern.INJECTIVE, level=15) - -@reg.register_compute("clip", level=15) -def compute_clip(attrs, inputs, output_type, target): +# add clip vta strategy +def compute_clip_vta(attrs, inputs, output_type): """ Clip operator. """ x = inputs[0] a_min = attrs.a_min @@ -52,6 +51,15 @@ def compute_clip(attrs, inputs, output_type, target): x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") return [x] +def clip_strategy_vta(attrs, inputs, out_type, target): + strategy = OpStrategy() + strategy.add_implement( + compute_clip_vta, + _strategy.wrap_topi_schedule(topi.generic.schedule_injective), + name="clip.vta") + return strategy + +reg.get("clip").get_attr("FTVMStrategy").register(clip_strategy_vta, "vta") @_strategy.conv2d_strategy.register("vta") def conv2d_strategy_vta(attrs, inputs, out_type, target): diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py index 1058eef29d22..9e65eab8e154 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py @@ -85,8 +85,8 @@ def run_conv2d(env, remote, wl, target, elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) - conv2d_fcompute = vta.top.vta_conv2d.conv2d_packed - conv2d_fschedule = vta.top.vta_conv2d.schedule_conv2d_packed + conv2d_fcompute = vta.top.conv2d_packed + conv2d_fschedule = vta.top.schedule_conv2d_packed # Derive shapes depending upon packing a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py index fa372946feca..284655adf6da 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py @@ -86,8 +86,8 @@ def run_conv2d_transpose(env, remote, wl, target, elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) - fcompute = vta.top.vta_conv2d_transpose.conv2d_transpose_packed - fschedule = vta.top.vta_conv2d_transpose.schedule_conv2d_transpose_packed + fcompute = vta.top.conv2d_transpose_packed + fschedule = vta.top.schedule_conv2d_transpose_packed # Derive shapes depending upon packing diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py index 52bdf3698e00..a0acdc34acef 100644 --- a/vta/tests/python/integration/test_benchmark_topi_dense.py +++ b/vta/tests/python/integration/test_benchmark_topi_dense.py @@ -63,8 +63,8 @@ def run_gemm(env, remote, target, env.BATCH, env.BLOCK_IN) kernel_shape = (out_feat//env.BLOCK_OUT, in_feat//env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN) - fcompute = vta.top.vta_dense.dense_packed - fschedule = vta.top.vta_dense.schedule_dense_packed + fcompute = vta.top.dense_packed + fschedule = vta.top.schedule_dense_packed else: data_shape = a_shape kernel_shape = w_shape diff --git a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py index ff883c38b868..5ec1be8ec0dc 100644 --- a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py @@ -81,8 +81,8 @@ def run_group_conv2d(env, remote, wl, target, elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) - fcompute = vta.top.vta_group_conv2d.group_conv2d_packed - fschedule = vta.top.vta_group_conv2d.schedule_group_conv2d_packed + fcompute = vta.top.group_conv2d_packed + fschedule = vta.top.schedule_group_conv2d_packed # Derive shapes depending upon packing CI_G = wl.in_filter // wl.groups diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py index 3a8c877a6d14..a20b8ec8d3d3 100644 --- a/vta/tutorials/autotvm/tune_relay_vta.py +++ b/vta/tutorials/autotvm/tune_relay_vta.py @@ -296,7 +296,6 @@ def tune_tasks(tasks, def register_vta_tuning_tasks(): from tvm.autotvm.task import TaskExtractEnv - from tvm.autotvm.task.task import deserialize_args @tvm.tag_scope(tag=topi.tag.ELEMWISE) def my_clip(x, a_min, a_max): @@ -310,20 +309,19 @@ def my_clip(x, a_min, a_max): # init autotvm env to register VTA operator TaskExtractEnv() - @autotvm.task.register("topi_nn_conv2d", override=True) + @autotvm.register_customized_task("conv2d_packed.vta") def _topi_nn_conv2d(*args, **kwargs): assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) A, W = args[:2] with tvm.target.vta(): - res = topi.nn.conv2d(*args, **kwargs) + res = vta.top.conv2d_packed(*args, **kwargs) res = topi.right_shift(res, 8) res = my_clip(res, 0, 127) res = topi.cast(res, "int8") if tvm.target.Target.current().device_name == 'vta': - s = topi.generic.schedule_conv2d_nchw([res]) + s = vta.top.schedule_conv2d_packed([res]) else: s = tvm.create_schedule([res.op]) return s, [A, W, res] @@ -361,6 +359,9 @@ def tune_and_evaluate(tuning_opt): target=target, target_host=env.target_host) + # filter out non-packed conv2d task + tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks)) + # We should have extracted 10 convolution tasks assert len(tasks) == 10 print("Extracted {} conv2d tasks:".format(len(tasks))) From 0bf960b16d021248adf16f1d6dfc892fb74c6892 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Fri, 14 Feb 2020 17:08:37 -0800 Subject: [PATCH 31/48] minor --- python/tvm/autotvm/task/task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py index 83ace71fac31..d83a6d4e5581 100644 --- a/python/tvm/autotvm/task/task.py +++ b/python/tvm/autotvm/task/task.py @@ -55,7 +55,7 @@ def _encode(x): if x is None: return None raise RuntimeError('Do not support type "%s" in argument. Consider to use' - 'primitive types or tvm.expr.Var only' % type(x)) + 'primitive types or tvm.tir.Var only' % type(x)) ret = [] for t in args: ret.append(_encode(t)) From dd17aa143db341ce3c0df61acdce9a7e7beb3604 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Sat, 15 Feb 2020 20:48:57 -0800 Subject: [PATCH 32/48] address comments --- python/tvm/autotvm/task/topi_integration.py | 3 -- python/tvm/relay/op/op.py | 8 +++--- src/relay/op/nn/pooling.cc | 4 +-- src/relay/op/tensor/transform.cc | 4 +-- topi/python/topi/intel_graphics/conv2d.py | 32 --------------------- 5 files changed, 8 insertions(+), 43 deletions(-) diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py index 2f08864bc771..841bc5f39b72 100644 --- a/python/tvm/autotvm/task/topi_integration.py +++ b/python/tvm/autotvm/task/topi_integration.py @@ -121,9 +121,6 @@ def add_task(self, task_name, args): args: tuple Arguments to the TOPI function. - - cond: SpecializedCondition - Specialized condition to enable the TOPI template. """ key = (task_name, serialize_args(args)) if self.allow_duplicate or key not in self.task_collection: diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py index da25cd1f033a..5cf3cf2e0b66 100644 --- a/python/tvm/relay/op/op.py +++ b/python/tvm/relay/op/op.py @@ -155,7 +155,7 @@ def compute(self, attrs, inputs, out_type): attrs : Attrs Op attributes. - inputs : list[tvm.Tensor] + inputs : list[tvm.tensor.Tensor] The input tensors. out_type : relay.Type @@ -163,7 +163,7 @@ def compute(self, attrs, inputs, out_type): Returns ------- - outs : list[tvm.Tensor] + outs : list[tvm.tensor.Tensor] The output tensors. """ return _OpImplementCompute(self, attrs, inputs, out_type) @@ -176,10 +176,10 @@ def schedule(self, attrs, outs, target): attrs : Attrs Op attributes. - outs : list[tvm.Tensor] + outs : list[tvm.tensor.Tensor] The output tensors. - target : tvm.Target + target : tvm.target.Target The target to schedule the op. Returns diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc index e9057b7ac086..77baae567ab6 100644 --- a/src/relay/op/nn/pooling.cc +++ b/src/relay/op/nn/pooling.cc @@ -164,8 +164,8 @@ bool Pool2DRel(const Array& types, template Array Pool2DCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type) { + const Array& inputs, + const Type& out_type) { static const Layout kNCHW("NCHW"); const auto* param = attrs.as(); CHECK(param != nullptr); diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc index 53bcba7f1356..fa96d7fcbee7 100644 --- a/src/relay/op/tensor/transform.cc +++ b/src/relay/op/tensor/transform.cc @@ -1674,8 +1674,8 @@ bool SqueezeRel(const Array& types, } Array SqueezeCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type) { + const Array& inputs, + const Type& out_type) { const SqueezeAttrs *param = attrs.as(); CHECK(param != nullptr); return { topi::squeeze(inputs[0], param->axis) }; diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py index 15211f5cb1d3..8993063b16e3 100644 --- a/topi/python/topi/intel_graphics/conv2d.py +++ b/topi/python/topi/intel_graphics/conv2d.py @@ -143,38 +143,6 @@ def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None s[tensor].bind(xi, thread_x) return xi, thread_z, thread_y, thread_x -# Define template function for autotvm task -# We define schedule template in this function instead of -# declaration function since actual input arguments need -# to be altered by the schedule selected. -# @autotvm.task.register("topi_intel_graphics_conv2d_NCHWc") -# def __topi_nn_conv2d_NCHWc(*args, **kwargs): -# assert not kwargs, "Do not support kwargs in template function call" -# data, kernel, strides, padding, dilation, layout, dtype = deserialize_args(args) -# raw_data_shape = get_const_tuple(data.shape) -# raw_kernel_shape = get_const_tuple(kernel.shape) -# -# # get config here -# cfg = get_config() -# _create_schedule_template(cfg, data, kernel, strides, padding, dilation, layout) -# cfg.add_flop(1) -# -# # change shape with the value in config -# ic_bn = cfg["tile_ic"].val if hasattr(cfg["tile_ic"], "val") else cfg["tile_ic"].size[-1] -# oc_bn = cfg["tile_oc"].val if hasattr(cfg["tile_oc"], "val") else cfg["tile_oc"].size[-1] -# -# new_data_shape = (raw_data_shape[0], raw_data_shape[1] // ic_bn, -# raw_data_shape[2], raw_data_shape[3], ic_bn) -# new_kernel_shape = (raw_kernel_shape[0] // oc_bn, raw_kernel_shape[1] // ic_bn, -# raw_kernel_shape[2], raw_kernel_shape[3], ic_bn, oc_bn) -# new_data = tvm.placeholder(new_data_shape, data.dtype) -# new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype) -# -# C = _decl_cl_spatialpack_NCHWc(cfg, new_data, new_kernel, strides, padding, dilation, dtype) -# s = _schedule_conv2d_NCHWc(cfg, [C]) -# -# return s, [new_data, new_kernel, C] - def _pack_data(data, kernel, ic_bn, oc_bn): n, _, ih, iw = get_const_tuple(data.shape) From e1669e3e6dc6c6e61aaeb5ad5cf45e411675c742 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Sat, 15 Feb 2020 21:55:02 -0800 Subject: [PATCH 33/48] fix --- tests/python/unittest/test_codegen_cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/unittest/test_codegen_cuda.py b/tests/python/unittest/test_codegen_cuda.py index ec36a5fa5a7a..8652817c21ce 100644 --- a/tests/python/unittest/test_codegen_cuda.py +++ b/tests/python/unittest/test_codegen_cuda.py @@ -305,7 +305,7 @@ def check_cuda(dtype, m=32, n=32): e = topi.elemwise_sum([c, d]) g = topi.sum(e) with tvm.target.cuda(): - sg = topi.generic.schedule_reduce(g) + sg = topi.cuda.schedule_reduce(g) ctx = tvm.gpu(0) func = tvm.build(sg, [a, b, g], 'cuda') a_np = np.random.uniform(size=(m, n)).astype(a.dtype) From 5eee588da4df783784e6f09862aead55073202a0 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Sun, 16 Feb 2020 21:08:18 -0800 Subject: [PATCH 34/48] address comments --- include/tvm/relay/op_attr_types.h | 124 ------------- include/tvm/relay/op_strategy.h | 164 ++++++++++++++++++ python/tvm/autotvm/task/task.py | 40 ++++- python/tvm/autotvm/task/topi_integration.py | 13 +- python/tvm/relay/backend/compile_engine.py | 3 +- python/tvm/relay/expr.py | 3 - src/relay/backend/compile_engine.h | 1 + .../ir/{op_attr_types.cc => op_strategy.cc} | 9 +- 8 files changed, 218 insertions(+), 139 deletions(-) create mode 100644 include/tvm/relay/op_strategy.h rename src/relay/ir/{op_attr_types.cc => op_strategy.cc} (96%) diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h index 5f02f42bac1c..1a2263e3f187 100644 --- a/include/tvm/relay/op_attr_types.h +++ b/include/tvm/relay/op_attr_types.h @@ -224,130 +224,6 @@ using FShapeFunc = runtime::TypedPackedFunc< const Array& inputs, const Array& out_ndims)>; -/*! - * \brief Operator implementation in TVM. - */ -class OpImplementNode : public Object { - public: - /*! \brief Compute function */ - FTVMCompute fcompute; - /*! \brief Schedule function */ - FTVMSchedule fschedule; - /*! \brief Name of the implementation */ - std::string name; - /*! \brief Priority level */ - int plevel; - - void VisitAttrs(tvm::AttrVisitor* v) { - v->Visit("name", &name); - v->Visit("plevel", &plevel); - } - - static constexpr const char* _type_key = "relay.OpImplement"; - TVM_DECLARE_FINAL_OBJECT_INFO(OpImplementNode, Object); -}; - -/*! - * \brief Operator implementation class. - */ -class OpImplement : public ObjectRef { - public: - /*! - * \brief Invoke the operator compute function. - * \param attrs The attribute of the primitive - * \param inputs The input tensors. - * \param out_type The output type information. - * \return The output compute description of the operator. - */ - TVM_DLL Array Compute(const Attrs& attrs, - const Array& inputs, - const Type& out_type); - /*! - * \brief Build the computation schedule. - * \param attrs The attribute of the node. - * \param outs The output tensors. - * \param target The build target. - * \return The computation schedule. - */ - TVM_DLL te::Schedule Schedule(const Attrs& attrs, - const Array& outs, - const Target& target); - - TVM_DEFINE_OBJECT_REF_METHODS(OpImplement, ObjectRef, OpImplementNode); -}; - -/*! - * \brief Specialized implementations for operators under certain conditions. - */ -class OpSpecializationNode : public Object { - public: - /*! \brief List of implementations. */ - Array implements; - /*! \brief Condition to enable the specialization. - * Could be undefined to represent generic case. */ - te::SpecializedCondition condition; - - void VisitAttrs(tvm::AttrVisitor* v) { - v->Visit("condition", &condition); - v->Visit("implements", &implements); - } - - static constexpr const char* _type_key = "relay.OpSpecialization"; - TVM_DECLARE_FINAL_OBJECT_INFO(OpSpecializationNode, ExprNode); -}; - -/*! - * \brief Operator specialization class. - */ -class OpSpecialization : public ObjectRef { - public: - /*! - * \brief Add an implementation. - * \param fcompute Compute function - * \param fschedule Schedule function - * \param name Name of the implementation - * \param plevel Priority level of the implementation - */ - TVM_DLL void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, - std::string name, int plevel); - - TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(OpSpecialization, ObjectRef, OpSpecializationNode); -}; - -/*! - * \brief Operator strategy to choose implementation. - */ -class OpStrategyNode : public Object { - public: - /*! \brief List of operator specializations. */ - Array specializations; - - void VisitAttrs(tvm::AttrVisitor* v) { - v->Visit("specializations", &specializations); - } - - static constexpr const char* _type_key = "relay.OpStrategy"; - TVM_DECLARE_FINAL_OBJECT_INFO(OpStrategyNode, ExprNode); -}; - -/*! - * \brief Operator strategy class. - */ -class OpStrategy : public ObjectRef { - public: - /*! - * \brief Add an implementation. - * \param fcompute Compute function - * \param fschedule Schedule function - * \param name Name of the implementation - * \param plevel Priority level of the implementation - */ - TVM_DLL void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, - std::string name, int plevel); - - TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(OpStrategy, ObjectRef, OpStrategyNode); -}; - } // namespace relay } // namespace tvm #endif // TVM_RELAY_OP_ATTR_TYPES_H_ diff --git a/include/tvm/relay/op_strategy.h b/include/tvm/relay/op_strategy.h new file mode 100644 index 000000000000..70897980ccdd --- /dev/null +++ b/include/tvm/relay/op_strategy.h @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file tvm/relay/op_strategy.h + * \brief The Relay operator Strategy and related data structure. + */ + +#ifndef TVM_RELAY_OP_STRATEGY_H_ +#define TVM_RELAY_OP_STRATEGY_H_ + +#include +#include +#include +#include +#include +#include + +namespace tvm { +namespace relay { + +/*! + * \brief Operator implementation that includes compute and schedule function. + */ +class OpImplementNode : public Object { + public: + /*! \brief Compute function */ + FTVMCompute fcompute; + /*! \brief Schedule function */ + FTVMSchedule fschedule; + /*! \brief Name of the implementation */ + std::string name; + /*! \brief Priority level */ + int plevel; + + void VisitAttrs(tvm::AttrVisitor* v) { + v->Visit("name", &name); + v->Visit("plevel", &plevel); + } + + static constexpr const char* _type_key = "relay.OpImplement"; + TVM_DECLARE_FINAL_OBJECT_INFO(OpImplementNode, Object); +}; + +/*! + * \brief Operator implementation class. + */ +class OpImplement : public ObjectRef { + public: + /*! + * \brief Invoke the operator compute function. + * \param attrs The attribute of the primitive + * \param inputs The input tensors. + * \param out_type The output type information. + * \return The output compute description of the operator. + */ + TVM_DLL Array Compute(const Attrs& attrs, + const Array& inputs, + const Type& out_type); + /*! + * \brief Build the computation schedule. + * \param attrs The attribute of the node. + * \param outs The output tensors. + * \param target The build target. + * \return The computation schedule. + */ + TVM_DLL te::Schedule Schedule(const Attrs& attrs, + const Array& outs, + const Target& target); + + TVM_DEFINE_OBJECT_REF_METHODS(OpImplement, ObjectRef, OpImplementNode); +}; + +/*! + * \brief Specialized implementations for operators under certain conditions. + */ +class OpSpecializationNode : public Object { + public: + /*! \brief List of implementations. */ + Array implements; + /*! \brief Condition to enable the specialization. + * Could be undefined to represent generic case. */ + te::SpecializedCondition condition; + + void VisitAttrs(tvm::AttrVisitor* v) { + v->Visit("condition", &condition); + v->Visit("implements", &implements); + } + + static constexpr const char* _type_key = "relay.OpSpecialization"; + TVM_DECLARE_FINAL_OBJECT_INFO(OpSpecializationNode, ExprNode); +}; + +/*! + * \brief Operator specialization class. + */ +class OpSpecialization : public ObjectRef { + public: + /*! + * \brief Add an implementation. + * \param fcompute Compute function + * \param fschedule Schedule function + * \param name Name of the implementation + * \param plevel Priority level of the implementation + */ + TVM_DLL void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, + std::string name, int plevel); + + TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(OpSpecialization, ObjectRef, OpSpecializationNode); +}; + +/*! + * \brief Operator strategy to choose implementation. + */ +class OpStrategyNode : public Object { + public: + /*! \brief List of operator specializations. */ + Array specializations; + + void VisitAttrs(tvm::AttrVisitor* v) { + v->Visit("specializations", &specializations); + } + + static constexpr const char* _type_key = "relay.OpStrategy"; + TVM_DECLARE_FINAL_OBJECT_INFO(OpStrategyNode, ExprNode); +}; + +/*! + * \brief Operator strategy class. + */ +class OpStrategy : public ObjectRef { + public: + /*! + * \brief Add an implementation. + * \param fcompute Compute function + * \param fschedule Schedule function + * \param name Name of the implementation + * \param plevel Priority level of the implementation + */ + TVM_DLL void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, + std::string name, int plevel); + + TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(OpStrategy, ObjectRef, OpStrategyNode); +}; + +} // namespace relay +} // namespace tvm +#endif // TVM_RELAY_OP_STRATEGY_H_ diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py index d83a6d4e5581..d09c540dcd21 100644 --- a/python/tvm/autotvm/task/task.py +++ b/python/tvm/autotvm/task/task.py @@ -274,7 +274,12 @@ def _do_reg(f): return _do_reg def register_customized_task(name, func=None): - """Register a customized function to autotvm task. + """Register a customized function to AutoTVM task. + + In most cases, you can just use register_topi_compute and register_topi_schedule + with the same task name to define an AutoTVM task. However, you can also + create a customized AutoTVM task that defines a tunable template or performs + extra layout transform before invoking compute/schedule function. Parameters ---------- @@ -289,6 +294,39 @@ def register_customized_task(name, func=None): ------- decorator: callable A decorator + + Examples + -------- + The following code is a tunable template for a blocked matrix multiplication + + .. code-block:: python + + @autotvm.register_customized_task("matmul") + def matmul(N, L, M, dtype): + A = tvm.placeholder((N, L), name='A', dtype=dtype) + B = tvm.placeholder((L, M), name='B', dtype=dtype) + + k = tvm.reduce_axis((0, L), name='k') + C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C') + s = tvm.create_schedule(C.op) + + # schedule + y, x = s[C].op.axis + k = s[C].op.reduce_axis[0] + + ##### define space begin ##### + cfg = autotvm.get_config() + cfg.define_split("tile_y", y, num_outputs=2) + cfg.define_split("tile_x", x, num_outputs=2) + ##### define space end ##### + + # schedule according to config + yo, yi = cfg["tile_y"].apply(s, C, y) + xo, xi = cfg["tile_x"].apply(s, C, x) + + s[C].reorder(yo, xo, k, yi, xi) + + return s, [A, B, C] """ def _do_reg(f): if name not in TASK_TABLE: diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py index 841bc5f39b72..f815b008e388 100644 --- a/python/tvm/autotvm/task/topi_integration.py +++ b/python/tvm/autotvm/task/topi_integration.py @@ -162,10 +162,9 @@ def get(allow_duplicate=False): def register_topi_compute(task_name, func=None): """Register a tunable template for a topi compute function. - After the registration, this topi compute will become a configuration dispatcher. It uses - all its argument as workload and dispatches configurations according to the input workload. - - It also stores this "workload" to its final ComputeOp, which can be used to reconstruct + The registration will wrap this topi compute to take `cfg` as the first argument, + followed by the original argument list. It uses all its argument as workload and + stores this "workload" to its final ComputeOp, which can be used to reconstruct "workload" in the following topi_schedule call. Parameters @@ -281,13 +280,13 @@ def wrapper(*args, **kwargs): def register_topi_schedule(task_name, func=None): """Register a tunable template for a topi schedule function. - After the registration. This topi schedule will become a configuration dispatcher. It dispatches - configurations according to the input workload. + The registration will wrap this topi schedule to take `cfg` as the first argument, + followed by the original argument list. Note that this function will try to find "workload" from all the ComputeOp in the input. You can attach "workload" to your compute op by using :any:`register_topi_compute`. - The task name need to match with the task name of the corresponding topi compute function. + The task name has to be the same as that of the corresponding topi compute function. Parameters ---------- diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py index 94415939b67f..4034666ac8bb 100644 --- a/python/tvm/relay/backend/compile_engine.py +++ b/python/tvm/relay/backend/compile_engine.py @@ -121,13 +121,14 @@ def get_valid_implements(op, attrs, inputs, out_type, target): assert fstrategy is not None, "%s doesn't have FTVMStrategy registered" % op.name with target: strategy = fstrategy(attrs, inputs, out_type, target) + analyzer = tvm.arith.Analyzer() ret = [] for spec in strategy.specializations: if spec.condition: # check if all the clauses in the specialized condition are true flag = True for clause in spec.condition.clauses: - clause = tvm.ir_pass.Simplify(clause) + clause = analyzer.canonical_simplify(clause) if isinstance(clause, tvm.expr.IntImm) and clause.value: continue flag = False diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py index 22d89050298c..2b5a39ae992d 100644 --- a/python/tvm/relay/expr.py +++ b/python/tvm/relay/expr.py @@ -277,9 +277,6 @@ def set_params(self, params): return _expr.FunctionSetParams(self, params) - def is_primitive(self): - return int(self.get_attribute("Primitive")) == 1 - def get_attribute(self, name): return _expr.FunctionGetAttr(self, name) diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h index ff9566d68625..8cb2f1574894 100644 --- a/src/relay/backend/compile_engine.h +++ b/src/relay/backend/compile_engine.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include diff --git a/src/relay/ir/op_attr_types.cc b/src/relay/ir/op_strategy.cc similarity index 96% rename from src/relay/ir/op_attr_types.cc rename to src/relay/ir/op_strategy.cc index 51344d196a1e..72886ceba9bf 100644 --- a/src/relay/ir/op_attr_types.cc +++ b/src/relay/ir/op_strategy.cc @@ -17,8 +17,12 @@ * under the License. */ -#include -#include +/*! + * \file src/tvm/relay/ir/op_strategy.cc + * \brief The Relay operator Strategy and related data structure. + */ + +#include namespace tvm { namespace relay { @@ -106,6 +110,5 @@ TVM_REGISTER_GLOBAL("relay.op._OpStrategyAddImplement") strategy.AddImplement(compute, schedule, name, plevel); }); - } // namespace relay } // namespace tvm From 2f3c719a7cf170ceccc9fbdcae59c747dcf9fded Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Sun, 16 Feb 2020 21:33:53 -0800 Subject: [PATCH 35/48] fix cpptest --- tests/cpp/relay_build_module_test.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc index e30209948f67..8c37c5a5437a 100644 --- a/tests/cpp/relay_build_module_test.cc +++ b/tests/cpp/relay_build_module_test.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include From 0445e5ac7fe4a862d3ab84a23327514a220514dd Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Mon, 17 Feb 2020 09:28:24 -0800 Subject: [PATCH 36/48] fix docs --- topi/python/topi/cuda/conv3d.py | 30 ++++++++++++------------------ topi/python/topi/nn/conv3d.py | 8 ++++---- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/topi/python/topi/cuda/conv3d.py b/topi/python/topi/cuda/conv3d.py index 6424d2fb8884..0a6a71ccc2f0 100644 --- a/topi/python/topi/cuda/conv3d.py +++ b/topi/python/topi/cuda/conv3d.py @@ -27,7 +27,7 @@ @autotvm.register_topi_compute("conv3d_ncdhw.cuda") def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'): - """Conv3D operator for cuda backend. + """Conv3D operator in NCDHW layout for cuda backend. Parameters ---------- @@ -92,35 +92,29 @@ def _callback(op): @autotvm.register_topi_compute("conv3d_ndhwc.cuda") def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'): - """Conv3D operator for cuda backend. + """Conv3d operator in NDHWC layout for cuda backend. Parameters ---------- - cfg: ConfigEntity - The config for this template + Input : tvm.Tensor + 5-D with shape [batch, in_depth, in_height, in_width, in_channel] - data : tvm.Tensor - 5-D with shape [batch, in_channel, in_depth, in_height, in_width] - - kernel : tvm.Tensor - 5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width] + Filter : tvm.Tensor + 5-D with shape [filter_depth, filter_height, filter_width, in_channel, num_filter] - strides : int or a list/tuple of three ints - stride size, or [stride_depth, stride_height, stride_width] + stride : int or a list/tuple of three ints + Stride size, or [stride_depth, stride_height, stride_width] - padding : int or a list/tuple of three ints - padding size, or [pad_depth, pad_height, pad_width] + padding : int or str + Padding size, or ['VALID', 'SAME'] dilation: int or a list/tuple of three ints dilation size, or [dilation_depth, dilation_height, dilation_width] - out_dtype: str - The output type. This is used for mixed precision. - Returns ------- - output : tvm.Tensor - 5-D with shape [batch, out_channel, out_depth, out_height, out_width] + Output : tvm.Tensor + 5-D with shape [batch, out_depth, out_height, out_width, out_channel] """ return nn.conv3d_ndhwc(data, kernel, strides, padding, dilation, out_dtype) diff --git a/topi/python/topi/nn/conv3d.py b/topi/python/topi/nn/conv3d.py index cc5cbe6af3c5..88c7c6a3ed90 100644 --- a/topi/python/topi/nn/conv3d.py +++ b/topi/python/topi/nn/conv3d.py @@ -100,13 +100,13 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'): Parameters ---------- Input : tvm.Tensor - 5-D with shape [batch, in_channel, in_depth, in_height, in_width] + 5-D with shape [batch, in_depth, in_height, in_width, in_channel] Filter : tvm.Tensor - 5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width] + 5-D with shape [filter_depth, filter_height, filter_width, in_channel, num_filter] stride : int or a list/tuple of three ints - Stride size, or [strid_depth, stride_height, stride_width] + Stride size, or [stride_depth, stride_height, stride_width] padding : int or str Padding size, or ['VALID', 'SAME'] @@ -117,7 +117,7 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'): Returns ------- Output : tvm.Tensor - 5-D with shape [batch, out_channel, out_depth, out_height, out_width] + 5-D with shape [batch, out_depth, out_height, out_width, out_channel] """ assert isinstance(stride, int) or len(stride) == 3 assert isinstance(dilation, int) or len(dilation) == 3 From e496ae0c73afaf2447200a0935bc6fe1787b9127 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Mon, 17 Feb 2020 14:23:37 -0800 Subject: [PATCH 37/48] change data structure name and api --- include/tvm/relay/op_strategy.h | 22 +-- python/tvm/relay/op/op.py | 14 +- python/tvm/relay/op/strategy/arm_cpu.py | 30 ++-- python/tvm/relay/op/strategy/bifrost.py | 14 +- python/tvm/relay/op/strategy/cuda.py | 166 ++++++++++-------- python/tvm/relay/op/strategy/generic.py | 118 +++++++------ python/tvm/relay/op/strategy/hls.py | 22 +-- .../tvm/relay/op/strategy/intel_graphics.py | 8 +- python/tvm/relay/op/strategy/mali.py | 14 +- python/tvm/relay/op/strategy/opengl.py | 12 +- python/tvm/relay/op/strategy/rocm.py | 29 +-- python/tvm/relay/op/strategy/x86.py | 100 +++++------ src/relay/backend/compile_engine.cc | 20 +-- src/relay/backend/compile_engine.h | 8 +- src/relay/ir/op_strategy.cc | 50 +++--- tests/cpp/relay_build_module_test.cc | 4 +- vta/python/vta/top/op.py | 10 +- 17 files changed, 328 insertions(+), 313 deletions(-) diff --git a/include/tvm/relay/op_strategy.h b/include/tvm/relay/op_strategy.h index 70897980ccdd..3824f9fae6ad 100644 --- a/include/tvm/relay/op_strategy.h +++ b/include/tvm/relay/op_strategy.h @@ -38,7 +38,7 @@ namespace relay { /*! * \brief Operator implementation that includes compute and schedule function. */ -class OpImplementNode : public Object { +class OpImplementationNode : public Object { public: /*! \brief Compute function */ FTVMCompute fcompute; @@ -54,14 +54,14 @@ class OpImplementNode : public Object { v->Visit("plevel", &plevel); } - static constexpr const char* _type_key = "relay.OpImplement"; - TVM_DECLARE_FINAL_OBJECT_INFO(OpImplementNode, Object); + static constexpr const char* _type_key = "relay.OpImplementation"; + TVM_DECLARE_FINAL_OBJECT_INFO(OpImplementationNode, Object); }; /*! * \brief Operator implementation class. */ -class OpImplement : public ObjectRef { +class OpImplementation : public ObjectRef { public: /*! * \brief Invoke the operator compute function. @@ -84,7 +84,7 @@ class OpImplement : public ObjectRef { const Array& outs, const Target& target); - TVM_DEFINE_OBJECT_REF_METHODS(OpImplement, ObjectRef, OpImplementNode); + TVM_DEFINE_OBJECT_REF_METHODS(OpImplementation, ObjectRef, OpImplementationNode); }; /*! @@ -93,14 +93,14 @@ class OpImplement : public ObjectRef { class OpSpecializationNode : public Object { public: /*! \brief List of implementations. */ - Array implements; + Array implementations; /*! \brief Condition to enable the specialization. * Could be undefined to represent generic case. */ te::SpecializedCondition condition; void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("condition", &condition); - v->Visit("implements", &implements); + v->Visit("implements", &implementations); } static constexpr const char* _type_key = "relay.OpSpecialization"; @@ -119,8 +119,8 @@ class OpSpecialization : public ObjectRef { * \param name Name of the implementation * \param plevel Priority level of the implementation */ - TVM_DLL void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, - std::string name, int plevel); + TVM_DLL void AddImplementation(FTVMCompute fcompute, FTVMSchedule fschedule, + std::string name, int plevel); TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(OpSpecialization, ObjectRef, OpSpecializationNode); }; @@ -153,8 +153,8 @@ class OpStrategy : public ObjectRef { * \param name Name of the implementation * \param plevel Priority level of the implementation */ - TVM_DLL void AddImplement(FTVMCompute fcompute, FTVMSchedule fschedule, - std::string name, int plevel); + TVM_DLL void AddImplementation(FTVMCompute fcompute, FTVMSchedule fschedule, + std::string name, int plevel); TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(OpStrategy, ObjectRef, OpStrategyNode); }; diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py index 5cf3cf2e0b66..4fd88f4383df 100644 --- a/python/tvm/relay/op/op.py +++ b/python/tvm/relay/op/op.py @@ -144,8 +144,8 @@ class OpPattern(object): OPAQUE = 8 -@tvm._ffi.register_object("relay.OpImplement") -class OpImplement(Object): +@tvm._ffi.register_object("relay.OpImplementation") +class OpImplementation(Object): """Operator implementation""" def compute(self, attrs, inputs, out_type): """Call compute function. @@ -166,7 +166,7 @@ def compute(self, attrs, inputs, out_type): outs : list[tvm.tensor.Tensor] The output tensors. """ - return _OpImplementCompute(self, attrs, inputs, out_type) + return _OpImplementationCompute(self, attrs, inputs, out_type) def schedule(self, attrs, outs, target): """Call schedule function. @@ -187,7 +187,7 @@ def schedule(self, attrs, outs, target): schedule : tvm.Schedule The schedule. """ - return _OpImplementSchedule(self, attrs, outs, target) + return _OpImplementationSchedule(self, attrs, outs, target) @tvm._ffi.register_object("relay.OpSpecialization") @@ -201,7 +201,7 @@ class OpStrategy(Object): def __init__(self): self.__init_handle_by_constructor__(_make.OpStrategy) - def add_implement(self, compute, schedule, name="default", plevel=10): + def add_implementation(self, compute, schedule, name="default", plevel=10): """Add an implementation to the strategy Parameters @@ -219,13 +219,13 @@ def add_implement(self, compute, schedule, name="default", plevel=10): plevel : int The priority level of implementation. """ - _OpStrategyAddImplement(self, compute, schedule, name, plevel) + _OpStrategyAddImplementation(self, compute, schedule, name, plevel) def _wrap_default_fstrategy(compute, schedule, name): def _fstrategy(attrs, inputs, out_type, target): strategy = OpStrategy() - strategy.add_implement(compute, schedule, name=name) + strategy.add_implementation(compute, schedule, name=name) return strategy return _fstrategy diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 850001d9ede2..62cff53ff2c0 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -54,7 +54,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack), name="conv2d_nchw_spatial_pack.arm_cpu") @@ -63,13 +63,13 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw)) if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \ dilation_h == 1 and dilation_w == 1: - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.arm_cpu", plevel=15) if "nnpack" in target.libs and pt == 1 and pb == 1 and pl == 1 and pr == 1: - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd_nnpack), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack), name="conv2d_nchw_winograd_nnpack.arm_cpu", @@ -77,13 +77,13 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): elif layout == "HWCN": assert kernel_layout == "HWIO" logger.warning("conv2d_hwcn is not optimized for arm cpu.") - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_hwcn), wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn), name="conv2d_hwcn.generic") elif layout == "NHWC": assert kernel_layout == "HWIO" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack), name="conv2d_nhwc_spatial_pack.arm_cpu") @@ -93,11 +93,11 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): if layout == "NCHW": assert kernel_layout == "OIHW" or re.match(r"OIHW\d*o", kernel_layout) if kernel_layout == "OIHW": - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw), wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.arm_cpu") - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw_spatial_pack), wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_spatial_pack), name="depthwise_conv2d_nchw_spatial_pack.arm_cpu", @@ -105,7 +105,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): elif layout == "NHWC": assert kernel_layout == "HWOI" logger.warning("depthwise_conv2d with layout NHWC is not optimized for arm cpu.") - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.generic") @@ -116,7 +116,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): if layout == 'NCHW': assert kernel_layout == "OIHW" logger.warning("group_conv2d with layout NCHW is not optimized for arm cpu.") - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), name="group_conv2d_nchw.generic") @@ -152,12 +152,12 @@ def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out _, _, kh, kw = get_const_tuple(inputs[1].shape) pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw)) assert kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.arm_cpu") if pt == 1 and pb == 1 and pl == 1 and pr == 1: - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d_winograd_nnpack( topi.arm_cpu.conv2d_nchw_winograd_nnpack_without_weight_transform), wrap_topi_schedule( @@ -179,7 +179,7 @@ def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target): assert dilation == (1, 1), "not support dilate now" assert groups == 1, "only support groups == 1 for now" strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d_transpose(topi.arm_cpu.conv2d_transpose_nchw), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_transpose_nchw), name="conv2d_tranpose_nchw.arm_cpu") @@ -191,12 +191,12 @@ def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() layout = attrs.data_layout if layout == "NCHW": - strategy.add_implement( + strategy.add_implementation( wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nchw), wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nchw), name="bitserial_conv2d_nchw.arm_cpu") elif layout == "NHWC": - strategy.add_implement( + strategy.add_implementation( wrap_compute_bitserial_conv2d(topi.arm_cpu.bitserial_conv2d_nhwc), wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_conv2d_nhwc), name="bitserial_conv2d_nhwc.arm_cpu") @@ -208,7 +208,7 @@ def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target): """bitserial_dense arm cpu strategy""" strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_bitserial_dense(topi.arm_cpu.bitserial_dense), wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_dense), name="bitserial_dense.arm_cpu") diff --git a/python/tvm/relay/op/strategy/bifrost.py b/python/tvm/relay/op/strategy/bifrost.py index 74c4b0bed530..cf60790f75e3 100644 --- a/python/tvm/relay/op/strategy/bifrost.py +++ b/python/tvm/relay/op/strategy/bifrost.py @@ -37,7 +37,7 @@ def conv2d_strategy_bifrost(attrs, inputs, out_type, target): if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.bifrost.conv2d_nchw_spatial_pack), wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_spatial_pack), name="conv2d_nchw_spatial_pack.bifrost") @@ -45,7 +45,7 @@ def conv2d_strategy_bifrost(attrs, inputs, out_type, target): _, _, kh, kw = get_const_tuple(kernel.shape) if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \ dilation_h == 1 and dilation_w == 1: - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.bifrost.conv2d_nchw_winograd), wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.bifrost", @@ -56,7 +56,7 @@ def conv2d_strategy_bifrost(attrs, inputs, out_type, target): elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), wrap_topi_schedule(topi.bifrost.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.bifrost") @@ -80,7 +80,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_bifrost(attrs, inputs, out if layout == "NCHW": _, _, kh, kw = get_const_tuple(inputs[1].shape) assert kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.bifrost.conv2d_nchw_winograd), wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.bifrost") @@ -93,7 +93,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_bifrost(attrs, inputs, out def dense_strategy_bifrost(attrs, inputs, out_type, target): """dense mali(bifrost) strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_dense(topi.bifrost.dense), - wrap_topi_schedule(topi.bifrost.schedule_dense), - name="dense.bifrost") + strategy.add_implementation(wrap_compute_dense(topi.bifrost.dense), + wrap_topi_schedule(topi.bifrost.schedule_dense), + name="dense.bifrost") return strategy diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index e65d2910ee3f..e229f7b79728 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -87,34 +87,34 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target): if layout == "NCHW": # TODO(@vinx13, @icemelon9): Use conv2d_NCHWc_int8 when dtype is int8/uint8. assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), name="conv2d_nchw.cuda") _, _, kh, kw = get_const_tuple(kernel.shape) if 2 < kh < 8 and 2 < kw < 8 and kh == kw and stride_h == 1 and stride_w == 1 and \ dilation_h == 1 and dilation_w == 1: - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.cuda", plevel=15) elif layout == "HWCN": assert kernel_layout == "HWIO" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwcn), wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn), name="conv2d_hwcn.cuda") # TODO(@alexgl-github): Re-enable this after fix the conv2d_nhwc for cuda # elif layout == "NHWC": # assert kernel_layout == "HWIO" - # strategy.add_implement( + # strategy.add_implementation( # wrap_compute_conv2d(topi.cuda.conv2d_nhwc), # wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc), # name="conv2d_nhwc.cuda") elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8), name="conv2d_NCHWc_int8.cuda") @@ -124,7 +124,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target): if target.target_name == "cuda" and "cudnn" in target.libs: if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \ padding[1] == padding[3]: - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", @@ -132,13 +132,13 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target): elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw), name="dpethwise_nchw.cuda") elif layout == "NHWC": assert kernel_layout == "HWOI" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.cuda") @@ -148,13 +148,13 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target): if layout == 'NCHW': # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8. assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw), name="group_conv2d_nchw.cuda") elif layout == 'NCHW4c' and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8), name="group_conv2d_NCHWc_int8.cuda") @@ -172,7 +172,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout == "NCHW": - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd_without_weight_transform), wrap_topi_schedule( topi.cuda.schedule_conv2d_nchw_winograd_without_weight_transform), @@ -188,7 +188,7 @@ def deformable_conv2d_strategy_cuda(attrs, inputs, out_type, target): layout = attrs.data_layout assert layout == "NCHW" strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_deformable_conv2d(topi.cuda.deformable_conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_deformable_conv2d_nchw), name="deformable_conv2d_nchw.cuda") @@ -204,7 +204,7 @@ def conv2d_transpose_strategy_cuda(attrs, inputs, out_type, target): assert dilation == (1, 1), "not support dilate now" assert groups == 1, "only support groups == 1 for now" strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d_transpose(topi.cuda.conv2d_transpose_nchw), wrap_topi_schedule(topi.cuda.schedule_conv2d_transpose_nchw), name="conv2d_transpose_nchw.cuda") @@ -217,20 +217,20 @@ def conv3d_strategy_cuda(attrs, inputs, out_type, target): layout = attrs.data_layout assert layout in ["NCDHW", "NDHWC"], "Not support this layout {} yet".format(layout) if layout == "NCDHW": - strategy.add_implement(wrap_compute_conv3d(topi.cuda.conv3d_ncdhw), - wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw), - name="conv3d_ncdhw.cuda", - plevel=10) + strategy.add_implementation(wrap_compute_conv3d(topi.cuda.conv3d_ncdhw), + wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw), + name="conv3d_ncdhw.cuda", + plevel=10) else: # layout == "NDHWC": - strategy.add_implement(wrap_compute_conv3d(topi.cuda.conv3d_ndhwc), - wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc), - name="conv3d_ndhwc.cuda", - plevel=10) + strategy.add_implementation(wrap_compute_conv3d(topi.cuda.conv3d_ndhwc), + wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc), + name="conv3d_ndhwc.cuda", + plevel=10) if target.target_name == "cuda" and "cudnn" in target.libs: - strategy.add_implement(wrap_compute_conv3d(topi.cuda.conv3d_cudnn, True), - wrap_topi_schedule(topi.cuda.schedule_conv3d_cudnn), - name="conv3d_cudnn.cuda", - plevel=15) + strategy.add_implementation(wrap_compute_conv3d(topi.cuda.conv3d_cudnn, True), + wrap_topi_schedule(topi.cuda.schedule_conv3d_cudnn), + name="conv3d_cudnn.cuda", + plevel=15) return strategy @conv1d_strategy.register(["cuda", "gpu"]) @@ -242,13 +242,13 @@ def conv1d_strategy_cuda(attrs, inputs, out_type, target): raise ValueError("dilation should be a positive value") strategy = _op.OpStrategy() if layout == "NCW": - strategy.add_implement(wrap_compute_conv1d(topi.cuda.conv1d_ncw), - wrap_topi_schedule(topi.cuda.schedule_conv1d_ncw), - name="conv1d_ncw.cuda") + strategy.add_implementation(wrap_compute_conv1d(topi.cuda.conv1d_ncw), + wrap_topi_schedule(topi.cuda.schedule_conv1d_ncw), + name="conv1d_ncw.cuda") elif layout == "NWC": - strategy.add_implement(wrap_compute_conv1d(topi.cuda.conv1d_nwc), - wrap_topi_schedule(topi.cuda.schedule_conv1d_nwc), - name="conv1d_nwc.cuda") + strategy.add_implementation(wrap_compute_conv1d(topi.cuda.conv1d_nwc), + wrap_topi_schedule(topi.cuda.schedule_conv1d_nwc), + name="conv1d_nwc.cuda") else: raise ValueError("Unsupported conv1d layout {}".format(layout)) return strategy @@ -263,9 +263,9 @@ def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target): assert layout == "NCW", "conv1d_transpose ncw only supported" assert dilation == (1,), "conv1d_transpose dilation is not supported" assert groups == 1, "conv1d_transpose groups == 1 only supported" - strategy.add_implement(wrap_compute_conv1d_transpose(topi.cuda.conv1d_transpose_ncw), - wrap_topi_schedule(topi.cuda.schedule_conv1d_transpose_ncw), - name="conv1d_transpose_ncw.cuda") + strategy.add_implementation(wrap_compute_conv1d_transpose(topi.cuda.conv1d_transpose_ncw), + wrap_topi_schedule(topi.cuda.schedule_conv1d_transpose_ncw), + name="conv1d_transpose_ncw.cuda") return strategy @dense_strategy.register(["cuda", "gpu"]) @@ -273,73 +273,81 @@ def dense_strategy_cuda(attrs, inputs, out_type, target): """dense cuda strategy""" strategy = _op.OpStrategy() if out_type.dtype == "int8": - strategy.add_implement(wrap_compute_dense(topi.cuda.dense_int8), - wrap_topi_schedule(topi.cuda.schedule_dense_int8), - name="dense_int8.cuda") + strategy.add_implementation( + wrap_compute_dense(topi.cuda.dense_int8), + wrap_topi_schedule(topi.cuda.schedule_dense_int8), + name="dense_int8.cuda") else: - strategy.add_implement(wrap_compute_dense(topi.cuda.dense_small_batch), - wrap_topi_schedule(topi.cuda.schedule_dense_small_batch), - name="dense_small_batch.cuda") + strategy.add_implementation( + wrap_compute_dense(topi.cuda.dense_small_batch), + wrap_topi_schedule(topi.cuda.schedule_dense_small_batch), + name="dense_small_batch.cuda") b = inputs[0].shape[0] with SpecializedCondition(b >= 32): - strategy.add_implement(wrap_compute_dense(topi.cuda.dense_large_batch), - wrap_topi_schedule(topi.cuda.schedule_dense_large_batch), - name="dense_large_batch.cuda", - plevel=15) + strategy.add_implementation( + wrap_compute_dense(topi.cuda.dense_large_batch), + wrap_topi_schedule(topi.cuda.schedule_dense_large_batch), + name="dense_large_batch.cuda", + plevel=15) if target.target_name == "cuda" and "cublas" in target.libs: - strategy.add_implement(wrap_compute_dense(topi.cuda.dense_cublas), - wrap_topi_schedule(topi.cuda.schedule_dense_cublas), - name="dense_cublas.cuda", - plevel=20) + strategy.add_implementation( + wrap_compute_dense(topi.cuda.dense_cublas), + wrap_topi_schedule(topi.cuda.schedule_dense_cublas), + name="dense_cublas.cuda", + plevel=20) return strategy @batch_matmul_strategy.register(["cuda", "gpu"]) def batch_matmul_strategy_cuda(attrs, inputs, out_type, target): """batch_matmul cuda strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_batch_matmul(topi.nn.batch_matmul), - wrap_topi_schedule(topi.cuda.schedule_batch_matmul), - name="batch_matmul.cuda", - plevel=10) + strategy.add_implementation( + wrap_compute_batch_matmul(topi.nn.batch_matmul), + wrap_topi_schedule(topi.cuda.schedule_batch_matmul), + name="batch_matmul.cuda", + plevel=10) if target.target_name == "cuda" and "cublas" in target.libs: - strategy.add_implement(wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas), - wrap_topi_schedule(topi.generic.schedule_extern), - name="batch_matmul_cublas.cuda", - plevel=15) + strategy.add_implementation( + wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas), + wrap_topi_schedule(topi.generic.schedule_extern), + name="batch_matmul_cublas.cuda", + plevel=15) return strategy @argsort_strategy.register(["cuda", "gpu"]) def argsort_strategy_cuda(attrs, inputs, out_type, target): """argsort cuda strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_argsort(topi.cuda.argsort), - wrap_topi_schedule(topi.cuda.schedule_argsort), - name="argsort.cuda") + strategy.add_implementation( + wrap_compute_argsort(topi.cuda.argsort), + wrap_topi_schedule(topi.cuda.schedule_argsort), + name="argsort.cuda") return strategy @topk_strategy.register(["cuda", "gpu"]) def topk_strategy_cuda(attrs, inputs, out_type, target): """topk cuda strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_topk(topi.cuda.topk), - wrap_topi_schedule(topi.cuda.schedule_topk), - name="topk.cuda") + strategy.add_implementation(wrap_compute_topk(topi.cuda.topk), + wrap_topi_schedule(topi.cuda.schedule_topk), + name="topk.cuda") return strategy @multibox_prior_strategy.register(["cuda", "gpu"]) def multibox_prior_strategy_cuda(attrs, inputs, out_type, target): """multibox_prior cuda strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_multibox_prior(topi.cuda.multibox_prior), - wrap_topi_schedule(topi.cuda.schedule_multibox_prior), - name="multibox_prior.cuda") + strategy.add_implementation( + wrap_compute_multibox_prior(topi.cuda.multibox_prior), + wrap_topi_schedule(topi.cuda.schedule_multibox_prior), + name="multibox_prior.cuda") return strategy @multibox_transform_loc_strategy.register(["cuda", "gpu"]) def multibox_transform_loc_strategy_cuda(attrs, inputs, out_type, target): """multibox_transform_loc cuda strategy""" strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_multibox_transform_loc(topi.cuda.multibox_transform_loc), wrap_topi_schedule(topi.cuda.schedule_multibox_transform_loc), name="multibox_transform_loc.cuda") @@ -349,27 +357,29 @@ def multibox_transform_loc_strategy_cuda(attrs, inputs, out_type, target): def get_valid_counts_strategy_cuda(attrs, inputs, out_type, target): """get_valid_counts cuda strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_get_valid_counts(topi.cuda.get_valid_counts), - wrap_topi_schedule(topi.cuda.schedule_get_valid_counts), - name="get_valid_counts.cuda") + strategy.add_implementation( + wrap_compute_get_valid_counts(topi.cuda.get_valid_counts), + wrap_topi_schedule(topi.cuda.schedule_get_valid_counts), + name="get_valid_counts.cuda") return strategy @nms_strategy.register(["cuda", "gpu"]) def nms_strategy_cuda(attrs, inputs, out_type, target): """nms cuda strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_nms(topi.cuda.non_max_suppression), - wrap_topi_schedule(topi.cuda.schedule_nms), - name="nms.cuda") + strategy.add_implementation( + wrap_compute_nms(topi.cuda.non_max_suppression), + wrap_topi_schedule(topi.cuda.schedule_nms), + name="nms.cuda") return strategy @roi_align_strategy.register(["cuda", "gpu"]) def roi_align_strategy_cuda(attrs, inputs, out_type, target): """roi_align cuda strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw), - wrap_topi_schedule(topi.cuda.schedule_roi_align), - name="roi_align_nchw.cuda") + strategy.add_implementation(wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw), + wrap_topi_schedule(topi.cuda.schedule_roi_align), + name="roi_align_nchw.cuda") return strategy @schedule_roi_pool.register(["cuda", "gpu"]) @@ -382,7 +392,7 @@ def schedule_roi_pool_cuda(attrs, outs, target): def proposal_strategy_cuda(attrs, inputs, out_type, target): """proposal cuda strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_proposal(topi.cuda.proposal), - wrap_topi_schedule(topi.cuda.schedule_proposal), - name="proposal.cuda") + strategy.add_implementation(wrap_compute_proposal(topi.cuda.proposal), + wrap_topi_schedule(topi.cuda.schedule_proposal), + name="proposal.cuda") return strategy diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index b20a630e9296..312ce95b2510 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -168,19 +168,19 @@ def conv2d_strategy(attrs, inputs, out_type, target): if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_nchw), wrap_topi_schedule(topi.generic.schedule_conv2d_nchw), name="conv2d_nchw.generic") elif layout == "NHWC": assert kernel_layout == "HWIO" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_nhwc), wrap_topi_schedule(topi.generic.schedule_conv2d_nhwc), name="conv2d_nhwc.generic") elif layout == "HWCN": assert kernel_layout == "HWIO" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_hwcn), wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn), name="conv2d_hwcn.generic") @@ -189,13 +189,13 @@ def conv2d_strategy(attrs, inputs, out_type, target): elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.generic") elif layout == "NHWC": assert kernel_layout == "HWOI" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.generic") @@ -204,7 +204,7 @@ def conv2d_strategy(attrs, inputs, out_type, target): else: # group_conv2d if layout == 'NCHW': assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), name="group_conv2d_nchw.generic") @@ -219,12 +219,12 @@ def conv2d_NCHWc_strategy(attrs, inputs, out_type, target): logger.warning("conv2d_NCHWc is not optimized for this platform.") strategy = _op.OpStrategy() if inputs[0].dtype == "int8" or inputs[0].dtype == "uint8": - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_NCHWc_int8, True, True), wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc_int8), name="conv2d_NCHWc_int8.generic") else: - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_NCHWc, True, True), wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc), name="conv2d_NCHWc.generic") @@ -236,7 +236,7 @@ def depthwise_conv2d_NCHWc_strategy(attrs, inputs, out_type, target): """depthwise_conv2d generic strategy""" logger.warning("depthwise_conv2d_NCHWc is not optimized for this platform.") strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_NCHWc, True, True), wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_NCHWc), name="depthwise_conv2d_NCHWc.generic") @@ -286,7 +286,7 @@ def deformable_conv2d_strategy(attrs, inputs, out_type, target): layout = attrs.data_layout assert layout == "NCHW" strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_deformable_conv2d(topi.nn.deformable_conv2d_nchw), wrap_topi_schedule(topi.generic.schedule_deformable_conv2d_nchw), name="deformable_conv2d.generic") @@ -321,7 +321,7 @@ def conv2d_transpose_strategy(attrs, inputs, out_type, target): assert dilation == (1, 1), "not support dilate now" assert groups == 1, "only support groups == 1 for now" strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw), wrap_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw), name="conv2d_transpose_nchw.generic") @@ -361,13 +361,15 @@ def conv3d_strategy(attrs, inputs, out_type, target): strategy = _op.OpStrategy() layout = attrs.data_layout if layout == "NCDHW": - strategy.add_implement(wrap_compute_conv3d(topi.nn.conv3d_ncdhw), - wrap_topi_schedule(topi.generic.schedule_conv3d_ncdhw), - name="conv3d_ncdhw.generic") + strategy.add_implementation( + wrap_compute_conv3d(topi.nn.conv3d_ncdhw), + wrap_topi_schedule(topi.generic.schedule_conv3d_ncdhw), + name="conv3d_ncdhw.generic") elif layout == "NDHWC": - strategy.add_implement(wrap_compute_conv3d(topi.nn.conv3d_ndhwc), - wrap_topi_schedule(topi.generic.schedule_conv3d_ndhwc), - name="conv3d_ndhwc.generic") + strategy.add_implementation( + wrap_compute_conv3d(topi.nn.conv3d_ndhwc), + wrap_topi_schedule(topi.generic.schedule_conv3d_ndhwc), + name="conv3d_ndhwc.generic") else: raise ValueError("Not support this layout {} yet".format(layout)) return strategy @@ -397,13 +399,15 @@ def conv1d_strategy(attrs, inputs, out_type, target): raise ValueError("dilation should be a positive value") strategy = _op.OpStrategy() if layout == "NCW": - strategy.add_implement(wrap_compute_conv1d(topi.nn.conv1d_ncw), - wrap_topi_schedule(topi.generic.schedule_conv1d_ncw), - name="conv1d_ncw.generic") + strategy.add_implementation( + wrap_compute_conv1d(topi.nn.conv1d_ncw), + wrap_topi_schedule(topi.generic.schedule_conv1d_ncw), + name="conv1d_ncw.generic") elif layout == "NWC": - strategy.add_implement(wrap_compute_conv1d(topi.nn.conv1d_nwc), - wrap_topi_schedule(topi.generic.schedule_conv1d_nwc), - name="conv1d_nwc.generic") + strategy.add_implementation( + wrap_compute_conv1d(topi.nn.conv1d_nwc), + wrap_topi_schedule(topi.generic.schedule_conv1d_nwc), + name="conv1d_nwc.generic") else: raise ValueError("Unsupported conv1d layout {}".format(layout)) return strategy @@ -433,9 +437,9 @@ def conv1d_transpose_strategy(attrs, inputs, out_type, target): assert layout == "NCW", "conv1d_transpose ncw only supported" assert dilation == (1,), "conv1d_transpose dilation is not supported" assert groups == 1, "conv1d_transpose groups == 1 only supported" - strategy.add_implement(wrap_compute_conv1d_transpose(topi.nn.conv1d_transpose_ncw), - wrap_topi_schedule(topi.generic.schedule_conv1d_transpose_ncw), - name="conv1d_transpose_ncw.generic") + strategy.add_implementation(wrap_compute_conv1d_transpose(topi.nn.conv1d_transpose_ncw), + wrap_topi_schedule(topi.generic.schedule_conv1d_transpose_ncw), + name="conv1d_transpose_ncw.generic") return strategy # dense @@ -453,9 +457,9 @@ def dense_strategy(attrs, inputs, out_type, target): """dense generic strategy""" logger.warning("dense is not optimized for this platform.") strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_dense(topi.nn.dense), - wrap_topi_schedule(topi.generic.schedule_dense), - name="dense.generic") + strategy.add_implementation(wrap_compute_dense(topi.nn.dense), + wrap_topi_schedule(topi.generic.schedule_dense), + name="dense.generic") return strategy # batch_matmul @@ -470,9 +474,9 @@ def batch_matmul_strategy(attrs, inputs, out_type, target): """batch_matmul generic strategy""" logger.warning("batch_matmul is not optimized for this platform.") strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_batch_matmul(topi.nn.batch_matmul), - wrap_topi_schedule(topi.generic.schedule_batch_matmul), - name="batch_matmul.generic") + strategy.add_implementation(wrap_compute_batch_matmul(topi.nn.batch_matmul), + wrap_topi_schedule(topi.generic.schedule_batch_matmul), + name="batch_matmul.generic") return strategy # sparse_dense @@ -503,9 +507,9 @@ def _compute_argsort(attrs, inputs, _): def argsort_strategy(attrs, inputs, out_type, target): """argsort generic strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_argsort(topi.argsort), - wrap_topi_schedule(topi.generic.schedule_argsort), - name="argsort.generic") + strategy.add_implementation(wrap_compute_argsort(topi.argsort), + wrap_topi_schedule(topi.generic.schedule_argsort), + name="argsort.generic") return strategy # topk @@ -526,9 +530,9 @@ def _compute_topk(attrs, inputs, out_type): def topk_strategy(attrs, inputs, out_type, target): """topk generic strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_topk(topi.topk), - wrap_topi_schedule(topi.generic.schedule_topk), - name="topk.generic") + strategy.add_implementation(wrap_compute_topk(topi.topk), + wrap_topi_schedule(topi.generic.schedule_topk), + name="topk.generic") return strategy # multibox_prior @@ -548,9 +552,9 @@ def _compute_multibox_prior(attrs, inputs, _): def multibox_prior_strategy(attrs, inputs, out_type, target): """multibox_prior generic strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_multibox_prior(topi.vision.ssd.multibox_prior), - wrap_topi_schedule(topi.generic.schedule_multibox_prior), - name="multibox_prior.generic") + strategy.add_implementation(wrap_compute_multibox_prior(topi.vision.ssd.multibox_prior), + wrap_topi_schedule(topi.generic.schedule_multibox_prior), + name="multibox_prior.generic") return strategy # multibox_transform_loc @@ -569,7 +573,7 @@ def _compute_multibox_transform_loc(attrs, inputs, _): def multibox_transform_loc_strategy(attrs, inputs, out_type, target): """schedule multibox_transform_loc""" strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_multibox_transform_loc(topi.vision.ssd.multibox_transform_loc), wrap_topi_schedule(topi.generic.schedule_multibox_transform_loc), name="multibox_transform_loc.generic") @@ -589,9 +593,9 @@ def _compute_get_valid_counts(attrs, inputs, out_type): def get_valid_counts_strategy(attrs, inputs, out_type, target): """get_valid_counts generic strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_get_valid_counts(topi.vision.get_valid_counts), - wrap_topi_schedule(topi.generic.schedule_get_valid_counts), - name="get_valid_counts.generic") + strategy.add_implementation(wrap_compute_get_valid_counts(topi.vision.get_valid_counts), + wrap_topi_schedule(topi.generic.schedule_get_valid_counts), + name="get_valid_counts.generic") return strategy # non-maximum suppression @@ -616,9 +620,9 @@ def _compute_nms(attrs, inputs, out_type): def nms_strategy(attrs, inputs, out_type, target): """nms generic strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_nms(topi.vision.non_max_suppression), - wrap_topi_schedule(topi.generic.schedule_nms), - name="nms.generic") + strategy.add_implementation(wrap_compute_nms(topi.vision.non_max_suppression), + wrap_topi_schedule(topi.generic.schedule_nms), + name="nms.generic") return strategy # roi_align @@ -637,9 +641,9 @@ def _compute_roi_align(attrs, inputs, out_type): def roi_align_strategy(attrs, inputs, out_type, target): """roi_align generic strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw), - wrap_topi_schedule(topi.generic.schedule_roi_align), - name="roi_align.generic") + strategy.add_implementation(wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw), + wrap_topi_schedule(topi.generic.schedule_roi_align), + name="roi_align.generic") return strategy # roi_pool @@ -670,9 +674,9 @@ def _compute_proposal(attrs, inputs, out_type): def proposal_strategy(attrs, inputs, out_type, target): """proposal generic strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_proposal(topi.vision.rcnn.proposal), - wrap_topi_schedule(topi.generic.schedule_proposal), - name="proposal.generic") + strategy.add_implementation(wrap_compute_proposal(topi.vision.rcnn.proposal), + wrap_topi_schedule(topi.generic.schedule_proposal), + name="proposal.generic") return strategy # argwhere @@ -705,12 +709,12 @@ def bitserial_conv2d_strategy(attrs, inputs, out_type, target): strategy = _op.OpStrategy() layout = attrs.data_layout if layout == "NCHW": - strategy.add_implement( + strategy.add_implementation( wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nchw), wrap_topi_schedule(topi.generic.schedule_bitserial_conv2d_nchw), name="bitserial_conv2d_nchw.generic") elif layout == "NHWC": - strategy.add_implement( + strategy.add_implementation( wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nhwc), wrap_topi_schedule(topi.generic.schedule_bitserial_conv2d_nhwc), name="bitserial_conv2d_nhwc.generic") @@ -738,7 +742,7 @@ def bitserial_dense_strategy(attrs, inputs, out_type, target): """bitserial_dense generic strategy""" logger.warning("bitserial_dense is not optimized for this platform.") strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_bitserial_dense(topi.nn.bitserial_dense), wrap_topi_schedule(topi.generic.schedule_bitserial_dense), name="bitserial_dense.generic") diff --git a/python/tvm/relay/op/strategy/hls.py b/python/tvm/relay/op/strategy/hls.py index ca14ffe92d61..514902b86833 100644 --- a/python/tvm/relay/op/strategy/hls.py +++ b/python/tvm/relay/op/strategy/hls.py @@ -72,13 +72,13 @@ def conv2d_strategy_hls(attrs, inputs, out_type, target): if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_nchw), wrap_topi_schedule(topi.hls.schedule_conv2d_nchw), name="conv2d_nchw.hls") elif layout == "NHWC": assert kernel_layout == "HWIO" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_nhwc), wrap_topi_schedule(topi.hls.schedule_conv2d_nhwc), name="conv2d_nhwc.hls") @@ -87,13 +87,13 @@ def conv2d_strategy_hls(attrs, inputs, out_type, target): elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), wrap_topi_schedule(topi.hls.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.hls") elif layout == "NHWC": assert kernel_layout == "HWOI" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.hls.schedule_depthwise_conv2d_nhwc), name="depthwise_nhwc.hls") @@ -107,7 +107,7 @@ def conv2d_strategy_hls(attrs, inputs, out_type, target): def conv2d_NCHWc_strategy_hls(attrs, inputs, out_type, target): """conv2d_NCHWc hls strategy""" strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_NCHWc, True, True), wrap_topi_schedule(topi.hls.schedule_conv2d_NCHWc), name="conv2d_NCHWc.hls") @@ -123,7 +123,7 @@ def conv2d_transpose_strategy_hls(attrs, inputs, out_type, target): assert dilation == (1, 1), "not support dilate now" assert groups == 1, "only support groups == 1 for now" strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw), wrap_topi_schedule(topi.hls.schedule_conv2d_transpose_nchw), name="conv2d_transpose_nchw.hls") @@ -133,9 +133,9 @@ def conv2d_transpose_strategy_hls(attrs, inputs, out_type, target): def dense_strategy_hls(attrs, inputs, out_type, target): """dense hls strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_dense(topi.nn.dense), - wrap_topi_schedule(topi.hls.schedule_dense), - name="dense.hls") + strategy.add_implementation(wrap_compute_dense(topi.nn.dense), + wrap_topi_schedule(topi.hls.schedule_dense), + name="dense.hls") return strategy @bitserial_conv2d_strategy.register("hls") @@ -144,12 +144,12 @@ def bitserial_conv2d_strategy_hls(attrs, inputs, out_type, target): strategy = _op.OpStrategy() layout = attrs.data_layout if layout == "NCHW": - strategy.add_implement( + strategy.add_implementation( wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nchw), wrap_topi_schedule(topi.hls.schedule_bitserial_conv2d_nchw), name="bitserial_conv2d_nchw.hls") elif layout == "NHWC": - strategy.add_implement( + strategy.add_implementation( wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nhwc), wrap_topi_schedule(topi.hls.schedule_bitserial_conv2d_nhwc), name="bitserial_conv2d_nhwc.hls") diff --git a/python/tvm/relay/op/strategy/intel_graphics.py b/python/tvm/relay/op/strategy/intel_graphics.py index cd047f79305e..0ea8d85e3530 100644 --- a/python/tvm/relay/op/strategy/intel_graphics.py +++ b/python/tvm/relay/op/strategy/intel_graphics.py @@ -36,13 +36,13 @@ def conv2d_strategy_intel_graphics(attrs, inputs, out_type, target): if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.intel_graphics.conv2d_nchw), wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_nchw), name="conv2d_nchw.intel_graphics") # conv2d_NCHWc won't work without alter op layout pass # TODO(@Laurawly): fix this - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.intel_graphics.conv2d_NCHWc, True, True), wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_NCHWc), name="conv2d_NCHWc.intel_graphics", @@ -53,7 +53,7 @@ def conv2d_strategy_intel_graphics(attrs, inputs, out_type, target): elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.intel_graphics.depthwise_conv2d_nchw), wrap_topi_schedule(topi.intel_graphics.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.intel_graphics") @@ -67,7 +67,7 @@ def conv2d_strategy_intel_graphics(attrs, inputs, out_type, target): def conv2d_NCHWc_strategy_intel_graphics(attrs, inputs, out_type, target): """conv2d_NCHWc intel_graphics strategy""" strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.intel_graphics.conv2d_NCHWc, True, True), wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_NCHWc), name="conv2d_NCHWc.intel_graphics") diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py index 885ad24c3657..38ea231d681e 100644 --- a/python/tvm/relay/op/strategy/mali.py +++ b/python/tvm/relay/op/strategy/mali.py @@ -36,7 +36,7 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target): if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack), wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack), name="conv2d_nchw_spatial_pack.mali") @@ -44,7 +44,7 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target): _, _, kh, kw = get_const_tuple(kernel.shape) if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \ dilation_h == 1 and dilation_w == 1: - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd), wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.mali", @@ -54,7 +54,7 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target): elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.mali.depthwise_conv2d_nchw), wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.mali") @@ -77,7 +77,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_ty if layout == "NCHW": _, _, kh, kw = get_const_tuple(inputs[1].shape) assert kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd), wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.mali") @@ -90,7 +90,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_ty def dense_strategy_mali(attrs, inputs, out_type, target): """dense mali strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_dense(topi.mali.dense), - wrap_topi_schedule(topi.mali.schedule_dense), - name="dense.mali") + strategy.add_implementation(wrap_compute_dense(topi.mali.dense), + wrap_topi_schedule(topi.mali.schedule_dense), + name="dense.mali") return strategy diff --git a/python/tvm/relay/op/strategy/opengl.py b/python/tvm/relay/op/strategy/opengl.py index c21ccc5593e6..45e290c50e0f 100644 --- a/python/tvm/relay/op/strategy/opengl.py +++ b/python/tvm/relay/op/strategy/opengl.py @@ -58,16 +58,16 @@ def conv2d_strategy_opengl(attrs, inputs, out_type, target): layout = attrs.data_layout assert groups == 1, "Don't support group conv2d on OpenGL" assert layout == "NCHW", "Only support conv2d layout NCHW for OpenGL" - strategy.add_implement(wrap_compute_conv2d(topi.nn.conv2d), - wrap_topi_schedule(topi.opengl.schedule_conv2d_nchw), - name="conv2d_nchw.opengl") + strategy.add_implementation(wrap_compute_conv2d(topi.nn.conv2d), + wrap_topi_schedule(topi.opengl.schedule_conv2d_nchw), + name="conv2d_nchw.opengl") return strategy @dense_strategy.register("opengl") def dense_strategy_opengl(attrs, inputs, out_type, target): """dense opengl strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_dense(topi.nn.dense), - wrap_topi_schedule(topi.opengl.schedule_dense), - name="dense.opengl") + strategy.add_implementation(wrap_compute_dense(topi.nn.dense), + wrap_topi_schedule(topi.opengl.schedule_dense), + name="dense.opengl") return strategy diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py index 86921e1fdb15..e11a688c1398 100644 --- a/python/tvm/relay/op/strategy/rocm.py +++ b/python/tvm/relay/op/strategy/rocm.py @@ -43,33 +43,33 @@ def conv2d_strategy_rocm(attrs, inputs, out_type, target): if layout == "NCHW": # TODO(@vinx13, @icemelon9): Use conv2d_NCHWc_int8 when dtype is int8/uint8. assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), name="conv2d_nchw.cuda") _, _, kh, kw = get_const_tuple(kernel.shape) if kh <= 7 and kw <= 7 and kh == kw and stride_h == 1 and stride_w == 1: - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.cuda", plevel=15) elif layout == "HWCN": assert kernel_layout == "HWIO" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwcn), wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn), name="conv2d_hwcn.cuda") # TODO(@alexgl-github): Re-enable this after fix the conv2d_nhwc for cuda # elif layout == "NHWC": # assert kernel_layout == "HWIO" - # strategy.add_implement( + # strategy.add_implementation( # wrap_compute_conv2d(topi.cuda.conv2d_nhwc), # wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc), # name="conv2d_nhwc.cuda") elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8), name="conv2d_NCHWc_int8.cuda") @@ -78,7 +78,7 @@ def conv2d_strategy_rocm(attrs, inputs, out_type, target): # add miopen implementation if "miopen" in target.libs: if layout == "NCHW": - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.rocm.conv2d_nchw_miopen, True), wrap_topi_schedule(topi.rocm.schedule_conv2d_nchw_miopen), name="conv2d_nchw_miopen.rocm", @@ -86,13 +86,13 @@ def conv2d_strategy_rocm(attrs, inputs, out_type, target): elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.cuda") elif layout == "NHWC": assert kernel_layout == "HWOI" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.cuda") @@ -102,13 +102,13 @@ def conv2d_strategy_rocm(attrs, inputs, out_type, target): if layout == 'NCHW': # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8. assert kernel_layout == "OIHW" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw), name="group_conv2d_nchw.cuda") elif layout == 'NCHW4c' and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8), name="group_conv2d_NCHWc_int8.cuda") @@ -122,12 +122,13 @@ def dense_strategy_rocm(attrs, inputs, out_type, target): strategy = _op.OpStrategy() assert len(inputs[0].shape) == 2 and len(inputs[1].shape) == 2, "Only support 2-dim dense" - strategy.add_implement(wrap_compute_dense(topi.rocm.dense), - wrap_topi_schedule(topi.rocm.schedule_dense), - name="dense.rocm") + strategy.add_implementation( + wrap_compute_dense(topi.rocm.dense), + wrap_topi_schedule(topi.rocm.schedule_dense), + name="dense.rocm") if target.target_name == "rocm" and "rocblas" in target.libs: assert out_type.dtype == inputs[0].dtype, "Mixed precision not supported." - strategy.add_implement( + strategy.add_implementation( wrap_compute_dense(topi.rocm.dense_rocblas), wrap_topi_schedule(topi.rocm.dense_rocblas), name="dense_rocblas.rocm", diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 86576ffcd7fc..9442b7c9cb5d 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -77,26 +77,26 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): if layout == "NCHW": assert kernel_layout == "OIHW" if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype): - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.x86.conv2d_nchw_int8), wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8), name="conv2d_nchw_int8.x86") else: - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.x86.conv2d_nchw), wrap_topi_schedule(topi.x86.schedule_conv2d_nchw), name="conv2d_nchw.x86") elif layout == "NHWC": assert kernel_layout == "HWIO" logger.warning("For x86 target, NCHW layout is recommended for conv2d.") - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_nhwc), wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc), name="conv2d_nhwc.x86") elif layout == "HWCN": assert kernel_layout == "HWIO" logger.warning("conv2d HWCN layout is not optimized for x86.") - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_hwcn), wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn), name="conv2d_hwcn.generic") @@ -107,21 +107,21 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): assert kernel_layout == "OIHW" channel_multiplier = get_const_tuple(inputs[1].shape)[1] if channel_multiplier == 1 and dilation_h == 1 and dilation_w == 1: - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw), wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.x86") else: logger.warning("For x86 target, depthwise_conv2d with channel " "multiplier greater than 1 is not optimized") - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.generic") elif layout == "NHWC": assert kernel_layout == "HWOI" logger.warning("depthwise_conv2d NHWC layout is not optimized for x86.") - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.generic") @@ -131,7 +131,7 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): if layout == 'NCHW': assert kernel_layout == "OIHW" logger.warning("group_conv2d is not optimized for x86.") - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), name="group_conv2d_nchw.generic") @@ -145,12 +145,12 @@ def conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() data, kernel = inputs if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype): - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.x86.conv2d_NCHWc_int8, True, True), wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc_int8), name="conv2d_NCHWc_int8.x86") else: - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.x86.conv2d_NCHWc, True, True), wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc), name="conv2d_NCHWc.x86") @@ -160,7 +160,7 @@ def conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target): def depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target): """depthwise_conv2d x86 strategy""" strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d(topi.x86.depthwise_conv2d_NCHWc, True, True), wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc), name="depthwise_conv2d_NCHWc.x86") @@ -176,7 +176,7 @@ def conv2d_transpose_strategy_cpu(attrs, inputs, out_type, target): assert dilation == (1, 1), "not support dilate now" assert groups == 1, "only support groups == 1 for now" strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_conv2d_transpose(topi.x86.conv2d_transpose_nchw), wrap_topi_schedule(topi.x86.schedule_conv2d_transpose_nchw), name="conv2d_transpose_nchw.x86") @@ -189,13 +189,13 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target): layout = attrs.data_layout if layout == "NCDHW": logger.warning("conv3d with layout NCDHW is not optimized for x86.") - strategy.add_implement(wrap_compute_conv3d(topi.nn.conv3d_ncdhw), - wrap_topi_schedule(topi.generic.schedule_conv3d_ncdhw), - name="conv3d_ncdhw.generic") + strategy.add_implementation(wrap_compute_conv3d(topi.nn.conv3d_ncdhw), + wrap_topi_schedule(topi.generic.schedule_conv3d_ncdhw), + name="conv3d_ncdhw.generic") elif layout == "NDHWC": - strategy.add_implement(wrap_compute_conv3d(topi.x86.conv3d_ndhwc), - wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc), - name="conv3d_ndhwc.x86") + strategy.add_implementation(wrap_compute_conv3d(topi.x86.conv3d_ndhwc), + wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc), + name="conv3d_ndhwc.x86") else: raise ValueError("Not support this layout {} yet".format(layout)) return strategy @@ -209,13 +209,13 @@ def conv1d_strategy_cpu(attrs, inputs, out_type, target): raise ValueError("dilation should be a positive value") strategy = _op.OpStrategy() if layout == "NCW": - strategy.add_implement(wrap_compute_conv1d(topi.nn.conv1d_ncw), - wrap_topi_schedule(topi.x86.schedule_conv1d_ncw), - name="conv1d_ncw.x86") + strategy.add_implementation(wrap_compute_conv1d(topi.nn.conv1d_ncw), + wrap_topi_schedule(topi.x86.schedule_conv1d_ncw), + name="conv1d_ncw.x86") elif layout == "NWC": - strategy.add_implement(wrap_compute_conv1d(topi.nn.conv1d_nwc), - wrap_topi_schedule(topi.x86.schedule_conv1d_nwc), - name="conv1d_nwc.x86") + strategy.add_implementation(wrap_compute_conv1d(topi.nn.conv1d_nwc), + wrap_topi_schedule(topi.x86.schedule_conv1d_nwc), + name="conv1d_nwc.x86") else: raise ValueError("Unsupported conv1d layout {}".format(layout)) return strategy @@ -225,36 +225,36 @@ def dense_strategy_cpu(attrs, inputs, out_type, target): """dense x86 strategy""" strategy = _op.OpStrategy() m, _ = inputs[0].shape - strategy.add_implement(wrap_compute_dense(topi.x86.dense_nopack), - wrap_topi_schedule(topi.x86.schedule_dense_nopack), - name="dense_nopack.x86", - plevel=10) + strategy.add_implementation(wrap_compute_dense(topi.x86.dense_nopack), + wrap_topi_schedule(topi.x86.schedule_dense_nopack), + name="dense_nopack.x86", + plevel=10) if "cblas" in target.libs: - strategy.add_implement(wrap_compute_dense(topi.x86.dense_cblas), - wrap_topi_schedule(topi.x86.schedule_dense_cblas), - name="dense_cblas.x86", - plevel=5) + strategy.add_implementation(wrap_compute_dense(topi.x86.dense_cblas), + wrap_topi_schedule(topi.x86.schedule_dense_cblas), + name="dense_cblas.x86", + plevel=5) with SpecializedCondition(m >= 16): # this implementation may not be well-optimized, so use plevel=8 for now. - strategy.add_implement(wrap_compute_dense(topi.x86.dense_pack), - wrap_topi_schedule(topi.x86.schedule_dense_pack), - name="dense_pack.x86", - plevel=8) + strategy.add_implementation(wrap_compute_dense(topi.x86.dense_pack), + wrap_topi_schedule(topi.x86.schedule_dense_pack), + name="dense_pack.x86", + plevel=8) return strategy @batch_matmul_strategy.register("cpu") def batch_matmul_strategy_cpu(attrs, inputs, out_type, target): """batch_matmul x86 strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_batch_matmul(topi.x86.batch_matmul), - wrap_topi_schedule(topi.x86.schedule_batch_matmul), - name="batch_matmul.x86", - plevel=10) + strategy.add_implementation(wrap_compute_batch_matmul(topi.x86.batch_matmul), + wrap_topi_schedule(topi.x86.schedule_batch_matmul), + name="batch_matmul.x86", + plevel=10) if "cblas" in target.libs: - strategy.add_implement(wrap_compute_batch_matmul(topi.x86.batch_matmul_cblas), - wrap_topi_schedule(topi.x86.schedule_batch_matmul_cblas), - name="batch_matmul_cblas.x86", - plevel=5) + strategy.add_implementation(wrap_compute_batch_matmul(topi.x86.batch_matmul_cblas), + wrap_topi_schedule(topi.x86.schedule_batch_matmul_cblas), + name="batch_matmul_cblas.x86", + plevel=5) return strategy @schedule_sparse_dense.register("cpu") @@ -267,9 +267,9 @@ def schedule_sparse_dense_cpu(attrs, outs, target): def roi_align_strategy_cpu(attrs, inputs, out_type, target): """roi_align x86 strategy""" strategy = _op.OpStrategy() - strategy.add_implement(wrap_compute_roi_align(topi.x86.roi_align_nchw), - wrap_topi_schedule(topi.generic.schedule_roi_align), - name="roi_align.x86") + strategy.add_implementation(wrap_compute_roi_align(topi.x86.roi_align_nchw), + wrap_topi_schedule(topi.generic.schedule_roi_align), + name="roi_align.x86") return strategy @bitserial_conv2d_strategy.register("cpu") @@ -278,12 +278,12 @@ def bitserial_conv2d_strategy_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() layout = attrs.data_layout if layout == "NCHW": - strategy.add_implement( + strategy.add_implementation( wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nchw), wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nchw), name="bitserial_conv2d_nchw.x86") elif layout == "NHWC": - strategy.add_implement( + strategy.add_implementation( wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nhwc), wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nhwc), name="bitserial_conv2d_nhwc.x86") @@ -295,7 +295,7 @@ def bitserial_conv2d_strategy_cpu(attrs, inputs, out_type, target): def bitserial_dense_strategy_cpu(attrs, inputs, out_type, target): """bitserial_dense x86 strategy""" strategy = _op.OpStrategy() - strategy.add_implement( + strategy.add_implementation( wrap_compute_bitserial_dense(topi.x86.bitserial_dense), wrap_topi_schedule(topi.x86.schedule_bitserial_dense), name="bitserial_dense.x86") diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc index 843037d62796..d0a7da9f1ba9 100644 --- a/src/relay/backend/compile_engine.cc +++ b/src/relay/backend/compile_engine.cc @@ -53,10 +53,10 @@ TVM_REGISTER_NODE_TYPE(CCacheKeyNode); TVM_REGISTER_NODE_TYPE(CCacheValueNode); TVM_REGISTER_OBJECT_TYPE(CompileEngineNode); -LoweredOutput::LoweredOutput(tvm::Array outputs, OpImplement implement) { +LoweredOutput::LoweredOutput(tvm::Array outputs, OpImplementation impl) { auto n = make_object(); n->outputs = std::move(outputs); - n->implement = std::move(implement); + n->implementation = std::move(impl); data_ = std::move(n); } @@ -166,8 +166,8 @@ class ScheduleGetter : te::Schedule schedule; // No need to register schedule for device copy op. if (master_attrs_.as() == nullptr) { - CHECK(master_implement_.defined()); - schedule = master_implement_.Schedule(master_attrs_, tensor_outs, target_); + CHECK(master_implementation_.defined()); + schedule = master_implementation_.Schedule(master_attrs_, tensor_outs, target_); for (const auto& scalar : scalars_) { if (schedule->Contain(scalar)) { schedule[scalar].compute_inline(); @@ -245,7 +245,7 @@ class ScheduleGetter : Op op = Downcast(call_node->op); Array outputs; - OpImplement implement; + OpImplementation impl; // Skip fcompute for device copy operators as it is not registered. if (op == device_copy_op_) { const auto* copy_input = inputs[0].operator->(); @@ -254,7 +254,7 @@ class ScheduleGetter : } else { LoweredOutput lowered_out = (*flower_call)(GetRef(call_node), inputs, target_); outputs = lowered_out->outputs; - implement = lowered_out->implement; + impl = lowered_out->implementation; } int op_pattern = fpattern[op]; @@ -267,7 +267,7 @@ class ScheduleGetter : master_op_ = op; master_attrs_ = call_node->attrs; master_op_pattern_ = op_pattern; - master_implement_ = implement; + master_implementation_ = impl; } if (outputs.size() != 1) { const auto* tuple_type = @@ -324,7 +324,7 @@ class ScheduleGetter : Op master_op_; Attrs master_attrs_; int master_op_pattern_{0}; - OpImplement master_implement_; + OpImplementation master_implementation_; std::ostringstream readable_name_stream_; std::unordered_map, ObjectHash, ObjectEqual> memo_; Array scalars_; @@ -814,8 +814,8 @@ const CompileEngine& CompileEngine::Global() { } TVM_REGISTER_GLOBAL("relay.backend._make_LoweredOutput") -.set_body_typed([](tvm::Array outputs, OpImplement implement) { - return LoweredOutput(outputs, implement); +.set_body_typed([](tvm::Array outputs, OpImplementation impl) { + return LoweredOutput(outputs, impl); }); TVM_REGISTER_GLOBAL("relay.backend._make_CCacheKey") diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h index 8cb2f1574894..538c348e21dc 100644 --- a/src/relay/backend/compile_engine.h +++ b/src/relay/backend/compile_engine.h @@ -1,4 +1,4 @@ -/* +src/relay/backend/compile_engine.cc/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -49,11 +49,11 @@ struct LoweredOutputNode : public Object { /*! \brief The outputs to the function */ tvm::Array outputs; /*! \brief The implementation used to compute the output */ - OpImplement implement; + OpImplementation implementation; void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("outputs", &outputs); - v->Visit("implement", &implement); + v->Visit("implementation", &implementation); } static constexpr const char* _type_key = "relay.LoweredOutput"; @@ -62,7 +62,7 @@ struct LoweredOutputNode : public Object { class LoweredOutput : public ObjectRef { public: - TVM_DLL LoweredOutput(tvm::Array outputs, OpImplement implement); + TVM_DLL LoweredOutput(tvm::Array outputs, OpImplementation impl); TVM_DEFINE_OBJECT_REF_METHODS(LoweredOutput, ObjectRef, LoweredOutputNode); }; diff --git a/src/relay/ir/op_strategy.cc b/src/relay/ir/op_strategy.cc index 72886ceba9bf..ce0f107cd6f6 100644 --- a/src/relay/ir/op_strategy.cc +++ b/src/relay/ir/op_strategy.cc @@ -27,67 +27,67 @@ namespace tvm { namespace relay { -TVM_REGISTER_NODE_TYPE(OpImplementNode); +TVM_REGISTER_NODE_TYPE(OpImplementationNode); TVM_REGISTER_NODE_TYPE(OpSpecializationNode); TVM_REGISTER_NODE_TYPE(OpStrategyNode); -Array OpImplement::Compute(const Attrs& attrs, - const Array& inputs, - const Type& out_type) { +Array OpImplementation::Compute(const Attrs& attrs, + const Array& inputs, + const Type& out_type) { return (*this)->fcompute(attrs, inputs, out_type); } -te::Schedule OpImplement::Schedule(const Attrs& attrs, - const Array &outs, - const Target& target) { +te::Schedule OpImplementation::Schedule(const Attrs& attrs, + const Array &outs, + const Target& target) { return (*this)->fschedule(attrs, outs, target); } -void OpSpecialization::AddImplement(tvm::relay::FTVMCompute fcompute, - tvm::relay::FTVMSchedule fschedule, - std::string name, - int plevel) { - auto n = make_object(); +void OpSpecialization::AddImplementation(tvm::relay::FTVMCompute fcompute, + tvm::relay::FTVMSchedule fschedule, + std::string name, + int plevel){ + auto n = make_object(); n->fcompute = fcompute; n->fschedule = fschedule; n->name = std::move(name); n->plevel = plevel; - (*this)->implements.push_back(OpImplement(n)); + (*this)->implementations.push_back(OpImplementation(n)); } -void OpStrategy::AddImplement(FTVMCompute fcompute, - FTVMSchedule fschedule, - std::string name, - int plevel) { +void OpStrategy::AddImplementation(FTVMCompute fcompute, + FTVMSchedule fschedule, + std::string name, + int plevel) { auto curr_cond = te::SpecializedCondition::Current(); auto self = this->operator->(); Array specializations = self->specializations; OpSpecialization op_spec; for (OpSpecialization op_spec : specializations) { if (op_spec->condition == curr_cond) { - op_spec.AddImplement(fcompute, fschedule, std::move(name), plevel); + op_spec.AddImplementation(fcompute, fschedule, std::move(name), plevel); return; } } ObjectPtr n = make_object(); n->condition = curr_cond; op_spec = OpSpecialization(n); - op_spec.AddImplement(fcompute, fschedule, std::move(name), plevel); + op_spec.AddImplementation(fcompute, fschedule, std::move(name), plevel); self->specializations.push_back(op_spec); } -TVM_REGISTER_GLOBAL("relay.op._OpImplementCompute") +TVM_REGISTER_GLOBAL("relay.op._OpImplementationCompute") .set_body([](TVMArgs args, TVMRetValue* rv) { - OpImplement imp = args[0]; + OpImplementation imp = args[0]; Attrs attrs = args[1]; Array inputs = args[2]; Type out_type = args[3]; *rv = imp.Compute(attrs, inputs, out_type); }); -TVM_REGISTER_GLOBAL("relay.op._OpImplementSchedule") +TVM_REGISTER_GLOBAL("relay.op._OpImplementationSchedule") .set_body([](TVMArgs args, TVMRetValue* rv) { - OpImplement imp = args[0]; + OpImplementation imp = args[0]; Attrs attrs = args[1]; Array outs = args[2]; Target target = args[3]; @@ -100,14 +100,14 @@ TVM_REGISTER_GLOBAL("relay.op._make.OpStrategy") *rv = OpStrategy(n); }); -TVM_REGISTER_GLOBAL("relay.op._OpStrategyAddImplement") +TVM_REGISTER_GLOBAL("relay.op._OpStrategyAddImplementation") .set_body([](TVMArgs args, TVMRetValue* rv) { OpStrategy strategy = args[0]; FTVMCompute compute = args[1]; FTVMSchedule schedule = args[2]; std::string name = args[3]; int plevel = args[4]; - strategy.AddImplement(compute, schedule, name, plevel); + strategy.AddImplementation(compute, schedule, name, plevel); }); } // namespace relay diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc index 8c37c5a5437a..b9a8f8f96f8b 100644 --- a/tests/cpp/relay_build_module_test.cc +++ b/tests/cpp/relay_build_module_test.cc @@ -53,7 +53,7 @@ TVM_REGISTER_GLOBAL("test.strategy") auto n = make_object(); auto strategy = tvm::relay::OpStrategy(std::move(n)); - strategy.AddImplement(fcompute, fschedule, "test.strategy", 10); + strategy.AddImplementation(fcompute, fschedule, "test.strategy", 10); return strategy; }); @@ -64,7 +64,7 @@ TVM_REGISTER_GLOBAL("relay.backend.lower_call") Op op = Downcast(call->op); auto out_type = call->checked_type(); OpStrategy strategy = fstrategy[op](call->attrs, inputs, out_type, target); - auto impl = strategy->specializations[0]->implements[0]; + auto impl = strategy->specializations[0]->implementations[0]; auto outs = impl.Compute(call->attrs, inputs, out_type); auto f = tvm::runtime::Registry::Get("relay.backend._make_LoweredOutput"); if (!f) { diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index 4da6e1916c92..04e14b1e2bdd 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -53,7 +53,7 @@ def compute_clip_vta(attrs, inputs, output_type): def clip_strategy_vta(attrs, inputs, out_type, target): strategy = OpStrategy() - strategy.add_implement( + strategy.add_implementation( compute_clip_vta, _strategy.wrap_topi_schedule(topi.generic.schedule_injective), name="clip.vta") @@ -78,12 +78,12 @@ def conv2d_strategy_vta(attrs, inputs, out_type, target): assert env.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now" assert kernel.dtype == "int8" - strategy.add_implement( + strategy.add_implementation( _strategy.wrap_compute_conv2d(conv2d_packed, True), _strategy.wrap_topi_schedule(schedule_conv2d_packed), name="conv2d_packed.vta") else: # group_conv2d - strategy.add_implement( + strategy.add_implementation( _strategy.wrap_compute_conv2d(group_conv2d_packed, has_groups=True), _strategy.wrap_topi_schedule(schedule_group_conv2d_packed), name="group_conv2d_packed.vta") @@ -103,7 +103,7 @@ def conv2d_transpose_strategy_vta(attrs, inputs, out_type, target): if is_packed_layout(layout): strategy = OpStrategy() - strategy.add_implement( + strategy.add_implementation( _strategy.wrap_compute_conv2d_transpose(conv2d_transpose_packed), _strategy.wrap_topi_schedule(schedule_conv2d_transpose_packed), name="conv2d_transpose_packed.vta") @@ -119,7 +119,7 @@ def dense_strategy_vta(attrs, inputs, out_type, target): """dense vta strategy""" if inputs[0].shape == 4: # this implies the layout is packed strategy = OpStrategy() - strategy.add_implement( + strategy.add_implementation( _strategy.wrap_compute_dense(dense_packed), _strategy.wrap_topi_schedule(schedule_dense_packed), name="dense_packed.vta") From ea3dfaafe23af37e2cd2321877f86000234c6dc6 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Mon, 17 Feb 2020 14:35:18 -0800 Subject: [PATCH 38/48] address comments --- python/tvm/autotvm/database.py | 4 +++- python/tvm/autotvm/record.py | 12 ++++++------ python/tvm/autotvm/task/relay_integration.py | 8 ++++---- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/python/tvm/autotvm/database.py b/python/tvm/autotvm/database.py index 75e3f9ff7d06..963f7e54ecaf 100644 --- a/python/tvm/autotvm/database.py +++ b/python/tvm/autotvm/database.py @@ -167,10 +167,12 @@ def filter(self, func): current = self.get(key) try: records = [decode(x) for x in current.split(RedisDatabase.MAGIC_SPLIT)] - records = list(filter(None, records)) + records = [rec for rec in records if rec is not None] except TypeError: # got a badly formatted/old format record continue + if not records: + continue inps, results = zip(*records) inp = inps[0] if not func(inp, results): diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py index 90857a135933..416b2cd57eb6 100644 --- a/python/tvm/autotvm/record.py +++ b/python/tvm/autotvm/record.py @@ -124,15 +124,16 @@ def decode(row, protocol='json'): Parameters ---------- - row: str + row : str a row in the logger file - protocol: str + + protocol : str log protocol, json or pickle Returns ------- - input: autotvm.tuner.MeasureInput - result: autotvm.tuner.MeasureResult + ret : tuple(autotvm.tuner.MeasureInput, autotvm.tuner.MeasureResult), or None + The tuple of input and result, or None if input uses old version log format. """ # pylint: disable=unused-variable global _old_version_warning @@ -229,8 +230,7 @@ def split_workload(in_file, clean=True): logger.info("start converting...") pool = multiprocessing.Pool() - lines = pool.map(decode, lines) - lines = list(filter(None, lines)) + lines = [rec for rec in pool.map(decode, lines) if rec is not None] logger.info("map done %.2f", time.time() - tic) wkl_dict = OrderedDict() diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py index 8a45e3d1240d..cd8d32fb2d68 100644 --- a/python/tvm/autotvm/task/relay_integration.py +++ b/python/tvm/autotvm/task/relay_integration.py @@ -69,8 +69,8 @@ def extract_from_program(mod, params, target, target_host=None, ops=None): The compilation target target_host: tvm.target.Target The host compilation target - ops: List of relay.op.Op - List of relay ops to be tuned + ops: List[relay.op.Op] or None + List of relay ops to be tuned. If not specified, all tunable ops will be extracted. Returns ------- @@ -96,8 +96,8 @@ def extract_from_multiple_program(mods, params, target, target_host=None, ops=No The compilation target target_host: tvm.target.Target The host compilation target - ops: List of relay.op.Op - List of relay ops to be tuned + ops: List[relay.op.Op] or None + List of relay ops to be tuned. If not specified, all tunable ops will be extracted. Returns ------- From eb630ef09d44f1c34a054c6eefbc5f4ecd598a8e Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Mon, 17 Feb 2020 14:42:48 -0800 Subject: [PATCH 39/48] lint --- src/relay/backend/compile_engine.h | 2 +- src/relay/ir/op_strategy.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h index 538c348e21dc..2dbacf645482 100644 --- a/src/relay/backend/compile_engine.h +++ b/src/relay/backend/compile_engine.h @@ -1,4 +1,4 @@ -src/relay/backend/compile_engine.cc/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information diff --git a/src/relay/ir/op_strategy.cc b/src/relay/ir/op_strategy.cc index ce0f107cd6f6..5ce609104395 100644 --- a/src/relay/ir/op_strategy.cc +++ b/src/relay/ir/op_strategy.cc @@ -46,7 +46,7 @@ te::Schedule OpImplementation::Schedule(const Attrs& attrs, void OpSpecialization::AddImplementation(tvm::relay::FTVMCompute fcompute, tvm::relay::FTVMSchedule fschedule, std::string name, - int plevel){ + int plevel) { auto n = make_object(); n->fcompute = fcompute; n->fschedule = fschedule; From 6cf45b5f58e359fb8f25b3e4c7de3cd5aa142ba4 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Mon, 17 Feb 2020 21:00:52 -0800 Subject: [PATCH 40/48] fix rebase err --- python/tvm/autotvm/task/topi_integration.py | 101 +------------------- python/tvm/relay/op/strategy/cuda.py | 2 +- python/tvm/relay/op/strategy/x86.py | 2 +- python/tvm/te/schedule.py | 8 +- src/te/schedule/schedule_lang.cc | 10 +- 5 files changed, 14 insertions(+), 109 deletions(-) diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py index f815b008e388..45385fbe8f7e 100644 --- a/python/tvm/autotvm/task/topi_integration.py +++ b/python/tvm/autotvm/task/topi_integration.py @@ -26,56 +26,12 @@ See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage. """ -<<<<<<< HEAD import tvm.te._ffi_api - -from ... import tensor, placeholder - -from .task import args_to_workload, dispatcher, register -from ..util import get_const_tuple - -# A table that records all registered dispatcher for all targets -_REGISTERED_DISPATCHER = { -} - - -def serialize_args(args): - """serialize arguments of a topi function to a hashable tuple. - - Parameters - ---------- - args: list of hashable or Tensor - """ - ret = [] - for t in args: - if isinstance(t, tensor.Tensor): - ret.append(('TENSOR', get_const_tuple(t.shape), t.dtype)) - else: - ret.append(t) - return tuple(ret) - - -def deserialize_args(args): - """The inverse function of :code:`serialize_args`. - - Parameters - ---------- - args: list of hashable or Tensor - """ - ret = [] - for t in args: - if isinstance(t, tuple) and t[0] == 'TENSOR': - ret.append(placeholder(shape=t[1], dtype=t[2])) - else: - ret.append(t) - return ret -======= from tvm import target as _target -from ... import _api_internal, tensor +from ... import tensor from .task import args_to_workload, DispatchContext, \ register_task_compute, register_task_schedule, serialize_args ->>>>>>> relay op strategy # Task extractor for relay program @@ -185,56 +141,6 @@ def register_topi_compute(task_name, func=None): -------- See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage. """ -<<<<<<< HEAD - def _decorator(f): - targets = [target_keys] if isinstance(target_keys, str) else target_keys - for target_key in targets: - if target_key not in _REGISTERED_DISPATCHER: - _REGISTERED_DISPATCHER[target_key] = {} - if topi_compute not in _REGISTERED_DISPATCHER[target_key]: - @topi_compute.register(target_key) - @dispatcher - def config_dispatcher(*args, **kwargs): - """override topi call as a config dispatcher""" - assert not kwargs, "Do not support kwargs in template function call" - return args_to_workload(args, topi_compute) - _REGISTERED_DISPATCHER[target_key][topi_compute] = config_dispatcher - - config_dispatcher = _REGISTERED_DISPATCHER[target_key][topi_compute] - - @config_dispatcher.register(template_keys, override=override) - def template_call(cfg, *args, **kwargs): - """call the topi func and attach workload to compute node""" - assert not kwargs, "Do not support kwargs in template function call" - - if f == topi_compute.fdefault: - node = f(*args, **kwargs) - else: - node = f(cfg, *args, **kwargs) - - # attach workload to return op - op = node.op - attrs = {} - for k, v in node.op.attrs.items(): - attrs[k] = v - attrs['workload'] = args_to_workload(args, topi_compute) - if isinstance(op, tensor.ComputeOp): - op = tvm.te._ffi_api.ComputeOp( - op.name, op.tag, attrs, op.axis, op.body) - elif isinstance(op, tensor.ExternOp): - op = tvm.te._ffi_api.ExternOp( - op.name, op.tag, attrs, - op.inputs, op.input_placeholders, - op.output_placeholders, op.body) - else: - raise RuntimeError("Unsupported op type: " + str(type(op))) - - if isinstance(node, tensor.Tensor): - return op.output(0) - return [op.output(i) for i in range(len(node))] - - return f -======= def _decorate(topi_compute): @register_task_compute(task_name) def wrapper(*args, **kwargs): @@ -255,16 +161,15 @@ def wrapper(*args, **kwargs): attrs[k] = v attrs['workload'] = workload if isinstance(op, tensor.ComputeOp): - op = _api_internal._ComputeOp( + op = tvm.te._ffi_api.ComputeOp( op.name, op.tag, attrs, op.axis, op.body) elif isinstance(op, tensor.ExternOp): - op = _api_internal._ExternOp( + op = tvm.te._ffi_api.ExternOp( op.name, op.tag, attrs, op.inputs, op.input_placeholders, op.output_placeholders, op.body) else: raise RuntimeError("Unsupported op type: " + str(type(op))) ->>>>>>> relay op strategy if isinstance(node, tensor.Tensor): return op.output(0) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index e229f7b79728..b2f559f12131 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -17,9 +17,9 @@ """Definition of CUDA/GPU operator strategy.""" # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import import topi +from tvm.te import SpecializedCondition from .generic import * from .. import op as _op -from ....schedule import SpecializedCondition @schedule_injective.register(["cuda", "gpu"]) def schedule_injective_cuda(attrs, outs, target): diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 9442b7c9cb5d..2fadb7f08dcd 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -19,9 +19,9 @@ import logging import topi +from tvm.te import SpecializedCondition from .generic import * from .. import op as _op -from ....schedule import SpecializedCondition logger = logging.getLogger('strategy') diff --git a/python/tvm/te/schedule.py b/python/tvm/te/schedule.py index 6499cb57d5c9..f8bbe09725f2 100644 --- a/python/tvm/te/schedule.py +++ b/python/tvm/te/schedule.py @@ -537,19 +537,19 @@ def __init__(self, conditions): if not isinstance(conditions, (list, _container.Array)): conditions = [conditions] self.__init_handle_by_constructor__( - _ffi_api._CreateSpecializedCondition, conditions) + _ffi_api.CreateSpecializedCondition, conditions) @staticmethod def current(): """Returns the current specialized condition""" - return _ffi_api._GetCurrentSpecialization() + return _ffi_api.GetCurrentSpecialization() def __enter__(self): - _ffi_api._EnterSpecializationScope(self) + _ffi_api.EnterSpecializationScope(self) return self def __exit__(self, ptype, value, trace): - _ffi_api._ExitSpecializationScope(self) + _ffi_api.ExitSpecializationScope(self) tvm._ffi._init_api("schedule", __name__) diff --git a/src/te/schedule/schedule_lang.cc b/src/te/schedule/schedule_lang.cc index 0417a41e74d2..2e26ea8682a1 100644 --- a/src/te/schedule/schedule_lang.cc +++ b/src/te/schedule/schedule_lang.cc @@ -825,12 +825,12 @@ SpecializedCondition SpecializedCondition::Current() { return cond; } -TVM_REGISTER_GLOBAL("_CreateSpecializedCondition") +TVM_REGISTER_GLOBAL("te.CreateSpecializedCondition") .set_body_typed([](Array condition) { return SpecializedCondition(condition); }); -TVM_REGISTER_GLOBAL("_GetCurrentSpecialization") +TVM_REGISTER_GLOBAL("te.GetCurrentSpecialization") .set_body([](TVMArgs args, TVMRetValue* ret) { *ret = SpecializedCondition::Current(); }); @@ -846,10 +846,10 @@ class SpecializedCondition::Internal { } }; -TVM_REGISTER_GLOBAL("_EnterSpecializationScope") +TVM_REGISTER_GLOBAL("te.EnterSpecializationScope") .set_body_typed(SpecializedCondition::Internal::EnterScope); -TVM_REGISTER_GLOBAL("_ExitSpecializationScope") +TVM_REGISTER_GLOBAL("te.ExitSpecializationScope") .set_body_typed(SpecializedCondition::Internal::ExitScope); TVM_REGISTER_NODE_TYPE(StageNode); @@ -917,7 +917,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) }) .set_dispatch([](const ObjectRef& node, ReprPrinter* p) { auto* op = static_cast(node.get()); - p->stream << "specialization("; + p->stream << "specialized_condition("; p->Print(op->clauses); p->stream << ')'; }); From 8b0081ac47023081827186efca0fd9433f9393c2 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Tue, 18 Feb 2020 17:22:07 -0800 Subject: [PATCH 41/48] updates --- include/tvm/relay/op_strategy.h | 2 +- python/tvm/autotvm/task/task.py | 1 + python/tvm/relay/backend/compile_engine.py | 22 +-- python/tvm/relay/expr.py | 3 - python/tvm/relay/op/__init__.py | 2 +- src/relay/ir/expr.cc | 6 - .../relay/test_backend_compile_engine.py | 129 +++++++++++++++++- topi/python/topi/arm_cpu/conv2d_alter_op.py | 2 +- topi/python/topi/bifrost/conv2d.py | 2 +- topi/python/topi/cuda/conv2d_alter_op.py | 2 +- .../topi/intel_graphics/conv2d_alter_op.py | 2 +- topi/python/topi/mali/conv2d.py | 2 +- topi/python/topi/x86/conv2d_alter_op.py | 2 +- 13 files changed, 148 insertions(+), 29 deletions(-) diff --git a/include/tvm/relay/op_strategy.h b/include/tvm/relay/op_strategy.h index 3824f9fae6ad..a4da95a36b07 100644 --- a/include/tvm/relay/op_strategy.h +++ b/include/tvm/relay/op_strategy.h @@ -100,7 +100,7 @@ class OpSpecializationNode : public Object { void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("condition", &condition); - v->Visit("implements", &implementations); + v->Visit("implementations", &implementations); } static constexpr const char* _type_key = "relay.OpSpecialization"; diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py index d09c540dcd21..ca1ae0eefefd 100644 --- a/python/tvm/autotvm/task/task.py +++ b/python/tvm/autotvm/task/task.py @@ -359,6 +359,7 @@ def create(task_name, args, target, target_host=None): tsk: Task a task object """ + args = serialize_args(args) ret = Task(task_name, args) if isinstance(target, str): diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py index 4034666ac8bb..407ac8231481 100644 --- a/python/tvm/relay/backend/compile_engine.py +++ b/python/tvm/relay/backend/compile_engine.py @@ -90,7 +90,7 @@ def get_shape(shape): return ret -def get_valid_implements(op, attrs, inputs, out_type, target): +def get_valid_implementations(op, attrs, inputs, out_type, target): """Get all valid implementations from the op strategy. Note that this function doesn't support op with symbolic input shapes. @@ -109,7 +109,7 @@ def get_valid_implements(op, attrs, inputs, out_type, target): out_type : relay.Type The output type. - target : tvm.Target + target : tvm.target.Target The target to compile the op. Returns @@ -134,16 +134,16 @@ def get_valid_implements(op, attrs, inputs, out_type, target): flag = False break if flag: - for impl in spec.implements: + for impl in spec.implementations: ret.append(impl) else: - for impl in spec.implements: + for impl in spec.implementations: ret.append(impl) return ret -def select_implement(op, attrs, inputs, out_type, target, use_autotvm=True): - """Select the best implement from the op strategy. +def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True): + """Select the best implementation from the op strategy. If use_autotvm is True, it'll first try to find the best implementation based on AutoTVM profile results. If no AutoTVM profile result is found, @@ -168,7 +168,7 @@ def select_implement(op, attrs, inputs, out_type, target, use_autotvm=True): out_type : relay.Type The output type. - target : tvm.Target + target : tvm.target.Target The target to compile the op. use_autotvm : bool @@ -179,7 +179,7 @@ def select_implement(op, attrs, inputs, out_type, target, use_autotvm=True): ret : tuple(relay.op.OpImplement, list[tvm.Tensor]) The best op implementation and the corresponding output tensors. """ - all_impls = get_valid_implements(op, attrs, inputs, out_type, target) + all_impls = get_valid_implementations(op, attrs, inputs, out_type, target) best_plevel_impl = None for impl in all_impls: @@ -200,7 +200,7 @@ def select_implement(op, attrs, inputs, out_type, target, use_autotvm=True): if workload is None: continue cfg = dispatch_ctx.query(target, workload) - if cfg.cost is None: + if cfg.is_fallback: # It's a fallback config continue if best_cfg is None or best_cfg.cost > cfg.cost: @@ -245,13 +245,13 @@ def lower_call(call, inputs, target): reenable_tracing = True if not is_dyn: - best_impl, outputs = select_implement( + best_impl, outputs = select_implementation( op, call.attrs, inputs, ret_type, target) logger.info("Use implementation %s for op %s", best_impl.name, op.name) else: # TODO(@icemelon9): Allow tvm to generate multiple kernels for dynamic shapes. # Currently, we just use the implementation with highest plevel - best_impl, outputs = select_implement( + best_impl, outputs = select_implementation( op, call.attrs, inputs, ret_type, target, use_autotvm=False) # re-enable AutoTVM tracing diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py index 2b5a39ae992d..39e68b8333ff 100644 --- a/python/tvm/relay/expr.py +++ b/python/tvm/relay/expr.py @@ -277,9 +277,6 @@ def set_params(self, params): return _expr.FunctionSetParams(self, params) - def get_attribute(self, name): - return _expr.FunctionGetAttr(self, name) - def set_attribute(self, name, ref): return _expr.FunctionSetAttr(self, name, ref) diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py index 8c22e35dfe6c..7427c63a14c1 100644 --- a/python/tvm/relay/op/__init__.py +++ b/python/tvm/relay/op/__init__.py @@ -19,7 +19,7 @@ # operator defs from .op import get, register, register_compute, register_gradient, \ register_pattern, register_alter_op_layout, register_legalize, \ - Op, OpPattern, debug + Op, OpPattern, OpStrategy, debug from . import strategy # Operators diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc index f63fc7a26c20..0292a6c2bb05 100644 --- a/src/relay/ir/expr.cc +++ b/src/relay/ir/expr.cc @@ -354,12 +354,6 @@ TVM_REGISTER_GLOBAL("relay._expr.TempExprRealize") return temp->Realize(); }); -TVM_REGISTER_GLOBAL("relay._expr.FunctionGetAttr") -.set_body_typed( - [](Function func, std::string name) { - return FunctionGetAttr(func, name); -}); - TVM_REGISTER_GLOBAL("relay._expr.FunctionSetAttr") .set_body_typed( [](Function func, std::string name, ObjectRef ref) { diff --git a/tests/python/relay/test_backend_compile_engine.py b/tests/python/relay/test_backend_compile_engine.py index fd7ec188611f..4e4122a28cf0 100644 --- a/tests/python/relay/test_backend_compile_engine.py +++ b/tests/python/relay/test_backend_compile_engine.py @@ -14,11 +14,136 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import numpy as np import tvm import tvm.testing -import numpy as np from tvm import relay +from tvm import autotvm +import topi +from tvm.relay.testing import run_infer_type +from tvm.relay.testing.temp_op_attr import TempOpAttr + + +@autotvm.register_topi_compute("test/conv2d_1") +def _compute_conv2d_1(cfg, input, filter, strides, padding, dilation, out_dtype): + return topi.nn.conv2d_nchw(input, filter, strides, padding, dilation, out_dtype) + +@autotvm.register_topi_schedule("test/conv2d_1") +def _schedule_conv2d_1(cfg, outs): + return topi.generic.schedule_conv2d_nchw(outs) + +@autotvm.register_topi_compute("test/conv2d_2") +def _compute_conv2d_2(cfg, input, filter, strides, padding, dilation, out_dtype): + return topi.nn.conv2d_nchw(input, filter, strides, padding, dilation, out_dtype) + +@autotvm.register_topi_schedule("test/conv2d_2") +def _schedule_conv2d_2(cfg, outs): + return topi.generic.schedule_conv2d_nchw(outs) + +def _compute_conv2d_3(input, filter, strides, padding, dilation, out_dtype): + return topi.nn.conv2d_nchw(input, filter, strides, padding, dilation, out_dtype) + +def _schedule_conv2d_3(outs): + return topi.generic.schedule_conv2d_nchw(outs) + +@tvm.target.override_native_generic_func("test_conv2d_strategy") +def _tmp_strategy(attrs, inputs, out_type, target): + strategy = relay.op.OpStrategy() + strategy.add_implementation( + relay.op.strategy.wrap_compute_conv2d(_compute_conv2d_1), + relay.op.strategy.wrap_topi_schedule(_schedule_conv2d_1), + name="conv2d_1", + plevel=10) + strategy.add_implementation( + relay.op.strategy.wrap_compute_conv2d(_compute_conv2d_2), + relay.op.strategy.wrap_topi_schedule(_schedule_conv2d_2), + name="conv2d_2", + plevel=15) + ic = inputs[0].shape[1] + with tvm.te.SpecializedCondition(ic >= 16): + strategy.add_implementation( + relay.op.strategy.wrap_compute_conv2d(_compute_conv2d_3), + relay.op.strategy.wrap_topi_schedule(_schedule_conv2d_3), + name="conv2d_3", + plevel=20) + return strategy + +def _create_record(task_name, dshape, wshape, target, cost): + args = [tvm.placeholder(dshape), tvm.placeholder(wshape), (1, 1), (1, 1, 1, 1), + (1, 1), 'float32'] + task = autotvm.task.create(task_name, args, target) + cfg = autotvm.ConfigEntity(0, None, {}, []) + cfg.cost = cost + inp = autotvm.MeasureInput(target=target, task=task, config=cfg) + result = autotvm.MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1) + return (inp, result) + +def test_get_valid_implementations(): + target = tvm.target.create("llvm") + + def _get_impls(dshape, wshape): + data = relay.var("data", shape=dshape) + weight = relay.var("wshape", shape=wshape) + out = relay.nn.conv2d(data, weight, padding=(1, 1)) + out = run_infer_type(out) + return relay.backend.compile_engine.get_valid_implementations( + relay.op.get("nn.conv2d"), + out.attrs, + [tvm.placeholder(dshape), tvm.placeholder(wshape)], + out.checked_type, + target) + + with TempOpAttr("nn.conv2d", "FTVMStrategy", _tmp_strategy): + impls = _get_impls((1, 8, 7, 7), (32, 8, 3, 3)) + assert len(impls) == 2 + impls = _get_impls((1, 16, 7, 7), (32, 16, 3, 3)) + assert len(impls) == 3 + +def test_select_implementation(): + target = tvm.target.create("llvm") + + def _select_impl(dshape, wshape, use_autotvm=False): + data = relay.var("data", shape=dshape) + weight = relay.var("wshape", shape=wshape) + out = relay.nn.conv2d(data, weight, padding=(1, 1)) + out = run_infer_type(out) + return relay.backend.compile_engine.select_implementation( + relay.op.get("nn.conv2d"), + out.attrs, + [tvm.placeholder(dshape), tvm.placeholder(wshape)], + out.checked_type, + target, + use_autotvm) + + with TempOpAttr("nn.conv2d", "FTVMStrategy", _tmp_strategy): + impl, _ = _select_impl((1, 8, 7, 7), (32, 8, 3, 3)) + assert impl.name == "conv2d_2" + impl, _ = _select_impl((1, 8, 7, 7), (32, 8, 3, 3), True) + assert impl.name == "conv2d_2" + impl, _ = _select_impl((1, 16, 7, 7), (32, 16, 3, 3)) + assert impl.name == "conv2d_3" + impl, _ = _select_impl((1, 16, 7, 7), (32, 16, 3, 3), True) + assert impl.name == "conv2d_3" + + # add autotvm record + records = [] + records.append(_create_record("test/conv2d_1", (1, 8, 7, 7), (32, 8, 3, 3), target, 0.5)) + records.append(_create_record("test/conv2d_1", (1, 16, 7, 7), (32, 16, 3, 3), target, 1.0)) + with target: + with autotvm.apply_history_best(records): + impl, _ = _select_impl((1, 8, 7, 7), (32, 8, 3, 3), True) + assert impl.name == "conv2d_1" + impl, _ = _select_impl((1, 16, 7, 7), (32, 16, 3, 3), True) + assert impl.name == "conv2d_1" + records.append(_create_record("test/conv2d_2", (1, 8, 7, 7), (32, 8, 3, 3), target, 0.2)) + records.append(_create_record("test/conv2d_1", (1, 16, 7, 7), (32, 16, 3, 3), target, 1.2)) + with target: + with autotvm.apply_history_best(records): + impl, _ = _select_impl((1, 8, 7, 7), (32, 8, 3, 3), True) + assert impl.name == "conv2d_2" + impl, _ = _select_impl((1, 16, 7, 7), (32, 16, 3, 3), True) + assert impl.name == "conv2d_1" def test_compile_engine(): engine = relay.backend.compile_engine.get() @@ -109,6 +234,8 @@ def test_compile_nhwc_pack(): if __name__ == "__main__": + test_get_valid_implementations() + test_select_implementation() test_compile_engine() test_compile_placeholder_bypass() test_compile_injective_with_tuple() diff --git a/topi/python/topi/arm_cpu/conv2d_alter_op.py b/topi/python/topi/arm_cpu/conv2d_alter_op.py index 5b586d34c9bd..bfbf5d6d62b0 100644 --- a/topi/python/topi/arm_cpu/conv2d_alter_op.py +++ b/topi/python/topi/arm_cpu/conv2d_alter_op.py @@ -35,7 +35,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): target = tvm.target.Target.current(allow_none=False) dispatch_ctx = autotvm.task.DispatchContext.current - _, outs = relay.backend.compile_engine.select_implement( + _, outs = relay.backend.compile_engine.select_implementation( relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target) workload = autotvm.task.get_workload(outs) if workload is None: diff --git a/topi/python/topi/bifrost/conv2d.py b/topi/python/topi/bifrost/conv2d.py index ae8c5e36b3f8..1921769aa015 100644 --- a/topi/python/topi/bifrost/conv2d.py +++ b/topi/python/topi/bifrost/conv2d.py @@ -462,7 +462,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): target = tvm.target.Target.current(allow_none=False) dispatch_ctx = autotvm.task.DispatchContext.current - _, outs = relay.backend.compile_engine.select_implement( + _, outs = relay.backend.compile_engine.select_implementation( relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target) workload = autotvm.task.get_workload(outs) if workload is None: diff --git a/topi/python/topi/cuda/conv2d_alter_op.py b/topi/python/topi/cuda/conv2d_alter_op.py index 09806733129b..f3e4f4c3b3c9 100644 --- a/topi/python/topi/cuda/conv2d_alter_op.py +++ b/topi/python/topi/cuda/conv2d_alter_op.py @@ -33,7 +33,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): target = tvm.target.Target.current(allow_none=False) dispatch_ctx = autotvm.task.DispatchContext.current - _, outs = relay.backend.compile_engine.select_implement( + _, outs = relay.backend.compile_engine.select_implementation( relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target) workload = autotvm.task.get_workload(outs) if workload is None: diff --git a/topi/python/topi/intel_graphics/conv2d_alter_op.py b/topi/python/topi/intel_graphics/conv2d_alter_op.py index 7211d650f4a3..e95e59f4c6d7 100644 --- a/topi/python/topi/intel_graphics/conv2d_alter_op.py +++ b/topi/python/topi/intel_graphics/conv2d_alter_op.py @@ -34,7 +34,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): cfg = dispatch_ctx.query(target, None) workload = cfg.workload else: - _, outs = relay.backend.compile_engine.select_implement( + _, outs = relay.backend.compile_engine.select_implementation( relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target) workload = autotvm.task.get_workload(outs) if workload is None: diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py index 7dd075714a61..f774e76c0ccd 100644 --- a/topi/python/topi/mali/conv2d.py +++ b/topi/python/topi/mali/conv2d.py @@ -428,7 +428,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): target = tvm.target.Target.current(allow_none=False) dispatch_ctx = autotvm.task.DispatchContext.current - _, outs = relay.backend.compile_engine.select_implement( + _, outs = relay.backend.compile_engine.select_implementation( relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target) workload = autotvm.task.get_workload(outs) if workload is None: diff --git a/topi/python/topi/x86/conv2d_alter_op.py b/topi/python/topi/x86/conv2d_alter_op.py index 8f7957906825..377d81539b7c 100644 --- a/topi/python/topi/x86/conv2d_alter_op.py +++ b/topi/python/topi/x86/conv2d_alter_op.py @@ -38,7 +38,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): cfg = dispatch_ctx.query(target, None) workload = cfg.workload else: - _, outs = relay.backend.compile_engine.select_implement( + _, outs = relay.backend.compile_engine.select_implementation( relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target) workload = autotvm.task.get_workload(outs) if workload is None: From 3510177b8b3159ced81bd1b25bc2be84d40529d5 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Wed, 19 Feb 2020 05:39:58 +0000 Subject: [PATCH 42/48] fix winograd test --- tests/python/relay/test_op_level2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py index 8da1b129e670..d545d0c1635a 100644 --- a/tests/python/relay/test_op_level2.py +++ b/tests/python/relay/test_op_level2.py @@ -320,6 +320,7 @@ def _query_inside(self, target, workload): return self.memory[key] cfg = autotvm.task.space.FallbackConfigEntity() cfg.is_fallback = False + cfg.cost = 0.1 if 'winograd' in workload[0] else 1 cfg['tile_b'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1]) cfg['tile_y'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1]) cfg['tile_x'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1]) From cf43e16d2c82a1d90ca3867eac6b91f71ab15cf2 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Tue, 18 Feb 2020 21:45:43 -0800 Subject: [PATCH 43/48] fix doc --- python/tvm/relay/backend/compile_engine.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py index 407ac8231481..6466dff6c5df 100644 --- a/python/tvm/relay/backend/compile_engine.py +++ b/python/tvm/relay/backend/compile_engine.py @@ -103,7 +103,7 @@ def get_valid_implementations(op, attrs, inputs, out_type, target): attrs : object The op attribute. - inputs : list of tvm.Tensor + inputs : List[tvm.Tensor] Input tensors to the op. out_type : relay.Type @@ -114,8 +114,8 @@ def get_valid_implementations(op, attrs, inputs, out_type, target): Returns ------- - ret : list of relay.op.OpImplement - The list of op implementations. + ret : List[relay.op.OpImplementation] + The list of all valid op implementations. """ fstrategy = op.get_attr("FTVMStrategy") assert fstrategy is not None, "%s doesn't have FTVMStrategy registered" % op.name @@ -162,7 +162,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True) attrs : object The op attribute. - inputs : list[tvm.Tensor] + inputs : List[tvm.Tensor] Input tensors to the op. out_type : relay.Type @@ -176,7 +176,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True) Returns ------- - ret : tuple(relay.op.OpImplement, list[tvm.Tensor]) + ret : tuple(relay.op.OpImplementation, List[tvm.Tensor]) The best op implementation and the corresponding output tensors. """ all_impls = get_valid_implementations(op, attrs, inputs, out_type, target) From f07d92a828571111809c127a4068272bf93ca3c0 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Wed, 19 Feb 2020 17:17:52 -0800 Subject: [PATCH 44/48] rebase --- src/te/schedule/schedule_lang.cc | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/te/schedule/schedule_lang.cc b/src/te/schedule/schedule_lang.cc index 2e26ea8682a1..7a2e05a56fd3 100644 --- a/src/te/schedule/schedule_lang.cc +++ b/src/te/schedule/schedule_lang.cc @@ -825,16 +825,6 @@ SpecializedCondition SpecializedCondition::Current() { return cond; } -TVM_REGISTER_GLOBAL("te.CreateSpecializedCondition") -.set_body_typed([](Array condition) { - return SpecializedCondition(condition); -}); - -TVM_REGISTER_GLOBAL("te.GetCurrentSpecialization") -.set_body([](TVMArgs args, TVMRetValue* ret) { - *ret = SpecializedCondition::Current(); -}); - class SpecializedCondition::Internal { public: static void EnterScope(SpecializedCondition cond) { @@ -846,12 +836,6 @@ class SpecializedCondition::Internal { } }; -TVM_REGISTER_GLOBAL("te.EnterSpecializationScope") -.set_body_typed(SpecializedCondition::Internal::EnterScope); - -TVM_REGISTER_GLOBAL("te.ExitSpecializationScope") -.set_body_typed(SpecializedCondition::Internal::ExitScope); - TVM_REGISTER_NODE_TYPE(StageNode); TVM_REGISTER_NODE_TYPE(IterVarAttrNode); TVM_REGISTER_NODE_TYPE(SplitNode); @@ -1035,5 +1019,21 @@ TVM_REGISTER_GLOBAL("te.ScheduleCacheWrite") TVM_REGISTER_GLOBAL("te.ScheduleRFactor") .set_body_method(&Schedule::rfactor); +TVM_REGISTER_GLOBAL("te.CreateSpecializedCondition") +.set_body_typed([](Array condition) { + return SpecializedCondition(condition); +}); + +TVM_REGISTER_GLOBAL("te.GetCurrentSpecialization") +.set_body([](TVMArgs args, TVMRetValue* ret) { + *ret = SpecializedCondition::Current(); +}); + +TVM_REGISTER_GLOBAL("te.EnterSpecializationScope") +.set_body_typed(SpecializedCondition::Internal::EnterScope); + +TVM_REGISTER_GLOBAL("te.ExitSpecializationScope") +.set_body_typed(SpecializedCondition::Internal::ExitScope); + } // namespace te } // namespace tvm From 59bd3991875f9bf14db5ce7043752a1d1f42264e Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Sun, 23 Feb 2020 12:20:59 -0800 Subject: [PATCH 45/48] upgrade tophub version number --- python/tvm/autotvm/tophub.py | 16 ++--- python/tvm/relay/op/strategy/arm_cpu.py | 71 +++++++++++-------- .../python/topi/cuda/conv2d_transpose_nchw.py | 4 +- 3 files changed, 53 insertions(+), 38 deletions(-) diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py index ce0be70e4a15..f13ba5289ce5 100644 --- a/python/tvm/autotvm/tophub.py +++ b/python/tvm/autotvm/tophub.py @@ -46,16 +46,16 @@ # the version of each package PACKAGE_VERSION = { - 'arm_cpu': "v0.04", - 'llvm': "v0.03", + 'arm_cpu': "v0.06", + 'llvm': "v0.04", - 'cuda': "v0.06", - 'rocm': "v0.03", - 'opencl': "v0.03", - 'mali': "v0.05", - 'intel_graphics': "v0.01", + 'cuda': "v0.08", + 'rocm': "v0.04", + 'opencl': "v0.04", + 'mali': "v0.06", + 'intel_graphics': "v0.02", - 'vta': "v0.06", + 'vta': "v0.08", } logger = logging.getLogger('autotvm') diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 62cff53ff2c0..9ade4f4cb9eb 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -53,27 +53,35 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): if groups == 1: if layout == "NCHW": - assert kernel_layout == "OIHW" - strategy.add_implementation( - wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack), - name="conv2d_nchw_spatial_pack.arm_cpu") - - _, _, kh, kw = get_const_tuple(kernel.shape) - pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw)) - if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \ - dilation_h == 1 and dilation_w == 1: + if kernel_layout == "OIHW": strategy.add_implementation( - wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd), - name="conv2d_nchw_winograd.arm_cpu", - plevel=15) - if "nnpack" in target.libs and pt == 1 and pb == 1 and pl == 1 and pr == 1: + wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack), + name="conv2d_nchw_spatial_pack.arm_cpu") + # check if winograd algorithm is applicable + _, _, kh, kw = get_const_tuple(kernel.shape) + pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw)) + if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \ + dilation_h == 1 and dilation_w == 1: strategy.add_implementation( - wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd_nnpack), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack), - name="conv2d_nchw_winograd_nnpack.arm_cpu", - plevel=13) + wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd), + name="conv2d_nchw_winograd.arm_cpu", + plevel=15) + if "nnpack" in target.libs and pt == 1 and pb == 1 and pl == 1 and pr == 1: + strategy.add_implementation( + wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd_nnpack), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack), + name="conv2d_nchw_winograd_nnpack.arm_cpu", + plevel=13) + elif re.match(r"OIHW\d*o", kernel_layout): + strategy.add_implementation( + wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack), + name="conv2d_nchw_spatial_pack.arm_cpu") + else: + raise RuntimeError("Unsupported weight layout {} for conv2d NCHW". + format(kernel_layout)) elif layout == "HWCN": assert kernel_layout == "HWIO" logger.warning("conv2d_hwcn is not optimized for arm cpu.") @@ -141,22 +149,27 @@ def _compute_conv2d_nnpack(attrs, inputs, out_type): def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out_type, target): """conv2d_winograd_without_weight_transfrom arm cpu strategy""" dilation = attrs.get_int_tuple("dilation") - padding = attrs.get_int_tuple("padding") groups = attrs.get_int("groups") layout = attrs.data_layout stride_h, stride_w = attrs.get_int_tuple("strides") + tile_size = attrs.get_int("tile_size") + kernel = inputs[1] assert dilation == (1, 1), "Do not support dilate now" assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout == "NCHW": - _, _, kh, kw = get_const_tuple(inputs[1].shape) - pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw)) - assert kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 - strategy.add_implementation( - wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd), - name="conv2d_nchw_winograd.arm_cpu") - if pt == 1 and pb == 1 and pl == 1 and pr == 1: + if len(kernel.shape) == 5: + pad_kh, pad_kw, _, _, _ = get_const_tuple(inputs[1].shape) + kh = pad_kh - tile_size + 1 + kw = pad_kw - tile_size + 1 + assert kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 + strategy.add_implementation( + wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd), + name="conv2d_nchw_winograd.arm_cpu") + elif len(kernel.shape) == 4: + # kernel must be packed by winograd nnpack + assert "nnpack" in target.libs strategy.add_implementation( wrap_compute_conv2d_winograd_nnpack( topi.arm_cpu.conv2d_nchw_winograd_nnpack_without_weight_transform), @@ -164,6 +177,8 @@ def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack_without_weight_transform), name="conv2d_nchw_winograd_nnpack_withou_weight_transform.arm_cpu", plevel=5) + else: + raise RuntimeError("Unsupported kernel shape: {}".format(kernel.shape)) else: raise RuntimeError("Unsupported conv2d_winograd_without_weight_transfrom layout {}". format(layout)) diff --git a/topi/python/topi/cuda/conv2d_transpose_nchw.py b/topi/python/topi/cuda/conv2d_transpose_nchw.py index c39a2fcac6a6..8751800c4517 100644 --- a/topi/python/topi/cuda/conv2d_transpose_nchw.py +++ b/topi/python/topi/cuda/conv2d_transpose_nchw.py @@ -24,7 +24,7 @@ from ..util import get_const_tuple, traverse_inline -@autotvm.register_topi_compute("nn.conv2d_transpose_nchw.cuda") +@autotvm.register_topi_compute("conv2d_transpose_nchw.cuda") def conv2d_transpose_nchw(cfg, data, kernel, stride, padding, out_dtype): """Transposed 2D convolution nchw forward operator. @@ -101,7 +101,7 @@ def conv2d_transpose_nchw(cfg, data, kernel, stride, padding, out_dtype): return data_out -@autotvm.register_topi_schedule("nn.conv2d_transpose_nchw.cuda") +@autotvm.register_topi_schedule("conv2d_transpose_nchw.cuda") def schedule_conv2d_transpose_nchw(cfg, outs): """TOPI Schedule callback for conv2d transpose operator. From 5338a5763ac6173c9850f35d8ef0f81cdd749ce6 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Sun, 23 Feb 2020 21:04:33 +0000 Subject: [PATCH 46/48] fix bug --- python/tvm/relay/op/strategy/__init__.py | 1 + python/tvm/relay/op/strategy/arm_cpu.py | 5 +-- python/tvm/relay/op/strategy/bifrost.py | 35 +++++++++++--------- python/tvm/relay/op/strategy/mali.py | 42 +++++++++++++++--------- topi/python/topi/bifrost/conv2d.py | 9 ++--- 5 files changed, 55 insertions(+), 37 deletions(-) diff --git a/python/tvm/relay/op/strategy/__init__.py b/python/tvm/relay/op/strategy/__init__.py index cbb9eb6470e7..59adf8262664 100644 --- a/python/tvm/relay/op/strategy/__init__.py +++ b/python/tvm/relay/op/strategy/__init__.py @@ -25,6 +25,7 @@ from . import cuda from . import hls from . import mali +from . import bifrost from . import opengl from . import rocm from . import intel_graphics diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 9ade4f4cb9eb..0945f517970f 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -151,15 +151,16 @@ def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") layout = attrs.data_layout - stride_h, stride_w = attrs.get_int_tuple("strides") - tile_size = attrs.get_int("tile_size") + strides = attrs.get_int_tuple("strides") kernel = inputs[1] assert dilation == (1, 1), "Do not support dilate now" + assert strides == (1, 1), "Do not support strides now" assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout == "NCHW": if len(kernel.shape) == 5: pad_kh, pad_kw, _, _, _ = get_const_tuple(inputs[1].shape) + tile_size = attrs.get_int("tile_size") kh = pad_kh - tile_size + 1 kw = pad_kw - tile_size + 1 assert kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 diff --git a/python/tvm/relay/op/strategy/bifrost.py b/python/tvm/relay/op/strategy/bifrost.py index cf60790f75e3..e8f62980a621 100644 --- a/python/tvm/relay/op/strategy/bifrost.py +++ b/python/tvm/relay/op/strategy/bifrost.py @@ -16,6 +16,7 @@ # under the License. """Definition of bifrost operator strategy.""" # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import +import re import topi from .generic import * from .. import op as _op @@ -36,20 +37,25 @@ def conv2d_strategy_bifrost(attrs, inputs, out_type, target): if groups == 1: if layout == "NCHW": - assert kernel_layout == "OIHW" - strategy.add_implementation( - wrap_compute_conv2d(topi.bifrost.conv2d_nchw_spatial_pack), - wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_spatial_pack), - name="conv2d_nchw_spatial_pack.bifrost") + if kernel_layout == "OIHW": + strategy.add_implementation( + wrap_compute_conv2d(topi.bifrost.conv2d_nchw_spatial_pack), + wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_spatial_pack), + name="conv2d_nchw_spatial_pack.bifrost") - _, _, kh, kw = get_const_tuple(kernel.shape) - if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \ - dilation_h == 1 and dilation_w == 1: + _, _, kh, kw = get_const_tuple(kernel.shape) + if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \ + dilation_h == 1 and dilation_w == 1: + strategy.add_implementation( + wrap_compute_conv2d(topi.bifrost.conv2d_nchw_winograd), + wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_winograd), + name="conv2d_nchw_winograd.bifrost", + plevel=15) + elif re.match(r"OIHW\d*o", kernel_layout): strategy.add_implementation( - wrap_compute_conv2d(topi.bifrost.conv2d_nchw_winograd), - wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_winograd), - name="conv2d_nchw_winograd.bifrost", - plevel=15) + wrap_compute_conv2d(topi.bifrost.conv2d_nchw_spatial_pack), + wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_spatial_pack), + name="conv2d_nchw_spatial_pack.bifrost") else: raise RuntimeError("Unsupported conv2d layout {} for Mali(Bifrost)". format(layout)) @@ -73,13 +79,12 @@ def conv2d_winograd_without_weight_transfrom_strategy_bifrost(attrs, inputs, out dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") layout = attrs.data_layout - stride_h, stride_w = attrs.get_int_tuple("strides") + strides = attrs.get_int_tuple("strides") assert dilation == (1, 1), "Do not support dilate now" + assert strides == (1, 1), "Do not support strides now" assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout == "NCHW": - _, _, kh, kw = get_const_tuple(inputs[1].shape) - assert kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 strategy.add_implementation( wrap_compute_conv2d(topi.bifrost.conv2d_nchw_winograd), wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_winograd), diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py index 38ea231d681e..8f1fa291d236 100644 --- a/python/tvm/relay/op/strategy/mali.py +++ b/python/tvm/relay/op/strategy/mali.py @@ -16,6 +16,7 @@ # under the License. """Definition of mali operator strategy.""" # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import +import re import topi from .generic import * from .. import op as _op @@ -35,20 +36,28 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target): if groups == 1: if layout == "NCHW": - assert kernel_layout == "OIHW" - strategy.add_implementation( - wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack), - wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack), - name="conv2d_nchw_spatial_pack.mali") - - _, _, kh, kw = get_const_tuple(kernel.shape) - if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \ - dilation_h == 1 and dilation_w == 1: + if kernel_layout == "OIHW": strategy.add_implementation( - wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd), - wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd), - name="conv2d_nchw_winograd.mali", - plevel=15) + wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack), + wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack), + name="conv2d_nchw_spatial_pack.mali") + # check if winograd algorithm is applicable + _, _, kh, kw = get_const_tuple(kernel.shape) + if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \ + dilation_h == 1 and dilation_w == 1: + strategy.add_implementation( + wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd), + wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd), + name="conv2d_nchw_winograd.mali", + plevel=15) + elif re.match(r"OIHW\d*o", kernel_layout): + strategy.add_implementation( + wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack), + wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack), + name="conv2d_nchw_spatial_pack.mali") + else: + raise RuntimeError("Unsupported weight layout {} for conv2d NCHW". + format(kernel_layout)) else: raise RuntimeError("Unsupported conv2d layout {} for mali".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): @@ -70,13 +79,14 @@ def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_ty dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") layout = attrs.data_layout - stride_h, stride_w = attrs.get_int_tuple("strides") + strides = attrs.get_int_tuple("strides") + kernel = inputs[1] assert dilation == (1, 1), "Do not support dilate now" + assert strides == (1, 1), "Do not support strides now" assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout == "NCHW": - _, _, kh, kw = get_const_tuple(inputs[1].shape) - assert kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 + assert len(kernel.shape) == 5, "Kernel must be packed into 5-dim" strategy.add_implementation( wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd), wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd), diff --git a/topi/python/topi/bifrost/conv2d.py b/topi/python/topi/bifrost/conv2d.py index 1921769aa015..816024ebdb25 100644 --- a/topi/python/topi/bifrost/conv2d.py +++ b/topi/python/topi/bifrost/conv2d.py @@ -457,7 +457,7 @@ def _schedule_winograd(cfg, s, op): ##### REGISTER ALTER OP LAYOUT ##### -@nn.conv2d_alter_layout.register(["bifrost"]) +@nn.conv2d_alter_layout.register("bifrost") def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): target = tvm.target.Target.current(allow_none=False) dispatch_ctx = autotvm.task.DispatchContext.current @@ -503,7 +503,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): dispatch_ctx.update(target, new_workload, cfg) return relay.nn.conv2d(*inputs, **new_attrs) - elif topi_tmpl == "conv2d_nchw_winograd.bifrost": + + if topi_tmpl == "conv2d_nchw_winograd.bifrost": assert data_layout == "NCHW" and kernel_layout == "OIHW" N, CI, H, W = get_const_tuple(data.shape) CO, _, KH, KW = get_const_tuple(kernel.shape) @@ -527,5 +528,5 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): return relay.nn.contrib_conv2d_winograd_without_weight_transform( inputs[0], weight_expr, **new_attrs) - else: - return None + + return None From 24ae7977ea68435af88756b86244c49ac1049506 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Sun, 23 Feb 2020 13:06:07 -0800 Subject: [PATCH 47/48] re-enable vta tsim test after tophub is upgraded --- tests/scripts/task_python_vta_tsim.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/scripts/task_python_vta_tsim.sh b/tests/scripts/task_python_vta_tsim.sh index 5f194b297678..eba62e537a85 100755 --- a/tests/scripts/task_python_vta_tsim.sh +++ b/tests/scripts/task_python_vta_tsim.sh @@ -46,10 +46,8 @@ echo "Running unittest in tsim..." python3 -m pytest -v vta/tests/python/unittest # Run unit tests in cycle accurate simulator -# TODO(@icemelon9): temporarily disable tsim test because it takes a long time without tophub logs. -# Re-enable this test after update the tophub logs. -# echo "Running integration test in tsim..." -# python3 -m pytest -v vta/tests/python/integration +echo "Running integration test in tsim..." +python3 -m pytest -v vta/tests/python/integration # Reset default fsim simulation cp vta/config/fsim_sample.json vta/config/vta_config.json From 4f1806a912100317b2c13ad5f50cc1503950680b Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Mon, 24 Feb 2020 19:21:03 +0000 Subject: [PATCH 48/48] fix vta test to use the correct args so the config can be found in tophub --- vta/tests/python/integration/test_benchmark_topi_conv2d.py | 6 ++++-- .../integration/test_benchmark_topi_conv2d_transpose.py | 4 +++- .../python/integration/test_benchmark_topi_group_conv2d.py | 5 ++++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py index 9e65eab8e154..6935e4794c4e 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py @@ -25,6 +25,7 @@ from collections import namedtuple import tvm +from tvm import relay from tvm import autotvm from tvm.contrib import util from tvm.contrib.pickle_memoize import memoize @@ -106,16 +107,17 @@ def run_conv2d(env, remote, wl, target, data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) + padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad)) # Define base computation schedule with target: if data_pack: res = conv2d_fcompute( - data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), (1, 1), + data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), layout, env.acc_dtype) else: res = conv2d_fcompute( - data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), (1, 1), + data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), env.acc_dtype) res = topi.right_shift(res, 8) res = topi.add(res, bias) diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py index 284655adf6da..2d96a7313480 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py @@ -25,6 +25,7 @@ from collections import namedtuple import tvm +from tvm import relay from tvm import autotvm from tvm.contrib import util from tvm.contrib.pickle_memoize import memoize @@ -103,11 +104,12 @@ def run_conv2d_transpose(env, remote, wl, target, kernel_shape = w_shape data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) + padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad)) # Define base computation schedule with target: res = fcompute( - data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), env.acc_dtype) + data, kernel, (wl.hstride, wl.wstride), padding, env.acc_dtype) res = topi.right_shift(res, env.WGT_WIDTH) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) diff --git a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py index 5ec1be8ec0dc..31fef4923328 100644 --- a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py @@ -25,6 +25,7 @@ from collections import namedtuple import tvm +from tvm import relay from tvm import autotvm from tvm.contrib import util import topi @@ -103,10 +104,12 @@ def run_group_conv2d(env, remote, wl, target, data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) + padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad)) + # Define base computation schedule with target: res = fcompute( - data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), (1, 1), + data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), wl.groups, env.acc_dtype) res = topi.right_shift(res, 8) res = topi.add(res, bias)