From 00c2aca6b6ea7df22aa2417fa5864e8d39c362f8 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 20 Sep 2022 11:20:35 +0000 Subject: [PATCH 01/36] add extra attr property set --- paddle/fluid/operators/ops_extra_info.h | 110 +++++++++++++++++++ paddle/phi/backends/onednn/onednn_context.cc | 67 +++++++++++ paddle/phi/backends/onednn/onednn_context.h | 8 ++ paddle/phi/core/attribute.h | 2 +- paddle/phi/core/kernel_context.h | 4 - paddle/phi/core/kernel_registry.h | 2 - paddle/phi/core/kernel_utils.h | 15 --- 7 files changed, 186 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h index 8f3780dd3a3ad..2373b00c13c97 100644 --- a/paddle/fluid/operators/ops_extra_info.h +++ b/paddle/fluid/operators/ops_extra_info.h @@ -14,11 +14,121 @@ #pragma once +#include +#include + #include "paddle/fluid/framework/attribute.h" namespace paddle { namespace operators { +// This file is to be compatible with the bad design and +// implementation of fluid in the past + +// Many operators in fluid have extra attributes, which are generally added +// to implement some specific kernel selection and to meet the specialization +// needs of a specific operation library like mkldnn or cudnn +enum class ExtraAttrProperty : uint8_t { + // The attributes that are no longer used by any scene + DEPRECATED = 0, + // The attributes used for framework execution scheduling, + // such as `use_mkldnn`, `use_cudnn`, no need to save + SCHEDULE, + // The attributes for ONEDNN only, can be saved in OneDNNContext + ONEDNN, + // The attributes for ONEDNN only, can be saved in GPUContext + GPUDNN, + // Add necessary properties as needed +}; + +class ExtraAttrPropertySet final { + public: + constexpr ExtraAttrPropertySet() : bitset_(0) {} + constexpr ExtraAttrPropertySet(ExtraAttrProperty e) // NOLINT + : bitset_(e == ExtraAttrProperty::DEPRECATED + ? 0 + : 1ULL << (static_cast(e) - 1)) {} + + inline uint64_t bitset() const { return bitset_; } + + bool inline Contains(ExtraAttrProperty e) const { + // DEPRECATED ExtraAttr always return false + return static_cast(bitset_ & ExtraAttrPropertySet(e).bitset()); + } + bool IsEmpty() const { return bitset_ == 0; } + + ExtraAttrPropertySet operator|(const ExtraAttrPropertySet& other) const { + return ExtraAttrPropertySet(bitset_ | other.bitset()); + } + ExtraAttrPropertySet operator&(const ExtraAttrPropertySet& other) const { + return ExtraAttrPropertySet(bitset_ & other.bitset()); + } + ExtraAttrPropertySet operator-(const ExtraAttrPropertySet& other) const { + return ExtraAttrPropertySet(bitset_ & ~other.bitset()); + } + ExtraAttrPropertySet operator^(const ExtraAttrPropertySet& other) const { + return ExtraAttrPropertySet(bitset_ ^ other.bitset()); + } + + bool operator==(const ExtraAttrPropertySet& other) const { + return bitset_ == other.bitset(); + } + + private: + constexpr ExtraAttrPropertySet(uint64_t bitset) : bitset_(bitset) {} + uint64_t bitset_; +}; + +const std::unordered_map + extra_attr_properties = { + // DEPRECATED attributes + {"use_quantizer", ExtraAttrProperty::DEPRECATED}, + // SCHEDULE attributes + {"use_cudnn", ExtraAttrProperty::SCHEDULE}, + {"use_mkldnn", ExtraAttrProperty::SCHEDULE}, + // ONEDNN dedicated attributes + {"data_format", ExtraAttrProperty::ONEDNN}, + {"force_fp32_output", ExtraAttrProperty::ONEDNN}, + {"fuse_activation", ExtraAttrProperty::ONEDNN}, + {"fuse_activation_type", ExtraAttrProperty::ONEDNN}, + {"fuse_activation_alpha", ExtraAttrProperty::ONEDNN}, + {"fuse_activation_beta", ExtraAttrProperty::ONEDNN}, + {"fuse_activation_scale", ExtraAttrProperty::ONEDNN}, + {"fuse_alpha", ExtraAttrProperty::ONEDNN}, + {"fuse_beta", ExtraAttrProperty::ONEDNN}, + {"fuse_brelu", ExtraAttrProperty::ONEDNN}, + {"fuse_brelu_threshold", ExtraAttrProperty::ONEDNN}, + {"fuse_relu", ExtraAttrProperty::ONEDNN}, + {"fuse_residual_connection", ExtraAttrProperty::ONEDNN}, + {"fuse_with_relu", ExtraAttrProperty::ONEDNN}, + {"fused_reshape_Out", ExtraAttrProperty::ONEDNN}, + {"fused_transpose_Out", ExtraAttrProperty::ONEDNN}, + {"fused_reshape_X", ExtraAttrProperty::ONEDNN}, + {"fused_reshape_Y", ExtraAttrProperty::ONEDNN}, + {"fused_transpose_X", ExtraAttrProperty::ONEDNN}, + {"fused_transpose_Y", ExtraAttrProperty::ONEDNN}, + {"mkldnn_data_type", ExtraAttrProperty::ONEDNN}, + {"scale_x", ExtraAttrProperty::ONEDNN}, + {"scale_y", ExtraAttrProperty::ONEDNN}, + {"scale_out", ExtraAttrProperty::ONEDNN}, + {"Scale_in_eltwise", ExtraAttrProperty::ONEDNN}, + {"Scale_x", ExtraAttrProperty::ONEDNN}, + {"Scale_y", ExtraAttrProperty::ONEDNN}, + {"Scale_out", ExtraAttrProperty::ONEDNN}, + {"Scale_weights", ExtraAttrProperty::ONEDNN}, + {"x_data_format", ExtraAttrProperty::ONEDNN}, + {"y_data_format", ExtraAttrProperty::ONEDNN}, + // GPUDNN dedicated attributes + {"exhaustive_search", ExtraAttrProperty::GPUDNN}, + {"fuse_relu_before_depthwise_conv", ExtraAttrProperty::GPUDNN}, + {"use_addto", ExtraAttrProperty::GPUDNN}, + {"workspace_size_MB", ExtraAttrProperty::GPUDNN}, + // Mixed-use attributes + {"is_test", + ExtraAttrPropertySet(ExtraAttrProperty::ONEDNN) | + ExtraAttrPropertySet(ExtraAttrProperty::GPUDNN)}, +}; + template struct ExtraAttrChecker { ExtraAttrChecker(const std::string& attr_name, T default_value) diff --git a/paddle/phi/backends/onednn/onednn_context.cc b/paddle/phi/backends/onednn/onednn_context.cc index 950483a469ed8..a5ecc1724bef5 100644 --- a/paddle/phi/backends/onednn/onednn_context.cc +++ b/paddle/phi/backends/onednn/onednn_context.cc @@ -16,6 +16,7 @@ #include "paddle/phi/common/place.h" #include "paddle/phi/core/enforce.h" +#include "paddle/utils/flat_hash_map.h" #include "paddle/fluid/framework/expect.h" #include "paddle/fluid/platform/device_context.h" @@ -284,6 +285,39 @@ struct OneDNNContext::Impl { return key_it->second; } + bool HasDeviceAttr(const std::string& attr_name) const { + return device_attrs_.count(attr_name) != 0UL; + } + const Attribute& GetDeviceAttr(const std::string& attr_name) const { + auto iter = device_attrs_.find(attr_name); + PADDLE_ENFORCE_NE( + iter, + device_attrs_.end(), + phi::errors::NotFound("Attribute `%s` is not found in OneDNNContext.")); + return iter->second; + } + + void SetDeviceAttr(const std::string& attr_name, const Attribute& attr) { + if (device_attrs_version_ == 0) { + device_attrs_[attr_name] = attr; + } else { + auto iter = device_attrs_.find(attr_name); + // TODO(chenweihang): this check is not enough + PADDLE_ENFORCE_EQ( + iter, + device_attrs_.end(), + phi::errors::AlreadyExists( + "Attribute `%s` in OneDNNContext is being used now.")); + device_attrs_[attr_name] = attr; + } + } + + int8_t DeviceAttrsVersion() const { return device_attrs_version_; } + + void SetDeviceAttrsVersion(int8_t version) { + device_attrs_version_ = version; + } + std::shared_ptr p_blobmap_; // Map key is pointer of executor and value is a data(iterator in map) needed // to erase @@ -291,6 +325,17 @@ struct OneDNNContext::Impl { std::shared_ptr p_mutex_; // 0 - clearing is allowed. x > 0 do not clear. unsigned int block_next_cache_clearing_ = 0; + + // For thread safety when using device_attrs_ + int8_t device_attrs_version_ = 0; + // Holds some attributes only used by the onednn kernel calculation + // Since original mkldnn op kernel directly adds the operations that require + // fusion to the native kernel operations, and uses the attribute `fuse_xxx` + // to control, for onednn, there will be some attributes that seem to be + // independent of the device are also saved here. + // Here, the operation of fusion needs to be implemented separately as + // a fusion op and kernel, instead of patching it to a basic operation. + AttributeMap device_attrs_; }; OneDNNContext::OneDNNContext(const Place& place) @@ -322,5 +367,27 @@ OneDNNContext::BlobPtr_t OneDNNContext::GetBlob( return impl_->GetBlob(name); } +bool OneDNNContext::HasDeviceAttr(const std::string& attr_name) const { + return impl_->HasDeviceAttr(attr_name); +} + +const Attribute& OneDNNContext::GetDeviceAttr( + const std::string& attr_name) const { + return impl_->GetDeviceAttr(attr_name); +} + +void OneDNNContext::SetDeviceAttr(const std::string& attr_name, + const Attribute& attr) { + return impl_->SetDeviceAttr(attr_name, attr); +} + +int8_t OneDNNContext::DeviceAttrsVersion() const { + return impl_->DeviceAttrsVersion(); +} + +void OneDNNContext::SetDeviceAttrsVersion(int8_t version) { + return impl_->SetDeviceAttrsVersion(version); +} + } // namespace phi #endif diff --git a/paddle/phi/backends/onednn/onednn_context.h b/paddle/phi/backends/onednn/onednn_context.h index d7cf8a0ff4902..c2e49ccb64cbd 100644 --- a/paddle/phi/backends/onednn/onednn_context.h +++ b/paddle/phi/backends/onednn/onednn_context.h @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/attribute.h" namespace phi { @@ -134,6 +135,13 @@ class OneDNNContext : public CPUContext { return OneDNNContextThreadLocals::fetch(); } + // Several methods for adapting ONEDNN-specific attributes + bool HasDeviceAttr(const std::string& attr_name) const; + const Attribute& GetDeviceAttr(const std::string& attr_name) const; + void SetDeviceAttr(const std::string& attr_name, const Attribute& attr); + int8_t DeviceAttrsVersion() const; + void SetDeviceAttrsVersion(int8_t version); + private: struct Impl; std::unique_ptr impl_; diff --git a/paddle/phi/core/attribute.h b/paddle/phi/core/attribute.h index d8d684b9030e9..75a6b16f46e0b 100644 --- a/paddle/phi/core/attribute.h +++ b/paddle/phi/core/attribute.h @@ -48,6 +48,6 @@ using Attribute = paddle::variant; -using RuntimeAttrs = paddle::flat_hash_map; +using AttributeMap = paddle::flat_hash_map; } // namespace phi diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h index 7b79138fe78a3..830443fca8fba 100644 --- a/paddle/phi/core/kernel_context.h +++ b/paddle/phi/core/kernel_context.h @@ -138,8 +138,6 @@ class KernelContext { template const AttrType& AttrAt(size_t idx) const; - const RuntimeAttrs& GetRuntimeAttrs() const { return runtime_attrs_; } - size_t InputsSize() const { return inputs_.size(); } size_t OutputsSize() const { return outputs_.size(); } size_t AttrsSize() const { return attrs_.size(); } @@ -154,8 +152,6 @@ class KernelContext { paddle::small_vector, kInputSmallVectorSize> input_range_; paddle::small_vector, kOutputSmallVectorSize> output_range_; - - RuntimeAttrs runtime_attrs_; }; } // namespace phi diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index 28c750dd9d923..fa2fd6f53a313 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -215,8 +215,6 @@ struct KernelArgsParseFunctor { args_def->AppendAttribute(AttributeType::DATA_LAYOUT); } else if (arg_type == std::type_index(typeid(Place))) { args_def->AppendAttribute(AttributeType::PLACE); - } else if (arg_type == std::type_index(typeid(RuntimeAttrs))) { - // do nothing } else { PADDLE_THROW(phi::errors::Unavailable( "Unsupported kernel argument type `%s`.", arg_type.name())); diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 9237b8eb31a2e..6d9cc95fe71e0 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -328,21 +328,6 @@ struct KernelImpl { PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(TensorArray); - template - struct KernelCallHelper { - template - static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { - const auto& runtime_attrs = ctx->GetRuntimeAttrs(); - KernelCallHelper:: - template Compute( - ctx, pargs..., runtime_attrs); - } - }; - /* End case */ template struct KernelCallHelper> { From 28e9710608d882179c73922904e32e0a6a57f486 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 21 Sep 2022 04:59:03 +0000 Subject: [PATCH 02/36] add type_info for all context --- paddle/fluid/framework/operator.cc | 18 ++++++++++++++ .../platform/device/mlu/device_context.h | 6 ++++- paddle/fluid/platform/device_context.h | 24 +++++++++++++++---- paddle/phi/backends/cpu/cpu_context.h | 5 +++- paddle/phi/backends/custom/custom_context.h | 5 +++- paddle/phi/backends/gpu/gpu_context.h | 5 +++- paddle/phi/backends/onednn/onednn_context.h | 2 ++ paddle/phi/backends/xpu/xpu_context.h | 5 +++- paddle/phi/core/device_context.h | 13 ++++++++++ 9 files changed, 74 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 4c28a9b59535e..83be1308bda86 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2879,6 +2879,7 @@ void OperatorWithKernel::BuildPhiKernelContext( } break; default: { if (attr_iter == Attrs().end()) { + // TODO(chenweihang): remove this backup searching later attr_iter = RuntimeAttrs().find(attr_names[i]); PADDLE_ENFORCE_NE(attr_iter, RuntimeAttrs().end(), @@ -2962,6 +2963,23 @@ void OperatorWithKernel::BuildPhiKernelContext( } } VLOG(4) << "Done attributes"; + + // For compatible with Op with extra attrs for specific backend + auto& runtime_attrs = RuntimeAttrs(); + for (const auto& attr : runtime_attrs) { + auto& attr_name = attr.first; + // auto& attr_val = attr.second; + VLOG(4) << "attr name: " << attr_name; + if (phi::OneDNNContext::classof(dev_ctx)) { + VLOG(4) << "Current DeviceContext is OneDNNContext"; + } else if (phi::GPUContext::classof(dev_ctx)) { + // For GPUDNN, GPUDNN has no DeviceContext, so use GPUContext + VLOG(4) << "Current DeviceContext is GPUContext"; + } else { + // skip current attr, do nothing + } + } + VLOG(4) << "Done runtime attributes"; } } // namespace framework diff --git a/paddle/fluid/platform/device/mlu/device_context.h b/paddle/fluid/platform/device/mlu/device_context.h index d8bb762315948..7bf7fdf054e45 100644 --- a/paddle/fluid/platform/device/mlu/device_context.h +++ b/paddle/fluid/platform/device/mlu/device_context.h @@ -74,7 +74,9 @@ class MLUContext { DISABLE_COPY_AND_ASSIGN(MLUContext); }; -class MLUDeviceContext : public DeviceContext { +class MLUDeviceContext + : public DeviceContext, + public phi::TypeInfoTraits { public: explicit MLUDeviceContext(MLUPlace place); virtual ~MLUDeviceContext(); @@ -130,6 +132,8 @@ class MLUDeviceContext : public DeviceContext { return thread_ctx_.at(this); } + static const char* name() { return "MLUDeviceContext"; } + private: int compute_capability_; int driver_version_; diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 7939f8ff7c066..f6646d1f5890d 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -144,7 +144,9 @@ struct DefaultDeviceContextType { // Graphcore IPU #ifdef PADDLE_WITH_IPU -class IPUDeviceContext : public DeviceContext { +class IPUDeviceContext + : public DeviceContext, + public phi::TypeInfoTraits { public: IPUDeviceContext() = delete; explicit IPUDeviceContext(IPUPlace place); @@ -154,6 +156,8 @@ class IPUDeviceContext : public DeviceContext { /*! \brief Wait for all operations completion in the stream. */ void Wait() const override; + static const char* name() { return "IPUDeviceContext"; } + private: IPUPlace place_; }; @@ -188,7 +192,9 @@ struct DefaultDeviceContextType { #endif #ifdef PADDLE_WITH_ASCEND_CL -class NPUDeviceContext : public DeviceContext { +class NPUDeviceContext + : public DeviceContex, + public TypeInfoTraits { public: explicit NPUDeviceContext(NPUPlace place); virtual ~NPUDeviceContext(); @@ -224,6 +230,8 @@ class NPUDeviceContext : public DeviceContext { // void WaitStreamCallback() const { return stream_->WaitCallback(); } + static const char* name() { return "NPUDeviceContext"; } + private: NPUPlace place_; aclrtContext context_; @@ -248,7 +256,9 @@ struct DefaultDeviceContextType { }; // Currently, NPUPinnedDeviceContext is only used to data copying. -class NPUPinnedDeviceContext : public DeviceContext { +class NPUPinnedDeviceContext + : public DeviceContext, + public phi::TypeInfoTraits { public: NPUPinnedDeviceContext(); explicit NPUPinnedDeviceContext(NPUPinnedPlace place); @@ -257,6 +267,8 @@ class NPUPinnedDeviceContext : public DeviceContext { Eigen::DefaultDevice* eigen_device() const; + static const char* name() { return "NPUPinnedDeviceContext"; } + private: NPUPinnedPlace place_; std::unique_ptr eigen_device_; @@ -276,7 +288,9 @@ struct DefaultDeviceContextType { }; // Currently, CUDAPinnedDeviceContext is only used to data copying. -class CUDAPinnedDeviceContext : public DeviceContext { +class CUDAPinnedDeviceContext + : public DeviceContext, + public phi::TypeInfoTraits { public: CUDAPinnedDeviceContext(); explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place); @@ -285,6 +299,8 @@ class CUDAPinnedDeviceContext : public DeviceContext { Eigen::DefaultDevice* eigen_device() const; + static const char* name() { return "CUDAPinnedDeviceContext"; } + private: CUDAPinnedPlace place_; std::unique_ptr eigen_device_; diff --git a/paddle/phi/backends/cpu/cpu_context.h b/paddle/phi/backends/cpu/cpu_context.h index 58548b2e04e02..3a8b5b0a82adc 100644 --- a/paddle/phi/backends/cpu/cpu_context.h +++ b/paddle/phi/backends/cpu/cpu_context.h @@ -24,7 +24,8 @@ limitations under the License. */ namespace phi { -class PADDLE_API CPUContext : public DeviceContext { +class PADDLE_API CPUContext : public DeviceContext, + public TypeInfoTraits { public: CPUContext(); CPUContext(CPUContext&&); @@ -34,6 +35,8 @@ class PADDLE_API CPUContext : public DeviceContext { Eigen::DefaultDevice* eigen_device() const; const Place& GetPlace() const override; + static const char* name() { return "CPUContext"; } + protected: // NOTE: External users manage resources. Used in inference scenarios. // The Set interface is for inference only, DeviceContext will mark the diff --git a/paddle/phi/backends/custom/custom_context.h b/paddle/phi/backends/custom/custom_context.h index 57be8534fa954..d007cb62cd4f9 100644 --- a/paddle/phi/backends/custom/custom_context.h +++ b/paddle/phi/backends/custom/custom_context.h @@ -21,7 +21,8 @@ limitations under the License. */ namespace phi { -class CustomContext : public DeviceContext { +class CustomContext : public DeviceContext, + public TypeInfoTraits { public: explicit CustomContext(const CustomPlace&); @@ -35,6 +36,8 @@ class CustomContext : public DeviceContext { // Wait for all operations completion in the stream. void Wait() const override; + static const char* name() { return "CustomContext"; } + public: // NOTE: DeviceContext hold resources. Used in training scenarios. // The interface used by the training scene, DeviceContext will initialize diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index 989bbbcbbf5f8..eeb93cb398705 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -77,7 +77,8 @@ class DnnWorkspaceHandle { std::unique_ptr mtx_; }; -class PADDLE_API GPUContext : public DeviceContext { +class PADDLE_API GPUContext : public DeviceContext, + public TypeInfoTraits { public: explicit GPUContext(const GPUPlace& place, bool init = true); @@ -166,6 +167,8 @@ class PADDLE_API GPUContext : public DeviceContext { void WaitStreamCallback() const; + static const char* name() { return "GPUContext"; } + public: /*! \brief Return nccl communicators. */ ncclComm_t nccl_comm() const; diff --git a/paddle/phi/backends/onednn/onednn_context.h b/paddle/phi/backends/onednn/onednn_context.h index c2e49ccb64cbd..33ca044260502 100644 --- a/paddle/phi/backends/onednn/onednn_context.h +++ b/paddle/phi/backends/onednn/onednn_context.h @@ -142,6 +142,8 @@ class OneDNNContext : public CPUContext { int8_t DeviceAttrsVersion() const; void SetDeviceAttrsVersion(int8_t version); + static const char* name() { return "OneDNNContext"; } + private: struct Impl; std::unique_ptr impl_; diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h index 90fc7c97b785c..88b3ac56339b1 100644 --- a/paddle/phi/backends/xpu/xpu_context.h +++ b/paddle/phi/backends/xpu/xpu_context.h @@ -26,7 +26,8 @@ namespace xpu = baidu::xpu::api; namespace phi { -class XPUContext : public DeviceContext { +class XPUContext : public DeviceContext, + public TypeInfoTraits { public: XPUContext(); @@ -65,6 +66,8 @@ class XPUContext : public DeviceContext { XPUStream stream() const; + static const char* name() { return "XPUContext"; } + private: struct Impl; std::unique_ptr impl_; diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h index c845d50f77564..5dad261f43b34 100644 --- a/paddle/phi/core/device_context.h +++ b/paddle/phi/core/device_context.h @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/phi/common/place.h" #include "paddle/phi/core/allocator.h" #include "paddle/phi/core/generator.h" +#include "paddle/phi/core/utils/type_registry.h" namespace phi { class TensorBase; @@ -188,9 +189,21 @@ class PADDLE_API DeviceContext { */ Generator* GetHostGenerator() const; + /** + * @brief Return the type information of the derived class to support + * safely downcast in non-rtti environment. + * + * @return The type information of the derived class. + */ + TypeInfo type_info() const { return type_info_; } + private: struct Impl; std::unique_ptr impl_; + + template + friend class TypeInfoTraits; + TypeInfo type_info_{TypeInfo::kUnknownType}; }; } // namespace phi From cd630f0a785f4e51dac0bbd896fa78ca0900fd7a Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 21 Sep 2022 11:40:14 +0000 Subject: [PATCH 03/36] add onednn context to all context --- paddle/phi/backends/all_context.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/phi/backends/all_context.h b/paddle/phi/backends/all_context.h index 392df09fcffd8..070bc71cc9ecf 100644 --- a/paddle/phi/backends/all_context.h +++ b/paddle/phi/backends/all_context.h @@ -24,6 +24,9 @@ limitations under the License. */ #include "paddle/phi/backends/custom/custom_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #ifdef PADDLE_WITH_XPU +#include "paddle/phi/backends/onednn/onednn_context.h" +#endif +#ifdef PADDLE_WITH_XPU #include "paddle/phi/backends/xpu/xpu_context.h" #endif From d63367271d6017400cef0bf07cbae2de2468bdeb Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 23 Sep 2022 04:00:56 +0000 Subject: [PATCH 04/36] fix context compile error --- paddle/phi/backends/all_context.h | 4 -- paddle/phi/backends/gpu/gpu_context.cc | 61 ++++++++++++++++++++ paddle/phi/backends/gpu/gpu_context.h | 12 +++- paddle/phi/backends/onednn/onednn_context.cc | 57 +++++++++--------- paddle/phi/backends/onednn/onednn_context.h | 10 ++-- paddle/phi/backends/xpu/xpu_context.h | 4 ++ paddle/phi/core/kernel_utils.h | 8 +-- 7 files changed, 108 insertions(+), 48 deletions(-) diff --git a/paddle/phi/backends/all_context.h b/paddle/phi/backends/all_context.h index 070bc71cc9ecf..12cb45d23812f 100644 --- a/paddle/phi/backends/all_context.h +++ b/paddle/phi/backends/all_context.h @@ -23,12 +23,8 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/custom/custom_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#ifdef PADDLE_WITH_XPU #include "paddle/phi/backends/onednn/onednn_context.h" -#endif -#ifdef PADDLE_WITH_XPU #include "paddle/phi/backends/xpu/xpu_context.h" -#endif #ifndef PADDLE_WITH_CUSTOM_KERNEL // TODO(wilber): DeviceContextPool nees include fluid file. diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index cf08f5b4affa9..3bd9f3e6dd481 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -717,6 +717,37 @@ struct GPUContext::Impl { } } + bool HasDnnAttr(const std::string& attr_name) const { + return dnn_attrs_.count(attr_name) != 0UL; + } + const Attribute& GetDnnAttr(const std::string& attr_name) const { + auto iter = dnn_attrs_.find(attr_name); + PADDLE_ENFORCE_NE( + iter, + dnn_attrs_.end(), + phi::errors::NotFound("Attribute `%s` is not found in OneDNNContext.")); + return iter->second; + } + + void SetDnnAttr(const std::string& attr_name, const Attribute& attr) { + if (dnn_attrs_version_ == 0) { + dnn_attrs_[attr_name] = attr; + } else { + auto iter = dnn_attrs_.find(attr_name); + // TODO(chenweihang): this check is not enough + PADDLE_ENFORCE_EQ( + iter, + dnn_attrs_.end(), + phi::errors::AlreadyExists( + "Attribute `%s` in OneDNNContext is being used now.")); + dnn_attrs_[attr_name] = attr; + } + } + + int8_t DnnAttrsVersion() const { return dnn_attrs_version_; } + + void SetDnnAttrsVersion(int8_t version) { dnn_attrs_version_ = version; } + // use one flag for all handles? // they should be accessed consistently bool owned_{false}; @@ -780,6 +811,17 @@ struct GPUContext::Impl { Allocator* allocator_{nullptr}; // external resource. // A internal resouce to initinalize eigen_device. std::unique_ptr eigen_stream_{nullptr}; + + // For thread safety when using onednn_attrs_ + int8_t dnn_attrs_version_ = 0; + // Holds some attributes only used by the gpudnn kernel calculation + // Since original mkldnn op kernel directly adds the operations that require + // fusion to the native kernel operations, and uses the attribute `fuse_xxx` + // to control, for gpudnn, there will be some attributes that seem to be + // independent of the device are also saved here. + // Here, the operation of fusion needs to be implemented separately as + // a fusion op and kernel, instead of patching it to a basic operation. + AttributeMap dnn_attrs_; }; GPUContext::GPUContext(GPUContext&&) = default; @@ -1000,4 +1042,23 @@ void GPUContext::SetDriverVersion(int val) { impl_->driver_version_ = val; } void GPUContext::SetRuntimeVersion(int val) { impl_->runtime_version_ = val; } +bool GPUContext::HasDnnAttr(const std::string& attr_name) const { + return impl_->HasDnnAttr(attr_name); +} + +const Attribute& GPUContext::GetDnnAttr(const std::string& attr_name) const { + return impl_->GetDnnAttr(attr_name); +} + +void GPUContext::SetDnnAttr(const std::string& attr_name, + const Attribute& attr) { + return impl_->SetDnnAttr(attr_name, attr); +} + +int8_t GPUContext::DnnAttrsVersion() const { return impl_->DnnAttrsVersion(); } + +void GPUContext::SetDnnAttrsVersion(int8_t version) { + return impl_->SetDnnAttrsVersion(version); +} + } // namespace phi diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index eeb93cb398705..ca0e63b0f7974 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_helper.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/attribute.h" #include "paddle/phi/core/device_context.h" namespace phi { @@ -167,6 +168,13 @@ class PADDLE_API GPUContext : public DeviceContext, void WaitStreamCallback() const; + // Several methods for adapting Dnn-specific attributes + bool HasDnnAttr(const std::string& attr_name) const; + const Attribute& GetDnnAttr(const std::string& attr_name) const; + void SetDnnAttr(const std::string& attr_name, const Attribute& attr); + int8_t DnnAttrsVersion() const; + void SetDnnAttrsVersion(int8_t version); + static const char* name() { return "GPUContext"; } public: @@ -253,10 +261,10 @@ class PADDLE_API GPUContext : public DeviceContext, std::unique_ptr impl_; }; -// Note: In order to register the kernel of CUDNN, GPUDNNContext is required. +// Note: In order to register the kernel of CUDNN, DnnContext is required. // Currently, CUDNN kernel directly uses GPUContext. But if the kernel function // has the same name, this will lead to duplicate instantiations of GPU kernel -// and GPUDNN kernel function, so if we using GPUDNNContext = GPUContext, we +// and Dnn kernel function, so if we using DnnContext = GPUContext, we // must use different function name for cudnn kernel using GPUDNNContext = GPUContext; diff --git a/paddle/phi/backends/onednn/onednn_context.cc b/paddle/phi/backends/onednn/onednn_context.cc index a5ecc1724bef5..f453b7da81807 100644 --- a/paddle/phi/backends/onednn/onednn_context.cc +++ b/paddle/phi/backends/onednn/onednn_context.cc @@ -285,38 +285,36 @@ struct OneDNNContext::Impl { return key_it->second; } - bool HasDeviceAttr(const std::string& attr_name) const { - return device_attrs_.count(attr_name) != 0UL; + bool HasDnnAttr(const std::string& attr_name) const { + return dnn_attrs_.count(attr_name) != 0UL; } - const Attribute& GetDeviceAttr(const std::string& attr_name) const { - auto iter = device_attrs_.find(attr_name); + const Attribute& GetDnnAttr(const std::string& attr_name) const { + auto iter = dnn_attrs_.find(attr_name); PADDLE_ENFORCE_NE( iter, - device_attrs_.end(), + dnn_attrs_.end(), phi::errors::NotFound("Attribute `%s` is not found in OneDNNContext.")); return iter->second; } - void SetDeviceAttr(const std::string& attr_name, const Attribute& attr) { - if (device_attrs_version_ == 0) { - device_attrs_[attr_name] = attr; + void SetDnnAttr(const std::string& attr_name, const Attribute& attr) { + if (dnn_attrs_version_ == 0) { + dnn_attrs_[attr_name] = attr; } else { - auto iter = device_attrs_.find(attr_name); + auto iter = dnn_attrs_.find(attr_name); // TODO(chenweihang): this check is not enough PADDLE_ENFORCE_EQ( iter, - device_attrs_.end(), + dnn_attrs_.end(), phi::errors::AlreadyExists( "Attribute `%s` in OneDNNContext is being used now.")); - device_attrs_[attr_name] = attr; + dnn_attrs_[attr_name] = attr; } } - int8_t DeviceAttrsVersion() const { return device_attrs_version_; } + int8_t DnnAttrsVersion() const { return dnn_attrs_version_; } - void SetDeviceAttrsVersion(int8_t version) { - device_attrs_version_ = version; - } + void SetDnnAttrsVersion(int8_t version) { dnn_attrs_version_ = version; } std::shared_ptr p_blobmap_; // Map key is pointer of executor and value is a data(iterator in map) needed @@ -326,8 +324,8 @@ struct OneDNNContext::Impl { // 0 - clearing is allowed. x > 0 do not clear. unsigned int block_next_cache_clearing_ = 0; - // For thread safety when using device_attrs_ - int8_t device_attrs_version_ = 0; + // For thread safety when using dnn_attrs_ + int8_t dnn_attrs_version_ = 0; // Holds some attributes only used by the onednn kernel calculation // Since original mkldnn op kernel directly adds the operations that require // fusion to the native kernel operations, and uses the attribute `fuse_xxx` @@ -335,7 +333,7 @@ struct OneDNNContext::Impl { // independent of the device are also saved here. // Here, the operation of fusion needs to be implemented separately as // a fusion op and kernel, instead of patching it to a basic operation. - AttributeMap device_attrs_; + AttributeMap dnn_attrs_; }; OneDNNContext::OneDNNContext(const Place& place) @@ -367,26 +365,25 @@ OneDNNContext::BlobPtr_t OneDNNContext::GetBlob( return impl_->GetBlob(name); } -bool OneDNNContext::HasDeviceAttr(const std::string& attr_name) const { - return impl_->HasDeviceAttr(attr_name); +bool OneDNNContext::HasDnnAttr(const std::string& attr_name) const { + return impl_->HasDnnAttr(attr_name); } -const Attribute& OneDNNContext::GetDeviceAttr( - const std::string& attr_name) const { - return impl_->GetDeviceAttr(attr_name); +const Attribute& OneDNNContext::GetDnnAttr(const std::string& attr_name) const { + return impl_->GetDnnAttr(attr_name); } -void OneDNNContext::SetDeviceAttr(const std::string& attr_name, - const Attribute& attr) { - return impl_->SetDeviceAttr(attr_name, attr); +void OneDNNContext::SetDnnAttr(const std::string& attr_name, + const Attribute& attr) { + return impl_->SetDnnAttr(attr_name, attr); } -int8_t OneDNNContext::DeviceAttrsVersion() const { - return impl_->DeviceAttrsVersion(); +int8_t OneDNNContext::DnnAttrsVersion() const { + return impl_->DnnAttrsVersion(); } -void OneDNNContext::SetDeviceAttrsVersion(int8_t version) { - return impl_->SetDeviceAttrsVersion(version); +void OneDNNContext::SetDnnAttrsVersion(int8_t version) { + return impl_->SetDnnAttrsVersion(version); } } // namespace phi diff --git a/paddle/phi/backends/onednn/onednn_context.h b/paddle/phi/backends/onednn/onednn_context.h index 33ca044260502..8cb027eecd3b0 100644 --- a/paddle/phi/backends/onednn/onednn_context.h +++ b/paddle/phi/backends/onednn/onednn_context.h @@ -136,11 +136,11 @@ class OneDNNContext : public CPUContext { } // Several methods for adapting ONEDNN-specific attributes - bool HasDeviceAttr(const std::string& attr_name) const; - const Attribute& GetDeviceAttr(const std::string& attr_name) const; - void SetDeviceAttr(const std::string& attr_name, const Attribute& attr); - int8_t DeviceAttrsVersion() const; - void SetDeviceAttrsVersion(int8_t version); + bool HasDnnAttr(const std::string& attr_name) const; + const Attribute& GetDnnAttr(const std::string& attr_name) const; + void SetDnnAttr(const std::string& attr_name, const Attribute& attr); + int8_t DnnAttrsVersion() const; + void SetDnnAttrsVersion(int8_t version); static const char* name() { return "OneDNNContext"; } diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h index 88b3ac56339b1..61967a9a58d39 100644 --- a/paddle/phi/backends/xpu/xpu_context.h +++ b/paddle/phi/backends/xpu/xpu_context.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#ifdef PADDLE_WITH_XPU + #include #include "paddle/phi/backends/xpu/forwards.h" @@ -82,3 +84,5 @@ using KPSContext = XPUContext; #endif } // namespace phi + +#endif diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 6d9cc95fe71e0..e7dd6ca437ac9 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -14,13 +14,7 @@ #pragma once -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/backends/custom/custom_context.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/backends/onednn/onednn_context.h" -#ifdef PADDLE_WITH_XPU -#include "paddle/phi/backends/xpu/xpu_context.h" -#endif +#include "paddle/phi/backends/all_context.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" From d651bf96ce5d74b90ce13fa28f9d048a5cb91151 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 23 Sep 2022 12:28:30 +0000 Subject: [PATCH 05/36] simplify conv kernel args --- paddle/phi/kernels/conv_kernel.cc | 56 ----------------- paddle/phi/kernels/conv_kernel.h | 17 +----- paddle/phi/kernels/cpu/conv_kernel.cc | 70 ++++++++++++++-------- paddle/phi/kernels/gpu/conv_kernel.cu | 46 ++++++++++---- paddle/phi/kernels/gpudnn/conv_kernel.cu | 22 +++---- paddle/phi/kernels/impl/conv_kernel_impl.h | 23 ++++--- paddle/phi/kernels/xpu/conv_kernel.cc | 10 +--- paddle/phi/ops/compat/conv2d_sig.cc | 34 +++-------- 8 files changed, 108 insertions(+), 170 deletions(-) delete mode 100644 paddle/phi/kernels/conv_kernel.cc diff --git a/paddle/phi/kernels/conv_kernel.cc b/paddle/phi/kernels/conv_kernel.cc deleted file mode 100644 index 542a4ec8a61c8..0000000000000 --- a/paddle/phi/kernels/conv_kernel.cc +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/conv_kernel.h" - -#include "paddle/fluid/platform/cudnn_workspace_helper.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -template -void ConvInferKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::string& paddding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - DenseTensor* out) { - ConvKernel(dev_ctx, - input, - filter, - strides, - paddings, - paddding_algorithm, - groups, - dilations, - data_format, - /*use_addto=*/false, - /*workspace_size_MB=*/ - paddle::platform::GetDefaultConvWorkspaceSizeLimitMB(), - /*exhaustive_search=*/false, - out); -} - -} // namespace phi - -PD_REGISTER_KERNEL( - conv2d_infer, CPU, ALL_LAYOUT, phi::ConvInferKernel, float, double) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL( - conv2d_infer, GPU, ALL_LAYOUT, phi::ConvInferKernel, float, double) {} -#endif diff --git a/paddle/phi/kernels/conv_kernel.h b/paddle/phi/kernels/conv_kernel.h index a105fe794f94d..0c02e8736aa93 100644 --- a/paddle/phi/kernels/conv_kernel.h +++ b/paddle/phi/kernels/conv_kernel.h @@ -25,12 +25,9 @@ void ConvKernel(const Context& dev_ctx, const std::vector& strides, const std::vector& paddings, const std::string& padding_algorithm, - int groups, const std::vector& dilations, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* out); template @@ -64,16 +61,4 @@ void DepthwiseConvKernel(const Context& dev_ctx, bool fuse_relu, DenseTensor* out); -template -void ConvInferKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::string& paddding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - DenseTensor* out); - } // namespace phi diff --git a/paddle/phi/kernels/cpu/conv_kernel.cc b/paddle/phi/kernels/cpu/conv_kernel.cc index ec3253194930b..12bfa852d96c9 100644 --- a/paddle/phi/kernels/cpu/conv_kernel.cc +++ b/paddle/phi/kernels/cpu/conv_kernel.cc @@ -19,6 +19,30 @@ #include "paddle/phi/kernels/impl/conv_kernel_impl.h" namespace phi { + +template +void ConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + const std::vector& dilations, + int groups, + const std::string& data_format, + DenseTensor* out) { + ConvKernelImpl(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + out); +} + template void DepthwiseConvKernel(const Context& dev_ctx, const DenseTensor& input, @@ -34,19 +58,16 @@ void DepthwiseConvKernel(const Context& dev_ctx, bool exhaustive_search, bool fuse_relu, DenseTensor* out) { - ConvKernel(dev_ctx, - input, - filter, - strides, - paddings, - padding_algorithm, - groups, - dilations, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search, - out); + ConvKernelImpl(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + out); } template @@ -63,19 +84,16 @@ void Conv3DKernel(const Context& dev_ctx, int workspace_size_MB, bool exhaustive_search, DenseTensor* out) { - ConvKernel(dev_ctx, - input, - filter, - strides, - paddings, - padding_algorithm, - groups, - dilations, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search, - out); + ConvKernelImpl(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + out); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/conv_kernel.cu b/paddle/phi/kernels/gpu/conv_kernel.cu index e548b6fbf6d7b..a089175c96fb6 100644 --- a/paddle/phi/kernels/gpu/conv_kernel.cu +++ b/paddle/phi/kernels/gpu/conv_kernel.cu @@ -20,6 +20,29 @@ namespace phi { +template +void ConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + const std::vector& dilations, + int groups, + const std::string& data_format, + DenseTensor* out) { + ConvKernelImpl(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + out); +} + template void Conv3DKernel(const Context& dev_ctx, const DenseTensor& input, @@ -34,19 +57,16 @@ void Conv3DKernel(const Context& dev_ctx, int workspace_size_MB, bool exhaustive_search, DenseTensor* out) { - ConvKernel(dev_ctx, - input, - filter, - strides, - paddings, - padding_algorithm, - groups, - dilations, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search, - out); + ConvKernelImpl(dev_ctx, + input, + filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + data_format, + out); } } // namespace phi diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index aa591a34a4399..f37e5d3a245b0 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -43,18 +43,20 @@ void ConvCudnnKernel(const Context& ctx, const std::vector& strides, const std::vector& paddings_t, const std::string& padding_algorithm, - int groups, const std::vector& dilations_t, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, DenseTensor* output) { ctx.template Alloc(output); std::vector paddings = paddings_t; std::vector dilations = dilations_t; - bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; + bool exhaustive_search_attr = + ctx.HasDnnAttr("exhaustive_search") + ? paddle::get(ctx.GetDnnAttr("exhaustive_search")) + : false; + bool exhaustive_search = + FLAGS_cudnn_exhaustive_search || exhaustive_search_attr; bool deterministic = FLAGS_cudnn_deterministic; auto exhaustive_deterministic = exhaustive_search && deterministic; PADDLE_ENFORCE_EQ(exhaustive_deterministic, @@ -406,12 +408,9 @@ void Conv3DCudnnKernel(const Context& dev_ctx, strides, paddings, padding_algorithm, - groups, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, out); } @@ -436,12 +435,9 @@ void DepthwiseConvCudnnKernel(const Context& dev_ctx, strides, paddings, padding_algorithm, - groups, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, out); } diff --git a/paddle/phi/kernels/impl/conv_kernel_impl.h b/paddle/phi/kernels/impl/conv_kernel_impl.h index 2ef2ed8af2809..eb2d183981213 100644 --- a/paddle/phi/kernels/impl/conv_kernel_impl.h +++ b/paddle/phi/kernels/impl/conv_kernel_impl.h @@ -25,19 +25,16 @@ namespace phi { template -void ConvKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter_t, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - DenseTensor* output) { +void ConvKernelImpl(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter_t, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + DenseTensor* output) { std::vector paddings = paddings_t; std::vector dilations = dilations_t; DenseTensor filter = filter_t; diff --git a/paddle/phi/kernels/xpu/conv_kernel.cc b/paddle/phi/kernels/xpu/conv_kernel.cc index 287faf4cdb9aa..fca984a8606ec 100644 --- a/paddle/phi/kernels/xpu/conv_kernel.cc +++ b/paddle/phi/kernels/xpu/conv_kernel.cc @@ -27,12 +27,9 @@ void ConvKernel(const Context& dev_ctx, const std::vector& strides, const std::vector& paddings_t, const std::string& padding_algorithm, - int groups, const std::vector& dilations_t, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* out) { using XPUT = typename XPUTypeTrait::Type; std::vector paddings = paddings_t; @@ -132,12 +129,9 @@ void DepthwiseConvKernel(const Context& dev_ctx, strides, paddings, paddding_algorithm, - groups, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, out); } diff --git a/paddle/phi/ops/compat/conv2d_sig.cc b/paddle/phi/ops/compat/conv2d_sig.cc index 617c6e289bf2b..26f5cba4fee97 100644 --- a/paddle/phi/ops/compat/conv2d_sig.cc +++ b/paddle/phi/ops/compat/conv2d_sig.cc @@ -17,31 +17,15 @@ namespace phi { KernelSignature Conv2dOpArgumentMapping(const ArgumentMappingContext& ctx) { - if (!ctx.HasAttr("use_addto") || !ctx.HasAttr("workspace_size_MB") || - !ctx.HasAttr("exhaustive_search")) { - return KernelSignature("conv2d_infer", - {"Input", "Filter"}, - {"strides", - "paddings", - "padding_algorithm", - "groups", - "dilations", - "data_format"}, - {"Output"}); - } else { - return KernelSignature("conv2d", - {"Input", "Filter"}, - {"strides", - "paddings", - "padding_algorithm", - "groups", - "dilations", - "data_format", - "use_addto", - "workspace_size_MB", - "exhaustive_search"}, - {"Output"}); - } + return KernelSignature("conv2d", + {"Input", "Filter"}, + {"strides", + "paddings", + "padding_algorithm", + "dilations", + "groups", + "data_format"}, + {"Output"}); } KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { From 84c56fa349e6d5edd988e4d470221fd61f28d682 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 26 Sep 2022 08:01:46 +0000 Subject: [PATCH 06/36] pass runtime attr into dev_ctx --- paddle/fluid/framework/operator.cc | 119 +++++++++++++++++-- paddle/fluid/operators/ops_extra_info.h | 11 +- paddle/phi/backends/gpu/gpu_context.cc | 7 +- paddle/phi/backends/gpu/gpu_context.h | 2 +- paddle/phi/backends/onednn/onednn_context.cc | 7 +- paddle/phi/backends/onednn/onednn_context.h | 2 +- paddle/phi/kernels/gpudnn/conv_kernel.cu | 5 +- 7 files changed, 131 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 83be1308bda86..d16901f9b906b 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/fluid/framework/unused_var_check.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/isfinite_op.h" +#include "paddle/fluid/operators/ops_extra_info.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" @@ -2966,17 +2967,115 @@ void OperatorWithKernel::BuildPhiKernelContext( // For compatible with Op with extra attrs for specific backend auto& runtime_attrs = RuntimeAttrs(); - for (const auto& attr : runtime_attrs) { - auto& attr_name = attr.first; - // auto& attr_val = attr.second; - VLOG(4) << "attr name: " << attr_name; - if (phi::OneDNNContext::classof(dev_ctx)) { - VLOG(4) << "Current DeviceContext is OneDNNContext"; - } else if (phi::GPUContext::classof(dev_ctx)) { - // For GPUDNN, GPUDNN has no DeviceContext, so use GPUContext - VLOG(4) << "Current DeviceContext is GPUContext"; + for (const auto& attr_iter : runtime_attrs) { + auto& attr_name = attr_iter.first; + auto& attr = attr_iter.second; + auto attr_propertys = paddle::operators::GetExtraAttrPropertys(attr_name); + if (phi::OneDNNContext::classof(dev_ctx) && + attr_propertys.Support(operators::ExtraAttrProperty::ONEDNN)) { + VLOG(4) << "Runtime attr `" << attr_name + << "` is passed to OneDNNContext."; + phi::OneDNNContext* one_dnn_ctx = + static_cast(dev_ctx); + switch (AttrTypeID(attr)) { + case proto::AttrType::INT: + one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(int, attr)); + break; + case proto::AttrType::FLOAT: + one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(float, attr)); + break; + case proto::AttrType::STRING: + one_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::string, attr)); + break; + case proto::AttrType::INTS: + one_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::vector, attr)); + break; + case proto::AttrType::FLOATS: + one_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::vector, attr)); + break; + case proto::AttrType::STRINGS: + one_dnn_ctx->SetDnnAttr( + attr_name, PADDLE_GET_CONST(std::vector, attr)); + break; + case proto::AttrType::BOOLEAN: + one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(bool, attr)); + break; + case proto::AttrType::BOOLEANS: + one_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::vector, attr)); + break; + case proto::AttrType::LONG: + one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(int64_t, attr)); + break; + case proto::AttrType::LONGS: + one_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::vector, attr)); + break; + case proto::AttrType::FLOAT64S: + one_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::vector, attr)); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported Attribute value type `%s` for phi.", + platform::demangle(attr.type().name()))); + } + } else if (phi::GPUContext::classof(dev_ctx) && + attr_propertys.Support(operators::ExtraAttrProperty::GPUDNN)) { + VLOG(4) << "Runtime attr `" << attr_name + << "` is passed to GPUDNNContext."; + phi::GPUContext* gpu_dnn_ctx = static_cast(dev_ctx); + switch (AttrTypeID(attr)) { + case proto::AttrType::INT: + gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(int, attr)); + break; + case proto::AttrType::FLOAT: + gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(float, attr)); + break; + case proto::AttrType::STRING: + gpu_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::string, attr)); + break; + case proto::AttrType::INTS: + gpu_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::vector, attr)); + break; + case proto::AttrType::FLOATS: + gpu_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::vector, attr)); + break; + case proto::AttrType::STRINGS: + gpu_dnn_ctx->SetDnnAttr( + attr_name, PADDLE_GET_CONST(std::vector, attr)); + break; + case proto::AttrType::BOOLEAN: + gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(bool, attr)); + break; + case proto::AttrType::BOOLEANS: + gpu_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::vector, attr)); + break; + case proto::AttrType::LONG: + gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(int64_t, attr)); + break; + case proto::AttrType::LONGS: + gpu_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::vector, attr)); + break; + case proto::AttrType::FLOAT64S: + gpu_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::vector, attr)); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported Attribute value type `%s` for phi.", + platform::demangle(attr.type().name()))); + } } else { - // skip current attr, do nothing + // do nothing } } VLOG(4) << "Done runtime attributes"; diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h index 2373b00c13c97..af6304375cedd 100644 --- a/paddle/fluid/operators/ops_extra_info.h +++ b/paddle/fluid/operators/ops_extra_info.h @@ -51,7 +51,7 @@ class ExtraAttrPropertySet final { inline uint64_t bitset() const { return bitset_; } - bool inline Contains(ExtraAttrProperty e) const { + bool inline Support(ExtraAttrProperty e) const { // DEPRECATED ExtraAttr always return false return static_cast(bitset_ & ExtraAttrPropertySet(e).bitset()); } @@ -129,6 +129,15 @@ const std::unordered_map ExtraAttrPropertySet(ExtraAttrProperty::GPUDNN)}, }; +inline ExtraAttrPropertySet GetExtraAttrPropertys( + const std::string& attr_name) { + auto iter = extra_attr_properties.find(attr_name); + if (iter != extra_attr_properties.end()) { + return iter->second; + } + return ExtraAttrPropertySet(); +} + template struct ExtraAttrChecker { ExtraAttrChecker(const std::string& attr_name, T default_value) diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index 3bd9f3e6dd481..def4e71f78f2b 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -729,7 +729,7 @@ struct GPUContext::Impl { return iter->second; } - void SetDnnAttr(const std::string& attr_name, const Attribute& attr) { + void SetDnnAttr(const std::string& attr_name, Attribute attr) { if (dnn_attrs_version_ == 0) { dnn_attrs_[attr_name] = attr; } else { @@ -1050,9 +1050,8 @@ const Attribute& GPUContext::GetDnnAttr(const std::string& attr_name) const { return impl_->GetDnnAttr(attr_name); } -void GPUContext::SetDnnAttr(const std::string& attr_name, - const Attribute& attr) { - return impl_->SetDnnAttr(attr_name, attr); +void GPUContext::SetDnnAttr(const std::string& attr_name, Attribute attr) { + return impl_->SetDnnAttr(attr_name, std::move(attr)); } int8_t GPUContext::DnnAttrsVersion() const { return impl_->DnnAttrsVersion(); } diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index ca0e63b0f7974..9f5f8d6264e96 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -171,7 +171,7 @@ class PADDLE_API GPUContext : public DeviceContext, // Several methods for adapting Dnn-specific attributes bool HasDnnAttr(const std::string& attr_name) const; const Attribute& GetDnnAttr(const std::string& attr_name) const; - void SetDnnAttr(const std::string& attr_name, const Attribute& attr); + void SetDnnAttr(const std::string& attr_name, Attribute attr); int8_t DnnAttrsVersion() const; void SetDnnAttrsVersion(int8_t version); diff --git a/paddle/phi/backends/onednn/onednn_context.cc b/paddle/phi/backends/onednn/onednn_context.cc index f453b7da81807..975c4726ded44 100644 --- a/paddle/phi/backends/onednn/onednn_context.cc +++ b/paddle/phi/backends/onednn/onednn_context.cc @@ -297,7 +297,7 @@ struct OneDNNContext::Impl { return iter->second; } - void SetDnnAttr(const std::string& attr_name, const Attribute& attr) { + void SetDnnAttr(const std::string& attr_name, Attribute attr) { if (dnn_attrs_version_ == 0) { dnn_attrs_[attr_name] = attr; } else { @@ -373,9 +373,8 @@ const Attribute& OneDNNContext::GetDnnAttr(const std::string& attr_name) const { return impl_->GetDnnAttr(attr_name); } -void OneDNNContext::SetDnnAttr(const std::string& attr_name, - const Attribute& attr) { - return impl_->SetDnnAttr(attr_name, attr); +void OneDNNContext::SetDnnAttr(const std::string& attr_name, Attribute attr) { + return impl_->SetDnnAttr(attr_name, std::move(attr)); } int8_t OneDNNContext::DnnAttrsVersion() const { diff --git a/paddle/phi/backends/onednn/onednn_context.h b/paddle/phi/backends/onednn/onednn_context.h index 8cb027eecd3b0..e077466ea725f 100644 --- a/paddle/phi/backends/onednn/onednn_context.h +++ b/paddle/phi/backends/onednn/onednn_context.h @@ -138,7 +138,7 @@ class OneDNNContext : public CPUContext { // Several methods for adapting ONEDNN-specific attributes bool HasDnnAttr(const std::string& attr_name) const; const Attribute& GetDnnAttr(const std::string& attr_name) const; - void SetDnnAttr(const std::string& attr_name, const Attribute& attr); + void SetDnnAttr(const std::string& attr_name, Attribute attr); int8_t DnnAttrsVersion() const; void SetDnnAttrsVersion(int8_t version); diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index f37e5d3a245b0..0fb968cb4eb5b 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -51,8 +51,11 @@ void ConvCudnnKernel(const Context& ctx, std::vector paddings = paddings_t; std::vector dilations = dilations_t; + bool has_exhaustive_search = ctx.HasDnnAttr("exhaustive_search"); + VLOG(4) << "GPUContext contains `exhaustive_search`: " + << has_exhaustive_search; bool exhaustive_search_attr = - ctx.HasDnnAttr("exhaustive_search") + has_exhaustive_search ? paddle::get(ctx.GetDnnAttr("exhaustive_search")) : false; bool exhaustive_search = From fc673723888fcdd5c750ca2c1d950350d3639f66 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 26 Sep 2022 13:27:26 +0000 Subject: [PATCH 07/36] fix marco error --- paddle/fluid/framework/operator.cc | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index d16901f9b906b..5b5d57101266c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2966,11 +2966,13 @@ void OperatorWithKernel::BuildPhiKernelContext( VLOG(4) << "Done attributes"; // For compatible with Op with extra attrs for specific backend +#if defined(PADDLE_WITH_MKLDNN) || defined(PADDLE_WITH_CUDA) auto& runtime_attrs = RuntimeAttrs(); for (const auto& attr_iter : runtime_attrs) { auto& attr_name = attr_iter.first; auto& attr = attr_iter.second; auto attr_propertys = paddle::operators::GetExtraAttrPropertys(attr_name); +#ifdef PADDLE_WITH_MKLDNN if (phi::OneDNNContext::classof(dev_ctx) && attr_propertys.Support(operators::ExtraAttrProperty::ONEDNN)) { VLOG(4) << "Runtime attr `" << attr_name @@ -3023,8 +3025,11 @@ void OperatorWithKernel::BuildPhiKernelContext( "Unsupported Attribute value type `%s` for phi.", platform::demangle(attr.type().name()))); } - } else if (phi::GPUContext::classof(dev_ctx) && - attr_propertys.Support(operators::ExtraAttrProperty::GPUDNN)) { + } +#endif +#ifdef PADDLE_WITH_CUDA + if (phi::GPUContext::classof(dev_ctx) && + attr_propertys.Support(operators::ExtraAttrProperty::GPUDNN)) { VLOG(4) << "Runtime attr `" << attr_name << "` is passed to GPUDNNContext."; phi::GPUContext* gpu_dnn_ctx = static_cast(dev_ctx); @@ -3074,11 +3079,11 @@ void OperatorWithKernel::BuildPhiKernelContext( "Unsupported Attribute value type `%s` for phi.", platform::demangle(attr.type().name()))); } - } else { - // do nothing } +#endif } VLOG(4) << "Done runtime attributes"; +#endif } } // namespace framework From ed7b59ff3752297365ff1c0f79d895bc357453ab Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 28 Sep 2022 03:18:34 +0000 Subject: [PATCH 08/36] clear conv_grad_kernel extra args --- paddle/fluid/platform/enforce.h | 133 +++++++++--------- paddle/fluid/platform/enforce_test.cc | 7 +- paddle/phi/core/enforce.h | 132 +++++++++-------- paddle/phi/kernels/conv_grad_kernel.h | 5 +- paddle/phi/kernels/cpu/conv_grad_kernel.cc | 10 +- paddle/phi/kernels/gpu/conv_grad_kernel.cu | 5 +- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 31 ++-- paddle/phi/kernels/gpudnn/conv_kernel.cu | 2 +- .../phi/kernels/impl/conv_grad_kernel_impl.h | 5 +- paddle/phi/kernels/xpu/conv_grad_kernel.cc | 10 +- paddle/phi/ops/compat/conv2d_sig.cc | 7 +- 11 files changed, 163 insertions(+), 184 deletions(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 3cad2e3d2055e..5bd9029179ffe 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -122,77 +122,80 @@ using namespace ::phi::enforce; // NOLINT #endif /* - * Summary: This PADDLE_GET(_**) series macros are used to call paddle::get - * safely. paddle::get is not a completely safe api, although it will not - * go wrong in most cases, but in extreme cases, it may fail and directly - * throw a paddle::bad_variant_access const exception, without any stack - *information. - * This kind of problems is difficult to debug, so add these macros to - * enrich paddle::get error information. At the same time, we restrict - * the direct use of paddle::get by CI rule. + * Summary: This macro is used to get Variable or internal type + * data (such as LoDTensor or SelectedRows) of the Input and + * Output in op, generally used when call scope.FindVar(Input/ + * Output("Name")) or ctx.Input(). + * Firstly this macro check whether the obtained pointer is null, + * and then return data if it is not null. + * + * Note: This macro is only suitable for specific scenarios and + * does not intended to be widely used. If it cannot meet the + * requirements, please use other PADDLE_ENFORCE** check macro. * * Parameters: - *     __TYPE: the target variable type - * __VALUE: the target variable to get + *     __PTR: pointer + * __ROLE: (string), Input or Output + * __NAME: (string), Input or Output name + * __OP_TYPE: (string), the op type * - * Examples: - * - unsafe writing: int x = paddle::get(y); - * - safe writing: int x = PADDLE_GET(int, y); + * Return: The data pointed to by the pointer. * - * Note: GCC 4.8 cannot select right overloaded function here, so need - * to define different functions and macros here, after we upgreade - * CI gcc version, we can only define one PADDLE_GET macro. + * Examples: + * GET_DATA_SAFELY(ctx.Input("X"), "Input", "X", "Mul"); */ -namespace details { - -using namespace phi::enforce::details; // NOLINT - -#define DEFINE_SAFE_PADDLE_GET( \ - __InputType, __OutputType, __OutputTypePtr, __FuncName) \ - template \ - auto __FuncName( \ - __InputType input, const char* expression, const char* file, int line) \ - ->typename std::conditional::value, \ - __OutputTypePtr, \ - __OutputType>::type { \ - try { \ - return paddle::get(input); \ - } catch (paddle::bad_variant_access const&) { \ - HANDLE_THE_ERROR \ - throw ::phi::enforce::EnforceNotMet( \ - phi::errors::InvalidArgument( \ - "paddle::get failed, cannot get value " \ - "(%s) by type %s, its type is %s.", \ - expression, \ - phi::enforce::demangle(typeid(OutputType).name()), \ - phi::enforce::demangle(input.type().name())), \ - file, \ - line); \ - END_HANDLE_THE_ERROR \ - } \ - } - -DEFINE_SAFE_PADDLE_GET(InputType&, OutputType&, OutputType*, SafeBoostGet); -DEFINE_SAFE_PADDLE_GET(const InputType&, - const OutputType&, - const OutputType*, - SafeBoostGetConst); -DEFINE_SAFE_PADDLE_GET(InputType&&, - OutputType, - OutputType*, - SafeBoostGetMutable); - -} // namespace details +#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE) \ + (([&]() -> std::add_lvalue_reference::type { \ + auto* __ptr = (__PTR); \ + if (UNLIKELY(nullptr == __ptr)) { \ + auto __summary__ = phi::errors::NotFound( \ + "Unable to get %s data of %s %s in operator %s. " \ + "Possible reasons are:\n" \ + " 1. The %s is not the %s of operator %s;\n" \ + " 2. The %s has no corresponding variable passed in;\n" \ + " 3. The %s corresponding variable is not initialized.", \ + phi::demangle( \ + typeid(std::add_lvalue_reference::type) \ + .name()), \ + __ROLE, \ + __NAME, \ + __OP_TYPE, \ + __NAME, \ + __ROLE, \ + __OP_TYPE, \ + __NAME, \ + __NAME); \ + auto __message__ = ::paddle::string::Sprintf( \ + "%s\n [Hint: pointer " #__PTR " should not be null.]", \ + __summary__.error_message()); \ + __THROW_ERROR_INTERNAL__( \ + phi::ErrorSummary(__summary__.code(), __message__)); \ + } \ + return *__ptr; \ + })()) -#define PADDLE_GET(__TYPE, __VALUE) \ - paddle::platform::details::SafeBoostGet<__TYPE>( \ - __VALUE, #__VALUE, __FILE__, __LINE__) -#define PADDLE_GET_CONST(__TYPE, __VALUE) \ - paddle::platform::details::SafeBoostGetConst<__TYPE>( \ - __VALUE, #__VALUE, __FILE__, __LINE__) -#define PADDLE_GET_MUTABLE(__TYPE, __VALUE) \ - paddle::platform::details::SafeBoostGetMutable<__TYPE>( \ - __VALUE, #__VALUE, __FILE__, __LINE__) +/* + * Summary: This macro is used to check whether op has specified + * Input or Output Variables. Because op's Input and Output + * checking are written similarly, so abstract this macro. + * + * Parameters: + *     __EXPR: (bool), the bool expression + * __ROLE: (string), Input or Output + * __NAME: (string), Input or Output name + * __OP_TYPE: (string), the op type + * + * Examples: + * OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul"); + */ +#define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE) \ + do { \ + PADDLE_ENFORCE_EQ( \ + __EXPR, \ + true, \ + phi::errors::NotFound( \ + "No %s(%s) found for %s operator.", __ROLE, __NAME, __OP_TYPE)); \ + } while (0) /** OTHER EXCEPTION AND ENFORCE **/ diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index 48aece1c4170b..9fc200ca82f1c 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -528,10 +528,9 @@ struct CannotToStringType { }; TEST(enforce, cannot_to_string_type) { - static_assert( - !paddle::platform::details::CanToString::kValue, - "CannotToStringType must not be converted to string"); - static_assert(paddle::platform::details::CanToString::kValue, + static_assert(!phi::enforce::details::CanToString::kValue, + "CannotToStringType must not be converted to string"); + static_assert(phi::enforce::details::CanToString::kValue, "int can be converted to string"); CannotToStringType obj1(3), obj2(4), obj3(3); diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index 251916d8c1a15..cfe8b47ef9afa 100644 --- a/paddle/phi/core/enforce.h +++ b/paddle/phi/core/enforce.h @@ -43,6 +43,7 @@ limitations under the License. */ #include "paddle/phi/core/errors.h" #include "paddle/utils/string/printf.h" #include "paddle/utils/string/to_string.h" +#include "paddle/utils/variant.h" DECLARE_int32(call_stack_level); @@ -409,80 +410,75 @@ struct EnforceNotMet : public std::exception { /** EXTENDED TOOL FUNCTIONS WITH CHECKING **/ /* - * Summary: This macro is used to get Variable or internal type - * data (such as LoDTensor or SelectedRows) of the Input and - * Output in op, generally used when call scope.FindVar(Input/ - * Output("Name")) or ctx.Input(). - * Firstly this macro check whether the obtained pointer is null, - * and then return data if it is not null. - * - * Note: This macro is only suitable for specific scenarios and - * does not intended to be widely used. If it cannot meet the - * requirements, please use other PADDLE_ENFORCE** check macro. + * Summary: This PADDLE_GET(_**) series macros are used to call paddle::get + * safely. paddle::get is not a completely safe api, although it will not + * go wrong in most cases, but in extreme cases, it may fail and directly + * throw a paddle::bad_variant_access const exception, without any stack + *information. + * This kind of problems is difficult to debug, so add these macros to + * enrich paddle::get error information. At the same time, we restrict + * the direct use of paddle::get by CI rule. * * Parameters: - *     __PTR: pointer - * __ROLE: (string), Input or Output - * __NAME: (string), Input or Output name - * __OP_TYPE: (string), the op type - * - * Return: The data pointed to by the pointer. + *     __TYPE: the target variable type + * __VALUE: the target variable to get * * Examples: - * GET_DATA_SAFELY(ctx.Input("X"), "Input", "X", "Mul"); - */ -#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE) \ - (([&]() -> std::add_lvalue_reference::type { \ - auto* __ptr = (__PTR); \ - if (UNLIKELY(nullptr == __ptr)) { \ - auto __summary__ = phi::errors::NotFound( \ - "Unable to get %s data of %s %s in operator %s. " \ - "Possible reasons are:\n" \ - " 1. The %s is not the %s of operator %s;\n" \ - " 2. The %s has no corresponding variable passed in;\n" \ - " 3. The %s corresponding variable is not initialized.", \ - phi::demangle( \ - typeid(std::add_lvalue_reference::type) \ - .name()), \ - __ROLE, \ - __NAME, \ - __OP_TYPE, \ - __NAME, \ - __ROLE, \ - __OP_TYPE, \ - __NAME, \ - __NAME); \ - auto __message__ = ::paddle::string::Sprintf( \ - "%s\n [Hint: pointer " #__PTR " should not be null.]", \ - __summary__.error_message()); \ - __THROW_ERROR_INTERNAL__( \ - phi::ErrorSummary(__summary__.code(), __message__)); \ - } \ - return *__ptr; \ - })()) - -/* - * Summary: This macro is used to check whether op has specified - * Input or Output Variables. Because op's Input and Output - * checking are written similarly, so abstract this macro. - * - * Parameters: - *     __EXPR: (bool), the bool expression - * __ROLE: (string), Input or Output - * __NAME: (string), Input or Output name - * __OP_TYPE: (string), the op type + * - unsafe writing: int x = paddle::get(y); + * - safe writing: int x = PADDLE_GET(int, y); * - * Examples: - * OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul"); + * Note: GCC 4.8 cannot select right overloaded function here, so need + * to define different functions and macros here, after we upgrade + * CI gcc version, we can only define one PADDLE_GET macro. */ -#define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE) \ - do { \ - PADDLE_ENFORCE_EQ( \ - __EXPR, \ - true, \ - phi::errors::NotFound( \ - "No %s(%s) found for %s operator.", __ROLE, __NAME, __OP_TYPE)); \ - } while (0) +namespace details { + +#define DEFINE_SAFE_PADDLE_GET( \ + __InputType, __OutputType, __OutputTypePtr, __FuncName) \ + template \ + auto __FuncName( \ + __InputType input, const char* expression, const char* file, int line) \ + ->typename std::conditional::value, \ + __OutputTypePtr, \ + __OutputType>::type { \ + try { \ + return paddle::get(input); \ + } catch (paddle::bad_variant_access const&) { \ + HANDLE_THE_ERROR \ + throw ::phi::enforce::EnforceNotMet( \ + phi::errors::InvalidArgument( \ + "paddle::get failed, cannot get value " \ + "(%s) by type %s, its type is %s.", \ + expression, \ + phi::enforce::demangle(typeid(OutputType).name()), \ + phi::enforce::demangle(input.type().name())), \ + file, \ + line); \ + END_HANDLE_THE_ERROR \ + } \ + } + +DEFINE_SAFE_PADDLE_GET(InputType&, OutputType&, OutputType*, SafeBoostGet); +DEFINE_SAFE_PADDLE_GET(const InputType&, + const OutputType&, + const OutputType*, + SafeBoostGetConst); +DEFINE_SAFE_PADDLE_GET(InputType&&, + OutputType, + OutputType*, + SafeBoostGetMutable); + +} // namespace details + +#define PADDLE_GET(__TYPE, __VALUE) \ + phi::enforce::details::SafeBoostGet<__TYPE>( \ + __VALUE, #__VALUE, __FILE__, __LINE__) +#define PADDLE_GET_CONST(__TYPE, __VALUE) \ + phi::enforce::details::SafeBoostGetConst<__TYPE>( \ + __VALUE, #__VALUE, __FILE__, __LINE__) +#define PADDLE_GET_MUTABLE(__TYPE, __VALUE) \ + phi::enforce::details::SafeBoostGetMutable<__TYPE>( \ + __VALUE, #__VALUE, __FILE__, __LINE__) } // namespace enforce using namespace enforce; // NOLINT diff --git a/paddle/phi/kernels/conv_grad_kernel.h b/paddle/phi/kernels/conv_grad_kernel.h index a6b970e0996be..00b8bfcf48ee3 100644 --- a/paddle/phi/kernels/conv_grad_kernel.h +++ b/paddle/phi/kernels/conv_grad_kernel.h @@ -26,12 +26,9 @@ void ConvGradKernel(const Context& dev_ctx, const std::vector& strides, const std::vector& paddings, const std::string& paddding_algorithm, - int groups, const std::vector& dilations, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* input_grad, DenseTensor* filter_grad); diff --git a/paddle/phi/kernels/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_kernel.cc index 880837dd7cd61..7b6c3ee8bd550 100644 --- a/paddle/phi/kernels/cpu/conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc @@ -44,12 +44,9 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, strides, paddings, paddding_algorithm, - groups, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, input_grad, filter_grad); } @@ -77,12 +74,9 @@ void Conv3DGradKernel(const Context& dev_ctx, strides, paddings, paddding_algorithm, - groups, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, input_grad, filter_grad); } diff --git a/paddle/phi/kernels/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_kernel.cu index e4b094314f8c1..6f535cb8d432d 100644 --- a/paddle/phi/kernels/gpu/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/conv_grad_kernel.cu @@ -43,12 +43,9 @@ void Conv3DGradKernel(const Context& dev_ctx, strides, paddings, paddding_algorithm, - groups, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, input_grad, filter_grad); } diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index bc7a8b4f37840..59cddfd31095e 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -44,12 +44,9 @@ void ConvCudnnGradKernel(const Context& ctx, const std::vector& strides_t, const std::vector& paddings_t, const std::string& padding_algorithm, - int groups, const std::vector& dilations_t, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, DenseTensor* input_grad, DenseTensor* filter_grad) { if (input_grad) { @@ -59,11 +56,25 @@ void ConvCudnnGradKernel(const Context& ctx, ctx.template Alloc(filter_grad); } + bool has_use_addto = ctx.HasDnnAttr("use_addto"); + VLOG(4) << "GPUContext contains `use_addto`: " << has_use_addto; + bool use_addto = has_use_addto + ? PADDLE_GET_CONST(bool, ctx.GetDnnAttr("use_addto")) + : false; + std::vector dilations = dilations_t; std::vector strides = strides_t; std::vector paddings = paddings_t; - bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; + bool has_exhaustive_search = ctx.HasDnnAttr("exhaustive_search"); + VLOG(4) << "GPUContext contains `exhaustive_search`: " + << has_exhaustive_search; + bool exhaustive_search_attr = + has_exhaustive_search + ? PADDLE_GET_CONST(bool, ctx.GetDnnAttr("exhaustive_search")) + : false; + bool exhaustive_search = + FLAGS_cudnn_exhaustive_search || exhaustive_search_attr; bool deterministic = FLAGS_cudnn_deterministic; auto exhaustive_deterministic = exhaustive_search && deterministic; PADDLE_ENFORCE_EQ(exhaustive_deterministic, @@ -616,12 +627,9 @@ void Conv3DCudnnGradKernel(const Context& dev_ctx, strides, paddings, paddding_algorithm, - groups, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, input_grad, filter_grad); } @@ -650,12 +658,9 @@ void DepthwiseConvCudnnGradKernel(const Context& dev_ctx, strides, paddings, paddding_algorithm, - groups, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, input_grad, filter_grad); } diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index 0fb968cb4eb5b..69466419a84ef 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -56,7 +56,7 @@ void ConvCudnnKernel(const Context& ctx, << has_exhaustive_search; bool exhaustive_search_attr = has_exhaustive_search - ? paddle::get(ctx.GetDnnAttr("exhaustive_search")) + ? PADDLE_GET_CONST(bool, ctx.GetDnnAttr("exhaustive_search")) : false; bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_attr; diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h index 6674500c3c2e5..5b96548476e7f 100644 --- a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h @@ -32,12 +32,9 @@ void ConvGradKernel(const Context& dev_ctx, const std::vector& strides, const std::vector& paddings_t, const std::string& padding_algorithm, - int groups, const std::vector& dilations_t, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* input_grad, DenseTensor* filter_grad) { // The filter and filter_grad will be reshaped in the calculations, diff --git a/paddle/phi/kernels/xpu/conv_grad_kernel.cc b/paddle/phi/kernels/xpu/conv_grad_kernel.cc index da86474b96b65..778ceb4773c30 100644 --- a/paddle/phi/kernels/xpu/conv_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/conv_grad_kernel.cc @@ -28,12 +28,9 @@ void ConvGradKernel(const Context& dev_ctx, const std::vector& strides, const std::vector& paddings_t, const std::string& padding_algorithm, - int groups, const std::vector& dilations_t, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* input_grad, DenseTensor* filter_grad) { using XPUT = typename XPUTypeTrait::Type; @@ -168,12 +165,9 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, strides, paddings, paddding_algorithm, - groups, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, input_grad, filter_grad); } diff --git a/paddle/phi/ops/compat/conv2d_sig.cc b/paddle/phi/ops/compat/conv2d_sig.cc index 26f5cba4fee97..57482eb1bb17e 100644 --- a/paddle/phi/ops/compat/conv2d_sig.cc +++ b/paddle/phi/ops/compat/conv2d_sig.cc @@ -34,12 +34,9 @@ KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { {"strides", "paddings", "padding_algorithm", - "groups", "dilations", - "data_format", - "use_addto", - "workspace_size_MB", - "exhaustive_search"}, + "groups", + "data_format"}, {"Input@GRAD", "Filter@GRAD"}); } From 23cab0737e539ca184961b2fd144633c21af06a5 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 28 Sep 2022 09:33:00 +0000 Subject: [PATCH 09/36] merge conv_grad_grad into conv_grad --- paddle/phi/kernels/conv_grad_grad_kernel.h | 61 -- paddle/phi/kernels/conv_grad_kernel.h | 40 + .../phi/kernels/cpu/conv_grad_grad_kernel.cc | 72 -- paddle/phi/kernels/cpu/conv_grad_kernel.cc | 50 ++ .../phi/kernels/gpu/conv_grad_grad_kernel.cu | 23 - paddle/phi/kernels/gpu/conv_grad_kernel.cu | 4 + .../kernels/gpudnn/conv_grad_grad_kernel.cu | 834 ------------------ paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 783 ++++++++++++++++ .../kernels/impl/conv_grad_grad_kernel_impl.h | 330 ------- .../phi/kernels/impl/conv_grad_kernel_impl.h | 304 ++++++- 10 files changed, 1180 insertions(+), 1321 deletions(-) delete mode 100644 paddle/phi/kernels/conv_grad_grad_kernel.h delete mode 100644 paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc delete mode 100644 paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu delete mode 100644 paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu delete mode 100644 paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h diff --git a/paddle/phi/kernels/conv_grad_grad_kernel.h b/paddle/phi/kernels/conv_grad_grad_kernel.h deleted file mode 100644 index f25cbe384c213..0000000000000 --- a/paddle/phi/kernels/conv_grad_grad_kernel.h +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -template -void ConvGradGradKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings, - const std::string& paddding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad); - -template -void Conv3DGradGradKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings, - const std::string& paddding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad); - -} // namespace phi diff --git a/paddle/phi/kernels/conv_grad_kernel.h b/paddle/phi/kernels/conv_grad_kernel.h index 00b8bfcf48ee3..759a49d85f486 100644 --- a/paddle/phi/kernels/conv_grad_kernel.h +++ b/paddle/phi/kernels/conv_grad_kernel.h @@ -67,4 +67,44 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, DenseTensor* input_grad, DenseTensor* filter_grad); +template +void ConvGradGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad); + +template +void Conv3DGradGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad); + } // namespace phi diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc deleted file mode 100644 index 3289c8f5c84d6..0000000000000 --- a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/conv_grad_grad_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h" - -namespace phi { -template -void Conv3DGradGradKernel(const Context& ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad) { - ConvGradGradKernel(ctx, - input, - filter, - out_grad, - input_grad_grad, - filter_grad_grad, - strides, - paddings_t, - padding_algorithm, - groups, - dilations_t, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search_t, - input_grad, - filter_grad, - out_grad_grad); -} - -} // namespace phi - -PD_REGISTER_KERNEL( - conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) { -} - -PD_REGISTER_KERNEL(conv3d_grad_grad, - CPU, - ALL_LAYOUT, - phi::Conv3DGradGradKernel, - float, - double) {} diff --git a/paddle/phi/kernels/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_kernel.cc index 7b6c3ee8bd550..3becf810d7b34 100644 --- a/paddle/phi/kernels/cpu/conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc @@ -81,6 +81,45 @@ void Conv3DGradKernel(const Context& dev_ctx, filter_grad); } +template +void Conv3DGradGradKernel(const Context& ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad) { + ConvGradGradKernel(ctx, + input, + filter, + out_grad, + input_grad_grad, + filter_grad_grad, + strides, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search_t, + input_grad, + filter_grad, + out_grad_grad); +} + } // namespace phi PD_REGISTER_KERNEL( @@ -95,3 +134,14 @@ PD_REGISTER_KERNEL(depthwise_conv2d_grad, PD_REGISTER_KERNEL( conv3d_grad, CPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {} + +PD_REGISTER_KERNEL( + conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) { +} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + CPU, + ALL_LAYOUT, + phi::Conv3DGradGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu deleted file mode 100644 index 9d3e6da944a19..0000000000000 --- a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/conv_grad_grad_kernel.h" - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h" - -PD_REGISTER_KERNEL( - conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) { -} diff --git a/paddle/phi/kernels/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_kernel.cu index 6f535cb8d432d..dea55d34413bd 100644 --- a/paddle/phi/kernels/gpu/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/conv_grad_kernel.cu @@ -57,3 +57,7 @@ PD_REGISTER_KERNEL( PD_REGISTER_KERNEL( conv3d_grad, GPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {} + +PD_REGISTER_KERNEL( + conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) { +} diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu deleted file mode 100644 index fb9580427e1f4..0000000000000 --- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu +++ /dev/null @@ -1,834 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/conv_grad_grad_kernel.h" - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/kernel_registry.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/operators/conv_miopen_helper.h" -#else -#include "paddle/fluid/operators/conv_cudnn_helper.h" -#endif - -#include "paddle/fluid/platform/cudnn_workspace_helper.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/profiler.h" -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/kernels/cpu/conv_util.h" -#include "paddle/phi/kernels/funcs/batch_norm_utils.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/padding.h" -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" - -namespace phi { - -template -void ConvCudnnGradGradKernel( - const Context& ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad) { - auto X = &input; - auto W = &filter; - auto dO = &out_grad; - auto ddX = input_grad_grad.get_ptr(); - auto ddW = filter_grad_grad.get_ptr(); - - auto ddO = out_grad_grad; - auto dW = filter_grad; - auto dX = input_grad; - if (ddO) { - ctx.template Alloc(ddO); - phi::funcs::SetConstant set_zero; - set_zero(ctx, ddO, static_cast(0)); - } - if (dW) { - ctx.template Alloc(dW); - } - if (dX) { - ctx.template Alloc(dX); - } - - // const T* x = X->data(); - const T* dy = dO->data(); - const T* w = W->data(); - - const T* ddx = nullptr; - const T* ddw = nullptr; - T *dw, *dx, *ddy; - dw = dx = ddy = nullptr; - T* transformed_dx = nullptr; - std::vector dilations = dilations_t; - - bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, - false, - phi::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - std::vector paddings = paddings_t; - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensors to channel first----------- - DenseTensor transformed_X_channel(X->type()); - DenseTensor transformed_dO_channel(dO->type()); - DenseTensor transformed_ddX_channel(X->type()); - - DenseTensor transformed_ddO_channel(dO->type()); - DenseTensor transformed_dX_channel(X->type()); - - if (channel_last) { - ResizeToChannelFirst(ctx, X, &transformed_X_channel); - TransToChannelFirst(ctx, X, &transformed_X_channel); - - ResizeToChannelFirst(ctx, dO, &transformed_dO_channel); - TransToChannelFirst(ctx, dO, &transformed_dO_channel); - - if (ddX) { - ResizeToChannelFirst(ctx, ddX, &transformed_ddX_channel); - TransToChannelFirst(ctx, ddX, &transformed_ddX_channel); - } - - if (ddO) { - ResizeToChannelFirst(ctx, ddO, &transformed_ddO_channel); - } - if (dX) { - ResizeToChannelFirst(ctx, dX, &transformed_dX_channel); - ctx.template Alloc(&transformed_dX_channel); - } - - } else { - transformed_X_channel = *X; - transformed_dO_channel = *dO; - if (ddX) { - transformed_ddX_channel = *ddX; - } - if (ddO) { - transformed_ddO_channel.ShareDataWith(*ddO); - } - if (dX) { - transformed_dX_channel.ShareDataWith(*dX); - } - } - - auto in_dims = transformed_X_channel.dims(); - auto filter_dims = W->dims(); - DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); - DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim); - DenseTensor transformed_X(X->type()); - DenseTensor transformed_ddX(X->type()); - - DenseTensor transformed_dX(X->type()); - - std::vector padding_common(data_dim, 0); - std::vector input_pad(X->dims().size() * 2, 0); - - if (!is_sys_pad) { - // get pad - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_X_channel.dims()[0]; - new_input_shape_vec[1] = transformed_X_channel.dims()[1]; - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_input_shape_vec[i + 2] = - transformed_X_channel.dims()[i + 2] + padding_diff[i]; - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - DDim new_input_shape(make_ddim(new_input_shape_vec)); - transformed_X.Resize(new_input_shape); - transformed_ddX.Resize(new_input_shape); - transformed_dX.Resize(new_input_shape); - - ctx.template Alloc(&transformed_X); - - if (ddX) { - ctx.template Alloc(&transformed_ddX); - } - if (dX) { - ctx.template Alloc(&transformed_dX); - } - - // pad for input - const int rank = X->dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - funcs::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); - if (ddX) { - funcs::PadFunction(ctx, - input_pad, - transformed_ddX_channel, - pad_value, - &transformed_ddX); - } - } break; - case 5: { - funcs::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); - if (ddX) { - funcs::PadFunction(ctx, - input_pad, - transformed_ddX_channel, - pad_value, - &transformed_ddX); - } - } break; - default: - PADDLE_THROW(phi::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - - } else { - transformed_X.ShareDataWith(transformed_X_channel); - if (ddX) { - transformed_ddX.ShareDataWith(transformed_ddX_channel); - } - if (dX) { - transformed_dX.ShareDataWith(transformed_dX_channel); - } - - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* x = transformed_X.data(); - - int iwo_group = groups; - int c_group = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_group = 1; - c_group = groups; - groups = 1; -#endif - auto dtype = paddle::platform::CudnnDataType::type; - - auto handle = ctx.cudnn_handle(); - auto layout = paddle::platform::GetCudnnTensorFormat( - paddle::platform::DataLayout::kNCHW); - - paddle::operators::ConvArgs args1{&transformed_ddX, - W, - &transformed_ddO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; - paddle::operators::ConvArgs args2{&transformed_X, - ddW, - &transformed_ddO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; - paddle::operators::ConvArgs args3{&transformed_ddX, - dW, - &transformed_dO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; - paddle::operators::ConvArgs args4{&transformed_dX, - ddW, - &transformed_dO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; - -#ifdef PADDLE_WITH_HIP - paddle::operators::SearchResult fwd_result1; - paddle::operators::SearchResult fwd_result2; - paddle::operators::SearchResult data_result; - paddle::operators::SearchResult - filter_result; -#else - paddle::operators::SearchResult fwd_result1; - paddle::operators::SearchResult fwd_result2; - paddle::operators::SearchResult data_result; - paddle::operators::SearchResult - filter_result; -#endif - - // ddo = conv(ddI, W) + conv(I, ddW) - size_t workspace_size = 0; - - T* transformed_ddy_channel = nullptr; - if (ddO) { - ddy = ddO->data(); - transformed_ddy_channel = transformed_ddO_channel.data(); - if (ddX) { - args1.handle = handle; - args1.idesc.set(transformed_ddX, iwo_group); - args1.wdesc.set(*W, layout, iwo_group); - args1.odesc.set(transformed_ddO_channel, iwo_group); - args1.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn(), - c_group); - -#ifdef PADDLE_WITH_HIP - using search1 = - paddle::operators::SearchAlgorithm; - workspace_size = search1::GetWorkspaceSize(args1); - fwd_result1.algo = search1::Find( - args1, exhaustive_search, false, workspace_size, ctx); -#else - using search1 = - paddle::operators::SearchAlgorithm; - fwd_result1 = search1::Find(args1, exhaustive_search, false, ctx); - workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo); -#endif - } - - if (ddW) { - ddw = ddW->data(); - args2.handle = handle; - args2.idesc.set(transformed_X, iwo_group); - args2.wdesc.set(*ddW, layout, iwo_group); - args2.odesc.set(transformed_ddO_channel, iwo_group); - args2.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn(), - c_group); - -#ifdef PADDLE_WITH_HIP - using search2 = - paddle::operators::SearchAlgorithm; - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2)); - fwd_result2.algo = search2::Find( - args2, exhaustive_search, false, workspace_size, ctx); -#else - using search2 = - paddle::operators::SearchAlgorithm; - fwd_result2 = search2::Find(args2, exhaustive_search, false, ctx); - workspace_size = std::max( - workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo)); -#endif - } - } - - if (dW && ddX) { - dw = dW->data(); - args3.handle = handle; - args3.idesc.set(transformed_ddX, iwo_group); - args3.wdesc.set(*dW, layout, iwo_group); - args3.odesc.set(transformed_dO_channel, iwo_group); - args3.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn(), - c_group); - -#ifdef PADDLE_WITH_HIP - using search3 = - paddle::operators::SearchAlgorithm; - workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_result.algo = search3::Find( - args3, exhaustive_search, deterministic, workspace_size, ctx); -#else - using search3 = - paddle::operators::SearchAlgorithm; - filter_result = - search3::Find(args3, exhaustive_search, deterministic, ctx); - workspace_size = std::max( - workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); -#endif - } - - if (ddW && dX) { - transformed_dx = transformed_dX.data(); - - args4.handle = handle; - args4.idesc.set(transformed_dX, iwo_group); - args4.wdesc.set(*ddW, layout, iwo_group); - args4.odesc.set(transformed_dO_channel, iwo_group); - args4.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn(), - c_group); - -#ifdef PADDLE_WITH_HIP - using search4 = - paddle::operators::SearchAlgorithm; - workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_result.algo = search4::Find( - args4, exhaustive_search, deterministic, workspace_size, ctx); -#else - using search4 = - paddle::operators::SearchAlgorithm; - data_result = - search4::Find(args4, exhaustive_search, deterministic, ctx); - workspace_size = std::max( - workspace_size, search4::GetWorkspaceSize(args4, data_result.algo)); -#endif - } - - int i_n, i_c, i_d, i_h, i_w; - GetNCDHW( - transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w); - - int o_n, o_c, o_d, o_h, o_w; - GetNCDHW(transformed_dO_channel.dims(), - DataLayout::kNCHW, - &o_n, - &o_c, - &o_d, - &o_h, - &o_w); - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = W->numel() / groups; - - paddle::operators::ScalingParamType alpha = 1.0f; - paddle::operators::ScalingParamType beta = 0.0f; - - // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. - // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : - // 0.0f; - // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr("use_addto"); - auto wkspace_handle = ctx.cudnn_workspace_handle(); - - if (ddO) { - if (ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionForward( - handle, - &alpha, - args1.idesc.desc(), - ddx, - args1.wdesc.desc(), - w, - args1.cdesc.desc(), - fwd_result1.algo, - &beta, - args1.odesc.desc(), - transformed_ddy_channel, - workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionForward( - handle, - &alpha, - args1.idesc.desc(), - ddx + i * group_offset_in, - args1.wdesc.desc(), - w + i * group_offset_filter, - args1.cdesc.desc(), - fwd_result1.algo, - workspace_ptr, - workspace_size, - &beta, - args1.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (ddW) { -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionForward( - handle, - &alpha, - args2.idesc.desc(), - x, - args2.wdesc.desc(), - ddw, - args2.cdesc.desc(), - fwd_result2.algo, - &beta, - args2.odesc.desc(), - transformed_ddy_channel, - workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionForward( - handle, - &alpha, - args2.idesc.desc(), - x + i * group_offset_in, - args2.wdesc.desc(), - ddw + i * group_offset_filter, - args2.cdesc.desc(), - fwd_result2.algo, - workspace_ptr, - workspace_size, - &alpha, - args2.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_ddO_channel, ddO); - } - } - T* transformed_dy_channel = transformed_dO_channel.data(); - if (dW && ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionBackwardWeights( - handle, - &alpha, - args3.odesc.desc(), - transformed_dy_channel, - args3.idesc.desc(), - ddx, - args3.cdesc.desc(), - filter_result.algo, - &beta, - args3.wdesc.desc(), - dw, - workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionBackwardFilter( - handle, - &alpha, - args3.idesc.desc(), - ddx + i * group_offset_in, - args3.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.cdesc.desc(), - filter_result.algo, - workspace_ptr, - workspace_size, - &beta, - args3.wdesc.desc(), - dw + i * group_offset_filter)); - }, - workspace_size); - } -#endif - } - - if (dX && ddW) { - ddw = ddW->data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionBackwardData( - handle, - &alpha, - args4.odesc.desc(), - transformed_dy_channel, - args4.wdesc.desc(), - ddw, - args4.cdesc.desc(), - data_result.algo, - &beta, - args4.idesc.desc(), - transformed_dx, - workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionBackwardData( - handle, - &alpha, - args4.wdesc.desc(), - ddw + i * group_offset_filter, - args4.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.cdesc.desc(), - data_result.algo, - workspace_ptr, - workspace_size, - &beta, - args4.idesc.desc(), - transformed_dx + i * group_offset_in)); - }, - workspace_size); - } -#endif - - if (!is_sys_pad) { - // reverse padded input - std::vector starts(X->dims().size(), 0); - std::vector axes(X->dims().size(), 0); - - for (size_t i = 0; i < X->dims().size(); ++i) { - starts[i] = input_pad[2 * i]; - axes[i] = i; - } - if (X->dims().size() == 4) { - paddle::operators::RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } else { - paddle::operators::RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_dX_channel, dX); - } - } -} - -template -void DepthwiseConvDoubleGradGPUDNNKernel( - const Context& ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, - bool fuse_relu, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad) { - ConvCudnnGradGradKernel(ctx, - input, - filter, - out_grad, - input_grad_grad, - filter_grad_grad, - strides, - paddings_t, - padding_algorithm, - groups, - dilations_t, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search_t, - input_grad, - filter_grad, - out_grad_grad); -} - -template -void Conv3DCudnnGradGradKernel( - const Context& ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad) { - ConvCudnnGradGradKernel(ctx, - input, - filter, - out_grad, - input_grad_grad, - filter_grad_grad, - strides, - paddings_t, - padding_algorithm, - groups, - dilations_t, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search_t, - input_grad, - filter_grad, - out_grad_grad); -} - -} // namespace phi - -#ifdef PADDLE_WITH_HIP -PD_REGISTER_KERNEL(conv2d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::ConvCudnnGradGradKernel, - float, - phi::dtype::float16) {} - -PD_REGISTER_KERNEL(conv3d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::Conv3DCudnnGradGradKernel, - float, - phi::dtype::float16) {} - -PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, - GPU, - ALL_LAYOUT, - phi::DepthwiseConvDoubleGradGPUDNNKernel, - float, - phi::dtype::float16) {} -#else -#if CUDNN_VERSION_MIN(8, 1, 0) -PD_REGISTER_KERNEL(conv2d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::ConvCudnnGradGradKernel, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16) {} - -PD_REGISTER_KERNEL(conv3d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::Conv3DCudnnGradGradKernel, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16) {} - -PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, - GPU, - ALL_LAYOUT, - phi::DepthwiseConvDoubleGradGPUDNNKernel, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16) {} - -#else - -PD_REGISTER_KERNEL(conv2d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::ConvCudnnGradGradKernel, - float, - double, - phi::dtype::float16) {} - -PD_REGISTER_KERNEL(conv3d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::Conv3DCudnnGradGradKernel, - float, - double, - phi::dtype::float16) {} - -PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, - GPU, - ALL_LAYOUT, - phi::DepthwiseConvDoubleGradGPUDNNKernel, - float, - double, - phi::dtype::float16) {} - -#endif - -#endif diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index 59cddfd31095e..bf870dc2eaf99 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -665,6 +665,720 @@ void DepthwiseConvCudnnGradKernel(const Context& dev_ctx, filter_grad); } +template +void ConvCudnnGradGradKernel( + const Context& ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad) { + auto X = &input; + auto W = &filter; + auto dO = &out_grad; + auto ddX = input_grad_grad.get_ptr(); + auto ddW = filter_grad_grad.get_ptr(); + + auto ddO = out_grad_grad; + auto dW = filter_grad; + auto dX = input_grad; + if (ddO) { + ctx.template Alloc(ddO); + phi::funcs::SetConstant set_zero; + set_zero(ctx, ddO, static_cast(0)); + } + if (dW) { + ctx.template Alloc(dW); + } + if (dX) { + ctx.template Alloc(dX); + } + + // const T* x = X->data(); + const T* dy = dO->data(); + const T* w = W->data(); + + const T* ddx = nullptr; + const T* ddw = nullptr; + T *dw, *dx, *ddy; + dw = dx = ddy = nullptr; + T* transformed_dx = nullptr; + std::vector dilations = dilations_t; + + bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; + bool deterministic = FLAGS_cudnn_deterministic; + auto exhaustive_deterministic = exhaustive_search && deterministic; + PADDLE_ENFORCE_EQ(exhaustive_deterministic, + false, + phi::errors::InvalidArgument( + "Cann't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time.")); + + std::vector paddings = paddings_t; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + // transform Tensors to channel first----------- + DenseTensor transformed_X_channel(X->type()); + DenseTensor transformed_dO_channel(dO->type()); + DenseTensor transformed_ddX_channel(X->type()); + + DenseTensor transformed_ddO_channel(dO->type()); + DenseTensor transformed_dX_channel(X->type()); + + if (channel_last) { + ResizeToChannelFirst(ctx, X, &transformed_X_channel); + TransToChannelFirst(ctx, X, &transformed_X_channel); + + ResizeToChannelFirst(ctx, dO, &transformed_dO_channel); + TransToChannelFirst(ctx, dO, &transformed_dO_channel); + + if (ddX) { + ResizeToChannelFirst(ctx, ddX, &transformed_ddX_channel); + TransToChannelFirst(ctx, ddX, &transformed_ddX_channel); + } + + if (ddO) { + ResizeToChannelFirst(ctx, ddO, &transformed_ddO_channel); + } + if (dX) { + ResizeToChannelFirst(ctx, dX, &transformed_dX_channel); + ctx.template Alloc(&transformed_dX_channel); + } + + } else { + transformed_X_channel = *X; + transformed_dO_channel = *dO; + if (ddX) { + transformed_ddX_channel = *ddX; + } + if (ddO) { + transformed_ddO_channel.ShareDataWith(*ddO); + } + if (dX) { + transformed_dX_channel.ShareDataWith(*dX); + } + } + + auto in_dims = transformed_X_channel.dims(); + auto filter_dims = W->dims(); + DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim); + DenseTensor transformed_X(X->type()); + DenseTensor transformed_ddX(X->type()); + + DenseTensor transformed_dX(X->type()); + + std::vector padding_common(data_dim, 0); + std::vector input_pad(X->dims().size() * 2, 0); + + if (!is_sys_pad) { + // get pad + std::vector padding_diff(data_dim); + std::vector new_input_shape_vec(data_dim + 2); + new_input_shape_vec[0] = transformed_X_channel.dims()[0]; + new_input_shape_vec[1] = transformed_X_channel.dims()[1]; + + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); + padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); + new_input_shape_vec[i + 2] = + transformed_X_channel.dims()[i + 2] + padding_diff[i]; + input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; + input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; + } + DDim new_input_shape(make_ddim(new_input_shape_vec)); + transformed_X.Resize(new_input_shape); + transformed_ddX.Resize(new_input_shape); + transformed_dX.Resize(new_input_shape); + + ctx.template Alloc(&transformed_X); + + if (ddX) { + ctx.template Alloc(&transformed_ddX); + } + if (dX) { + ctx.template Alloc(&transformed_dX); + } + + // pad for input + const int rank = X->dims().size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction( + ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + if (ddX) { + funcs::PadFunction(ctx, + input_pad, + transformed_ddX_channel, + pad_value, + &transformed_ddX); + } + } break; + case 5: { + funcs::PadFunction( + ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + if (ddX) { + funcs::PadFunction(ctx, + input_pad, + transformed_ddX_channel, + pad_value, + &transformed_ddX); + } + } break; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "ConvOp only support tensors with 4 or 5 dimensions.")); + } + + } else { + transformed_X.ShareDataWith(transformed_X_channel); + if (ddX) { + transformed_ddX.ShareDataWith(transformed_ddX_channel); + } + if (dX) { + transformed_dX.ShareDataWith(transformed_dX_channel); + } + + if (paddings.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings[2 * i]; + } + } + } + + const T* x = transformed_X.data(); + + int iwo_group = groups; + int c_group = 1; +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) + iwo_group = 1; + c_group = groups; + groups = 1; +#endif + auto dtype = paddle::platform::CudnnDataType::type; + + auto handle = ctx.cudnn_handle(); + auto layout = paddle::platform::GetCudnnTensorFormat( + paddle::platform::DataLayout::kNCHW); + + paddle::operators::ConvArgs args1{&transformed_ddX, + W, + &transformed_ddO_channel, + strides, + padding_common, + dilations, + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; + paddle::operators::ConvArgs args2{&transformed_X, + ddW, + &transformed_ddO_channel, + strides, + padding_common, + dilations, + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; + paddle::operators::ConvArgs args3{&transformed_ddX, + dW, + &transformed_dO_channel, + strides, + padding_common, + dilations, + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; + paddle::operators::ConvArgs args4{&transformed_dX, + ddW, + &transformed_dO_channel, + strides, + padding_common, + dilations, + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; + +#ifdef PADDLE_WITH_HIP + paddle::operators::SearchResult fwd_result1; + paddle::operators::SearchResult fwd_result2; + paddle::operators::SearchResult data_result; + paddle::operators::SearchResult + filter_result; +#else + paddle::operators::SearchResult fwd_result1; + paddle::operators::SearchResult fwd_result2; + paddle::operators::SearchResult data_result; + paddle::operators::SearchResult + filter_result; +#endif + + // ddo = conv(ddI, W) + conv(I, ddW) + size_t workspace_size = 0; + + T* transformed_ddy_channel = nullptr; + if (ddO) { + ddy = ddO->data(); + transformed_ddy_channel = transformed_ddO_channel.data(); + if (ddX) { + args1.handle = handle; + args1.idesc.set(transformed_ddX, iwo_group); + args1.wdesc.set(*W, layout, iwo_group); + args1.odesc.set(transformed_ddO_channel, iwo_group); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search1 = + paddle::operators::SearchAlgorithm; + workspace_size = search1::GetWorkspaceSize(args1); + fwd_result1.algo = search1::Find( + args1, exhaustive_search, false, workspace_size, ctx); +#else + using search1 = + paddle::operators::SearchAlgorithm; + fwd_result1 = search1::Find(args1, exhaustive_search, false, ctx); + workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo); +#endif + } + + if (ddW) { + ddw = ddW->data(); + args2.handle = handle; + args2.idesc.set(transformed_X, iwo_group); + args2.wdesc.set(*ddW, layout, iwo_group); + args2.odesc.set(transformed_ddO_channel, iwo_group); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search2 = + paddle::operators::SearchAlgorithm; + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2)); + fwd_result2.algo = search2::Find( + args2, exhaustive_search, false, workspace_size, ctx); +#else + using search2 = + paddle::operators::SearchAlgorithm; + fwd_result2 = search2::Find(args2, exhaustive_search, false, ctx); + workspace_size = std::max( + workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo)); +#endif + } + } + + if (dW && ddX) { + dw = dW->data(); + args3.handle = handle; + args3.idesc.set(transformed_ddX, iwo_group); + args3.wdesc.set(*dW, layout, iwo_group); + args3.odesc.set(transformed_dO_channel, iwo_group); + args3.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search3 = + paddle::operators::SearchAlgorithm; + workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); + filter_result.algo = search3::Find( + args3, exhaustive_search, deterministic, workspace_size, ctx); +#else + using search3 = + paddle::operators::SearchAlgorithm; + filter_result = + search3::Find(args3, exhaustive_search, deterministic, ctx); + workspace_size = std::max( + workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); +#endif + } + + if (ddW && dX) { + transformed_dx = transformed_dX.data(); + + args4.handle = handle; + args4.idesc.set(transformed_dX, iwo_group); + args4.wdesc.set(*ddW, layout, iwo_group); + args4.odesc.set(transformed_dO_channel, iwo_group); + args4.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_group); + +#ifdef PADDLE_WITH_HIP + using search4 = + paddle::operators::SearchAlgorithm; + workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); + data_result.algo = search4::Find( + args4, exhaustive_search, deterministic, workspace_size, ctx); +#else + using search4 = + paddle::operators::SearchAlgorithm; + data_result = + search4::Find(args4, exhaustive_search, deterministic, ctx); + workspace_size = std::max( + workspace_size, search4::GetWorkspaceSize(args4, data_result.algo)); +#endif + } + + int i_n, i_c, i_d, i_h, i_w; + GetNCDHW( + transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w); + + int o_n, o_c, o_d, o_h, o_w; + GetNCDHW(transformed_dO_channel.dims(), + DataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = W->numel() / groups; + + paddle::operators::ScalingParamType alpha = 1.0f; + paddle::operators::ScalingParamType beta = 0.0f; + + // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. + // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : + // 0.0f; + // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr("use_addto"); + auto wkspace_handle = ctx.cudnn_workspace_handle(); + + if (ddO) { + if (ddX) { + ddx = transformed_ddX.data(); +#ifdef PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionForward( + handle, + &alpha, + args1.idesc.desc(), + ddx, + args1.wdesc.desc(), + w, + args1.cdesc.desc(), + fwd_result1.algo, + &beta, + args1.odesc.desc(), + transformed_ddy_channel, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionForward( + handle, + &alpha, + args1.idesc.desc(), + ddx + i * group_offset_in, + args1.wdesc.desc(), + w + i * group_offset_filter, + args1.cdesc.desc(), + fwd_result1.algo, + workspace_ptr, + workspace_size, + &beta, + args1.odesc.desc(), + transformed_ddy_channel + i * group_offset_out)); + }, + workspace_size); + } +#endif + } + if (ddW) { +#ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionForward( + handle, + &alpha, + args2.idesc.desc(), + x, + args2.wdesc.desc(), + ddw, + args2.cdesc.desc(), + fwd_result2.algo, + &beta, + args2.odesc.desc(), + transformed_ddy_channel, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionForward( + handle, + &alpha, + args2.idesc.desc(), + x + i * group_offset_in, + args2.wdesc.desc(), + ddw + i * group_offset_filter, + args2.cdesc.desc(), + fwd_result2.algo, + workspace_ptr, + workspace_size, + &alpha, + args2.odesc.desc(), + transformed_ddy_channel + i * group_offset_out)); + }, + workspace_size); + } +#endif + } + if (channel_last) { + TransToChannelLast(ctx, &transformed_ddO_channel, ddO); + } + } + T* transformed_dy_channel = transformed_dO_channel.data(); + if (dW && ddX) { + ddx = transformed_ddX.data(); +#ifdef PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardWeights( + handle, + &alpha, + args3.odesc.desc(), + transformed_dy_channel, + args3.idesc.desc(), + ddx, + args3.cdesc.desc(), + filter_result.algo, + &beta, + args3.wdesc.desc(), + dw, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardFilter( + handle, + &alpha, + args3.idesc.desc(), + ddx + i * group_offset_in, + args3.odesc.desc(), + transformed_dy_channel + i * group_offset_out, + args3.cdesc.desc(), + filter_result.algo, + workspace_ptr, + workspace_size, + &beta, + args3.wdesc.desc(), + dw + i * group_offset_filter)); + }, + workspace_size); + } +#endif + } + + if (dX && ddW) { + ddw = ddW->data(); +#ifdef PADDLE_WITH_HIP + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args4.odesc.desc(), + transformed_dy_channel, + args4.wdesc.desc(), + ddw, + args4.cdesc.desc(), + data_result.algo, + &beta, + args4.idesc.desc(), + transformed_dx, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnConvolutionBackwardData( + handle, + &alpha, + args4.wdesc.desc(), + ddw + i * group_offset_filter, + args4.odesc.desc(), + transformed_dy_channel + i * group_offset_out, + args4.cdesc.desc(), + data_result.algo, + workspace_ptr, + workspace_size, + &beta, + args4.idesc.desc(), + transformed_dx + i * group_offset_in)); + }, + workspace_size); + } +#endif + + if (!is_sys_pad) { + // reverse padded input + std::vector starts(X->dims().size(), 0); + std::vector axes(X->dims().size(), 0); + + for (size_t i = 0; i < X->dims().size(); ++i) { + starts[i] = input_pad[2 * i]; + axes[i] = i; + } + if (X->dims().size() == 4) { + paddle::operators::RemovePaddingSlice( + ctx, &transformed_dX, &transformed_dX_channel, starts, axes); + } else { + paddle::operators::RemovePaddingSlice( + ctx, &transformed_dX, &transformed_dX_channel, starts, axes); + } + } + if (channel_last) { + TransToChannelLast(ctx, &transformed_dX_channel, dX); + } + } +} + +template +void DepthwiseConvDoubleGradGPUDNNKernel( + const Context& ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + bool fuse_relu, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad) { + ConvCudnnGradGradKernel(ctx, + input, + filter, + out_grad, + input_grad_grad, + filter_grad_grad, + strides, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search_t, + input_grad, + filter_grad, + out_grad_grad); +} + +template +void Conv3DCudnnGradGradKernel( + const Context& ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search_t, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad) { + ConvCudnnGradGradKernel(ctx, + input, + filter, + out_grad, + input_grad_grad, + filter_grad_grad, + strides, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search_t, + input_grad, + filter_grad, + out_grad_grad); +} + } // namespace phi #ifdef PADDLE_WITH_HIP @@ -688,6 +1402,26 @@ PD_REGISTER_KERNEL(depthwise_conv2d_grad, phi::DepthwiseConvCudnnGradKernel, float, phi::dtype::float16) {} +PD_REGISTER_KERNEL(conv2d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvDoubleGradGPUDNNKernel, + float, + phi::dtype::float16) {} #else #if CUDNN_VERSION_MIN(8, 1, 0) PD_REGISTER_KERNEL(conv2d_grad, @@ -707,6 +1441,32 @@ PD_REGISTER_KERNEL(conv3d_grad, double, phi::dtype::float16, phi::dtype::bfloat16) {} +PD_REGISTER_KERNEL(conv2d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvDoubleGradGPUDNNKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} #else PD_REGISTER_KERNEL(conv2d_grad, GPUDNN, @@ -724,6 +1484,29 @@ PD_REGISTER_KERNEL(conv3d_grad, double, phi::dtype::float16) {} +PD_REGISTER_KERNEL(conv2d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnGradGradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(conv3d_grad_grad, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnGradGradKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, + GPU, + ALL_LAYOUT, + phi::DepthwiseConvDoubleGradGPUDNNKernel, + float, + double, + phi::dtype::float16) {} #endif #endif diff --git a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h deleted file mode 100644 index 512b1529f9191..0000000000000 --- a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h +++ /dev/null @@ -1,330 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/operators/math/im2col.h" -#include "paddle/fluid/operators/math/vol2col.h" -#include "paddle/phi/kernels/conv_kernel.h" -#include "paddle/phi/kernels/cpu/conv_util.h" -#include "paddle/phi/kernels/funcs/batch_norm_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace phi { - -template -void ConvGradGradKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides_t, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad) { - const DenseTensor* X = &input; - const DenseTensor* dY = &out_grad; - const DenseTensor* ddX = input_grad_grad.get_ptr(); - const DenseTensor* ddW_in = filter_grad_grad.get_ptr(); - - DenseTensor* ddY = out_grad_grad; - DenseTensor* dW = filter_grad; - DenseTensor* dX = input_grad; - DenseTensor W = filter; - - if (!ddY && !dW && !dX) return; - - const std::vector strides = strides_t; - std::vector paddings = paddings_t; - std::vector dilations = dilations_t; - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensor - DenseTensor transformed_X(X->type()); - DenseTensor transformed_dY(dY->type()); - DenseTensor transformed_ddX(X->type()); - - if (channel_last) { - ResizeToChannelFirst(dev_ctx, X, &transformed_X); - TransToChannelFirst(dev_ctx, X, &transformed_X); - - ResizeToChannelFirst(dev_ctx, dY, &transformed_dY); - TransToChannelFirst(dev_ctx, dY, &transformed_dY); - - if (ddX) { - ResizeToChannelFirst(dev_ctx, ddX, &transformed_ddX); - TransToChannelFirst(dev_ctx, ddX, &transformed_ddX); - } - } else { - transformed_X = *X; - transformed_dY = *dY; - if (ddX) { - transformed_ddX = *ddX; - } - } - - // update padding and dilation - auto in_dims = transformed_X.dims(); - auto filter_dims = W.dims(); - - DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); - DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - const int batch_size = static_cast(transformed_X.dims()[0]); - std::vector filter_shape_vec(vectorize(W.dims())); - std::vector output_shape_vec(vectorize(transformed_dY.dims())); - - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - // col_shape [in_channel/group, kh, kw, oh, ow] - col_shape_vec[0] = transformed_X.dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2]; - } - DDim col_shape(make_ddim(col_shape_vec)); - // col_matrix_shape [in_channel/group * kh * kw, oh * ow] - DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1); - // input_shape [Cin, H, W] - DDim input_shape = - slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size()); - // filter_matrix_shape [Cout, Cin * kh * kw] - DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]}; - - W.Resize(filter_matrix_shape); - DDim output_matrix_shape = { - transformed_dY.dims()[1], - transformed_dY.numel() / - (transformed_dY.dims()[0] * transformed_dY.dims()[1])}; - int in_step = static_cast(transformed_X.dims()[1]) / groups; - int out_step = static_cast(transformed_dY.dims()[1]) / groups; - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - DenseTensor col; - DenseTensor col_matrix; - if (is_expand) { - col.Resize(col_shape); - dev_ctx.template Alloc(&col); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - phi::funcs::SetConstant set_zero; - auto blas = phi::funcs::GetBlas(dev_ctx); - - // dx convolution double grad: gemm + col2im(col2vol) - // dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout, - // oH, oW) - if (dX && ddW_in) { - Tensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - dev_ctx.template Alloc(dX); - - DenseTensor transformed_dX(dX->type()); - - if (channel_last) { - ResizeToChannelFirst(dev_ctx, dX, &transformed_dX); - - } else { - transformed_dX = *dX; - } - // if is_expand is false, the operation of set_zero is unnecessary - // because math::matmul will reset dx - if (is_expand) { - set_zero(dev_ctx, &transformed_dX, static_cast(0)); - } - paddle::operators::math::Col2VolFunctor col2vol; - paddle::operators::math:: - Col2ImFunctor - col2im; - - for (int i = 0; i < batch_size; i++) { - DenseTensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // gemm - DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col_matrix.ShareDataWith(dx_slice); - col_matrix.Resize(col_matrix_shape); - } - blas.MatMul( - ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0)); - - if (is_expand && data_dim == 2U) { - col2im(dev_ctx, - col, - dilations, - strides, - std::vector{ - paddings[0], paddings[2], paddings[1], paddings[3]}, - &dx_slice); - } else if (is_expand && data_dim == 3U) { - col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice); - } - } - } - if (channel_last) { - TransToChannelLast(dev_ctx, &transformed_dX, dX); - } - } - - // dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout, - // oH, oW) - // dw convolution double grad: im2col(vol2col) + gemm - if (dW && ddX) { - dev_ctx.template Alloc(dW); - set_zero(dev_ctx, dW, static_cast(0)); - DenseTensor dW_arr = *dW; - dW_arr.Resize(filter_matrix_shape); - paddle::operators::math:: - Im2ColFunctor - im2col; - paddle::operators::math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - DenseTensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; ++g) { - // im2col - DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, - ddx_slice, - dilations, - strides, - std::vector{ - paddings[0], paddings[2], paddings[1], paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - - DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul( - dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0)); - } - } - } - - // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W), - // w/ddw(Cout, Cin, kh, kw) - // ddy convolution double grad: im2col(vol2col) + gemm - if (ddY) { - dev_ctx.template Alloc(ddY); - - DenseTensor transformed_ddY(ddY->type()); - if (channel_last) { - ResizeToChannelFirst(dev_ctx, ddY, &transformed_ddY); - } else { - transformed_ddY = *ddY; - } - - set_zero(dev_ctx, &transformed_ddY, static_cast(0)); - paddle::operators::math:: - Im2ColFunctor - im2col; - paddle::operators::math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - DenseTensor ddy_batch = - transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape); - for (int g = 0; g < groups; ++g) { - // gemm - DenseTensor ddy_slice = - ddy_batch.Slice(g * out_step, (g + 1) * out_step); - - if (ddX) { - DenseTensor ddx_batch = - transformed_ddX.Slice(i, i + 1).Resize(input_shape); - DenseTensor ddx_slice = - ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, - ddx_slice, - dilations, - strides, - std::vector{ - paddings[0], paddings[2], paddings[1], paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul( - w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0)); - } - - if (ddW_in) { - DenseTensor x_batch = - transformed_X.Slice(i, i + 1).Resize(input_shape); - DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); - - DenseTensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - if (!is_expand) { - col.ShareDataWith(x_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, - x_slice, - dilations, - strides, - std::vector{ - paddings[0], paddings[2], paddings[1], paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col); - } - - // gemm - DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul( - ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0)); - } - } - } - if (channel_last) { - TransToChannelLast(dev_ctx, &transformed_ddY, ddY); - } - } -} - -} // namespace phi diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h index 5b96548476e7f..9929ee908df5d 100644 --- a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h @@ -16,7 +16,6 @@ #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/vol2col.h" -#include "paddle/phi/kernels/conv_grad_kernel.h" #include "paddle/phi/kernels/cpu/conv_util.h" #include "paddle/phi/kernels/funcs/batch_norm_utils.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -251,4 +250,307 @@ void ConvGradKernel(const Context& dev_ctx, } } +template +void ConvGradGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const paddle::optional& input_grad_grad, + const paddle::optional& filter_grad_grad, + const std::vector& strides_t, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + DenseTensor* input_grad, + DenseTensor* filter_grad, + DenseTensor* out_grad_grad) { + const DenseTensor* X = &input; + const DenseTensor* dY = &out_grad; + const DenseTensor* ddX = input_grad_grad.get_ptr(); + const DenseTensor* ddW_in = filter_grad_grad.get_ptr(); + + DenseTensor* ddY = out_grad_grad; + DenseTensor* dW = filter_grad; + DenseTensor* dX = input_grad; + DenseTensor W = filter; + + if (!ddY && !dW && !dX) return; + + const std::vector strides = strides_t; + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + // transform Tensor + DenseTensor transformed_X(X->type()); + DenseTensor transformed_dY(dY->type()); + DenseTensor transformed_ddX(X->type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, X, &transformed_X); + TransToChannelFirst(dev_ctx, X, &transformed_X); + + ResizeToChannelFirst(dev_ctx, dY, &transformed_dY); + TransToChannelFirst(dev_ctx, dY, &transformed_dY); + + if (ddX) { + ResizeToChannelFirst(dev_ctx, ddX, &transformed_ddX); + TransToChannelFirst(dev_ctx, ddX, &transformed_ddX); + } + } else { + transformed_X = *X; + transformed_dY = *dY; + if (ddX) { + transformed_ddX = *ddX; + } + } + + // update padding and dilation + auto in_dims = transformed_X.dims(); + auto filter_dims = W.dims(); + + DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + const int batch_size = static_cast(transformed_X.dims()[0]); + std::vector filter_shape_vec(vectorize(W.dims())); + std::vector output_shape_vec(vectorize(transformed_dY.dims())); + + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + // col_shape [in_channel/group, kh, kw, oh, ow] + col_shape_vec[0] = transformed_X.dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2]; + } + DDim col_shape(make_ddim(col_shape_vec)); + // col_matrix_shape [in_channel/group * kh * kw, oh * ow] + DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1); + // input_shape [Cin, H, W] + DDim input_shape = + slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size()); + // filter_matrix_shape [Cout, Cin * kh * kw] + DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]}; + + W.Resize(filter_matrix_shape); + DDim output_matrix_shape = { + transformed_dY.dims()[1], + transformed_dY.numel() / + (transformed_dY.dims()[0] * transformed_dY.dims()[1])}; + int in_step = static_cast(transformed_X.dims()[1]) / groups; + int out_step = static_cast(transformed_dY.dims()[1]) / groups; + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + DenseTensor col; + DenseTensor col_matrix; + if (is_expand) { + col.Resize(col_shape); + dev_ctx.template Alloc(&col); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + phi::funcs::SetConstant set_zero; + auto blas = phi::funcs::GetBlas(dev_ctx); + + // dx convolution double grad: gemm + col2im(col2vol) + // dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout, + // oH, oW) + if (dX && ddW_in) { + Tensor ddW; + ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); + dev_ctx.template Alloc(dX); + + DenseTensor transformed_dX(dX->type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, dX, &transformed_dX); + + } else { + transformed_dX = *dX; + } + // if is_expand is false, the operation of set_zero is unnecessary + // because math::matmul will reset dx + if (is_expand) { + set_zero(dev_ctx, &transformed_dX, static_cast(0)); + } + paddle::operators::math::Col2VolFunctor col2vol; + paddle::operators::math:: + Col2ImFunctor + col2im; + + for (int i = 0; i < batch_size; i++) { + DenseTensor dy_batch = + transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); + DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // gemm + DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); + DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col_matrix.ShareDataWith(dx_slice); + col_matrix.Resize(col_matrix_shape); + } + blas.MatMul( + ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0)); + + if (is_expand && data_dim == 2U) { + col2im(dev_ctx, + col, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &dx_slice); + } else if (is_expand && data_dim == 3U) { + col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice); + } + } + } + if (channel_last) { + TransToChannelLast(dev_ctx, &transformed_dX, dX); + } + } + + // dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout, + // oH, oW) + // dw convolution double grad: im2col(vol2col) + gemm + if (dW && ddX) { + dev_ctx.template Alloc(dW); + set_zero(dev_ctx, dW, static_cast(0)); + DenseTensor dW_arr = *dW; + dW_arr.Resize(filter_matrix_shape); + paddle::operators::math:: + Im2ColFunctor + im2col; + paddle::operators::math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; ++i) { + DenseTensor dy_batch = + transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); + Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; ++g) { + // im2col + DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); + DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col.ShareDataWith(ddx_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + ddx_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); + } + + DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0)); + } + } + } + + // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W), + // w/ddw(Cout, Cin, kh, kw) + // ddy convolution double grad: im2col(vol2col) + gemm + if (ddY) { + dev_ctx.template Alloc(ddY); + + DenseTensor transformed_ddY(ddY->type()); + if (channel_last) { + ResizeToChannelFirst(dev_ctx, ddY, &transformed_ddY); + } else { + transformed_ddY = *ddY; + } + + set_zero(dev_ctx, &transformed_ddY, static_cast(0)); + paddle::operators::math:: + Im2ColFunctor + im2col; + paddle::operators::math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; ++i) { + DenseTensor ddy_batch = + transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape); + for (int g = 0; g < groups; ++g) { + // gemm + DenseTensor ddy_slice = + ddy_batch.Slice(g * out_step, (g + 1) * out_step); + + if (ddX) { + DenseTensor ddx_batch = + transformed_ddX.Slice(i, i + 1).Resize(input_shape); + DenseTensor ddx_slice = + ddx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col.ShareDataWith(ddx_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + ddx_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); + } + DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0)); + } + + if (ddW_in) { + DenseTensor x_batch = + transformed_X.Slice(i, i + 1).Resize(input_shape); + DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); + + DenseTensor ddW; + ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); + if (!is_expand) { + col.ShareDataWith(x_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, + x_slice, + dilations, + strides, + std::vector{ + paddings[0], paddings[2], paddings[1], paddings[3]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col); + } + + // gemm + DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul( + ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0)); + } + } + } + if (channel_last) { + TransToChannelLast(dev_ctx, &transformed_ddY, ddY); + } + } +} + } // namespace phi From 123edd23f89a45ef3ad9efefbdfb9db86e57089a Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 28 Sep 2022 12:35:31 +0000 Subject: [PATCH 10/36] clear conv2d_grad_grad extra attrs --- paddle/phi/kernels/conv_grad_kernel.h | 5 +--- paddle/phi/kernels/cpu/conv_grad_kernel.cc | 5 +--- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 25 +++++++++---------- .../phi/kernels/impl/conv_grad_kernel_impl.h | 5 +--- paddle/phi/ops/compat/conv2d_sig.cc | 7 ++---- 5 files changed, 17 insertions(+), 30 deletions(-) diff --git a/paddle/phi/kernels/conv_grad_kernel.h b/paddle/phi/kernels/conv_grad_kernel.h index 759a49d85f486..21dbc7f8761d1 100644 --- a/paddle/phi/kernels/conv_grad_kernel.h +++ b/paddle/phi/kernels/conv_grad_kernel.h @@ -77,12 +77,9 @@ void ConvGradGradKernel(const Context& dev_ctx, const std::vector& strides, const std::vector& paddings, const std::string& paddding_algorithm, - int groups, const std::vector& dilations, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* input_grad, DenseTensor* filter_grad, DenseTensor* out_grad_grad); diff --git a/paddle/phi/kernels/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_kernel.cc index 3becf810d7b34..7102af6cf4a20 100644 --- a/paddle/phi/kernels/cpu/conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc @@ -109,12 +109,9 @@ void Conv3DGradGradKernel(const Context& ctx, strides, paddings_t, padding_algorithm, - groups, dilations_t, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search_t, input_grad, filter_grad, out_grad_grad); diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index bf870dc2eaf99..e67815a5b7dd6 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -676,12 +676,9 @@ void ConvCudnnGradGradKernel( const std::vector& strides, const std::vector& paddings_t, const std::string& padding_algorithm, - int groups, const std::vector& dilations_t, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, DenseTensor* input_grad, DenseTensor* filter_grad, DenseTensor* out_grad_grad) { @@ -717,7 +714,15 @@ void ConvCudnnGradGradKernel( T* transformed_dx = nullptr; std::vector dilations = dilations_t; - bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; + bool has_exhaustive_search = ctx.HasDnnAttr("exhaustive_search"); + VLOG(4) << "GPUContext contains `exhaustive_search`: " + << has_exhaustive_search; + bool exhaustive_search_attr = + has_exhaustive_search + ? PADDLE_GET_CONST(bool, ctx.GetDnnAttr("exhaustive_search")) + : false; + bool exhaustive_search = + FLAGS_cudnn_exhaustive_search || exhaustive_search_attr; bool deterministic = FLAGS_cudnn_deterministic; auto exhaustive_deterministic = exhaustive_search && deterministic; PADDLE_ENFORCE_EQ(exhaustive_deterministic, @@ -1328,12 +1333,9 @@ void DepthwiseConvDoubleGradGPUDNNKernel( strides, paddings_t, padding_algorithm, - groups, dilations_t, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search_t, input_grad, filter_grad, out_grad_grad); @@ -1368,12 +1370,9 @@ void Conv3DCudnnGradGradKernel( strides, paddings_t, padding_algorithm, - groups, dilations_t, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search_t, input_grad, filter_grad, out_grad_grad); diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h index 9929ee908df5d..0d4cdddf6b520 100644 --- a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h @@ -260,12 +260,9 @@ void ConvGradGradKernel(const Context& dev_ctx, const std::vector& strides_t, const std::vector& paddings_t, const std::string& padding_algorithm, - int groups, const std::vector& dilations_t, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, DenseTensor* input_grad, DenseTensor* filter_grad, DenseTensor* out_grad_grad) { diff --git a/paddle/phi/ops/compat/conv2d_sig.cc b/paddle/phi/ops/compat/conv2d_sig.cc index 57482eb1bb17e..22ff9b3e1a834 100644 --- a/paddle/phi/ops/compat/conv2d_sig.cc +++ b/paddle/phi/ops/compat/conv2d_sig.cc @@ -47,12 +47,9 @@ KernelSignature Conv2dDoubleGradOpArgumentMapping( {"strides", "paddings", "padding_algorithm", - "groups", "dilations", - "data_format", - "use_addto", - "workspace_size_MB", - "exhaustive_search"}, + "groups", + "data_format"}, {"DInput", "DFilter", "DDOutput"}); } From b3aaf8660a3b9992cca0b2aa79be84546076e874 Mon Sep 17 00:00:00 2001 From: Silv3S Date: Thu, 29 Sep 2022 19:45:13 +0200 Subject: [PATCH 11/36] remove redundant imports --- paddle/phi/kernels/onednn/activation_grad_kernel.cc | 3 --- paddle/phi/kernels/onednn/activation_kernel.cc | 3 --- paddle/phi/kernels/onednn/log_softmax_kernel.cc | 3 --- paddle/phi/kernels/onednn/softmax_grad_kernel.cc | 3 --- 4 files changed, 12 deletions(-) diff --git a/paddle/phi/kernels/onednn/activation_grad_kernel.cc b/paddle/phi/kernels/onednn/activation_grad_kernel.cc index 9e183abf0287f..cec81f187a842 100644 --- a/paddle/phi/kernels/onednn/activation_grad_kernel.cc +++ b/paddle/phi/kernels/onednn/activation_grad_kernel.cc @@ -15,10 +15,7 @@ #include "paddle/phi/kernels/activation_grad_kernel.h" #include "paddle/phi/kernels/gelu_grad_kernel.h" -#include "paddle/phi/backends/onednn/onednn_context.h" #include "paddle/phi/backends/onednn/onednn_reuse.h" -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/activation_functor.h" diff --git a/paddle/phi/kernels/onednn/activation_kernel.cc b/paddle/phi/kernels/onednn/activation_kernel.cc index 36ba1be724ccf..c81b24e402bf8 100644 --- a/paddle/phi/kernels/onednn/activation_kernel.cc +++ b/paddle/phi/kernels/onednn/activation_kernel.cc @@ -15,10 +15,7 @@ #include "paddle/phi/kernels/activation_kernel.h" #include "paddle/phi/kernels/gelu_grad_kernel.h" -#include "paddle/phi/backends/onednn/onednn_context.h" #include "paddle/phi/backends/onednn/onednn_reuse.h" -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/activation_functor.h" diff --git a/paddle/phi/kernels/onednn/log_softmax_kernel.cc b/paddle/phi/kernels/onednn/log_softmax_kernel.cc index 963bb7e0e3224..aa811332e524a 100644 --- a/paddle/phi/kernels/onednn/log_softmax_kernel.cc +++ b/paddle/phi/kernels/onednn/log_softmax_kernel.cc @@ -14,10 +14,7 @@ #include "paddle/phi/kernels/log_softmax_kernel.h" -#include "paddle/phi/backends/onednn/onednn_context.h" #include "paddle/phi/backends/onednn/onednn_reuse.h" -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { diff --git a/paddle/phi/kernels/onednn/softmax_grad_kernel.cc b/paddle/phi/kernels/onednn/softmax_grad_kernel.cc index 0209992a679cd..e83135dc11c13 100644 --- a/paddle/phi/kernels/onednn/softmax_grad_kernel.cc +++ b/paddle/phi/kernels/onednn/softmax_grad_kernel.cc @@ -14,10 +14,7 @@ #include "paddle/phi/kernels/softmax_grad_kernel.h" -#include "paddle/phi/backends/onednn/onednn_context.h" #include "paddle/phi/backends/onednn/onednn_reuse.h" -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { From 3839f84d272011739ef3241c20b008047eff8c64 Mon Sep 17 00:00:00 2001 From: Silv3S Date: Thu, 29 Sep 2022 20:30:13 +0200 Subject: [PATCH 12/36] migrate softmax --- .../ir/mkldnn/mkldnn_inplace_pass_tester.cc | 2 +- .../operators/mkldnn/softmax_mkldnn_op.cc | 112 ------------------ .../operators/mkldnn/test_mkldnn_caching.cc | 2 +- .../mkldnn/test_mkldnn_op_inplace.cc | 3 +- paddle/phi/backends/onednn/onednn_reuse.h | 9 +- paddle/phi/kernels/onednn/softmax_kernel.cc | 61 ++++++++++ 6 files changed, 72 insertions(+), 117 deletions(-) delete mode 100644 paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc create mode 100644 paddle/phi/kernels/onednn/softmax_kernel.cc diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index d78c0c4356266..3f90d32652819 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -23,7 +23,7 @@ #include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(softmax); -USE_OP_DEVICE_KERNEL(softmax, MKLDNN); +PD_DECLARE_KERNEL(softmax, OneDNN, ALL_LAYOUT); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(leaky_relu); diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc deleted file mode 100644 index 659539e5e39b8..0000000000000 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ /dev/null @@ -1,112 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/mkldnn_reuse.h" -#include "paddle/phi/kernels/funcs/axis_utils.h" - -namespace paddle { -namespace operators { - -using paddle::framework::Tensor; -using paddle::platform::MKLDNNDeviceContext; -using paddle::platform::MKLDNNMemDesc; - -using dnnl::memory; // Note: paddle has also "memory" namespace -using dnnl::primitive; -using dnnl::prop_kind; -using dnnl::softmax_backward; -using dnnl::softmax_forward; -using dnnl::stream; -using platform::to_void_cast; - -template -class SoftmaxMKLDNNHandler - : public platform::MKLDNNHandlerNoCachingT { - public: - SoftmaxMKLDNNHandler(const dnnl::engine mkldnn_engine, - platform::Place cpu_place, - const Tensor* input, - Tensor* output, - const int axis) - : platform::MKLDNNHandlerNoCachingT(mkldnn_engine, - cpu_place) { - PADDLE_ENFORCE_EQ( - input->dims(), - output->dims(), - platform::errors::InvalidArgument( - "The shape of input and output tensor must be identical.")); - - this->AcquireForwardPrimitiveDescriptor( - prop_kind::forward_scoring, input->mem_desc(), axis); - } -}; - -template -class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - const Tensor* input = ctx.Input("X"); - Tensor* output = ctx.Output("Out"); - bool is_inplaced = input->IsSharedBufferWith(*output); - - const int axis = - phi::funcs::CanonicalAxis(ctx.Attr("axis"), input->dims().size()); - - SoftmaxMKLDNNHandler handler( - mkldnn_engine, ctx.GetPlace(), input, output, axis); - - auto softmax_src_memory_p = handler.AcquireSrcMemory(input); - // For Inplace src and and dst are the same memory object - std::shared_ptr softmax_dst_memory_p = nullptr; - if (is_inplaced) { - softmax_dst_memory_p = softmax_src_memory_p; - output->mutable_data(ctx.GetPlace()); - } else { - softmax_dst_memory_p = handler.AcquireDstMemory(output); - } - auto softmax_p = handler.AcquireForwardPrimitive(); - - auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); - softmax_p->execute(astream, - {{DNNL_ARG_SRC, *softmax_src_memory_p}, - {DNNL_ARG_DST, *softmax_dst_memory_p}}); - astream.wait(); - - const bool is_test = ctx.Attr("is_test"); - if (!is_test) { - T* output_data = output->mutable_data(ctx.GetPlace()); - std::for_each(output_data, &output_data[output->numel()], [](T& val) { - val = std::max(val, static_cast(exp(-64))); - }); - } - - output->set_mem_desc(softmax_dst_memory_p->get_desc()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_KERNEL(softmax, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::SoftmaxMKLDNNKernel, - ops::SoftmaxMKLDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index f05bd2635116c..dab6fda9e981b 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -34,7 +34,7 @@ USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN); USE_OP_ITSELF(relu); PD_DECLARE_KERNEL(relu, OneDNN, ALL_LAYOUT); USE_OP_ITSELF(softmax); -USE_OP_DEVICE_KERNEL(softmax, MKLDNN); +PD_DECLARE_KERNEL(softmax, OneDNN, ALL_LAYOUT); USE_OP_ITSELF(conv2d); USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index 196e018507069..b027ec3d303c8 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -32,8 +32,7 @@ USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(relu); PD_DECLARE_KERNEL(relu, OneDNN, ALL_LAYOUT); USE_OP_ITSELF(softmax); -USE_OP_DEVICE_KERNEL(softmax, MKLDNN); - +PD_DECLARE_KERNEL(softmax, OneDNN, ALL_LAYOUT); PD_DECLARE_KERNEL(softmax, CPU, ALL_LAYOUT); namespace paddle { diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h index d52e0bd5db8e4..31128448a7e96 100644 --- a/paddle/phi/backends/onednn/onednn_reuse.h +++ b/paddle/phi/backends/onednn/onednn_reuse.h @@ -692,12 +692,19 @@ class SoftmaxOneDNNHandler public: SoftmaxOneDNNHandler(const dnnl::engine onednn_engine, Place cpu_place, + int axis, const DenseTensor* x, - int axis) + DenseTensor* out) : OneDNNHandlerNoCachingT(onednn_engine, cpu_place) { + PADDLE_ENFORCE_EQ( + x->dims(), + out->dims(), + phi::errors::InvalidArgument( + "The shape of input and output tensor must be identical.")); + const int canonical_axis = funcs::CanonicalAxis(axis, x->dims().size()); this->AcquireForwardPrimitiveDescriptor( dnnl::prop_kind::forward_scoring, x->mem_desc(), canonical_axis); diff --git a/paddle/phi/kernels/onednn/softmax_kernel.cc b/paddle/phi/kernels/onednn/softmax_kernel.cc new file mode 100644 index 0000000000000..818396e71bd46 --- /dev/null +++ b/paddle/phi/kernels/onednn/softmax_kernel.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/softmax_kernel.h" + +#include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void SoftmaxKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out) { + funcs::SoftmaxOneDNNHandler handler( + dev_ctx.GetEngine(), dev_ctx.GetPlace(), axis, &x, out); + + auto src_memory_p = handler.AcquireSrcMemory(&x); + std::shared_ptr dst_memory_p = nullptr; + if (x.IsSharedBufferWith(*out)) { + dst_memory_p = src_memory_p; + dev_ctx.template Alloc(out); + } else { + dst_memory_p = handler.AcquireDstMemory(out); + } + auto softmax_p = handler.AcquireForwardPrimitive(); + + auto& astream = OneDNNContext::tls().get_stream(); + softmax_p->execute(astream, + {{DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}); + astream.wait(); + + // Find a way to get attribute + // const bool is_test = ctx.Attr("is_test"); + // if (!is_test) { + // T* output_data = out->mutable_data(ctx.GetPlace()); + // std::for_each(output_data, &output_data[out->numel()], [](T& val) { + // val = std::max(val, static_cast(exp(-64))); + // }); + // } + + out->set_mem_desc(dst_memory_p->get_desc()); + } + +} // namespace phi + +PD_REGISTER_KERNEL( + softmax, OneDNN, ALL_LAYOUT, phi::SoftmaxKernel, float, phi::dtype::bfloat16) {} From 853b1db473b4c7ea8b6a0d22daa6c9c9f1767cf5 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 30 Sep 2022 05:11:46 +0000 Subject: [PATCH 13/36] clear yaml and eager extra attr --- .../manual/eager_manual/dygraph_forward_api.h | 7 +-- .../forwards/conv2d_fwd_function.cc | 31 +++------- .../manual/eager_manual/nodes/conv2d_nodes.cc | 19 +----- .../api/manual/eager_manual/nodes/nodes.h | 20 ------- paddle/phi/api/yaml/legacy_backward.yaml | 8 +-- paddle/phi/api/yaml/legacy_ops.yaml | 6 +- paddle/phi/infermeta/binary.cc | 59 +++++++++++++------ paddle/phi/infermeta/binary.h | 42 ++++++++----- python/paddle/fluid/dygraph/nn.py | 4 +- python/paddle/nn/functional/conv.py | 7 +-- 10 files changed, 92 insertions(+), 111 deletions(-) diff --git a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h index bc970f4e2d859..4b0247f7c5e37 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h +++ b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h @@ -25,9 +25,6 @@ paddle::experimental::Tensor conv2d_ad_func( std::vector strides, std::vector paddings, std::string paddding_algorithm, - int groups, std::vector dilations, - std::string data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search); + int groups, + std::string data_format); diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc index d5f15883e0e19..730d494cdf24e 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc @@ -30,12 +30,9 @@ paddle::experimental::Tensor conv2d_ad_func( std::vector strides, std::vector paddings, std::string paddding_algorithm, - int groups, std::vector dilations, - std::string data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search) { + int groups, + std::string data_format) { // Dygraph Record Event paddle::platform::RecordEvent dygraph_entrance_record_event( "conv2d dygraph", paddle::platform::TracerEventType::Operator, 1); @@ -65,12 +62,9 @@ paddle::experimental::Tensor conv2d_ad_func( strides, paddings, paddding_algorithm, - groups, dilations, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search); + groups, + data_format); } } @@ -93,12 +87,9 @@ paddle::experimental::Tensor conv2d_ad_func( strides, paddings, paddding_algorithm, - groups, dilations, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search); + groups, + data_format); transformer->SetOutTensorLayout(&out); if (need_tune) { egr::Controller::Instance().EnableLayoutAutoTune(); @@ -120,12 +111,9 @@ paddle::experimental::Tensor conv2d_ad_func( strides, paddings, paddding_algorithm, - groups, dilations, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search); + groups, + data_format); // Check NaN and Inf if needed if (FLAGS_check_nan_inf) { egr::CheckTensorHasNanOrInf("conv2d", api_result); @@ -161,9 +149,6 @@ paddle::experimental::Tensor conv2d_ad_func( grad_node->SetAttributegroups(groups); grad_node->SetAttributedilations(dilations); grad_node->SetAttributedata_format(data_format); - grad_node->SetAttributeuse_addto(use_addto); - grad_node->SetAttributeworkspace_size_MB(workspace_size_MB); - grad_node->SetAttributeexhaustive_search(exhaustive_search); // Set TensorWrappers for Forward Inputs if needed grad_node->SetTensorWrapperinput(input); grad_node->SetTensorWrapperfilter(filter); diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc index ce8d647cb9ece..fd3b3bda41d99 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc @@ -50,9 +50,6 @@ Conv2dGradNodeFinal::operator()( auto& groups = this->groups_; auto& dilations = this->dilations_; auto& data_format = this->data_format_; - auto& use_addto = this->use_addto_; - auto& workspace_size_MB = this->workspace_size_MB_; - auto& exhaustive_search = this->exhaustive_search_; // Prepare Grad function call const auto& out_metas = OutputMeta(); @@ -88,12 +85,9 @@ Conv2dGradNodeFinal::operator()( strides, paddings, paddding_algorithm, - groups, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, api_output_0, api_output_1); // Check NaN and Inf id needed @@ -138,9 +132,6 @@ Conv2dGradNodeFinal::operator()( grad_node->SetAttributegroups(groups); grad_node->SetAttributedilations(dilations); grad_node->SetAttributedata_format(data_format); - grad_node->SetAttributeuse_addto(use_addto); - grad_node->SetAttributeworkspace_size_MB(workspace_size_MB); - grad_node->SetAttributeexhaustive_search(exhaustive_search); // Set TensorWrappers for Forward Inputs if needed grad_node->SetTensorWrapperinput(input); grad_node->SetTensorWrapperfilter(filter); @@ -219,9 +210,6 @@ Conv2dDoubleGradNodeFinal::operator()( auto& groups = this->groups_; auto& dilations = this->dilations_; auto& data_format = this->data_format_; - auto& use_addto = this->use_addto_; - auto& workspace_size_MB = this->workspace_size_MB_; - auto& exhaustive_search = this->exhaustive_search_; // Prepare Grad function call const auto& out_metas = OutputMeta(); @@ -262,12 +250,9 @@ Conv2dDoubleGradNodeFinal::operator()( strides, paddings, paddding_algorithm, - groups, dilations, + groups, data_format, - use_addto, - workspace_size_MB, - exhaustive_search, api_output_0, api_output_1, api_output_2); diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h index 14fe144c0094a..ead68b074e1fc 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h @@ -73,13 +73,6 @@ class Conv2dGradNodeFinal : public egr::GradNodeBase { void SetAttributedata_format(const std::string& data_format) { data_format_ = data_format; } - void SetAttributeuse_addto(const bool& use_addto) { use_addto_ = use_addto; } - void SetAttributeworkspace_size_MB(const int& workspace_size_MB) { - workspace_size_MB_ = workspace_size_MB; - } - void SetAttributeexhaustive_search(const bool& exhaustive_search) { - exhaustive_search_ = exhaustive_search; - } private: // TensorWrappers @@ -93,9 +86,6 @@ class Conv2dGradNodeFinal : public egr::GradNodeBase { int groups_; std::vector dilations_; std::string data_format_; - bool use_addto_; - int workspace_size_MB_; - bool exhaustive_search_; }; class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase { @@ -156,13 +146,6 @@ class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase { void SetAttributedata_format(const std::string& data_format) { data_format_ = data_format; } - void SetAttributeuse_addto(const bool& use_addto) { use_addto_ = use_addto; } - void SetAttributeworkspace_size_MB(const int& workspace_size_MB) { - workspace_size_MB_ = workspace_size_MB; - } - void SetAttributeexhaustive_search(const bool& exhaustive_search) { - exhaustive_search_ = exhaustive_search; - } private: // TensorWrappers @@ -177,9 +160,6 @@ class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase { int groups_; std::vector dilations_; std::string data_format_; - bool use_addto_; - int workspace_size_MB_; - bool exhaustive_search_; }; class AddNGradNodeFinal : public egr::GradNodeBase { diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 2f4eff9835604..34b19e61884a7 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -424,8 +424,8 @@ func : conj - backward_op : conv2d_grad - forward : conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out) - args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) + forward : conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int[] dilations, int groups, str data_format) -> Tensor(out) + args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str paddding_algorithm, int[] dilations, int groups, str data_format) output : Tensor(input_grad), Tensor(filter_grad) infer_meta : func : GeneralBinaryGradInferMeta @@ -436,8 +436,8 @@ backward : conv2d_grad_grad - backward_op : conv2d_grad_grad - forward : conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter) - args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) + forward : conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str paddding_algorithm, int[] dilations, int groups, str data_format) -> Tensor(grad_input), Tensor(grad_filter) + args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int[] dilations, int groups, str data_format) output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad) infer_meta : func : GeneralTernaryGradInferMeta diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index bca2aeb58a507..a8655227f890c 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -545,7 +545,7 @@ backward : conj_grad - op : conv2d - args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) + args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format) output : Tensor infer_meta : func : ConvInferMeta @@ -568,7 +568,7 @@ args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) output : Tensor infer_meta : - func : ConvInferMeta + func : Conv3DInferMeta kernel : func : conv3d use_gpudnn : true @@ -673,7 +673,7 @@ args : (Tensor x, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) output : Tensor(out) infer_meta : - func : ConvInferMeta + func : DepthwiseConvInferMeta param : [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search] kernel : func : depthwise_conv2d diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index f4be109f7c79d..c96e7dcbe87fd 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -409,12 +409,9 @@ void ConvInferMeta(const MetaTensor& input, const std::vector& strides, const std::vector& paddings_t, const std::string& padding_algorithm, - int groups, const std::vector& dilations_t, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, MetaTensor* out, MetaConfig config) { std::vector paddings = paddings_t; @@ -559,27 +556,27 @@ void ConvInferMeta(const MetaTensor& input, out->set_dtype(input.dtype()); } -void ConvInferInferMeta(const MetaTensor& input, - const MetaTensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::string& paddding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - MetaTensor* out, - MetaConfig config) { +void Conv3DInferMeta(const MetaTensor& input, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + MetaTensor* out, + MetaConfig config) { ConvInferMeta(input, filter, strides, paddings, paddding_algorithm, - groups, dilations, + groups, data_format, - /*use_addto=*/false, - /*workspace_size_MB=*/512, // useless in infermeta - /*exhaustive_search=*/false, out, config); } @@ -923,6 +920,31 @@ void CrossEntropyWithSoftmaxInferMeta(const MetaTensor& logits, loss->share_lod(logits); } +void DepthwiseConvInferMeta(const MetaTensor& input, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + MetaTensor* out, + MetaConfig config) { + ConvInferMeta(input, + filter, + strides, + paddings, + paddding_algorithm, + dilations, + groups, + data_format, + out, + config); +} + void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, @@ -2877,4 +2899,3 @@ void Unpool3dInferMeta(const MetaTensor& x, } // namespace phi PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta); -PD_REGISTER_INFER_META_FN(conv2d_infer, phi::ConvInferInferMeta); diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index d310f6282b09d..d4c30bf5b84aa 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -81,25 +81,25 @@ void ConvInferMeta(const MetaTensor& input, const std::vector& strides, const std::vector& paddings, const std::string& paddding_algorithm, - int groups, const std::vector& dilations, + int groups, const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search, MetaTensor* out, MetaConfig config = MetaConfig()); -void ConvInferInferMeta(const MetaTensor& input, - const MetaTensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::string& paddding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - MetaTensor* out, - MetaConfig config = MetaConfig()); +void Conv3DInferMeta(const MetaTensor& input, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + MetaTensor* out, + MetaConfig config = MetaConfig()); void ConvTransposeInferMeta(const MetaTensor& x, const MetaTensor& filter, @@ -143,6 +143,20 @@ void CrossEntropyWithSoftmaxInferMeta(const MetaTensor& logits, MetaTensor* loss, MetaConfig config = MetaConfig()); +void DepthwiseConvInferMeta(const MetaTensor& input, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index e21642b1c468c..4d4c24f74c435 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -239,9 +239,9 @@ def _get_default_param_initializer(): def forward(self, input): if in_dygraph_mode() and self._l_type == "conv2d": pre_bias = _C_ops.conv2d(input, self.weight, self._stride, - self._padding, "EXPLICIT", + self._padding, "EXPLICIT", self._dilation, self._groups if self._groups else 1, - self._dilation, "NCHW", False, -1, False) + "NCHW") if self.bias is not None: pre_act = F.elementwise_add(pre_bias, self.bias, axis=1) else: diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py index e70fa037ae256..70d3b06c738a4 100644 --- a/python/paddle/nn/functional/conv.py +++ b/python/paddle/nn/functional/conv.py @@ -124,8 +124,7 @@ def _conv_nd(x, # Due to the poor performance of NHWC, we transpose the input to NCHW. if in_dygraph_mode() and op_type == "conv2d": pre_bias = _C_ops.conv2d(x, weight, stride, padding, padding_algorithm, - groups, dilation, data_format, False, -1, - False) + dilation, groups, data_format) if bias is not None: channel_dim = channel_dim + len( x.shape) if channel_dim < 0 else channel_dim @@ -620,8 +619,8 @@ def conv2d(x, else: if in_dygraph_mode(): pre_bias = _C_ops.conv2d(x, weight, stride, padding, - padding_algorithm, groups, dilation, - data_format, False, -1, False) + padding_algorithm, dilation, groups, + data_format) if bias is not None: out = nn.elementwise_add(pre_bias, bias, axis=channel_dim) return out From de52450a47ab4c914518d36422c27b3634591cab Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 10 Oct 2022 11:55:35 +0000 Subject: [PATCH 14/36] fix conv1d error --- python/paddle/nn/functional/conv.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py index 70d3b06c738a4..9aaed427ebbe7 100644 --- a/python/paddle/nn/functional/conv.py +++ b/python/paddle/nn/functional/conv.py @@ -408,10 +408,14 @@ def conv1d(x, x = unsqueeze(x, axis=[squeeze_aixs]) if in_dygraph_mode(): - out = getattr(_C_ops, - l_type)(x, weight, stride, padding, padding_algorithm, - groups, dilation, conv2d_data_format, False, -1, - False, False, use_cudnn) + if l_type == 'conv2d': + out = _C_ops.conv2d(x, weight, stride, padding, padding_algorithm, + dilation, groups, conv2d_data_format) + else: + out = getattr(_C_ops, + l_type)(x, weight, stride, padding, padding_algorithm, + groups, dilation, conv2d_data_format, False, + -1, False, False, use_cudnn) if bias is not None: out = nn.elementwise_add(out, bias, axis=channel_dim) elif _in_legacy_dygraph(): From 03f6893873ae34009ac230b55c861786a72076f9 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 11 Oct 2022 05:02:02 +0000 Subject: [PATCH 15/36] change to thread local --- paddle/fluid/platform/device_context.h | 2 +- paddle/phi/backends/gpu/gpu_context.cc | 38 ++++---------------- paddle/phi/backends/gpu/gpu_context.h | 2 -- paddle/phi/backends/onednn/onednn_context.cc | 33 ++++------------- paddle/phi/backends/onednn/onednn_context.h | 2 -- 5 files changed, 14 insertions(+), 63 deletions(-) diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index e2ff4911dc049..a16255e558f09 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -193,7 +193,7 @@ struct DefaultDeviceContextType { #ifdef PADDLE_WITH_ASCEND_CL class NPUDeviceContext - : public DeviceContex, + : public DeviceContext, public TypeInfoTraits { public: explicit NPUDeviceContext(NPUPlace place); diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index def4e71f78f2b..69f0baf069984 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -720,6 +720,7 @@ struct GPUContext::Impl { bool HasDnnAttr(const std::string& attr_name) const { return dnn_attrs_.count(attr_name) != 0UL; } + const Attribute& GetDnnAttr(const std::string& attr_name) const { auto iter = dnn_attrs_.find(attr_name); PADDLE_ENFORCE_NE( @@ -730,24 +731,9 @@ struct GPUContext::Impl { } void SetDnnAttr(const std::string& attr_name, Attribute attr) { - if (dnn_attrs_version_ == 0) { - dnn_attrs_[attr_name] = attr; - } else { - auto iter = dnn_attrs_.find(attr_name); - // TODO(chenweihang): this check is not enough - PADDLE_ENFORCE_EQ( - iter, - dnn_attrs_.end(), - phi::errors::AlreadyExists( - "Attribute `%s` in OneDNNContext is being used now.")); - dnn_attrs_[attr_name] = attr; - } + dnn_attrs_[attr_name] = attr; } - int8_t DnnAttrsVersion() const { return dnn_attrs_version_; } - - void SetDnnAttrsVersion(int8_t version) { dnn_attrs_version_ = version; } - // use one flag for all handles? // they should be accessed consistently bool owned_{false}; @@ -812,18 +798,14 @@ struct GPUContext::Impl { // A internal resouce to initinalize eigen_device. std::unique_ptr eigen_stream_{nullptr}; - // For thread safety when using onednn_attrs_ - int8_t dnn_attrs_version_ = 0; // Holds some attributes only used by the gpudnn kernel calculation - // Since original mkldnn op kernel directly adds the operations that require - // fusion to the native kernel operations, and uses the attribute `fuse_xxx` - // to control, for gpudnn, there will be some attributes that seem to be - // independent of the device are also saved here. - // Here, the operation of fusion needs to be implemented separately as - // a fusion op and kernel, instead of patching it to a basic operation. - AttributeMap dnn_attrs_; + // Because DeviceContext is a global singleton, you need to ensure thread + // safety, use the thread_local variable + static thread_local AttributeMap dnn_attrs_; }; +thread_local AttributeMap GPUContext::Impl::dnn_attrs_ = {}; + GPUContext::GPUContext(GPUContext&&) = default; GPUContext& GPUContext::operator=(GPUContext&&) = default; @@ -1054,10 +1036,4 @@ void GPUContext::SetDnnAttr(const std::string& attr_name, Attribute attr) { return impl_->SetDnnAttr(attr_name, std::move(attr)); } -int8_t GPUContext::DnnAttrsVersion() const { return impl_->DnnAttrsVersion(); } - -void GPUContext::SetDnnAttrsVersion(int8_t version) { - return impl_->SetDnnAttrsVersion(version); -} - } // namespace phi diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index 9f5f8d6264e96..84aba73fad1e1 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -172,8 +172,6 @@ class PADDLE_API GPUContext : public DeviceContext, bool HasDnnAttr(const std::string& attr_name) const; const Attribute& GetDnnAttr(const std::string& attr_name) const; void SetDnnAttr(const std::string& attr_name, Attribute attr); - int8_t DnnAttrsVersion() const; - void SetDnnAttrsVersion(int8_t version); static const char* name() { return "GPUContext"; } diff --git a/paddle/phi/backends/onednn/onednn_context.cc b/paddle/phi/backends/onednn/onednn_context.cc index 975c4726ded44..f57b10fb51a3b 100644 --- a/paddle/phi/backends/onednn/onednn_context.cc +++ b/paddle/phi/backends/onednn/onednn_context.cc @@ -298,24 +298,9 @@ struct OneDNNContext::Impl { } void SetDnnAttr(const std::string& attr_name, Attribute attr) { - if (dnn_attrs_version_ == 0) { - dnn_attrs_[attr_name] = attr; - } else { - auto iter = dnn_attrs_.find(attr_name); - // TODO(chenweihang): this check is not enough - PADDLE_ENFORCE_EQ( - iter, - dnn_attrs_.end(), - phi::errors::AlreadyExists( - "Attribute `%s` in OneDNNContext is being used now.")); - dnn_attrs_[attr_name] = attr; - } + dnn_attrs_[attr_name] = attr; } - int8_t DnnAttrsVersion() const { return dnn_attrs_version_; } - - void SetDnnAttrsVersion(int8_t version) { dnn_attrs_version_ = version; } - std::shared_ptr p_blobmap_; // Map key is pointer of executor and value is a data(iterator in map) needed // to erase @@ -324,8 +309,6 @@ struct OneDNNContext::Impl { // 0 - clearing is allowed. x > 0 do not clear. unsigned int block_next_cache_clearing_ = 0; - // For thread safety when using dnn_attrs_ - int8_t dnn_attrs_version_ = 0; // Holds some attributes only used by the onednn kernel calculation // Since original mkldnn op kernel directly adds the operations that require // fusion to the native kernel operations, and uses the attribute `fuse_xxx` @@ -333,9 +316,13 @@ struct OneDNNContext::Impl { // independent of the device are also saved here. // Here, the operation of fusion needs to be implemented separately as // a fusion op and kernel, instead of patching it to a basic operation. - AttributeMap dnn_attrs_; + // Because DeviceContext is a global singleton, you need to ensure thread + // safety, use the thread_local variable + static thread_local AttributeMap dnn_attrs_; }; +thread_local AttributeMap OneDNNContext::Impl::dnn_attrs_ = {}; + OneDNNContext::OneDNNContext(const Place& place) : CPUContext(place), impl_(std::make_unique()) {} @@ -377,13 +364,5 @@ void OneDNNContext::SetDnnAttr(const std::string& attr_name, Attribute attr) { return impl_->SetDnnAttr(attr_name, std::move(attr)); } -int8_t OneDNNContext::DnnAttrsVersion() const { - return impl_->DnnAttrsVersion(); -} - -void OneDNNContext::SetDnnAttrsVersion(int8_t version) { - return impl_->SetDnnAttrsVersion(version); -} - } // namespace phi #endif diff --git a/paddle/phi/backends/onednn/onednn_context.h b/paddle/phi/backends/onednn/onednn_context.h index e077466ea725f..ee8474a41ccda 100644 --- a/paddle/phi/backends/onednn/onednn_context.h +++ b/paddle/phi/backends/onednn/onednn_context.h @@ -139,8 +139,6 @@ class OneDNNContext : public CPUContext { bool HasDnnAttr(const std::string& attr_name) const; const Attribute& GetDnnAttr(const std::string& attr_name) const; void SetDnnAttr(const std::string& attr_name, Attribute attr); - int8_t DnnAttrsVersion() const; - void SetDnnAttrsVersion(int8_t version); static const char* name() { return "OneDNNContext"; } From 49bcebe270cdd41cdcee1712c16146e24dfed205 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 11 Oct 2022 06:39:23 +0000 Subject: [PATCH 16/36] fix npu compile failed --- paddle/fluid/platform/device_context.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index a16255e558f09..0ad10cd8a7b3a 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -194,7 +194,7 @@ struct DefaultDeviceContextType { #ifdef PADDLE_WITH_ASCEND_CL class NPUDeviceContext : public DeviceContext, - public TypeInfoTraits { + public phi::TypeInfoTraits { public: explicit NPUDeviceContext(NPUPlace place); virtual ~NPUDeviceContext(); From ada7d0cbf7a384fb4e6cc6959e8947265936cc12 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 11 Oct 2022 07:56:00 +0000 Subject: [PATCH 17/36] try to fix windows compile failed --- paddle/phi/kernels/cpu/erfinv_grad_kernel.cc | 4 +++ paddle/phi/kernels/cpu/erfinv_kernel.cc | 22 ++++++++++-- paddle/phi/kernels/gpu/erfinv_grad_kernel.cu | 4 +++ .../kernels/impl/erfinv_grad_kernel_impl.h | 3 -- paddle/phi/kernels/impl/erfinv_kernel_impl.h | 35 ------------------- 5 files changed, 28 insertions(+), 40 deletions(-) delete mode 100644 paddle/phi/kernels/impl/erfinv_kernel_impl.h diff --git a/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc index 2d363189936b0..8ebb70653e072 100644 --- a/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc @@ -12,6 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif + #include "paddle/phi/kernels/erfinv_grad_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" diff --git a/paddle/phi/kernels/cpu/erfinv_kernel.cc b/paddle/phi/kernels/cpu/erfinv_kernel.cc index f298cc358d662..8822d6e536cb6 100644 --- a/paddle/phi/kernels/cpu/erfinv_kernel.cc +++ b/paddle/phi/kernels/cpu/erfinv_kernel.cc @@ -12,10 +12,28 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif + #include "paddle/phi/kernels/erfinv_kernel.h" -#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void ErfinvKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { + ctx.template Alloc(out); + auto eigen_in = EigenVector::Flatten(x); + auto eigen_out = EigenVector::Flatten(*out); + auto& place = *ctx.eigen_device(); + constexpr T half = static_cast(0.5); + constexpr T half_sqrt = static_cast(M_SQRT1_2); + eigen_out.device(place) = (eigen_in * half + half).ndtri() * half_sqrt; +} + +} // namespace phi PD_REGISTER_KERNEL(erfinv, CPU, ALL_LAYOUT, phi::ErfinvKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu index ce45d4e018e5a..034788502b043 100644 --- a/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu @@ -12,6 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif + #include "paddle/phi/kernels/erfinv_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" diff --git a/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h b/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h index ae76574e04e71..2d41d04f43c49 100644 --- a/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h @@ -13,9 +13,6 @@ // limitations under the License. #pragma once -#ifndef _USE_MATH_DEFINES -#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows -#endif #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/eigen/common.h" diff --git a/paddle/phi/kernels/impl/erfinv_kernel_impl.h b/paddle/phi/kernels/impl/erfinv_kernel_impl.h deleted file mode 100644 index c0fb8a01b9971..0000000000000 --- a/paddle/phi/kernels/impl/erfinv_kernel_impl.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#ifndef _USE_MATH_DEFINES -#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows -#endif -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" - -namespace phi { - -template -void ErfinvKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { - ctx.template Alloc(out); - auto eigen_in = EigenVector::Flatten(x); - auto eigen_out = EigenVector::Flatten(*out); - auto& place = *ctx.eigen_device(); - constexpr T half = static_cast(0.5); - constexpr T half_sqrt = static_cast(M_SQRT1_2); - eigen_out.device(place) = (eigen_in * half + half).ndtri() * half_sqrt; -} - -} // namespace phi From 22ccfa94fbb3abc19b7605757775aa0281c2c20d Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 13 Oct 2022 07:40:26 +0000 Subject: [PATCH 18/36] add conv2d onednn phi kernel --- .../manual/eager_manual/dygraph_forward_api.h | 2 +- .../forwards/conv2d_fwd_function.cc | 10 +- .../manual/eager_manual/nodes/conv2d_nodes.cc | 10 +- .../api/manual/eager_manual/nodes/nodes.h | 12 +- paddle/fluid/framework/archive.h | 2 +- paddle/fluid/framework/channel.h | 2 +- paddle/fluid/framework/operator.cc | 86 +- .../fused/mkldnn/fusion_gru_mkldnn_op.cc | 2 +- .../fused/mkldnn/fusion_lstm_mkldnn_op.cc | 2 +- .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 2 +- .../fluid/operators/mkldnn/prelu_mkldnn_op.cc | 2 +- paddle/fluid/operators/ops_extra_info.h | 21 + paddle/fluid/platform/device_context.cc | 2 +- paddle/phi/api/yaml/legacy_backward.yaml | 26 +- paddle/phi/api/yaml/legacy_ops.yaml | 2 +- paddle/phi/backends/onednn/onednn_context.cc | 41 +- paddle/phi/backends/onednn/onednn_context.h | 6 +- paddle/phi/backends/onednn/onednn_helper.h | 35 + paddle/phi/backends/onednn/onednn_reuse.h | 62 ++ paddle/{fluid/framework => phi/core}/expect.h | 0 paddle/phi/infermeta/binary.cc | 8 +- paddle/phi/infermeta/binary.h | 6 +- paddle/phi/kernels/conv_grad_kernel.h | 10 +- paddle/phi/kernels/conv_kernel.h | 2 +- paddle/phi/kernels/cpu/conv_grad_kernel.cc | 8 +- paddle/phi/kernels/gpu/conv_grad_kernel.cu | 4 +- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 8 +- paddle/phi/kernels/onednn/conv_grad_kernel.cc | 190 +++++ paddle/phi/kernels/onednn/conv_handler.h | 764 ++++++++++++++++++ paddle/phi/kernels/onednn/conv_kernel.cc | 457 +++++++++++ paddle/phi/kernels/xpu/conv_grad_kernel.cc | 4 +- paddle/phi/kernels/xpu/conv_kernel.cc | 4 +- 32 files changed, 1669 insertions(+), 123 deletions(-) rename paddle/{fluid/framework => phi/core}/expect.h (100%) create mode 100644 paddle/phi/kernels/onednn/conv_grad_kernel.cc create mode 100644 paddle/phi/kernels/onednn/conv_handler.h create mode 100644 paddle/phi/kernels/onednn/conv_kernel.cc diff --git a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h index 4b0247f7c5e37..22a4b03312ce0 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h +++ b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h @@ -24,7 +24,7 @@ paddle::experimental::Tensor conv2d_ad_func( const paddle::experimental::Tensor& filter, std::vector strides, std::vector paddings, - std::string paddding_algorithm, + std::string padding_algorithm, std::vector dilations, int groups, std::string data_format); diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc index 730d494cdf24e..842ff4822dba4 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc @@ -29,7 +29,7 @@ paddle::experimental::Tensor conv2d_ad_func( const paddle::experimental::Tensor& filter, std::vector strides, std::vector paddings, - std::string paddding_algorithm, + std::string padding_algorithm, std::vector dilations, int groups, std::string data_format) { @@ -61,7 +61,7 @@ paddle::experimental::Tensor conv2d_ad_func( new_filter, strides, paddings, - paddding_algorithm, + padding_algorithm, dilations, groups, data_format); @@ -86,7 +86,7 @@ paddle::experimental::Tensor conv2d_ad_func( filter, strides, paddings, - paddding_algorithm, + padding_algorithm, dilations, groups, data_format); @@ -110,7 +110,7 @@ paddle::experimental::Tensor conv2d_ad_func( filter, strides, paddings, - paddding_algorithm, + padding_algorithm, dilations, groups, data_format); @@ -145,7 +145,7 @@ paddle::experimental::Tensor conv2d_ad_func( // SetAttributes if needed grad_node->SetAttributestrides(strides); grad_node->SetAttributepaddings(paddings); - grad_node->SetAttributepaddding_algorithm(paddding_algorithm); + grad_node->SetAttributepadding_algorithm(padding_algorithm); grad_node->SetAttributegroups(groups); grad_node->SetAttributedilations(dilations); grad_node->SetAttributedata_format(data_format); diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc index fd3b3bda41d99..647f6768bc6b1 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc @@ -46,7 +46,7 @@ Conv2dGradNodeFinal::operator()( auto& grad_out = hooked_grads[0][0]; auto& strides = this->strides_; auto& paddings = this->paddings_; - auto& paddding_algorithm = this->paddding_algorithm_; + auto& padding_algorithm = this->padding_algorithm_; auto& groups = this->groups_; auto& dilations = this->dilations_; auto& data_format = this->data_format_; @@ -84,7 +84,7 @@ Conv2dGradNodeFinal::operator()( grad_out, strides, paddings, - paddding_algorithm, + padding_algorithm, dilations, groups, data_format, @@ -128,7 +128,7 @@ Conv2dGradNodeFinal::operator()( // SetAttributes if needed grad_node->SetAttributestrides(strides); grad_node->SetAttributepaddings(paddings); - grad_node->SetAttributepaddding_algorithm(paddding_algorithm); + grad_node->SetAttributepadding_algorithm(padding_algorithm); grad_node->SetAttributegroups(groups); grad_node->SetAttributedilations(dilations); grad_node->SetAttributedata_format(data_format); @@ -206,7 +206,7 @@ Conv2dDoubleGradNodeFinal::operator()( auto& strides = this->strides_; auto& paddings = this->paddings_; - auto& paddding_algorithm = this->paddding_algorithm_; + auto& padding_algorithm = this->padding_algorithm_; auto& groups = this->groups_; auto& dilations = this->dilations_; auto& data_format = this->data_format_; @@ -249,7 +249,7 @@ Conv2dDoubleGradNodeFinal::operator()( grad_filter_grad_optional, strides, paddings, - paddding_algorithm, + padding_algorithm, dilations, groups, data_format, diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h index ead68b074e1fc..6fb583703848a 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h @@ -63,8 +63,8 @@ class Conv2dGradNodeFinal : public egr::GradNodeBase { void SetAttributepaddings(const std::vector& paddings) { paddings_ = paddings; } - void SetAttributepaddding_algorithm(const std::string& paddding_algorithm) { - paddding_algorithm_ = paddding_algorithm; + void SetAttributepadding_algorithm(const std::string& padding_algorithm) { + padding_algorithm_ = padding_algorithm; } void SetAttributegroups(const int& groups) { groups_ = groups; } void SetAttributedilations(const std::vector& dilations) { @@ -82,7 +82,7 @@ class Conv2dGradNodeFinal : public egr::GradNodeBase { // Attributes std::vector strides_; std::vector paddings_; - std::string paddding_algorithm_; + std::string padding_algorithm_; int groups_; std::vector dilations_; std::string data_format_; @@ -136,8 +136,8 @@ class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase { void SetAttributepaddings(const std::vector& paddings) { paddings_ = paddings; } - void SetAttributepaddding_algorithm(const std::string& paddding_algorithm) { - paddding_algorithm_ = paddding_algorithm; + void SetAttributepadding_algorithm(const std::string& padding_algorithm) { + padding_algorithm_ = padding_algorithm; } void SetAttributegroups(const int& groups) { groups_ = groups; } void SetAttributedilations(const std::vector& dilations) { @@ -156,7 +156,7 @@ class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase { // Attributes std::vector strides_; std::vector paddings_; - std::string paddding_algorithm_; + std::string padding_algorithm_; int groups_; std::vector dilations_; std::string data_format_; diff --git a/paddle/fluid/framework/archive.h b/paddle/fluid/framework/archive.h index a68470492e936..5e7e510b06106 100644 --- a/paddle/fluid/framework/archive.h +++ b/paddle/fluid/framework/archive.h @@ -32,8 +32,8 @@ #include #include -#include "paddle/fluid/framework/expect.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/expect.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h index fe6c51b87228a..f498b1db2de70 100644 --- a/paddle/fluid/framework/channel.h +++ b/paddle/fluid/framework/channel.h @@ -30,7 +30,7 @@ #include #include -#include "paddle/fluid/framework/expect.h" +#include "paddle/phi/core/expect.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 9b491a423d8c0..4fa427451500b 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -3020,9 +3020,6 @@ void OperatorWithKernel::BuildPhiKernelContext( phi::OneDNNContext* one_dnn_ctx = static_cast(dev_ctx); switch (AttrTypeID(attr)) { - case proto::AttrType::INT: - one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(int, attr)); - break; case proto::AttrType::FLOAT: one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(float, attr)); break; @@ -3038,28 +3035,9 @@ void OperatorWithKernel::BuildPhiKernelContext( one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(std::vector, attr)); break; - case proto::AttrType::STRINGS: - one_dnn_ctx->SetDnnAttr( - attr_name, PADDLE_GET_CONST(std::vector, attr)); - break; case proto::AttrType::BOOLEAN: one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(bool, attr)); break; - case proto::AttrType::BOOLEANS: - one_dnn_ctx->SetDnnAttr(attr_name, - PADDLE_GET_CONST(std::vector, attr)); - break; - case proto::AttrType::LONG: - one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(int64_t, attr)); - break; - case proto::AttrType::LONGS: - one_dnn_ctx->SetDnnAttr(attr_name, - PADDLE_GET_CONST(std::vector, attr)); - break; - case proto::AttrType::FLOAT64S: - one_dnn_ctx->SetDnnAttr(attr_name, - PADDLE_GET_CONST(std::vector, attr)); - break; default: PADDLE_THROW(platform::errors::Unimplemented( "Unsupported Attribute value type `%s` for phi.", @@ -3077,43 +3055,9 @@ void OperatorWithKernel::BuildPhiKernelContext( case proto::AttrType::INT: gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(int, attr)); break; - case proto::AttrType::FLOAT: - gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(float, attr)); - break; - case proto::AttrType::STRING: - gpu_dnn_ctx->SetDnnAttr(attr_name, - PADDLE_GET_CONST(std::string, attr)); - break; - case proto::AttrType::INTS: - gpu_dnn_ctx->SetDnnAttr(attr_name, - PADDLE_GET_CONST(std::vector, attr)); - break; - case proto::AttrType::FLOATS: - gpu_dnn_ctx->SetDnnAttr(attr_name, - PADDLE_GET_CONST(std::vector, attr)); - break; - case proto::AttrType::STRINGS: - gpu_dnn_ctx->SetDnnAttr( - attr_name, PADDLE_GET_CONST(std::vector, attr)); - break; case proto::AttrType::BOOLEAN: gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(bool, attr)); break; - case proto::AttrType::BOOLEANS: - gpu_dnn_ctx->SetDnnAttr(attr_name, - PADDLE_GET_CONST(std::vector, attr)); - break; - case proto::AttrType::LONG: - gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(int64_t, attr)); - break; - case proto::AttrType::LONGS: - gpu_dnn_ctx->SetDnnAttr(attr_name, - PADDLE_GET_CONST(std::vector, attr)); - break; - case proto::AttrType::FLOAT64S: - gpu_dnn_ctx->SetDnnAttr(attr_name, - PADDLE_GET_CONST(std::vector, attr)); - break; default: PADDLE_THROW(platform::errors::Unimplemented( "Unsupported Attribute value type `%s` for phi.", @@ -3124,6 +3068,36 @@ void OperatorWithKernel::BuildPhiKernelContext( } VLOG(4) << "Done runtime attributes"; #endif + +// For compatible with Op with extra input for onednn backend +#ifdef PADDLE_WITH_MKLDNN + if (phi::OneDNNContext::classof(dev_ctx)) { + phi::OneDNNContext* one_dnn_ctx = static_cast(dev_ctx); + auto& extra_input_names = + paddle::operators::ExtraInfoUtils::Instance().GetExtraInputNamesMap( + Type()); + for (const auto& input_name : extra_input_names) { + auto it = ctx.inputs.find(input_name); + if (it == ctx.inputs.end() || it->second.size() == 0) { + one_dnn_ctx->SetDnnInput(input_name, nullptr); + } else { + auto ins_vector = it->second; + PADDLE_ENFORCE_EQ( + ins_vector.size(), + 1UL, + phi::errors::InvalidArgument( + "OneDNN's extra input only allows one input tensor.")); + auto* var = ins_vector[0]; + PADDLE_ENFORCE_EQ(var->IsType(), + true, + phi::errors::InvalidArgument( + "OneDNN's extra input only can be DenseTensor.")); + one_dnn_ctx->SetDnnInput(input_name, &(var->Get())); + } + } + } + VLOG(4) << "Done runtime extra inputs"; +#endif } } // namespace framework diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc index a040aa3779323..71dd1c203fa1a 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/expect.h" #include "paddle/fluid/operators/fused/fusion_gru_op.h" #include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h" +#include "paddle/phi/core/expect.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc index 6ecde2fdcf87e..1d40d4ac20b80 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/expect.h" #include "paddle/fluid/operators/fused/fusion_lstm_op.h" #include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h" +#include "paddle/phi/core/expect.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index cdd064467ebc5..e4927ed38957f 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -14,11 +14,11 @@ #include -#include "paddle/fluid/framework/expect.h" #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/core/expect.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc index e3b9d3ffd7c6a..246bfacca7f4e 100644 --- a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/expect.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/core/expect.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h index af6304375cedd..d05d4a57ac492 100644 --- a/paddle/fluid/operators/ops_extra_info.h +++ b/paddle/fluid/operators/ops_extra_info.h @@ -16,6 +16,7 @@ #include #include +#include #include "paddle/fluid/framework/attribute.h" @@ -87,6 +88,8 @@ const std::unordered_map {"use_cudnn", ExtraAttrProperty::SCHEDULE}, {"use_mkldnn", ExtraAttrProperty::SCHEDULE}, // ONEDNN dedicated attributes + {"Activation_scale", ExtraAttrProperty::ONEDNN}, + {"Bias", ExtraAttrProperty::ONEDNN}, {"data_format", ExtraAttrProperty::ONEDNN}, {"force_fp32_output", ExtraAttrProperty::ONEDNN}, {"fuse_activation", ExtraAttrProperty::ONEDNN}, @@ -108,14 +111,18 @@ const std::unordered_map {"fused_transpose_X", ExtraAttrProperty::ONEDNN}, {"fused_transpose_Y", ExtraAttrProperty::ONEDNN}, {"mkldnn_data_type", ExtraAttrProperty::ONEDNN}, + {"Output_shift_scale", ExtraAttrProperty::ONEDNN}, + {"ResidualData", ExtraAttrProperty::ONEDNN}, {"scale_x", ExtraAttrProperty::ONEDNN}, {"scale_y", ExtraAttrProperty::ONEDNN}, {"scale_out", ExtraAttrProperty::ONEDNN}, + {"Scale_in", ExtraAttrProperty::ONEDNN}, {"Scale_in_eltwise", ExtraAttrProperty::ONEDNN}, {"Scale_x", ExtraAttrProperty::ONEDNN}, {"Scale_y", ExtraAttrProperty::ONEDNN}, {"Scale_out", ExtraAttrProperty::ONEDNN}, {"Scale_weights", ExtraAttrProperty::ONEDNN}, + {"Sum_scale", ExtraAttrProperty::ONEDNN}, {"x_data_format", ExtraAttrProperty::ONEDNN}, {"y_data_format", ExtraAttrProperty::ONEDNN}, // GPUDNN dedicated attributes @@ -190,6 +197,15 @@ class ExtraInfoUtils { return empty_extra_attrs_checker_; } + const std::vector& GetExtraInputNamesMap( + const std::string& op_type) const { + auto iter = g_extra_input_names_map_.find(op_type); + if (iter != g_extra_input_names_map_.end()) { + return iter->second; + } + return empty_extra_input_names_; + } + private: ExtraInfoUtils(); @@ -202,6 +218,11 @@ class ExtraInfoUtils { g_extra_attrs_checker_; std::vector> empty_extra_attrs_checker_{}; + + std::unordered_map> + g_extra_input_names_map_ = {{"conv2d", {"Bias", "ResidualData"}}, + {"conv2d_grad", {"Bias"}}}; + std::vector empty_extra_input_names_; }; } // namespace operators diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index c39705b618f65..f0b1efc769430 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -19,7 +19,6 @@ limitations under the License. */ #include #include "glog/logging.h" -#include "paddle/fluid/framework/expect.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/platform/device/device_wrapper.h" @@ -28,6 +27,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/allocator.h" +#include "paddle/phi/core/expect.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 519ba7c03fb0e..8f474b68af15f 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -424,8 +424,8 @@ func : conj - backward_op : conv2d_grad - forward : conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int[] dilations, int groups, str data_format) -> Tensor(out) - args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str paddding_algorithm, int[] dilations, int groups, str data_format) + forward : conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format) -> Tensor(out) + args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format) output : Tensor(input_grad), Tensor(filter_grad) infer_meta : func : GeneralBinaryGradInferMeta @@ -436,8 +436,8 @@ backward : conv2d_grad_grad - backward_op : conv2d_grad_grad - forward : conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str paddding_algorithm, int[] dilations, int groups, str data_format) -> Tensor(grad_input), Tensor(grad_filter) - args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int[] dilations, int groups, str data_format) + forward : conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format) -> Tensor(grad_input), Tensor(grad_filter) + args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format) output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad) infer_meta : func : GeneralTernaryGradInferMeta @@ -469,8 +469,8 @@ backward : conv2d_transpose_double_grad - backward_op : conv3d_grad - forward : conv3d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out) - args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) + forward : conv3d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out) + args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) output : Tensor(input_grad), Tensor(filter_grad) infer_meta : func : GeneralBinaryGradInferMeta @@ -481,8 +481,8 @@ backward : conv3d_grad_grad - backward_op : conv3d_grad_grad - forward : conv3d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter) - args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) + forward : conv3d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter) + args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad) infer_meta : func : GeneralTernaryGradInferMeta @@ -573,21 +573,21 @@ optional : mask - backward_op : depthwise_conv2d_grad - forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(out) - args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) + forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(out) + args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) output : Tensor(input_grad), Tensor(filter_grad) infer_meta : func : GeneralBinaryGradInferMeta param : [input, filter] kernel : func : depthwise_conv2d_grad - param : [input, filter, out_grad, strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, fuse_relu] + param : [input, filter, out_grad, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, fuse_relu] use_gpudnn : use_gpudnn backward : depthwise_conv2d_grad_grad - backward_op : depthwise_conv2d_grad_grad - forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(grad_input), Tensor(grad_filter) - args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu) + forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(grad_input), Tensor(grad_filter) + args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu) output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad) infer_meta : func : GeneralTernaryGradInferMeta diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index 2a1fb70895c4c..e504cf51347f6 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -565,7 +565,7 @@ backward : conv2d_transpose_grad - op : conv3d - args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) + args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) output : Tensor infer_meta : func : Conv3DInferMeta diff --git a/paddle/phi/backends/onednn/onednn_context.cc b/paddle/phi/backends/onednn/onednn_context.cc index f57b10fb51a3b..5aa8090c62fe5 100644 --- a/paddle/phi/backends/onednn/onednn_context.cc +++ b/paddle/phi/backends/onednn/onednn_context.cc @@ -18,8 +18,8 @@ #include "paddle/phi/core/enforce.h" #include "paddle/utils/flat_hash_map.h" -#include "paddle/fluid/framework/expect.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/expect.h" namespace phi { @@ -301,6 +301,24 @@ struct OneDNNContext::Impl { dnn_attrs_[attr_name] = attr; } + bool HasDnnInput(const std::string& input_name) const { + return dnn_inputs_.count(input_name) != 0UL; + } + + const DenseTensor* GetDnnInput(const std::string& input_name) const { + auto iter = dnn_inputs_.find(input_name); + PADDLE_ENFORCE_NE( + iter, + dnn_inputs_.end(), + phi::errors::NotFound( + "Input DenseTensor `%s` is not found in OneDNNContext.")); + return iter->second; + } + + void SetDnnInput(const std::string& input_name, const DenseTensor* input) { + dnn_inputs_[input_name] = input; + } + std::shared_ptr p_blobmap_; // Map key is pointer of executor and value is a data(iterator in map) needed // to erase @@ -319,9 +337,16 @@ struct OneDNNContext::Impl { // Because DeviceContext is a global singleton, you need to ensure thread // safety, use the thread_local variable static thread_local AttributeMap dnn_attrs_; + // For onednn, in addition to extra attrs, there are also extra inputs, + // but the number is small. Hope that the implementation can be optimized + // to remove this member in the future. + static thread_local paddle::flat_hash_map + dnn_inputs_; }; thread_local AttributeMap OneDNNContext::Impl::dnn_attrs_ = {}; +thread_local paddle::flat_hash_map + OneDNNContext::Impl::dnn_inputs_ = {}; OneDNNContext::OneDNNContext(const Place& place) : CPUContext(place), impl_(std::make_unique()) {} @@ -364,5 +389,19 @@ void OneDNNContext::SetDnnAttr(const std::string& attr_name, Attribute attr) { return impl_->SetDnnAttr(attr_name, std::move(attr)); } +bool OneDNNContext::HasDnnInput(const std::string& input_name) const { + return impl_->HasDnnInput(input_name); +} + +const DenseTensor* OneDNNContext::GetDnnInput( + const std::string& input_name) const { + return impl_->GetDnnInput(input_name); +} + +void OneDNNContext::SetDnnInput(const std::string& input_name, + const DenseTensor* input) { + return impl_->SetDnnInput(input_name, input); +} + } // namespace phi #endif diff --git a/paddle/phi/backends/onednn/onednn_context.h b/paddle/phi/backends/onednn/onednn_context.h index ee8474a41ccda..d7cf1752bcce4 100644 --- a/paddle/phi/backends/onednn/onednn_context.h +++ b/paddle/phi/backends/onednn/onednn_context.h @@ -135,11 +135,15 @@ class OneDNNContext : public CPUContext { return OneDNNContextThreadLocals::fetch(); } - // Several methods for adapting ONEDNN-specific attributes + // Several methods for adapting ONEDNN-specific attributes and inputs bool HasDnnAttr(const std::string& attr_name) const; const Attribute& GetDnnAttr(const std::string& attr_name) const; void SetDnnAttr(const std::string& attr_name, Attribute attr); + bool HasDnnInput(const std::string& input_name) const; + const DenseTensor* GetDnnInput(const std::string& input_name) const; + void SetDnnInput(const std::string& input_name, const DenseTensor* input); + static const char* name() { return "OneDNNContext"; } private: diff --git a/paddle/phi/backends/onednn/onednn_helper.h b/paddle/phi/backends/onednn/onednn_helper.h index e91e02282ccc0..11cc9c29f501f 100644 --- a/paddle/phi/backends/onednn/onednn_helper.h +++ b/paddle/phi/backends/onednn/onednn_helper.h @@ -195,6 +195,41 @@ inline std::string CreateKey(const OneDNNContext& dev_ctx, ArgTypes&&... args) { return key; } +inline std::vector> ToOnednnPadding( + const std::vector& paddings) { + if (paddings.size() == 6) { + int padding_front = paddings[0]; + int padding_back = paddings[1]; + int padding_top = paddings[2]; + int padding_bottom = paddings[3]; + int padding_left = paddings[4]; + int padding_right = paddings[5]; + + return {{padding_front, padding_top, padding_left}, + {padding_back, padding_bottom, padding_right}}; + } else { + int padding_top = paddings[0]; + int padding_bottom = paddings[1]; + int padding_left = paddings[2]; + int padding_right = paddings[3]; + + return {{padding_top, padding_left}, {padding_bottom, padding_right}}; + } +} + +// The function adjusts the vector of weight dimensions for group convolutions +inline void GetGroupConvWeightsTz(std::vector& weights_tz, // NOLINT + const int groups) { + if (groups > 1) { + // if (is_conv3d) [o, i, d, h, w]->[g, o/g, i, d, h, w] + // else [o, i, h, w] -> [g, o/g, i, h, w] + weights_tz.push_back(0); + std::rotate(weights_tz.begin(), weights_tz.end() - 1, weights_tz.end()); + weights_tz[0] = groups; + weights_tz[1] = weights_tz[1] / groups; + } +} + inline void MatchShapeToLayout(DenseTensor* tensor_in, DataLayout from, DataLayout to) { diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h index 4775a694e1f1a..9f79b9121904c 100644 --- a/paddle/phi/backends/onednn/onednn_reuse.h +++ b/paddle/phi/backends/onednn/onednn_reuse.h @@ -39,6 +39,67 @@ using memory = dnnl::memory; using OneDNNMemoryFormat = dnnl::memory::format_tag; +static void AppendActivation(const OneDNNContext& dev_ctx, + dnnl::post_ops& post_ops, // NOLINT + float activation_scale = 1.0f) { + const auto invalid_attribute = + dev_ctx.HasDnnAttr("fuse_activation") + ? PADDLE_GET_CONST(std::string, dev_ctx.GetDnnAttr("fuse_activation")) + .empty() + : true; + if (invalid_attribute) return; + + const auto fuse_activation = + dev_ctx.HasDnnAttr("fuse_activation") + ? PADDLE_GET_CONST(std::string, dev_ctx.GetDnnAttr("fuse_activation")) + : ""; + const auto fuse_alpha = + dev_ctx.HasDnnAttr("fuse_alpha") + ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fuse_alpha")) + : 0.0f; + const auto fuse_beta = + dev_ctx.HasDnnAttr("fuse_beta") + ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fuse_beta")) + : 0.0f; + + if (fuse_activation == "hard_sigmoid") { + post_ops.append_eltwise(activation_scale, + dnnl::algorithm::eltwise_linear, + fuse_alpha, + fuse_beta); + post_ops.append_eltwise( + activation_scale, dnnl::algorithm::eltwise_clip, 0.0f, 1.0f); + } else { + const std::unordered_map activation_map = { + {"abs", dnnl::algorithm::eltwise_abs}, + {"clip", dnnl::algorithm::eltwise_clip}, + {"gelu", dnnl::algorithm::eltwise_gelu_erf}, + {"gelu_erf", dnnl::algorithm::eltwise_gelu_erf}, + {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh}, + {"hard_swish", dnnl::algorithm::eltwise_hardswish}, + {"leaky_relu", dnnl::algorithm::eltwise_relu}, + {"mish", dnnl::algorithm::eltwise_mish}, + {"relu", dnnl::algorithm::eltwise_relu}, + {"relu6", dnnl::algorithm::eltwise_bounded_relu}, + {"sigmoid", dnnl::algorithm::eltwise_logistic}, + {"sqrt", dnnl::algorithm::eltwise_sqrt}, + {"swish", dnnl::algorithm::eltwise_swish}, + {"tanh", dnnl::algorithm::eltwise_tanh}}; + + const auto& activation_type = activation_map.find(fuse_activation); + + PADDLE_ENFORCE_NE( + activation_type, + activation_map.end(), + phi::errors::InvalidArgument( + "Activation '%s' not found in oneDNN algorithms mapper", + fuse_activation)); + + post_ops.append_eltwise( + activation_scale, activation_type->second, fuse_alpha, fuse_beta); + } +} + template logical_axis_; dnnl::engine engine_; }; + } // namespace funcs } // namespace phi diff --git a/paddle/fluid/framework/expect.h b/paddle/phi/core/expect.h similarity index 100% rename from paddle/fluid/framework/expect.h rename to paddle/phi/core/expect.h diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index c96e7dcbe87fd..46c8520ff7237 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -560,7 +560,7 @@ void Conv3DInferMeta(const MetaTensor& input, const MetaTensor& filter, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -573,7 +573,7 @@ void Conv3DInferMeta(const MetaTensor& input, filter, strides, paddings, - paddding_algorithm, + padding_algorithm, dilations, groups, data_format, @@ -924,7 +924,7 @@ void DepthwiseConvInferMeta(const MetaTensor& input, const MetaTensor& filter, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -937,7 +937,7 @@ void DepthwiseConvInferMeta(const MetaTensor& input, filter, strides, paddings, - paddding_algorithm, + padding_algorithm, dilations, groups, data_format, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index d4c30bf5b84aa..30e22cb3f56a6 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -80,7 +80,7 @@ void ConvInferMeta(const MetaTensor& input, const MetaTensor& filter, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, const std::vector& dilations, int groups, const std::string& data_format, @@ -91,7 +91,7 @@ void Conv3DInferMeta(const MetaTensor& input, const MetaTensor& filter, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -147,7 +147,7 @@ void DepthwiseConvInferMeta(const MetaTensor& input, const MetaTensor& filter, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, diff --git a/paddle/phi/kernels/conv_grad_kernel.h b/paddle/phi/kernels/conv_grad_kernel.h index 21dbc7f8761d1..4164db5f8a01b 100644 --- a/paddle/phi/kernels/conv_grad_kernel.h +++ b/paddle/phi/kernels/conv_grad_kernel.h @@ -25,7 +25,7 @@ void ConvGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, const std::vector& dilations, int groups, const std::string& data_format, @@ -39,7 +39,7 @@ void Conv3DGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -56,7 +56,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -76,7 +76,7 @@ void ConvGradGradKernel(const Context& dev_ctx, const paddle::optional& filter_grad_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, const std::vector& dilations, int groups, const std::string& data_format, @@ -93,7 +93,7 @@ void Conv3DGradGradKernel(const Context& dev_ctx, const paddle::optional& filter_grad_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, diff --git a/paddle/phi/kernels/conv_kernel.h b/paddle/phi/kernels/conv_kernel.h index 0c02e8736aa93..06faee6d3aa1e 100644 --- a/paddle/phi/kernels/conv_kernel.h +++ b/paddle/phi/kernels/conv_kernel.h @@ -51,7 +51,7 @@ void DepthwiseConvKernel(const Context& dev_ctx, const DenseTensor& filter, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, diff --git a/paddle/phi/kernels/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_kernel.cc index 7102af6cf4a20..06a63267c5c96 100644 --- a/paddle/phi/kernels/cpu/conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc @@ -27,7 +27,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -43,7 +43,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, out_grad, strides, paddings, - paddding_algorithm, + padding_algorithm, dilations, groups, data_format, @@ -58,7 +58,7 @@ void Conv3DGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -73,7 +73,7 @@ void Conv3DGradKernel(const Context& dev_ctx, out_grad, strides, paddings, - paddding_algorithm, + padding_algorithm, dilations, groups, data_format, diff --git a/paddle/phi/kernels/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_kernel.cu index dea55d34413bd..4f15030365a6c 100644 --- a/paddle/phi/kernels/gpu/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/conv_grad_kernel.cu @@ -27,7 +27,7 @@ void Conv3DGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -42,7 +42,7 @@ void Conv3DGradKernel(const Context& dev_ctx, out_grad, strides, paddings, - paddding_algorithm, + padding_algorithm, dilations, groups, data_format, diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index e67815a5b7dd6..a252bb01b267b 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -611,7 +611,7 @@ void Conv3DCudnnGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -626,7 +626,7 @@ void Conv3DCudnnGradKernel(const Context& dev_ctx, out_grad, strides, paddings, - paddding_algorithm, + padding_algorithm, dilations, groups, data_format, @@ -641,7 +641,7 @@ void DepthwiseConvCudnnGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -657,7 +657,7 @@ void DepthwiseConvCudnnGradKernel(const Context& dev_ctx, out_grad, strides, paddings, - paddding_algorithm, + padding_algorithm, dilations, groups, data_format, diff --git a/paddle/phi/kernels/onednn/conv_grad_kernel.cc b/paddle/phi/kernels/onednn/conv_grad_kernel.cc new file mode 100644 index 0000000000000..190fe26cd346b --- /dev/null +++ b/paddle/phi/kernels/onednn/conv_grad_kernel.cc @@ -0,0 +1,190 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/funcs/data_layout_transform.h" +#include "paddle/phi/kernels/onednn/conv_handler.h" + +namespace phi { + +#define PD_VISIT_FLOAT_AND_BF16_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, \ + ::paddle::DataType::BFLOAT16, \ + ::phi::dtype::bfloat16, \ + __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + +template +void ConvGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + const std::vector& dilations, + int groups, + const std::string& data_format, + DenseTensor* input_grad, + DenseTensor* filter_grad) { + PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType(), + AllocationType::CPU, + phi::errors::PreconditionNotMet( + "Operator DNNL ConvGrad must use CPUPlace")); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + VLOG(1) << "ConvGrad Has is_test " << dev_ctx.HasDnnAttr("is_test"); + + const auto* bias = + dev_ctx.HasDnnInput("Bias") ? dev_ctx.GetDnnInput("Bias") : nullptr; + bool is_test = dev_ctx.HasDnnAttr("is_test") + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test")) + : false; + + if (!input_grad && !filter_grad) return; + + PD_VISIT_FLOAT_AND_BF16_TYPES( + filter.dtype(), "ConvOneDNNHandlerT", ([&] { + // TODO(jczaja): Are all tensors really needed? + onednn::ConvOneDNNHandlerT handler(dev_ctx, + dev_ctx.GetPlace(), + &input, + &filter, + bias, + &out_grad, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + filter_grad, + input_grad); + + // create mkldnn memory from input tensors (data/weights) + auto& astream = OneDNNContext::tls().get_stream(); + + if (filter_grad) { + auto src_memory_p = + handler.AcquireSrcMemoryWithReorderFromWeightsPrimitive(&input); + auto diff_dst_memory_p = + handler.AcquireDiffDstMemoryWithReorderFromWeightsPrimitive( + &out_grad); + + // For convoluition with groups write filter grad into + // oneDNN buffer and then we reorder it into filter_grad tensor + int g = std::max(groups, 1); + auto diff_weights_memory_p = + g > 1 ? handler.AcquireDiffWeightsMemory() + : handler.AcquireDiffWeightsMemory(filter_grad); + + auto conv_bwd_weights_p = handler.AcquireBackwardWeightsPrimitive(); + + conv_bwd_weights_p->execute( + astream, + {{DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_DIFF_DST, *diff_dst_memory_p}, + {DNNL_ARG_DIFF_WEIGHTS, *diff_weights_memory_p}}); + astream.wait(); + + // For convolution with groups convert from blocked to NCHW + // otherwise there will be problems in next operators working on + // this data + if (g > 1) { + // in OneDNN groups in convolution are treated as separate + // dimension which is not the case in paddlepaddle + + dnnl::memory::data_type in_type = + funcs::ToOneDNNDataType(filter.dtype()); + // for 3d conv with groups (six dimensional data reorder to + // goidhw) for 2d conv with groups (five dimensional data reorder + // to goihw) auto weights_tz = phi::vectorize(filter->dims()); + + auto weights_tz = diff_weights_memory_p->get_desc().dims(); + dnnl::memory::format_tag out_format = + weights_tz.size() == 6 ? dnnl::memory::format_tag::goidhw + : dnnl::memory::format_tag::goihw; + funcs::ReorderOneDNNHandler handler( + weights_tz, filter.dtype(), in_type, mkldnn_engine); + auto reorder_dst_memory_p = handler.AcquireDstMemory( + filter_grad, out_format, dev_ctx.GetPlace()); + + auto reorder_p = handler.AcquireReorder(reorder_dst_memory_p, + diff_weights_memory_p); + + { + reorder_p->execute( + astream, *diff_weights_memory_p, *reorder_dst_memory_p); + astream.wait(); + } + + // So here we have a data in goihw , which can be interpreted as + // OIHW (OIDHW for conv3d) because filter_grad shape is set for + // OIHW (OIDHW for conv3d) + dnnl::memory::format_tag target_format = + weights_tz.size() == 6 ? dnnl::memory::format_tag::oidhw + : dnnl::memory::format_tag::oihw; + filter_grad->set_mem_desc( + dnnl::memory::desc(phi::vectorize(filter_grad->dims()), + in_type, + target_format)); + } else { + filter_grad->set_mem_desc(diff_weights_memory_p->get_desc()); + } + } + if (input_grad) { + auto weights_memory_p = + handler.AcquireWeightsMemoryWithReorderFromDataPrimitive( + &filter, groups, strides.size() == 3U); + + auto diff_dst_memory_p = + handler.AcquireDiffDstMemoryWithReorderMemoryFromDataPrimitive( + &out_grad); + auto diff_src_memory_p = handler.AcquireDiffSrcMemory(input_grad); + + auto conv_bwd_data_p = handler.AcquireBackwardPrimitive(); + + conv_bwd_data_p->execute(astream, + {{DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DIFF_DST, *diff_dst_memory_p}, + {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}}); + astream.wait(); + + input_grad->set_mem_desc(diff_src_memory_p->get_desc()); + } + })); +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d_grad, + OneDNN, + ONEDNN, + phi::ConvGradKernel, + float, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/onednn/conv_handler.h b/paddle/phi/kernels/onednn/conv_handler.h new file mode 100644 index 0000000000000..f5e7f2c29c763 --- /dev/null +++ b/paddle/phi/kernels/onednn/conv_handler.h @@ -0,0 +1,764 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/backends/onednn/onednn_helper.h" +#include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/core/expect.h" +#include "paddle/phi/kernels/cpu/conv_util.h" + +namespace phi { +namespace onednn { + +inline funcs::OneDNNMemoryFormat GetWeightsFormat(int groups, bool is_conv3d) { + if (is_conv3d) { + return (groups == 1) ? funcs::OneDNNMemoryFormat::oidhw + : funcs::OneDNNMemoryFormat::goidhw; + } else { + return (groups == 1) ? funcs::OneDNNMemoryFormat::oihw + : funcs::OneDNNMemoryFormat::goihw; + } +} + +template +class ConvOneDNNHandlerT + : public funcs::OneDNNHandlerT { + public: + ConvOneDNNHandlerT(const OneDNNContext& dev_ctx, + const dnnl::engine mkldnn_engine, + Place cpu_place, + const phi::DenseTensor* input, + const phi::DenseTensor* filter, + const phi::DenseTensor* bias, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + const std::vector& dilations, + int groups, + const std::string& data_format, + bool is_test, + bool is_BFLOAT16, + const std::string& fuse_activation, + bool fuse_residual_conn, + bool force_fp32_output, + phi::DenseTensor* output) + : funcs::OneDNNHandlerT( + dev_ctx, + mkldnn_engine, + cpu_place, + funcs::CreateKey(dev_ctx, phi::vectorize(input->dims()))) { + if (unlikely(!this->isCached())) { + PADDLE_ENFORCE_EQ( + input->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The input tensor's layout should be %d, but got %d.", + DataLayout::ONEDNN, + input->layout())); + + PADDLE_ENFORCE_EQ( + filter->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The Filter tensor's layout should be %d, but got %d.", + DataLayout::ONEDNN, + filter->layout())); + + PADDLE_ENFORCE_GE( + input->dims().size(), + 4, + phi::errors::InvalidArgument( + "Input must be with 4 or 5 dimensions, i.e. NCHW or " + "NCDHW, but got dimension = %d .", + input->dims().size())); + PADDLE_ENFORCE_LE( + input->dims().size(), + 5, + phi::errors::InvalidArgument( + "Input must be with 4 or 5 dimensions, i.e. NCHW or " + "NCDHW, but got dimension = %d .", + input->dims().size())); + + PADDLE_ENFORCE_GE( + filter->dims().size(), + 4, + phi::errors::InvalidArgument( + "Filter must be with 4 or 5 dimensions, i.e. OIHW or " + "OIDHW, but got dimension = %d .", + filter->dims().size())); + PADDLE_ENFORCE_LE( + filter->dims().size(), + 5, + phi::errors::InvalidArgument( + "Filter must be with 4 or 5 dimensions, i.e. OIHW or " + "OIDHW, but got dimension = %d .", + filter->dims().size())); + + if (bias) { + PADDLE_ENFORCE_EQ( + bias->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The Bias tensor's layout should be %d, but got %d.", + DataLayout::ONEDNN, + bias->layout())); + + PADDLE_ENFORCE_EQ( + bias->dims().size(), + 1, + phi::errors::InvalidArgument("Bias must only have 1 dimension, " + "i.e. X, but got dimension = %d .", + bias->dims().size())); + } + + const auto input_dims = input->dims(); + const auto data_dims = phi::slice_ddim(input_dims, 2, input_dims.size()); + const auto filter_dims = filter->dims(); + const auto filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + + const auto ksize = phi::vectorize(filter_data_dims); + std::vector strides(begin(strides), end(strides)); + std::vector paddings(begin(paddings), end(paddings)); + std::vector dilations(begin(dilations), end(dilations)); + + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, data_dims, strides, ksize); + + std::transform( + dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) { + return i - 1; + }); + + const auto src_tz = phi::vectorize(input->dims()); + + auto weights_tz = phi::vectorize(filter->dims()); + funcs::GetGroupConvWeightsTz(weights_tz, groups); + + const auto dst_tz = phi::vectorize(output->dims()); + + const dnnl::memory::dims stride_dims = strides; + const auto onednn_paddings = funcs::ToOnednnPadding(paddings); + const dnnl::memory::dims dilations_dims = dilations; + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + auto chosen_memory_format = funcs::OneDNNMemoryFormat::any; + auto data_type = dnnl::memory::data_type::f32; + if (is_BFLOAT16 || std::is_same::value) { + data_type = dnnl::memory::data_type::bf16; + } + + dnnl::memory::desc src_md, weights_md; + if (funcs::is_int8()) { + src_md = funcs::OneDNNMemDesc(src_tz, + funcs::ToOneDNNDataType(input->dtype()), + chosen_memory_format); + weights_md = funcs::OneDNNMemDesc( + weights_tz, dnnl::memory::data_type::s8, chosen_memory_format); + } else { + src_md = funcs::OneDNNMemDesc(src_tz, data_type, chosen_memory_format); + weights_md = funcs::OneDNNMemDesc( + weights_tz, data_type, funcs::OneDNNMemoryFormat::any); + } + + const auto dst_md = funcs::OneDNNMemDesc( + dst_tz, funcs::OneDNNGetDataType(), chosen_memory_format); + const auto fwd_prop_kind = is_test ? dnnl::prop_kind::forward_inference + : dnnl::prop_kind::forward_training; + + const dnnl::primitive_attr conv_attr = CreateConvAttrs(filter, + groups, + force_fp32_output, + fuse_residual_conn, + fuse_activation); + + if (bias) { + auto bias_tz = phi::vectorize(bias->dims()); + dnnl::memory::desc bias_md; + if (funcs::is_int8()) { + bias_md = funcs::OneDNNMemDesc(bias_tz, + dnnl::memory::data_type::s32, + funcs::OneDNNMemoryFormat::x); + } else { + bias_md = funcs::OneDNNMemDesc( + bias_tz, data_type, funcs::OneDNNMemoryFormat::x); + } + + this->AcquireForwardPrimitiveDescriptor( + conv_attr, + fwd_prop_kind, + dnnl::algorithm::convolution_direct, + src_md, + weights_md, + bias_md, + dst_md, + stride_dims, + dilations_dims, + onednn_paddings[0], + onednn_paddings[1]); + } else { + this->AcquireForwardPrimitiveDescriptor( + conv_attr, + fwd_prop_kind, + dnnl::algorithm::convolution_direct, + src_md, + weights_md, + dst_md, + stride_dims, + dilations_dims, + onednn_paddings[0], + onednn_paddings[1]); + } + } + } + + ConvOneDNNHandlerT(const OneDNNContext& dev_ctx, + Place cpu_place, + const phi::DenseTensor* in, + const phi::DenseTensor* filter, + const phi::DenseTensor* bias, + const phi::DenseTensor* out_grad, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + const std::vector& dilations, + int groups, + const std::string& data_format, + bool is_test, + phi::DenseTensor* filter_grad, + phi::DenseTensor* in_x_grad) + : funcs::OneDNNHandlerT( + dev_ctx, + dev_ctx.GetEngine(), + cpu_place, + funcs::CreateKey(dev_ctx, phi::vectorize(in->dims()))) { + if (unlikely(!this->isBwdCached())) { + PADDLE_ENFORCE_EQ( + in->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The input tensor's layout should be %d, but got %d.", + DataLayout::ONEDNN, + in->layout())); + + PADDLE_ENFORCE_EQ( + filter->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The filter tensor's layout should be %d, but got %d.", + DataLayout::ONEDNN, + filter->layout())); + + PADDLE_ENFORCE_EQ( + out_grad->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The output_grad tensor's layout should be %d, but got %d.", + DataLayout::ONEDNN, + out_grad->layout())); + + PADDLE_ENFORCE_EQ( + is_test, + false, + phi::errors::InvalidArgument( + "is_test attribute should be set to False in training phase.")); + + std::vector strides(begin(strides), end(strides)); + std::vector paddings(begin(paddings), end(paddings)); + std::vector dilations(begin(dilations), end(dilations)); + + auto input_dims = in->dims(); + auto data_dims = phi::slice_ddim(input_dims, 2, input_dims.size()); + auto filter_dims = filter->dims(); + auto filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + auto ksize = phi::vectorize(filter_data_dims); + + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, data_dims, strides, ksize); + + auto src_tz = phi::vectorize(in->dims()); + auto weights_tz = phi::vectorize(filter->dims()); + + int g = std::max(groups, 1); + funcs::GetGroupConvWeightsTz(weights_tz, g); + auto dst_tz = phi::vectorize(out_grad->dims()); + + /* create memory descriptor for conv backward without specified format + * ('any') which lets a primitive (conv backward in this case) choose + * the memory format preferred for best performance + */ + const auto chosen_memory_format = funcs::OneDNNMemoryFormat::any; + const auto weights_format = funcs::OneDNNMemoryFormat::any; + + auto src_md = funcs::OneDNNMemDesc( + src_tz, funcs::OneDNNGetDataType(), chosen_memory_format); + const auto dst_md = funcs::OneDNNMemDesc( + dst_tz, funcs::OneDNNGetDataType(), chosen_memory_format); + auto diff_src_md = funcs::OneDNNMemDesc( + src_tz, funcs::OneDNNGetDataType(), chosen_memory_format); + auto weights_md = funcs::OneDNNMemDesc( + weights_tz, funcs::OneDNNGetDataType(), weights_format); + auto diff_weights_md = funcs::OneDNNMemDesc( + weights_tz, funcs::OneDNNGetDataType(), weights_format); + auto diff_dst_md = funcs::OneDNNMemDesc( + dst_tz, funcs::OneDNNGetDataType(), chosen_memory_format); + + auto onednn_paddings = funcs::ToOnednnPadding(paddings); + std::transform( + dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) { + return i - 1; + }); + const dnnl::memory::dims dilations_dims = dilations; + + const dnnl::memory::dims stride_dims = strides; + // Recreating FWD PD. For training there are no post ops in convolution + dnnl::primitive_attr conv_attr; + if (bias) { + auto bias_tz = phi::vectorize(bias->dims()); + dnnl::memory::desc bias_md; + if (funcs::is_int8()) { + bias_md = funcs::OneDNNMemDesc(bias_tz, + dnnl::memory::data_type::s32, + funcs::OneDNNMemoryFormat::x); + } else { + bias_md = funcs::OneDNNMemDesc(bias_tz, + dnnl::memory::data_type::f32, + funcs::OneDNNMemoryFormat::x); + } + + this->AcquireForwardPrimitiveDescriptor( + conv_attr, + dnnl::prop_kind::forward_training, + dnnl::algorithm::convolution_direct, + src_md, + weights_md, + bias_md, + dst_md, + stride_dims, + dilations_dims, + onednn_paddings[0], + onednn_paddings[1]); + } else { + this->AcquireForwardPrimitiveDescriptor( + conv_attr, + dnnl::prop_kind::forward_training, + dnnl::algorithm::convolution_direct, + src_md, + weights_md, + dst_md, + stride_dims, + dilations_dims, + onednn_paddings[0], + onednn_paddings[1]); + } + + this->AcquireBackwardPrimitiveDescriptor( + dnnl::algorithm::convolution_direct, + diff_src_md, + weights_md, + diff_dst_md, + strides, + dilations_dims, + onednn_paddings[0], + onednn_paddings[1]); + + this->AcquireBackwardWeightsPrimitiveDescriptor( + dnnl::algorithm::convolution_direct, + src_md, + diff_weights_md, + diff_dst_md, + strides, + dilations_dims, + onednn_paddings[0], + onednn_paddings[1]); + } + } + + std::shared_ptr>> get_int8_bias_scales( + const DenseTensor* filter, + int groups, + const std::vector& scale_weights_data) { + // Get scales int8 bias key + const std::string key_bs = this->key_ + "@bs"; + + // Scales for int8 bias are to be cached to avoid + // computing them each iteration + auto bias_scale_tuple = + std::static_pointer_cast>>( + this->dev_ctx_.GetBlob(key_bs)); + if (bias_scale_tuple) return bias_scale_tuple; + + const auto& weights_tz = phi::vectorize(filter->dims()); + + const auto& scale_in_data = + this->dev_ctx_.HasDnnAttr("Scale_in") + ? PADDLE_GET_CONST(float, this->dev_ctx_.GetDnnAttr("Scale_in")) + : 1.0f; + + bool is_multi_channel = scale_weights_data.size() > 1; + int mask_reorder = is_multi_channel ? 1 << 0 : 1; + + int count = 1; + if (is_multi_channel) { + count *= weights_tz[0]; + if (groups > 1) { + count *= weights_tz[1]; + } + } + + bias_scale_tuple = + std::make_shared>>(std::make_tuple( + static_cast(mask_reorder), std::vector(count))); + for (int i = 0; i < count; i++) { + std::get<1>(*bias_scale_tuple)[i] = scale_in_data * scale_weights_data[i]; + } + + this->dev_ctx_.SetBlob(key_bs, bias_scale_tuple); + + return bias_scale_tuple; + } + + std::tuple, float> get_int8_scales( + const DenseTensor* filter, + int groups, + bool force_fp32_output, + bool fuse_residual_conn, + const std::string& fuse_activation) const { + const auto& weights_tz = phi::vectorize(filter->dims()); + groups = std::max(groups, 1); + + const auto& scale_weights_data = + this->dev_ctx_.HasDnnAttr("Scale_weights") + ? PADDLE_GET_CONST(std::vector, + this->dev_ctx_.GetDnnAttr("Scale_weights")) + : std::vector{1.0f}; + const auto& scale_in_data = + this->dev_ctx_.HasDnnAttr("Scale_in") + ? PADDLE_GET_CONST(float, this->dev_ctx_.GetDnnAttr("Scale_in")) + : 1.0f; + const auto& scale_in_eltwise_data = + this->dev_ctx_.HasDnnAttr("Scale_in_eltwise") + ? PADDLE_GET_CONST(float, + this->dev_ctx_.GetDnnAttr("Scale_in_eltwise")) + : 1.0f; + + bool is_multi_channel = scale_weights_data.size() > 1; + bool has_activation = !fuse_activation.empty(); + const auto& scale_out = + this->dev_ctx_.HasDnnAttr("Scale_out") + ? PADDLE_GET_CONST(float, this->dev_ctx_.GetDnnAttr("Scale_out")) + : 1.0f; + float activation_scale = + (!force_fp32_output && has_activation) ? scale_out : 1.0f; + + float scale_out_data = + (force_fp32_output || has_activation) ? 1.0f : scale_out; + float sum_scale = + fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; + int count = + is_multi_channel + ? (groups > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) + : 1; + std::vector output_shift_scale(count); + +#pragma omp parallel for if (count > 50) + for (int i = 0; i < count; i++) { + if (scale_weights_data[i] == 0.0) + // weights data will contain 0 in some models, then weights + // scale couldn't be calculated + output_shift_scale[i] = scale_out_data; + else + output_shift_scale[i] = + static_cast(static_cast(scale_out_data) / + (static_cast(scale_in_data) * + static_cast(scale_weights_data[i]))); + } + + return std::make_tuple(sum_scale, output_shift_scale, activation_scale); + } + + dnnl::primitive_attr CreateConvAttrs(const DenseTensor* filter, + int groups, + bool force_fp32_output, + bool fuse_residual_conn, + const std::string& fuse_activation) { + dnnl::primitive_attr conv_attr; + dnnl::post_ops post_operations; + + float sum_scale = 1.0f; + float activation_scale = 1.0f; + std::vector output_shift_scale; + if (funcs::is_int8()) { + if (this->dev_ctx_.HasDnnAttr("Sum_scale")) { + sum_scale = + PADDLE_GET_CONST(float, this->dev_ctx_.GetDnnAttr("Sum_scale")); + activation_scale = + this->dev_ctx_.HasDnnAttr("Activation_scale") + ? PADDLE_GET_CONST( + float, this->dev_ctx_.GetDnnAttr("Activation_scale")) + : activation_scale; + output_shift_scale = + this->dev_ctx_.HasDnnAttr("Output_shift_scale") + ? PADDLE_GET_CONST( + std::vector, + this->dev_ctx_.GetDnnAttr("Output_shift_scale")) + : output_shift_scale; + } else { + std::tie(sum_scale, output_shift_scale, activation_scale) = + get_int8_scales(filter, + groups, + force_fp32_output, + fuse_residual_conn, + fuse_activation); + } + + if (output_shift_scale.size() > 0) { + int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0; + conv_attr.set_output_scales(mask, output_shift_scale); + } + } + + // Fusion with Elementwise layer relies on adding a sum post-operation with + // the scale parameter. It is assumed that when fuse_residual_connection is + // true, the output tensor contains the data coming from residual + // connection. The result of this post_op is: + // Output = scale * Output + Conv_Out. + if (fuse_residual_conn) { + post_operations.append_sum(sum_scale); + } + + funcs::AppendActivation(this->dev_ctx_, post_operations, activation_scale); + + conv_attr.set_post_ops(post_operations); + return conv_attr; + } + + std::shared_ptr + AcquireWeightsMemoryWithReorderFromDataPrimitive( + const phi::DenseTensor* filter, const int groups, const bool is_conv3d) { + const K* filter_data = filter->data(); + auto weights_tz = phi::vectorize(filter->dims()); + funcs::GetGroupConvWeightsTz(weights_tz, groups); + + auto user_src_md = + funcs::OneDNNMemDesc(weights_tz, + funcs::OneDNNGetDataType(), + GetWeightsFormat(groups, is_conv3d)); + + return this->AcquireMemoryWithReorder(user_src_md, + this->bwd_pd_->weights_desc(), + funcs::to_void_cast(filter_data), + "@weights_mem_d_p", + false); + } + + std::shared_ptr AcquireSrcMemoryWithReorder( + const phi::DenseTensor* input) { + return this->AcquireMemoryWithReorderPrimitive(input, + "@src_mem_p_user", + "@src_mem_p_target", + "@src_mem_p", + this->fwd_pd_->src_desc()); + } + + std::shared_ptr AcquireSrcMemoryWithReorderFromWeightsPrimitive( + const phi::DenseTensor* input) { + return this->AcquireMemoryWithReorderPrimitive(input, + "@src_mem_w_p_user", + "@src_mem_w_p_target", + "@src_mem_w_p", + this->bwd_w_pd_->src_desc()); + } + + std::shared_ptr + AcquireDiffDstMemoryWithReorderFromWeightsPrimitive( + const phi::DenseTensor* out_grad) { + return this->AcquireMemoryWithReorderPrimitive( + out_grad, + "@diff_dst_mem_w_p_user", + "@diff_dst_mem_w_p_target", + "@diff_dst_mem_w_p", + this->bwd_w_pd_->diff_dst_desc()); + } + + std::shared_ptr + AcquireDiffDstMemoryWithReorderMemoryFromDataPrimitive( + const phi::DenseTensor* out_grad) { + return this->AcquireMemoryWithReorderPrimitive( + out_grad, + "@diff_dst_mem_p_user", + "@diff_dst_mem_p_target", + "@diff_dst_mem_p", + this->bwd_pd_->diff_dst_desc()); + } + + std::shared_ptr AcquireMemoryWithReorderPrimitive( + const phi::DenseTensor* in_mem, + const char* key_mem_user, + const char* key_mem_target, + const char* key_mem, + const dnnl::memory::desc& mem_md) { + const T* in_mem_data = in_mem->data(); + const std::string user_key_suffix{key_mem_user}; + auto user_mem_p = this->AcquireMemory(user_key_suffix); + + if (!user_mem_p) { + return this->AcquireMemoryWithReorder(in_mem->mem_desc(), + mem_md, + funcs::to_void_cast(in_mem_data), + key_mem); + } else { + const std::string target_key_suffix{key_mem_target}; + const auto target_mem_p = this->AcquireMemory(target_key_suffix); + user_mem_p->set_data_handle(funcs::to_void_cast(in_mem_data)); + if (user_mem_p != target_mem_p) { + this->AcquireReorder(user_mem_p, target_mem_p); + } + return target_mem_p; + } + } + + std::shared_ptr AcquireWeightsMemoryWithReorder( + const phi::DenseTensor* filter, + const int groups, + const bool is_conv3d, + const bool is_test, + const std::vector& scale_data = {1.0f}, + int mask = 0) { + // This is workaround to make execution faster, delete + // if statement after including md inside Tensor + auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target"); + if (is_test && weights_mem_p) { + return weights_mem_p; + } else if (is_test) { + const K* filter_data = filter->data(); + auto weights_tz = phi::vectorize(filter->dims()); + funcs::GetGroupConvWeightsTz(weights_tz, groups); + + auto user_src_md = + funcs::OneDNNMemDesc(weights_tz, + funcs::OneDNNGetDataType(), + GetWeightsFormat(groups, is_conv3d)); + + return this->AcquireMemoryWithReorder(user_src_md, + this->fwd_pd_->weights_desc(), + funcs::to_void_cast(filter_data), + "@weights_mem_p", + is_test, + {}, + scale_data, + mask); + } else { + const T* filter_data = filter->data(); + auto weights_tz = phi::vectorize(filter->dims()); + funcs::GetGroupConvWeightsTz(weights_tz, groups); + + auto user_src_md = + funcs::OneDNNMemDesc(weights_tz, + funcs::OneDNNGetDataType(), + GetWeightsFormat(groups, is_conv3d)); + + return this->AcquireMemoryWithReorder(user_src_md, + this->fwd_pd_->weights_desc(), + funcs::to_void_cast(filter_data), + "@weights_mem_p", + is_test, + {}, + scale_data, + mask); + } + } + + std::shared_ptr AcquireBiasMemoryWithReorder( + const phi::DenseTensor* bias, + const bool is_test, + const std::vector& scale_data = {1.0f}, + int mask = 0) { + auto bias_mem_p = this->AcquireMemory("@bias_mem_p_target"); + if (is_test && bias_mem_p) { + return bias_mem_p; + } else { + // if K is int8 (weights are int8) then biases are int32 + using K_Bias = typename std:: + conditional::value, int32_t, K>::type; + if (std::is_same::value && + bias->dtype() != phi::DataType::INT32) { + LOG(ERROR) << "Bias should be of type int32 but is " << bias->dtype(); + } + const K_Bias* bias_data = bias->data(); + + return this->AcquireMemoryWithReorder( + bias->mem_desc(), + this->fwd_pd_->bias_desc(), + funcs::to_void_cast(bias_data), + "@bias_mem_p", + is_test, + {}, + scale_data, + mask); + } + } + + std::shared_ptr AcquireResidualMemory( + const phi::DenseTensor* residual_param) { + void* residual_data = + residual_param->dtype() == + paddle::experimental::CppTypeToDataType::Type() + ? funcs::to_void_cast(residual_param->data()) + : funcs::to_void_cast(residual_param->data()); + auto residual_mem_p = this->AcquireMemory("@user_residual_data_mem_p"); + if (residual_mem_p) { + residual_mem_p->set_data_handle(residual_data); + return residual_mem_p; + } else { + return this->AcquireMemoryFromPrimitive(residual_param->mem_desc(), + residual_data, + "@user_residual_data_mem_p"); + } + } + + std::shared_ptr AcquireDstMemoryWithResidual( + phi::DenseTensor* output, const phi::DenseTensor* residual_param) { + std::shared_ptr dst_memory_p; + if (residual_param->mem_desc() != this->fwd_pd_->dst_desc()) { + auto residual_memory_p = this->AcquireResidualMemory(residual_param); + dst_memory_p = this->template AcquireDstMemory(output); + this->AcquireReorder(residual_memory_p, dst_memory_p); + } else { + // Changing ShareDataWith to TensorCopy results in performance drop + // on ResNet architectures + // (https://github.com/PaddlePaddle/Paddle/issues/22964) + output->ShareDataWith(*residual_param); + dst_memory_p = this->template AcquireDstMemory(output); + } + return dst_memory_p; + } +}; + +} // namespace onednn +} // namespace phi diff --git a/paddle/phi/kernels/onednn/conv_kernel.cc b/paddle/phi/kernels/onednn/conv_kernel.cc new file mode 100644 index 0000000000000..a189b3c8db5b2 --- /dev/null +++ b/paddle/phi/kernels/onednn/conv_kernel.cc @@ -0,0 +1,457 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/funcs/data_layout_transform.h" +#include "paddle/phi/kernels/onednn/conv_handler.h" + +namespace phi { + +static dnnl::memory::data_type GetDstType( + bool is_int8, + bool is_bfloat16, + bool force_fp32_output, + std::string fuse_activation, + bool fuse_residual_conn, + const phi::DenseTensor* residual_param) { + auto dst_dt = dnnl::memory::data_type::f32; + if (is_int8) { + dst_dt = (fuse_activation == "relu" || fuse_activation == "relu6") + ? dnnl::memory::data_type::u8 + : dnnl::memory::data_type::s8; + if (force_fp32_output) { + dst_dt = dnnl::memory::data_type::f32; + } + if (fuse_residual_conn && residual_param) { + auto residual_dt = funcs::ToOneDNNDataType(residual_param->dtype()); + if (dst_dt != residual_dt) dst_dt = residual_dt; + } + } else { + if (!force_fp32_output && is_bfloat16) { + dst_dt = dnnl::memory::data_type::bf16; + if (fuse_residual_conn && residual_param) { + dst_dt = funcs::ToOneDNNDataType(residual_param->dtype()); + } + } + } + return dst_dt; +} + +#define PD_VISIT_FLOAT_AND_INT8_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + +template +void ComputeFP32(const OneDNNContext& dev_ctx, + const DenseTensor* input, + const DenseTensor* filter, + const DenseTensor* bias, + const DenseTensor* residual_param, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + const std::vector& dilations, + int groups, + const std::string& data_format, + bool is_test, + bool is_BFLOAT16, + const std::string& fuse_activation, + bool fuse_residual_conn, + bool force_fp32_output, + DenseTensor* output) { + const auto& mkldnn_engine = dev_ctx.GetEngine(); + const bool is_conv3d = strides.size() == 3U; + + PD_VISIT_FLOAT_AND_INT8_TYPES( + filter->dtype(), "ConvOneDNNHandlerT", ([&] { + onednn::ConvOneDNNHandlerT handler(dev_ctx, + mkldnn_engine, + dev_ctx.GetPlace(), + input, + filter, + bias, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + is_BFLOAT16, + fuse_activation, + fuse_residual_conn, + force_fp32_output, + output); + + auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); + + auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( + filter, groups, is_conv3d, is_test); + + std::shared_ptr dst_memory_p; + if (fuse_residual_conn) { + dst_memory_p = + handler.AcquireDstMemoryWithResidual(output, residual_param); + } else { + dst_memory_p = handler.template AcquireDstMemory(output); + } + + auto conv_p = handler.AcquireForwardPrimitive(); + + std::unordered_map args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + if (bias) { + auto bias_memory_p = + handler.AcquireBiasMemoryWithReorder(bias, is_test); + args.insert({DNNL_ARG_BIAS, *bias_memory_p}); + } + + auto& astream = OneDNNContext::tls().get_stream(); + conv_p->execute(astream, args); + astream.wait(); + + output->set_mem_desc(dst_memory_p->get_desc()); + })); +} + +template +void ComputeINT8(const OneDNNContext& dev_ctx, + const DenseTensor* input, + const DenseTensor* filter, + const DenseTensor* bias, + const DenseTensor* residual_param, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + const std::vector& dilations, + int groups, + const std::string& data_format, + bool is_test, + bool is_BFLOAT16, + const std::string& fuse_activation, + bool fuse_residual_conn, + bool force_fp32_output, + DenseTensor* output) { + const auto& mkldnn_engine = dev_ctx.GetEngine(); + const bool is_conv3d = strides.size() == 3U; + + bool unsigned_output = + (fuse_activation == "relu" || fuse_activation == "relu6"); + bool need_s8_to_u8 = false; + + PADDLE_ENFORCE_NE( + is_conv3d, + true, + phi::errors::Unimplemented( + "OneDNN int8 convolution does not support 3D inputs currently")); + PADDLE_ENFORCE_EQ( + fuse_residual_conn && force_fp32_output, + false, + phi::errors::Unimplemented( + "residual fusion does not support force output with fp32")); + + PD_VISIT_FLOAT_AND_INT8_TYPES( + filter->dtype(), "ConvMKLDNNHandlerT", ([&] { + onednn::ConvOneDNNHandlerT handler(dev_ctx, + mkldnn_engine, + dev_ctx.GetPlace(), + input, + filter, + bias, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + is_BFLOAT16, + fuse_activation, + fuse_residual_conn, + force_fp32_output, + output); + + auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); + + const auto& scale_weights_data = + dev_ctx.HasDnnAttr("Scale_weights") + ? PADDLE_GET_CONST(std::vector, + dev_ctx.GetDnnAttr("Scale_weights")) + : std::vector{1.0f}; + const bool is_multi_channel = scale_weights_data.size() > 1; + int mask_reorder = is_multi_channel + ? ((groups != 1) ? (1 << 1) + (1 << 0) : 1 << 0) + : 0; + auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( + filter, groups, false, true, scale_weights_data, mask_reorder); + + std::shared_ptr dst_memory_p; + if (fuse_residual_conn) { + PADDLE_ENFORCE_EQ( + output->dims(), + residual_param->dims(), + phi::errors::InvalidArgument( + "Output and elementwise parameter need to have the " + "same dimension sizes, but got output's dimension = %d" + " and residual param's dimension =%d .", + output->dims().size(), + residual_param->dims().size())); + dst_memory_p = + handler.AcquireDstMemoryWithResidual(output, residual_param); + need_s8_to_u8 = (funcs::OneDNNGetDataType() == + dnnl::memory::data_type::s8) && + unsigned_output; + } else { + dst_memory_p = handler.template AcquireDstMemory(output); + } + + auto conv_p = handler.AcquireForwardPrimitive(); + + std::unordered_map args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + if (bias) { + std::vector bias_scales; + auto p_scales_tuple = + std::make_shared>>( + std::make_tuple(static_cast(mask_reorder), + bias_scales)); + if (dev_ctx.HasDnnAttr("Bias_scales")) { + bias_scales = PADDLE_GET_CONST(std::vector, + dev_ctx.GetDnnAttr("Bias_scales")); + p_scales_tuple = + std::make_shared>>( + std::make_tuple(static_cast(mask_reorder), + bias_scales)); + } else { + p_scales_tuple = handler.get_int8_bias_scales( + filter, groups, scale_weights_data); + } + auto bias_memory_p = handler.AcquireBiasMemoryWithReorder( + bias, + true, + std::get<1>(*p_scales_tuple), + std::get<0>(*p_scales_tuple)); + args.insert({DNNL_ARG_BIAS, *bias_memory_p}); + } + + auto& astream = OneDNNContext::tls().get_stream(); + conv_p->execute(astream, args); + astream.wait(); + + if (need_s8_to_u8) { + dev_ctx.Alloc(output); + } + + output->set_mem_desc(dst_memory_p->get_desc()); + })); +} + +template +void ConvKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + const std::vector& dilations, + int groups, + const std::string& data_format, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + dev_ctx.GetPlace().GetType(), + AllocationType::CPU, + phi::errors::PreconditionNotMet("Operator DNNL Conv must use CPUPlace")); + bool is_INT8 = + std::is_same::value || std::is_same::value; + + VLOG(1) << "Conv Has mkldnn_data_type: " + << dev_ctx.HasDnnAttr("mkldnn_data_type"); + VLOG(1) << "Conv Has ResidualData " + << dev_ctx.HasDnnInput("mkldnn_data_type"); + VLOG(1) << "Conv Has fuse_residual_connection " + << dev_ctx.HasDnnAttr("fuse_residual_connection"); + VLOG(1) << "Conv Has fuse_activation " + << dev_ctx.HasDnnAttr("fuse_activation"); + VLOG(1) << "Conv Has force_fp32_output " + << dev_ctx.HasDnnAttr("force_fp32_output"); + VLOG(1) << "Conv Has Scale_weights " << dev_ctx.HasDnnAttr("Scale_weights"); + VLOG(1) << "Conv Has Scale_in " << dev_ctx.HasDnnAttr("Scale_in"); + VLOG(1) << "Conv Has Scale_in_eltwise " + << dev_ctx.HasDnnAttr("Scale_in_eltwise"); + VLOG(1) << "Conv Has Scale_out " << dev_ctx.HasDnnAttr("Scale_out"); + VLOG(1) << "Conv Has is_test " << dev_ctx.HasDnnAttr("is_test"); + VLOG(1) << "Conv Has Bias_scales " << dev_ctx.HasDnnAttr("Bias_scales"); + + bool is_test = dev_ctx.HasDnnAttr("is_test") + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test")) + : false; + bool is_BFLOAT16 = + dev_ctx.HasDnnAttr("mkldnn_data_type") + ? PADDLE_GET_CONST(std::string, + dev_ctx.GetDnnAttr("mkldnn_data_type")) == + "bfloat16" + : false; + const auto* bias = + dev_ctx.HasDnnInput("Bias") ? dev_ctx.GetDnnInput("Bias") : nullptr; + const auto* residual_param = dev_ctx.HasDnnInput("ResidualData") + ? dev_ctx.GetDnnInput("ResidualData") + : nullptr; + bool fuse_residual_conn = + dev_ctx.HasDnnAttr("fuse_residual_connection") + ? PADDLE_GET_CONST(bool, + dev_ctx.GetDnnAttr("fuse_residual_connection")) + : false; + const std::string& fuse_activation = + dev_ctx.HasDnnAttr("fuse_activation") + ? PADDLE_GET_CONST(std::string, dev_ctx.GetDnnAttr("fuse_activation")) + : ""; + bool force_fp32_output = + dev_ctx.HasDnnAttr("force_fp32_output") + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("force_fp32_output")) + : false; + + auto dst_dt = GetDstType(is_INT8, + is_BFLOAT16, + force_fp32_output, + fuse_activation, + fuse_residual_conn, + residual_param); + + if (!is_INT8) { + if (dst_dt == dnnl::memory::data_type::f32) { + ComputeFP32(dev_ctx, + &input, + &filter, + bias, + residual_param, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + is_BFLOAT16, + fuse_activation, + fuse_residual_conn, + force_fp32_output, + out); + } else if (dst_dt == dnnl::memory::data_type::bf16) { + ComputeFP32(dev_ctx, + &input, + &filter, + bias, + residual_param, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + is_BFLOAT16, + fuse_activation, + fuse_residual_conn, + force_fp32_output, + out); + } + } else { + if (dst_dt == dnnl::memory::data_type::f32) { + ComputeINT8(dev_ctx, + &input, + &filter, + bias, + residual_param, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + is_BFLOAT16, + fuse_activation, + fuse_residual_conn, + force_fp32_output, + out); + } else if (dst_dt == dnnl::memory::data_type::u8) { + ComputeINT8(dev_ctx, + &input, + &filter, + bias, + residual_param, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + is_BFLOAT16, + fuse_activation, + fuse_residual_conn, + force_fp32_output, + out); + } else if (dst_dt == dnnl::memory::data_type::s8) { + ComputeINT8(dev_ctx, + &input, + &filter, + bias, + residual_param, + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + is_test, + is_BFLOAT16, + fuse_activation, + fuse_residual_conn, + force_fp32_output, + out); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d, + OneDNN, + ONEDNN, + phi::ConvKernel, + float, + phi::dtype::bfloat16, + uint8_t, + int8_t) {} diff --git a/paddle/phi/kernels/xpu/conv_grad_kernel.cc b/paddle/phi/kernels/xpu/conv_grad_kernel.cc index 778ceb4773c30..ad97d86e916fa 100644 --- a/paddle/phi/kernels/xpu/conv_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/conv_grad_kernel.cc @@ -148,7 +148,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -164,7 +164,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, out_grad, strides, paddings, - paddding_algorithm, + padding_algorithm, dilations, groups, data_format, diff --git a/paddle/phi/kernels/xpu/conv_kernel.cc b/paddle/phi/kernels/xpu/conv_kernel.cc index fca984a8606ec..05f5f939187c4 100644 --- a/paddle/phi/kernels/xpu/conv_kernel.cc +++ b/paddle/phi/kernels/xpu/conv_kernel.cc @@ -114,7 +114,7 @@ void DepthwiseConvKernel(const Context& dev_ctx, const DenseTensor& filter, const std::vector& strides, const std::vector& paddings, - const std::string& paddding_algorithm, + const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, @@ -128,7 +128,7 @@ void DepthwiseConvKernel(const Context& dev_ctx, filter, strides, paddings, - paddding_algorithm, + padding_algorithm, dilations, groups, data_format, From 6331512d8753ab0162380930ae14dc7b47ed363c Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Mon, 17 Oct 2022 17:08:42 +0800 Subject: [PATCH 19/36] fix ci bugs (#36) --- paddle/fluid/framework/operator.cc | 9 ++++ .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 14 ----- paddle/phi/backends/onednn/onednn_context.cc | 54 +++++++++++++++++++ paddle/phi/backends/onednn/onednn_context.h | 11 ++++ paddle/phi/kernels/onednn/conv_grad_kernel.cc | 7 ++- paddle/phi/kernels/onednn/conv_handler.h | 42 +++++++-------- paddle/phi/kernels/onednn/conv_kernel.cc | 37 +++---------- 7 files changed, 107 insertions(+), 67 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 5cdaf14f3cb33..e29ebe4d226d3 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2698,6 +2698,15 @@ void OperatorWithKernel::BuildPhiKernelContext( auto attr_defs = phi_kernel_->args_def().attribute_defs(); auto output_defs = phi_kernel_->args_def().output_defs(); +#if defined(PADDLE_WITH_MKLDNN) + if (phi::OneDNNContext::classof(dev_ctx)) { + // Onednn holds this op's variable's name and init them here. + phi::OneDNNContext* one_dnn_ctx = static_cast(dev_ctx); + one_dnn_ctx->SetInputsName(Inputs()); + one_dnn_ctx->SetOutputsName(Outputs()); + } +#endif + PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(), platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index fad41e7a133f8..bd7971751d156 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -1182,20 +1182,6 @@ class ConvMKLDNNGradOpKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_KERNEL(conv2d, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::ConvMKLDNNOpKernel, - ops::ConvMKLDNNOpKernel, - ops::ConvMKLDNNOpKernel, - ops::ConvMKLDNNOpKernel); - -REGISTER_OP_KERNEL(conv2d_grad, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::ConvMKLDNNGradOpKernel, - ops::ConvMKLDNNGradOpKernel); - REGISTER_OP_KERNEL(depthwise_conv2d, MKLDNN, ::paddle::platform::CPUPlace, diff --git a/paddle/phi/backends/onednn/onednn_context.cc b/paddle/phi/backends/onednn/onednn_context.cc index 5aa8090c62fe5..6c69191c94457 100644 --- a/paddle/phi/backends/onednn/onednn_context.cc +++ b/paddle/phi/backends/onednn/onednn_context.cc @@ -319,6 +319,35 @@ struct OneDNNContext::Impl { dnn_inputs_[input_name] = input; } + void SetInputsName(const TensorNameMap& inputs_name) { + inputs_name_ = inputs_name; + } + + void SetOutputsName(const TensorNameMap& outputs_name) { + outputs_name_ = outputs_name; + } + + const std::vector& GetInputsName( + const std::string& input) const { + auto it = inputs_name_.find(input); + PADDLE_ENFORCE_NE(it, + inputs_name_.end(), + phi::errors::NotFound( + "OneDnnContext does not have the input %s.", input)); + return it->second; + } + + const std::vector& GetOutputsName( + const std::string& output) const { + auto it = outputs_name_.find(output); + PADDLE_ENFORCE_NE( + it, + outputs_name_.end(), + phi::errors::NotFound("OneDnnContext does not have the output %s.", + output)); + return it->second; + } + std::shared_ptr p_blobmap_; // Map key is pointer of executor and value is a data(iterator in map) needed // to erase @@ -342,11 +371,18 @@ struct OneDNNContext::Impl { // to remove this member in the future. static thread_local paddle::flat_hash_map dnn_inputs_; + + // Onednn need get input and output's name in current Kernel for generating + // unique_key. + static thread_local TensorNameMap inputs_name_; + static thread_local TensorNameMap outputs_name_; }; thread_local AttributeMap OneDNNContext::Impl::dnn_attrs_ = {}; thread_local paddle::flat_hash_map OneDNNContext::Impl::dnn_inputs_ = {}; +thread_local TensorNameMap OneDNNContext::Impl::inputs_name_ = {}; +thread_local TensorNameMap OneDNNContext::Impl::outputs_name_ = {}; OneDNNContext::OneDNNContext(const Place& place) : CPUContext(place), impl_(std::make_unique()) {} @@ -403,5 +439,23 @@ void OneDNNContext::SetDnnInput(const std::string& input_name, return impl_->SetDnnInput(input_name, input); } +void OneDNNContext::SetInputsName(const TensorNameMap& inputs_name) { + impl_->SetInputsName(inputs_name); +} + +void OneDNNContext::SetOutputsName(const TensorNameMap& outputs_name) { + impl_->SetOutputsName(outputs_name); +} + +const std::vector& OneDNNContext::GetInputsName( + const std::string& input) const { + return impl_->GetInputsName(input); +} + +const std::vector& OneDNNContext::GetOutputsName( + const std::string& output) const { + return impl_->GetOutputsName(output); +} + } // namespace phi #endif diff --git a/paddle/phi/backends/onednn/onednn_context.h b/paddle/phi/backends/onednn/onednn_context.h index d7cf1752bcce4..9035aef5f9a32 100644 --- a/paddle/phi/backends/onednn/onednn_context.h +++ b/paddle/phi/backends/onednn/onednn_context.h @@ -24,6 +24,8 @@ limitations under the License. */ namespace phi { +using TensorNameMap = std::map>; + class OneDNNContextThreadLocals { // default mkldnn session id @@ -144,6 +146,15 @@ class OneDNNContext : public CPUContext { const DenseTensor* GetDnnInput(const std::string& input_name) const; void SetDnnInput(const std::string& input_name, const DenseTensor* input); + void SetInputsName(const TensorNameMap& inputs_name); + + void SetOutputsName(const TensorNameMap& outputs_name); + + const std::vector& GetInputsName(const std::string& input) const; + + const std::vector& GetOutputsName( + const std::string& output) const; + static const char* name() { return "OneDNNContext"; } private: diff --git a/paddle/phi/kernels/onednn/conv_grad_kernel.cc b/paddle/phi/kernels/onednn/conv_grad_kernel.cc index 190fe26cd346b..a9bd4a8368127 100644 --- a/paddle/phi/kernels/onednn/conv_grad_kernel.cc +++ b/paddle/phi/kernels/onednn/conv_grad_kernel.cc @@ -56,7 +56,6 @@ void ConvGradKernel(const Context& dev_ctx, phi::errors::PreconditionNotMet( "Operator DNNL ConvGrad must use CPUPlace")); const auto& mkldnn_engine = dev_ctx.GetEngine(); - VLOG(1) << "ConvGrad Has is_test " << dev_ctx.HasDnnAttr("is_test"); const auto* bias = @@ -67,6 +66,9 @@ void ConvGradKernel(const Context& dev_ctx, if (!input_grad && !filter_grad) return; + const std::string& unique_name = + dev_ctx.GetInputsName("Input")[0] + dev_ctx.GetInputsName("Filter")[0]; + PD_VISIT_FLOAT_AND_BF16_TYPES( filter.dtype(), "ConvOneDNNHandlerT", ([&] { // TODO(jczaja): Are all tensors really needed? @@ -84,7 +86,8 @@ void ConvGradKernel(const Context& dev_ctx, data_format, is_test, filter_grad, - input_grad); + input_grad, + unique_name); // create mkldnn memory from input tensors (data/weights) auto& astream = OneDNNContext::tls().get_stream(); diff --git a/paddle/phi/kernels/onednn/conv_handler.h b/paddle/phi/kernels/onednn/conv_handler.h index f5e7f2c29c763..d10b89de47be9 100644 --- a/paddle/phi/kernels/onednn/conv_handler.h +++ b/paddle/phi/kernels/onednn/conv_handler.h @@ -45,10 +45,10 @@ class ConvOneDNNHandlerT const phi::DenseTensor* input, const phi::DenseTensor* filter, const phi::DenseTensor* bias, - const std::vector& strides, - const std::vector& paddings, + const std::vector& strides_in, + const std::vector& paddings_in, const std::string& padding_algorithm, - const std::vector& dilations, + const std::vector& dilations_in, int groups, const std::string& data_format, bool is_test, @@ -56,7 +56,8 @@ class ConvOneDNNHandlerT const std::string& fuse_activation, bool fuse_residual_conn, bool force_fp32_output, - phi::DenseTensor* output) + phi::DenseTensor* output, + const std::string& unique_name) : funcs::OneDNNHandlerTdims()))) { + funcs::CreateKey( + dev_ctx, phi::vectorize(input->dims()), unique_name)) { if (unlikely(!this->isCached())) { PADDLE_ENFORCE_EQ( input->layout(), @@ -128,21 +130,17 @@ class ConvOneDNNHandlerT "i.e. X, but got dimension = %d .", bias->dims().size())); } - const auto input_dims = input->dims(); const auto data_dims = phi::slice_ddim(input_dims, 2, input_dims.size()); const auto filter_dims = filter->dims(); const auto filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); - const auto ksize = phi::vectorize(filter_data_dims); - std::vector strides(begin(strides), end(strides)); - std::vector paddings(begin(paddings), end(paddings)); - std::vector dilations(begin(dilations), end(dilations)); - + std::vector strides(begin(strides_in), end(strides_in)); + std::vector paddings(begin(paddings_in), end(paddings_in)); + std::vector dilations(begin(dilations_in), end(dilations_in)); UpdatePaddingAndDilation( &paddings, &dilations, padding_algorithm, data_dims, strides, ksize); - std::transform( dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) { return i - 1; @@ -158,7 +156,6 @@ class ConvOneDNNHandlerT const dnnl::memory::dims stride_dims = strides; const auto onednn_paddings = funcs::ToOnednnPadding(paddings); const dnnl::memory::dims dilations_dims = dilations; - /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose * the memory format preferred for best performance @@ -186,7 +183,6 @@ class ConvOneDNNHandlerT dst_tz, funcs::OneDNNGetDataType(), chosen_memory_format); const auto fwd_prop_kind = is_test ? dnnl::prop_kind::forward_inference : dnnl::prop_kind::forward_training; - const dnnl::primitive_attr conv_attr = CreateConvAttrs(filter, groups, force_fp32_output, @@ -239,15 +235,16 @@ class ConvOneDNNHandlerT const phi::DenseTensor* filter, const phi::DenseTensor* bias, const phi::DenseTensor* out_grad, - const std::vector& strides, - const std::vector& paddings, + const std::vector& strides_in, + const std::vector& paddings_in, const std::string& padding_algorithm, - const std::vector& dilations, + const std::vector& dilations_in, int groups, const std::string& data_format, bool is_test, phi::DenseTensor* filter_grad, - phi::DenseTensor* in_x_grad) + phi::DenseTensor* in_x_grad, + const std::string& unique_name) : funcs::OneDNNHandlerTdims()))) { + funcs::CreateKey( + dev_ctx, phi::vectorize(in->dims()), unique_name)) { if (unlikely(!this->isBwdCached())) { PADDLE_ENFORCE_EQ( in->layout(), @@ -287,9 +285,9 @@ class ConvOneDNNHandlerT phi::errors::InvalidArgument( "is_test attribute should be set to False in training phase.")); - std::vector strides(begin(strides), end(strides)); - std::vector paddings(begin(paddings), end(paddings)); - std::vector dilations(begin(dilations), end(dilations)); + std::vector strides(begin(strides_in), end(strides_in)); + std::vector paddings(begin(paddings_in), end(paddings_in)); + std::vector dilations(begin(dilations_in), end(dilations_in)); auto input_dims = in->dims(); auto data_dims = phi::slice_ddim(input_dims, 2, input_dims.size()); diff --git a/paddle/phi/kernels/onednn/conv_kernel.cc b/paddle/phi/kernels/onednn/conv_kernel.cc index a189b3c8db5b2..c88c260c03098 100644 --- a/paddle/phi/kernels/onednn/conv_kernel.cc +++ b/paddle/phi/kernels/onednn/conv_kernel.cc @@ -86,7 +86,8 @@ void ComputeFP32(const OneDNNContext& dev_ctx, DenseTensor* output) { const auto& mkldnn_engine = dev_ctx.GetEngine(); const bool is_conv3d = strides.size() == 3U; - + const std::string& unique_name = + dev_ctx.GetInputsName("Input")[0] + dev_ctx.GetInputsName("Filter")[0]; PD_VISIT_FLOAT_AND_INT8_TYPES( filter->dtype(), "ConvOneDNNHandlerT", ([&] { onednn::ConvOneDNNHandlerT handler(dev_ctx, @@ -106,13 +107,11 @@ void ComputeFP32(const OneDNNContext& dev_ctx, fuse_activation, fuse_residual_conn, force_fp32_output, - output); - + output, + unique_name); auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); - auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( filter, groups, is_conv3d, is_test); - std::shared_ptr dst_memory_p; if (fuse_residual_conn) { dst_memory_p = @@ -122,7 +121,6 @@ void ComputeFP32(const OneDNNContext& dev_ctx, } auto conv_p = handler.AcquireForwardPrimitive(); - std::unordered_map args = { {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_WEIGHTS, *weights_memory_p}, @@ -137,7 +135,6 @@ void ComputeFP32(const OneDNNContext& dev_ctx, auto& astream = OneDNNContext::tls().get_stream(); conv_p->execute(astream, args); astream.wait(); - output->set_mem_desc(dst_memory_p->get_desc()); })); } @@ -177,7 +174,8 @@ void ComputeINT8(const OneDNNContext& dev_ctx, false, phi::errors::Unimplemented( "residual fusion does not support force output with fp32")); - + const std::string& unique_name = + dev_ctx.GetInputsName("Input")[0] + dev_ctx.GetInputsName("Filter")[0]; PD_VISIT_FLOAT_AND_INT8_TYPES( filter->dtype(), "ConvMKLDNNHandlerT", ([&] { onednn::ConvOneDNNHandlerT handler(dev_ctx, @@ -197,7 +195,8 @@ void ComputeINT8(const OneDNNContext& dev_ctx, fuse_activation, fuse_residual_conn, force_fp32_output, - output); + output, + unique_name); auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); @@ -295,24 +294,6 @@ void ConvKernel(const Context& dev_ctx, bool is_INT8 = std::is_same::value || std::is_same::value; - VLOG(1) << "Conv Has mkldnn_data_type: " - << dev_ctx.HasDnnAttr("mkldnn_data_type"); - VLOG(1) << "Conv Has ResidualData " - << dev_ctx.HasDnnInput("mkldnn_data_type"); - VLOG(1) << "Conv Has fuse_residual_connection " - << dev_ctx.HasDnnAttr("fuse_residual_connection"); - VLOG(1) << "Conv Has fuse_activation " - << dev_ctx.HasDnnAttr("fuse_activation"); - VLOG(1) << "Conv Has force_fp32_output " - << dev_ctx.HasDnnAttr("force_fp32_output"); - VLOG(1) << "Conv Has Scale_weights " << dev_ctx.HasDnnAttr("Scale_weights"); - VLOG(1) << "Conv Has Scale_in " << dev_ctx.HasDnnAttr("Scale_in"); - VLOG(1) << "Conv Has Scale_in_eltwise " - << dev_ctx.HasDnnAttr("Scale_in_eltwise"); - VLOG(1) << "Conv Has Scale_out " << dev_ctx.HasDnnAttr("Scale_out"); - VLOG(1) << "Conv Has is_test " << dev_ctx.HasDnnAttr("is_test"); - VLOG(1) << "Conv Has Bias_scales " << dev_ctx.HasDnnAttr("Bias_scales"); - bool is_test = dev_ctx.HasDnnAttr("is_test") ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test")) : false; @@ -340,14 +321,12 @@ void ConvKernel(const Context& dev_ctx, dev_ctx.HasDnnAttr("force_fp32_output") ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("force_fp32_output")) : false; - auto dst_dt = GetDstType(is_INT8, is_BFLOAT16, force_fp32_output, fuse_activation, fuse_residual_conn, residual_param); - if (!is_INT8) { if (dst_dt == dnnl::memory::data_type::f32) { ComputeFP32(dev_ctx, From 37afcf63bac5712dc613187c121f3fba9835af04 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Mon, 17 Oct 2022 19:40:40 +0800 Subject: [PATCH 20/36] fix compile bugs (#38) --- paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 60c9b8f265960..9a4237230e86f 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -36,7 +36,7 @@ PD_DECLARE_KERNEL(relu, OneDNN, ONEDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_ITSELF(conv2d); -USE_OP_DEVICE_KERNEL(conv2d, MKLDNN); +PD_DECLARE_KERNEL(conv2d, OneDNN, ONEDNN); namespace paddle { namespace operators { From 39e2529b503ddd8fe2ed2e39f0289b503b2e833a Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Tue, 18 Oct 2022 21:04:47 +0800 Subject: [PATCH 21/36] fix extra input transform bug (#39) --- paddle/fluid/framework/operator.cc | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index e29ebe4d226d3..d13d2faca611f 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2267,7 +2267,8 @@ Scope* OperatorWithKernel::PrepareData( } std::unique_ptr new_expected_kernel_key = nullptr; - if (run_phi_kernel_ && in_def->backend != phi::Backend::ALL_BACKEND) { + if (run_phi_kernel_ && in_def != nullptr && + in_def->backend != phi::Backend::ALL_BACKEND) { auto tensor_backend = phi::TransToPhiBackend(tensor_in->place()); if ((in_def->backend != tensor_backend && (in_def->backend != phi::Backend::GPUDNN || @@ -2376,7 +2377,10 @@ Scope* OperatorWithKernel::PrepareData( }; if (run_phi_kernel_) { - const auto& input_names = kernel_signature_->input_names; + std::vector input_names; + for (auto& input_name : kernel_signature_->input_names) { + input_names.emplace_back(input_name); + } const auto& input_defs = phi_kernel_->args_def().input_defs(); PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(), @@ -2398,6 +2402,19 @@ Scope* OperatorWithKernel::PrepareData( no_buffer_ins && no_buffer_ins->count(input_name) > 0; prepare_input_data(input_name, &ins_vector, &in_def, should_skip_input); } + // For input that is Extra + for (auto& var_name_item : Inputs()) { + if (std::find(input_names.begin(), + input_names.end(), + var_name_item.first) == input_names.end()) { + bool should_skip_input = + no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0; + std::vector& input_vars = ctx->inputs[var_name_item.first]; + prepare_input_data( + var_name_item.first, &input_vars, nullptr, should_skip_input); + } + } + } else { for (auto& var_name_item : Inputs()) { bool should_skip_input = From 16d595c8fb7346b697f01c2284ed32fb9a22c912 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Fri, 21 Oct 2022 17:57:32 +0800 Subject: [PATCH 22/36] support dynamic created attr (#40) --- paddle/fluid/framework/op_desc.cc | 21 +++- paddle/fluid/operators/ops_extra_info.h | 12 +++ .../api/yaml/generator/ops_extra_info_gen.py | 100 ++++++++++-------- paddle/phi/api/yaml/op_compat.yaml | 1 + paddle/phi/kernels/onednn/conv_handler.h | 2 +- 5 files changed, 87 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index fc3a8bf24dbcf..7d99b77f487f5 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -495,11 +495,17 @@ OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block) attr_type != proto::AttrType::BLOCKS && attr_type != proto::AttrType::VAR && attr_type != proto::AttrType::VARS) { - auto iter = runtime_attrs_.find(attr_name); - if (iter == runtime_attrs_.end()) { - attrs_[attr_name] = GetAttrValue(attr); + auto default_attr_iter = runtime_attrs_.find(attr_name); + const auto &extra_dynamic_attr_map = + operators::ExtraInfoUtils::Instance().GetExtraDynamicAttrsMap(Type()); + auto dynamic_attr_iter = extra_dynamic_attr_map.find(attr_name); + + if (default_attr_iter != runtime_attrs_.end()) { + default_attr_iter->second = GetAttrValue(attr); + } else if (dynamic_attr_iter != extra_dynamic_attr_map.end()) { + runtime_attrs_[attr_name] = GetAttrValue(attr); } else { - iter->second = GetAttrValue(attr); + attrs_[attr_name] = GetAttrValue(attr); } } } @@ -665,8 +671,13 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) { const auto &extra_attr_map = operators::ExtraInfoUtils::Instance().GetExtraAttrsMap(Type()); + const auto &extra_dynamic_attr_map = + operators::ExtraInfoUtils::Instance().GetExtraDynamicAttrsMap(Type()); auto extra_attr_iter = extra_attr_map.find(name); - if (extra_attr_iter != extra_attr_map.end()) { + auto extra_dynamic_attr_iter = extra_dynamic_attr_map.find(name); + // auto pass_attr_iter = operators::extra_attr_properties.find(name); + if (extra_attr_iter != extra_attr_map.end() || + extra_dynamic_attr_iter != extra_dynamic_attr_map.end()) { is_runtime_attr = true; attrs_ptr = &(this->runtime_attrs_); } diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h index d05d4a57ac492..30c4c7c2be476 100644 --- a/paddle/fluid/operators/ops_extra_info.h +++ b/paddle/fluid/operators/ops_extra_info.h @@ -90,6 +90,7 @@ const std::unordered_map // ONEDNN dedicated attributes {"Activation_scale", ExtraAttrProperty::ONEDNN}, {"Bias", ExtraAttrProperty::ONEDNN}, + {"Bias_scales", ExtraAttrProperty::ONEDNN}, {"data_format", ExtraAttrProperty::ONEDNN}, {"force_fp32_output", ExtraAttrProperty::ONEDNN}, {"fuse_activation", ExtraAttrProperty::ONEDNN}, @@ -188,6 +189,15 @@ class ExtraInfoUtils { return empty_extra_attrs_map_; } + const paddle::framework::AttributeMap& GetExtraDynamicAttrsMap( + const std::string& op_type) const { + auto iter = g_extra_dynamic_attrs_map_.find(op_type); + if (iter != g_extra_dynamic_attrs_map_.end()) { + return iter->second; + } + return empty_extra_attrs_map_; + } + const std::vector>& GetExtraAttrsChecker(const std::string& op_type) const { auto iter = g_extra_attrs_checker_.find(op_type); @@ -211,6 +221,8 @@ class ExtraInfoUtils { std::unordered_map g_extra_attrs_map_; + std::unordered_map + g_extra_dynamic_attrs_map_; paddle::framework::AttributeMap empty_extra_attrs_map_{}; std::unordered_map< std::string, diff --git a/paddle/phi/api/yaml/generator/ops_extra_info_gen.py b/paddle/phi/api/yaml/generator/ops_extra_info_gen.py index 85f1d1657557e..c1f63bb2d871d 100644 --- a/paddle/phi/api/yaml/generator/ops_extra_info_gen.py +++ b/paddle/phi/api/yaml/generator/ops_extra_info_gen.py @@ -17,7 +17,7 @@ import argparse -def map_code_template(attrs_str, attrs_checker_str): +def map_code_template(default_attrs_str, dynamic_attr_str, attrs_checker_str): return f"""// This file is generated by paddle/phi/api/yaml/generator/ops_extra_info_gen.py #include "paddle/fluid/operators/ops_extra_info.h" @@ -28,7 +28,11 @@ def map_code_template(attrs_str, attrs_checker_str): ExtraInfoUtils::ExtraInfoUtils() {{ g_extra_attrs_map_ = {{ - {attrs_str} + {default_attrs_str} + }}; + + g_extra_dynamic_attrs_map_ = {{ + {dynamic_attr_str} }}; g_extra_attrs_checker_ = {{ @@ -64,10 +68,7 @@ def parse_attr(attr_str): 'name'), result.group('default_val') -def generate_extra_info(op_compat_yaml_path, ops_extra_info_path): - compat_apis = [] - with open(op_compat_yaml_path, 'rt') as f: - compat_apis = yaml.safe_load(f) +def generate_attr_info(attr_type, op_compat_args): def get_op_name(api_item): names = api_item.split('(') @@ -76,49 +77,62 @@ def get_op_name(api_item): else: return names[1].split(')')[0].strip() - extra_map_str_list = [] - extra_checker_str_list = [] + attr_map_str_list = [] + attr_checker_str_list = [] + extra_args_map = op_compat_args['extra'] + if attr_type in extra_args_map: + attr_map_list = [] + attr_checker_func_list = [] + for attr in extra_args_map[attr_type]: + attr_type, attr_name, default_val = parse_attr(attr) + attr_checker_func_list.append( + f"[](framework::AttributeMap* attr_map, bool only_check_exist_value)-> void {{ ExtraAttrChecker<{attr_type}>(\"{attr_name}\", {default_val})(attr_map, only_check_exist_value);}}" + ) + if attr_type.startswith("std::vector"): + attr_map_list.append( + f"{{\"{attr_name}\", {attr_type}{default_val}}}") + else: + attr_map_list.append( + f"{{\"{attr_name}\", {attr_type}{{{default_val}}}}}") + api_extra_attr_map = ", ".join(attr_map_list) + api_extra_attr_checkers = ",\n ".join(attr_checker_func_list) + attr_map_str_list.append( + f"{{\"{get_op_name(op_compat_args['op'])}\", {{ {api_extra_attr_map} }}}}" + ) + attr_checker_str_list.append( + f"{{\"{get_op_name(op_compat_args['op'])}\", {{ {api_extra_attr_checkers} }}}}" + ) + if 'backward' in op_compat_args: + for bw_item in op_compat_args['backward'].split(','): + bw_op_name = get_op_name(bw_item) + attr_map_str_list.append( + f"{{\"{bw_op_name}\", {{ {api_extra_attr_map} }}}}") + attr_checker_str_list.append( + f"{{\"{bw_op_name}\", {{ {api_extra_attr_checkers} }}}}") + return attr_map_str_list, attr_checker_str_list + +def generate_extra_info(op_compat_yaml_path, ops_extra_info_path): + compat_apis = [] + with open(op_compat_yaml_path, 'rt') as f: + compat_apis = yaml.safe_load(f) + extra_default_attr_str_list = [] + extra_dynamic_attr_str_list = [] + extra_checker_str_list = [] for op_compat_args in compat_apis: if 'extra' in op_compat_args: - extra_args_map = op_compat_args['extra'] # TODO(chenweihang): add inputs and outputs - if 'attrs' in extra_args_map: - attr_map_list = [] - attr_checker_func_list = [] - for attr in extra_args_map['attrs']: - attr_type, attr_name, default_val = parse_attr(attr) - attr_checker_func_list.append( - f"[](framework::AttributeMap* attr_map, bool only_check_exist_value)-> void {{ ExtraAttrChecker<{attr_type}>(\"{attr_name}\", {default_val})(attr_map, only_check_exist_value);}}" - ) - if attr_type.startswith("std::vector"): - attr_map_list.append( - f"{{\"{attr_name}\", {attr_type}{default_val}}}") - else: - attr_map_list.append( - f"{{\"{attr_name}\", {attr_type}{{{default_val}}}}}" - ) - api_extra_attr_map = ", ".join(attr_map_list) - api_extra_attr_checkers = ",\n ".join( - attr_checker_func_list) - extra_map_str_list.append( - f"{{\"{get_op_name(op_compat_args['op'])}\", {{ {api_extra_attr_map} }}}}" - ) - extra_checker_str_list.append( - f"{{\"{get_op_name(op_compat_args['op'])}\", {{ {api_extra_attr_checkers} }}}}" - ) - if 'backward' in op_compat_args: - for bw_item in op_compat_args['backward'].split(','): - bw_op_name = get_op_name(bw_item) - extra_map_str_list.append( - f"{{\"{bw_op_name}\", {{ {api_extra_attr_map} }}}}") - extra_checker_str_list.append( - f"{{\"{bw_op_name}\", {{ {api_extra_attr_checkers} }}}}" - ) - + default_attr_map_str, default_attr_checker_str = generate_attr_info( + 'attrs', op_compat_args) + dynamic_attr_map_str, _ = generate_attr_info( + 'dynamic_attrs', op_compat_args) + extra_default_attr_str_list.extend(default_attr_map_str) + extra_dynamic_attr_str_list.extend(dynamic_attr_map_str) + extra_checker_str_list.extend(default_attr_checker_str) ops_extra_info_file = open(ops_extra_info_path, 'w') ops_extra_info_file.write( - map_code_template(",\n ".join(extra_map_str_list), + map_code_template(",\n ".join(extra_default_attr_str_list), + ",\n ".join(extra_dynamic_attr_str_list), ",\n ".join(extra_checker_str_list))) ops_extra_info_file.close() diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index bcb3563a040b4..91faad4cca3c3 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -112,6 +112,7 @@ bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false, int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false] + dynamic_attrs : ['float[] Bias_scales = {1.0f}', float Sum_scale = 1.0f, 'float[] Output_shift_scale = {1.0f}', float Activation_scale = 1.0f] - op : conv2d_fusion extra : diff --git a/paddle/phi/kernels/onednn/conv_handler.h b/paddle/phi/kernels/onednn/conv_handler.h index d10b89de47be9..b3595291e329e 100644 --- a/paddle/phi/kernels/onednn/conv_handler.h +++ b/paddle/phi/kernels/onednn/conv_handler.h @@ -406,6 +406,7 @@ class ConvOneDNNHandlerT // Scales for int8 bias are to be cached to avoid // computing them each iteration + groups = std::max(groups, 1); auto bias_scale_tuple = std::static_pointer_cast>>( this->dev_ctx_.GetBlob(key_bs)); @@ -709,7 +710,6 @@ class ConvOneDNNHandlerT LOG(ERROR) << "Bias should be of type int32 but is " << bias->dtype(); } const K_Bias* bias_data = bias->data(); - return this->AcquireMemoryWithReorder( bias->mem_desc(), this->fwd_pd_->bias_desc(), From c0cdb15f77b4378d64bc40dc046f5c82a9949faa Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 24 Oct 2022 03:01:27 +0000 Subject: [PATCH 23/36] reset extra info gen code --- .../api/yaml/generator/ops_extra_info_gen.py | 48 +++++-------------- 1 file changed, 11 insertions(+), 37 deletions(-) diff --git a/paddle/phi/api/yaml/generator/ops_extra_info_gen.py b/paddle/phi/api/yaml/generator/ops_extra_info_gen.py index 3efb4178d94e3..a0983b9d26f27 100644 --- a/paddle/phi/api/yaml/generator/ops_extra_info_gen.py +++ b/paddle/phi/api/yaml/generator/ops_extra_info_gen.py @@ -129,46 +129,20 @@ def generate_extra_info(op_compat_yaml_path, ops_extra_info_path): for op_compat_args in compat_apis: if 'extra' in op_compat_args: # TODO(chenweihang): add inputs and outputs - if 'attrs' in extra_args_map: - attr_map_list = [] - attr_checker_func_list = [] - for attr in extra_args_map['attrs']: - attr_type, attr_name, default_val = parse_attr(attr) - attr_checker_func_list.append( - f"[](framework::AttributeMap* attr_map, bool only_check_exist_value)-> void {{ ExtraAttrChecker<{attr_type}>(\"{attr_name}\", {default_val})(attr_map, only_check_exist_value);}}" - ) - if attr_type.startswith("std::vector"): - attr_map_list.append( - f"{{\"{attr_name}\", {attr_type}{default_val}}}" - ) - else: - attr_map_list.append( - f"{{\"{attr_name}\", {attr_type}{{{default_val}}}}}" - ) - api_extra_attr_map = ", ".join(attr_map_list) - api_extra_attr_checkers = ",\n ".join( - attr_checker_func_list - ) - extra_map_str_list.append( - f"{{\"{get_op_name(op_compat_args['op'])}\", {{ {api_extra_attr_map} }}}}" - ) - extra_checker_str_list.append( - f"{{\"{get_op_name(op_compat_args['op'])}\", {{ {api_extra_attr_checkers} }}}}" - ) - if 'backward' in op_compat_args: - for bw_item in op_compat_args['backward'].split(','): - bw_op_name = get_op_name(bw_item) - extra_map_str_list.append( - f"{{\"{bw_op_name}\", {{ {api_extra_attr_map} }}}}" - ) - extra_checker_str_list.append( - f"{{\"{bw_op_name}\", {{ {api_extra_attr_checkers} }}}}" - ) - + default_attr_map_str, default_attr_checker_str = generate_attr_info( + 'attrs', op_compat_args + ) + dynamic_attr_map_str, _ = generate_attr_info( + 'dynamic_attrs', op_compat_args + ) + extra_default_attr_str_list.extend(default_attr_map_str) + extra_dynamic_attr_str_list.extend(dynamic_attr_map_str) + extra_checker_str_list.extend(default_attr_checker_str) ops_extra_info_file = open(ops_extra_info_path, 'w') ops_extra_info_file.write( map_code_template( - ",\n ".join(extra_map_str_list), + ",\n ".join(extra_default_attr_str_list), + ",\n ".join(extra_dynamic_attr_str_list), ",\n ".join(extra_checker_str_list), ) ) From dafdbcc068eb34369bd9afcaed5e5a39629b4a60 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 24 Oct 2022 03:22:48 +0000 Subject: [PATCH 24/36] rm conv_grad_grad kernel --- .../kernels/gpudnn/conv_grad_grad_kernel.cu | 824 ------------------ 1 file changed, 824 deletions(-) delete mode 100644 paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu deleted file mode 100644 index 7b4bd05991ddd..0000000000000 --- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu +++ /dev/null @@ -1,824 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/conv_grad_grad_kernel.h" - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/kernel_registry.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h" -#else -#include "paddle/phi/kernels/gpudnn/conv_cudnn_v7.h" -#endif - -#include "paddle/fluid/platform/cudnn_workspace_helper.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/profiler.h" -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/kernels/cpu/conv_util.h" -#include "paddle/phi/kernels/funcs/batch_norm_utils.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/padding.h" -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" - -namespace phi { - -template -void ConvCudnnGradGradKernel( - const Context& ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad) { - auto X = &input; - auto W = &filter; - auto dO = &out_grad; - auto ddX = input_grad_grad.get_ptr(); - auto ddW = filter_grad_grad.get_ptr(); - - auto ddO = out_grad_grad; - auto dW = filter_grad; - auto dX = input_grad; - if (ddO) { - ctx.template Alloc(ddO); - phi::funcs::SetConstant set_zero; - set_zero(ctx, ddO, static_cast(0)); - } - if (dW) { - ctx.template Alloc(dW); - } - if (dX) { - ctx.template Alloc(dX); - } - - // const T* x = X->data(); - const T* dy = dO->data(); - const T* w = W->data(); - - const T* ddx = nullptr; - const T* ddw = nullptr; - T *dw, *dx, *ddy; - dw = dx = ddy = nullptr; - T* transformed_dx = nullptr; - std::vector dilations = dilations_t; - - bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, - false, - phi::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - std::vector paddings = paddings_t; - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensors to channel first----------- - DenseTensor transformed_X_channel(X->type()); - DenseTensor transformed_dO_channel(dO->type()); - DenseTensor transformed_ddX_channel(X->type()); - - DenseTensor transformed_ddO_channel(dO->type()); - DenseTensor transformed_dX_channel(X->type()); - - if (channel_last) { - ResizeToChannelFirst(ctx, X, &transformed_X_channel); - TransToChannelFirst(ctx, X, &transformed_X_channel); - - ResizeToChannelFirst(ctx, dO, &transformed_dO_channel); - TransToChannelFirst(ctx, dO, &transformed_dO_channel); - - if (ddX) { - ResizeToChannelFirst(ctx, ddX, &transformed_ddX_channel); - TransToChannelFirst(ctx, ddX, &transformed_ddX_channel); - } - - if (ddO) { - ResizeToChannelFirst(ctx, ddO, &transformed_ddO_channel); - } - if (dX) { - ResizeToChannelFirst(ctx, dX, &transformed_dX_channel); - ctx.template Alloc(&transformed_dX_channel); - } - - } else { - transformed_X_channel = *X; - transformed_dO_channel = *dO; - if (ddX) { - transformed_ddX_channel = *ddX; - } - if (ddO) { - transformed_ddO_channel.ShareDataWith(*ddO); - } - if (dX) { - transformed_dX_channel.ShareDataWith(*dX); - } - } - - auto in_dims = transformed_X_channel.dims(); - auto filter_dims = W->dims(); - DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); - DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim); - DenseTensor transformed_X(X->type()); - DenseTensor transformed_ddX(X->type()); - - DenseTensor transformed_dX(X->type()); - - std::vector padding_common(data_dim, 0); - std::vector input_pad(X->dims().size() * 2, 0); - - if (!is_sys_pad) { - // get pad - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_X_channel.dims()[0]; - new_input_shape_vec[1] = transformed_X_channel.dims()[1]; - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_input_shape_vec[i + 2] = - transformed_X_channel.dims()[i + 2] + padding_diff[i]; - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - DDim new_input_shape(make_ddim(new_input_shape_vec)); - transformed_X.Resize(new_input_shape); - transformed_ddX.Resize(new_input_shape); - transformed_dX.Resize(new_input_shape); - - ctx.template Alloc(&transformed_X); - - if (ddX) { - ctx.template Alloc(&transformed_ddX); - } - if (dX) { - ctx.template Alloc(&transformed_dX); - } - - // pad for input - const int rank = X->dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - funcs::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); - if (ddX) { - funcs::PadFunction(ctx, - input_pad, - transformed_ddX_channel, - pad_value, - &transformed_ddX); - } - } break; - case 5: { - funcs::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); - if (ddX) { - funcs::PadFunction(ctx, - input_pad, - transformed_ddX_channel, - pad_value, - &transformed_ddX); - } - } break; - default: - PADDLE_THROW(phi::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - - } else { - transformed_X.ShareDataWith(transformed_X_channel); - if (ddX) { - transformed_ddX.ShareDataWith(transformed_ddX_channel); - } - if (dX) { - transformed_dX.ShareDataWith(transformed_dX_channel); - } - - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* x = transformed_X.data(); - - int iwo_group = groups; - int c_group = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_group = 1; - c_group = groups; - groups = 1; -#endif - auto dtype = paddle::platform::CudnnDataType::type; - - auto handle = ctx.cudnn_handle(); - auto layout = paddle::platform::GetCudnnTensorFormat( - paddle::platform::DataLayout::kNCHW); - - ConvArgs args1{&transformed_ddX, - W, - &transformed_ddO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; - ConvArgs args2{&transformed_X, - ddW, - &transformed_ddO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; - ConvArgs args3{&transformed_ddX, - dW, - &transformed_dO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; - ConvArgs args4{&transformed_dX, - ddW, - &transformed_dO_channel, - strides, - padding_common, - dilations, - dtype, - groups, - paddle::platform::DataLayout::kNCHW}; - -#ifdef PADDLE_WITH_HIP - SearchResult fwd_result1; - SearchResult fwd_result2; - SearchResult data_result; - SearchResult filter_result; -#else - SearchResult fwd_result1; - SearchResult fwd_result2; - SearchResult data_result; - SearchResult filter_result; -#endif - - // ddo = conv(ddI, W) + conv(I, ddW) - size_t workspace_size = 0; - - T* transformed_ddy_channel = nullptr; - if (ddO) { - ddy = ddO->data(); - transformed_ddy_channel = transformed_ddO_channel.data(); - if (ddX) { - args1.handle = handle; - args1.idesc.set(transformed_ddX, iwo_group); - args1.wdesc.set(*W, layout, iwo_group); - args1.odesc.set(transformed_ddO_channel, iwo_group); - args1.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn(), - c_group); - -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size = search1::GetWorkspaceSize(args1); - fwd_result1.algo = search1::Find( - args1, exhaustive_search, false, workspace_size, ctx); -#else - using search1 = SearchAlgorithm; - fwd_result1 = search1::Find(ctx, args1, exhaustive_search, false); - workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo); -#endif - } - - if (ddW) { - ddw = ddW->data(); - args2.handle = handle; - args2.idesc.set(transformed_X, iwo_group); - args2.wdesc.set(*ddW, layout, iwo_group); - args2.odesc.set(transformed_ddO_channel, iwo_group); - args2.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn(), - c_group); - -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2)); - fwd_result2.algo = search2::Find( - args2, exhaustive_search, false, workspace_size, ctx); -#else - using search2 = SearchAlgorithm; - fwd_result2 = search2::Find(ctx, args2, exhaustive_search, false); - workspace_size = std::max( - workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo)); -#endif - } - } - - if (dW && ddX) { - dw = dW->data(); - args3.handle = handle; - args3.idesc.set(transformed_ddX, iwo_group); - args3.wdesc.set(*dW, layout, iwo_group); - args3.odesc.set(transformed_dO_channel, iwo_group); - args3.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn(), - c_group); - -#ifdef PADDLE_WITH_HIP - using search3 = SearchAlgorithm; - workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_result.algo = search3::Find( - args3, exhaustive_search, deterministic, workspace_size, ctx); -#else - using search3 = SearchAlgorithm; - filter_result = - search3::Find(ctx, args3, exhaustive_search, deterministic); - workspace_size = std::max( - workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); -#endif - } - - if (ddW && dX) { - transformed_dx = transformed_dX.data(); - - args4.handle = handle; - args4.idesc.set(transformed_dX, iwo_group); - args4.wdesc.set(*ddW, layout, iwo_group); - args4.odesc.set(transformed_dO_channel, iwo_group); - args4.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn(), - c_group); - -#ifdef PADDLE_WITH_HIP - using search4 = SearchAlgorithm; - workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_result.algo = search4::Find( - args4, exhaustive_search, deterministic, workspace_size, ctx); -#else - using search4 = SearchAlgorithm; - data_result = - search4::Find(ctx, args4, exhaustive_search, deterministic); - workspace_size = std::max( - workspace_size, search4::GetWorkspaceSize(args4, data_result.algo)); -#endif - } - - int i_n, i_c, i_d, i_h, i_w; - GetNCDHW( - transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w); - - int o_n, o_c, o_d, o_h, o_w; - GetNCDHW(transformed_dO_channel.dims(), - DataLayout::kNCHW, - &o_n, - &o_c, - &o_d, - &o_h, - &o_w); - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = W->numel() / groups; - - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - - // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. - // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : - // 0.0f; - // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr("use_addto"); - auto wkspace_handle = ctx.cudnn_workspace_handle(); - - if (ddO) { - if (ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionForward( - handle, - &alpha, - args1.idesc.desc(), - ddx, - args1.wdesc.desc(), - w, - args1.cdesc.desc(), - fwd_result1.algo, - &beta, - args1.odesc.desc(), - transformed_ddy_channel, - workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionForward( - handle, - &alpha, - args1.idesc.desc(), - ddx + i * group_offset_in, - args1.wdesc.desc(), - w + i * group_offset_filter, - args1.cdesc.desc(), - fwd_result1.algo, - workspace_ptr, - workspace_size, - &beta, - args1.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (ddW) { -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionForward( - handle, - &alpha, - args2.idesc.desc(), - x, - args2.wdesc.desc(), - ddw, - args2.cdesc.desc(), - fwd_result2.algo, - &beta, - args2.odesc.desc(), - transformed_ddy_channel, - workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionForward( - handle, - &alpha, - args2.idesc.desc(), - x + i * group_offset_in, - args2.wdesc.desc(), - ddw + i * group_offset_filter, - args2.cdesc.desc(), - fwd_result2.algo, - workspace_ptr, - workspace_size, - &alpha, - args2.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_ddO_channel, ddO); - } - } - T* transformed_dy_channel = transformed_dO_channel.data(); - if (dW && ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionBackwardWeights( - handle, - &alpha, - args3.odesc.desc(), - transformed_dy_channel, - args3.idesc.desc(), - ddx, - args3.cdesc.desc(), - filter_result.algo, - &beta, - args3.wdesc.desc(), - dw, - workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionBackwardFilter( - handle, - &alpha, - args3.idesc.desc(), - ddx + i * group_offset_in, - args3.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.cdesc.desc(), - filter_result.algo, - workspace_ptr, - workspace_size, - &beta, - args3.wdesc.desc(), - dw + i * group_offset_filter)); - }, - workspace_size); - } -#endif - } - - if (dX && ddW) { - ddw = ddW->data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionBackwardData( - handle, - &alpha, - args4.odesc.desc(), - transformed_dy_channel, - args4.wdesc.desc(), - ddw, - args4.cdesc.desc(), - data_result.algo, - &beta, - args4.idesc.desc(), - transformed_dx, - workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnConvolutionBackwardData( - handle, - &alpha, - args4.wdesc.desc(), - ddw + i * group_offset_filter, - args4.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.cdesc.desc(), - data_result.algo, - workspace_ptr, - workspace_size, - &beta, - args4.idesc.desc(), - transformed_dx + i * group_offset_in)); - }, - workspace_size); - } -#endif - - if (!is_sys_pad) { - // reverse padded input - std::vector starts(X->dims().size(), 0); - std::vector axes(X->dims().size(), 0); - - for (size_t i = 0; i < X->dims().size(); ++i) { - starts[i] = input_pad[2 * i]; - axes[i] = i; - } - if (X->dims().size() == 4) { - RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } else { - RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_dX_channel, dX); - } - } -} - -template -void DepthwiseConvDoubleGradGPUDNNKernel( - const Context& ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, - bool fuse_relu, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad) { - ConvCudnnGradGradKernel(ctx, - input, - filter, - out_grad, - input_grad_grad, - filter_grad_grad, - strides, - paddings_t, - padding_algorithm, - groups, - dilations_t, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search_t, - input_grad, - filter_grad, - out_grad_grad); -} - -template -void Conv3DCudnnGradGradKernel( - const Context& ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - bool use_addto, - int workspace_size_MB, - bool exhaustive_search_t, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad) { - ConvCudnnGradGradKernel(ctx, - input, - filter, - out_grad, - input_grad_grad, - filter_grad_grad, - strides, - paddings_t, - padding_algorithm, - groups, - dilations_t, - data_format, - use_addto, - workspace_size_MB, - exhaustive_search_t, - input_grad, - filter_grad, - out_grad_grad); -} - -} // namespace phi - -#ifdef PADDLE_WITH_HIP -PD_REGISTER_KERNEL(conv2d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::ConvCudnnGradGradKernel, - float, - phi::dtype::float16) {} - -PD_REGISTER_KERNEL(conv3d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::Conv3DCudnnGradGradKernel, - float, - phi::dtype::float16) {} - -PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, - GPU, - ALL_LAYOUT, - phi::DepthwiseConvDoubleGradGPUDNNKernel, - float, - phi::dtype::float16) {} -#else -#if CUDNN_VERSION_MIN(8, 1, 0) -PD_REGISTER_KERNEL(conv2d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::ConvCudnnGradGradKernel, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16) {} - -PD_REGISTER_KERNEL(conv3d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::Conv3DCudnnGradGradKernel, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16) {} - -PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, - GPU, - ALL_LAYOUT, - phi::DepthwiseConvDoubleGradGPUDNNKernel, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16) {} - -#else - -PD_REGISTER_KERNEL(conv2d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::ConvCudnnGradGradKernel, - float, - double, - phi::dtype::float16) {} - -PD_REGISTER_KERNEL(conv3d_grad_grad, - GPUDNN, - ALL_LAYOUT, - phi::Conv3DCudnnGradGradKernel, - float, - double, - phi::dtype::float16) {} - -PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad, - GPU, - ALL_LAYOUT, - phi::DepthwiseConvDoubleGradGPUDNNKernel, - float, - double, - phi::dtype::float16) {} - -#endif - -#endif From e6d5312a480a0aeaf60a22a33141e971f0189fb1 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 24 Oct 2022 09:07:53 +0000 Subject: [PATCH 25/36] reimpl pass attr adapting --- paddle/fluid/framework/op_desc.cc | 21 +-- paddle/fluid/framework/operator.cc | 125 ++++++++++-------- paddle/fluid/operators/ops_extra_info.h | 11 -- .../api/yaml/generator/ops_extra_info_gen.py | 110 +++++++-------- paddle/phi/api/yaml/op_compat.yaml | 1 - paddle/phi/kernels/onednn/conv_handler.h | 1 + 6 files changed, 125 insertions(+), 144 deletions(-) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 7d99b77f487f5..fc3a8bf24dbcf 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -495,17 +495,11 @@ OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block) attr_type != proto::AttrType::BLOCKS && attr_type != proto::AttrType::VAR && attr_type != proto::AttrType::VARS) { - auto default_attr_iter = runtime_attrs_.find(attr_name); - const auto &extra_dynamic_attr_map = - operators::ExtraInfoUtils::Instance().GetExtraDynamicAttrsMap(Type()); - auto dynamic_attr_iter = extra_dynamic_attr_map.find(attr_name); - - if (default_attr_iter != runtime_attrs_.end()) { - default_attr_iter->second = GetAttrValue(attr); - } else if (dynamic_attr_iter != extra_dynamic_attr_map.end()) { - runtime_attrs_[attr_name] = GetAttrValue(attr); - } else { + auto iter = runtime_attrs_.find(attr_name); + if (iter == runtime_attrs_.end()) { attrs_[attr_name] = GetAttrValue(attr); + } else { + iter->second = GetAttrValue(attr); } } } @@ -671,13 +665,8 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) { const auto &extra_attr_map = operators::ExtraInfoUtils::Instance().GetExtraAttrsMap(Type()); - const auto &extra_dynamic_attr_map = - operators::ExtraInfoUtils::Instance().GetExtraDynamicAttrsMap(Type()); auto extra_attr_iter = extra_attr_map.find(name); - auto extra_dynamic_attr_iter = extra_dynamic_attr_map.find(name); - // auto pass_attr_iter = operators::extra_attr_properties.find(name); - if (extra_attr_iter != extra_attr_map.end() || - extra_dynamic_attr_iter != extra_dynamic_attr_map.end()) { + if (extra_attr_iter != extra_attr_map.end()) { is_runtime_attr = true; attrs_ptr = &(this->runtime_attrs_); } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 4c2f4c2071188..0796ea8082006 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2708,6 +2708,62 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( return (*arg_map_fn_)(arg_mapping_ctx); } +static void SetDnnAttrIntoDeviceContext( + phi::DeviceContext* dev_ctx, + const Attribute& attr, + const std::string& attr_name, + const operators::ExtraAttrPropertySet& attr_propertys) { +#ifdef PADDLE_WITH_MKLDNN + if (phi::OneDNNContext::classof(dev_ctx) && + attr_propertys.Support(operators::ExtraAttrProperty::ONEDNN)) { + VLOG(4) << "Runtime attr `" << attr_name << "` is passed to OneDNNContext."; + phi::OneDNNContext* one_dnn_ctx = static_cast(dev_ctx); + switch (AttrTypeID(attr)) { + case proto::AttrType::FLOAT: + one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(float, attr)); + break; + case proto::AttrType::STRING: + one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(std::string, attr)); + break; + case proto::AttrType::INTS: + one_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::vector, attr)); + break; + case proto::AttrType::FLOATS: + one_dnn_ctx->SetDnnAttr(attr_name, + PADDLE_GET_CONST(std::vector, attr)); + break; + case proto::AttrType::BOOLEAN: + one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(bool, attr)); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported Attribute value type `%s` for phi.", + platform::demangle(attr.type().name()))); + } + } +#endif +#ifdef PADDLE_WITH_CUDA + if (phi::GPUContext::classof(dev_ctx) && + attr_propertys.Support(operators::ExtraAttrProperty::GPUDNN)) { + VLOG(4) << "Runtime attr `" << attr_name << "` is passed to GPUDNNContext."; + phi::GPUContext* gpu_dnn_ctx = static_cast(dev_ctx); + switch (AttrTypeID(attr)) { + case proto::AttrType::INT: + gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(int, attr)); + break; + case proto::AttrType::BOOLEAN: + gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(bool, attr)); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported Attribute value type `%s` for phi.", + platform::demangle(attr.type().name()))); + } + } +#endif +} + void OperatorWithKernel::BuildPhiKernelContext( const RuntimeContext& ctx, platform::DeviceContext* dev_ctx, @@ -3102,59 +3158,22 @@ void OperatorWithKernel::BuildPhiKernelContext( auto& attr_name = attr_iter.first; auto& attr = attr_iter.second; auto attr_propertys = paddle::operators::GetExtraAttrPropertys(attr_name); -#ifdef PADDLE_WITH_MKLDNN - if (phi::OneDNNContext::classof(dev_ctx) && - attr_propertys.Support(operators::ExtraAttrProperty::ONEDNN)) { - VLOG(4) << "Runtime attr `" << attr_name - << "` is passed to OneDNNContext."; - phi::OneDNNContext* one_dnn_ctx = - static_cast(dev_ctx); - switch (AttrTypeID(attr)) { - case proto::AttrType::FLOAT: - one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(float, attr)); - break; - case proto::AttrType::STRING: - one_dnn_ctx->SetDnnAttr(attr_name, - PADDLE_GET_CONST(std::string, attr)); - break; - case proto::AttrType::INTS: - one_dnn_ctx->SetDnnAttr(attr_name, - PADDLE_GET_CONST(std::vector, attr)); - break; - case proto::AttrType::FLOATS: - one_dnn_ctx->SetDnnAttr(attr_name, - PADDLE_GET_CONST(std::vector, attr)); - break; - case proto::AttrType::BOOLEAN: - one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(bool, attr)); - break; - default: - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported Attribute value type `%s` for phi.", - platform::demangle(attr.type().name()))); - } - } -#endif -#ifdef PADDLE_WITH_CUDA - if (phi::GPUContext::classof(dev_ctx) && - attr_propertys.Support(operators::ExtraAttrProperty::GPUDNN)) { - VLOG(4) << "Runtime attr `" << attr_name - << "` is passed to GPUDNNContext."; - phi::GPUContext* gpu_dnn_ctx = static_cast(dev_ctx); - switch (AttrTypeID(attr)) { - case proto::AttrType::INT: - gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(int, attr)); - break; - case proto::AttrType::BOOLEAN: - gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(bool, attr)); - break; - default: - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported Attribute value type `%s` for phi.", - platform::demangle(attr.type().name()))); - } - } -#endif + SetDnnAttrIntoDeviceContext(dev_ctx, attr, attr_name, attr_propertys); + } + // TODO(chenweihang): Since the pass will still `SetAttr` in the OpDesc, + // we try to add these Attrs to the RuntimeAttrs, but these OpDesc will lose + // the RuntimeAttrs information in the process of converting the Graph to + // the Program, so additional record configuration will be introduced, + // which increases the The cost of development and understanding, so we + // still use Attrs to get and the attributes set by these passes from Attrs + // for the time being. In the future, it is necessary to clarify the + // positioning of RuntimeAttrs and expand related functions. + auto& attrs = Attrs(); + for (const auto& attr_iter : attrs) { + auto& attr_name = attr_iter.first; + auto& attr = attr_iter.second; + auto attr_propertys = paddle::operators::GetExtraAttrPropertys(attr_name); + SetDnnAttrIntoDeviceContext(dev_ctx, attr, attr_name, attr_propertys); } VLOG(4) << "Done runtime attributes"; #endif diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h index 30c4c7c2be476..a54fe36fd201d 100644 --- a/paddle/fluid/operators/ops_extra_info.h +++ b/paddle/fluid/operators/ops_extra_info.h @@ -189,15 +189,6 @@ class ExtraInfoUtils { return empty_extra_attrs_map_; } - const paddle::framework::AttributeMap& GetExtraDynamicAttrsMap( - const std::string& op_type) const { - auto iter = g_extra_dynamic_attrs_map_.find(op_type); - if (iter != g_extra_dynamic_attrs_map_.end()) { - return iter->second; - } - return empty_extra_attrs_map_; - } - const std::vector>& GetExtraAttrsChecker(const std::string& op_type) const { auto iter = g_extra_attrs_checker_.find(op_type); @@ -221,8 +212,6 @@ class ExtraInfoUtils { std::unordered_map g_extra_attrs_map_; - std::unordered_map - g_extra_dynamic_attrs_map_; paddle::framework::AttributeMap empty_extra_attrs_map_{}; std::unordered_map< std::string, diff --git a/paddle/phi/api/yaml/generator/ops_extra_info_gen.py b/paddle/phi/api/yaml/generator/ops_extra_info_gen.py index a0983b9d26f27..77596b9f2d46b 100644 --- a/paddle/phi/api/yaml/generator/ops_extra_info_gen.py +++ b/paddle/phi/api/yaml/generator/ops_extra_info_gen.py @@ -17,7 +17,7 @@ import argparse -def map_code_template(default_attrs_str, dynamic_attr_str, attrs_checker_str): +def map_code_template(attrs_str, attrs_checker_str): return f"""// This file is generated by paddle/phi/api/yaml/generator/ops_extra_info_gen.py #include "paddle/fluid/operators/ops_extra_info.h" @@ -28,11 +28,7 @@ def map_code_template(default_attrs_str, dynamic_attr_str, attrs_checker_str): ExtraInfoUtils::ExtraInfoUtils() {{ g_extra_attrs_map_ = {{ - {default_attrs_str} - }}; - - g_extra_dynamic_attrs_map_ = {{ - {dynamic_attr_str} + {attrs_str} }}; g_extra_attrs_checker_ = {{ @@ -72,7 +68,11 @@ def parse_attr(attr_str): ) -def generate_attr_info(attr_type, op_compat_args): +def generate_extra_info(op_compat_yaml_path, ops_extra_info_path): + compat_apis = [] + with open(op_compat_yaml_path, 'rt') as f: + compat_apis = yaml.safe_load(f) + def get_op_name(api_item): names = api_item.split('(') if len(names) == 1: @@ -80,69 +80,53 @@ def get_op_name(api_item): else: return names[1].split(')')[0].strip() - attr_map_str_list = [] - attr_checker_str_list = [] - extra_args_map = op_compat_args['extra'] - if attr_type in extra_args_map: - attr_map_list = [] - attr_checker_func_list = [] - for attr in extra_args_map[attr_type]: - attr_type, attr_name, default_val = parse_attr(attr) - attr_checker_func_list.append( - f"[](framework::AttributeMap* attr_map, bool only_check_exist_value)-> void {{ ExtraAttrChecker<{attr_type}>(\"{attr_name}\", {default_val})(attr_map, only_check_exist_value);}}" - ) - if attr_type.startswith("std::vector"): - attr_map_list.append( - f"{{\"{attr_name}\", {attr_type}{default_val}}}" - ) - else: - attr_map_list.append( - f"{{\"{attr_name}\", {attr_type}{{{default_val}}}}}" - ) - api_extra_attr_map = ", ".join(attr_map_list) - api_extra_attr_checkers = ",\n ".join(attr_checker_func_list) - attr_map_str_list.append( - f"{{\"{get_op_name(op_compat_args['op'])}\", {{ {api_extra_attr_map} }}}}" - ) - attr_checker_str_list.append( - f"{{\"{get_op_name(op_compat_args['op'])}\", {{ {api_extra_attr_checkers} }}}}" - ) - if 'backward' in op_compat_args: - for bw_item in op_compat_args['backward'].split(','): - bw_op_name = get_op_name(bw_item) - attr_map_str_list.append( - f"{{\"{bw_op_name}\", {{ {api_extra_attr_map} }}}}" - ) - attr_checker_str_list.append( - f"{{\"{bw_op_name}\", {{ {api_extra_attr_checkers} }}}}" - ) - return attr_map_str_list, attr_checker_str_list - - -def generate_extra_info(op_compat_yaml_path, ops_extra_info_path): - compat_apis = [] - with open(op_compat_yaml_path, 'rt') as f: - compat_apis = yaml.safe_load(f) - extra_default_attr_str_list = [] - extra_dynamic_attr_str_list = [] + extra_map_str_list = [] extra_checker_str_list = [] + for op_compat_args in compat_apis: if 'extra' in op_compat_args: + extra_args_map = op_compat_args['extra'] # TODO(chenweihang): add inputs and outputs - default_attr_map_str, default_attr_checker_str = generate_attr_info( - 'attrs', op_compat_args - ) - dynamic_attr_map_str, _ = generate_attr_info( - 'dynamic_attrs', op_compat_args - ) - extra_default_attr_str_list.extend(default_attr_map_str) - extra_dynamic_attr_str_list.extend(dynamic_attr_map_str) - extra_checker_str_list.extend(default_attr_checker_str) + if 'attrs' in extra_args_map: + attr_map_list = [] + attr_checker_func_list = [] + for attr in extra_args_map['attrs']: + attr_type, attr_name, default_val = parse_attr(attr) + attr_checker_func_list.append( + f"[](framework::AttributeMap* attr_map, bool only_check_exist_value)-> void {{ ExtraAttrChecker<{attr_type}>(\"{attr_name}\", {default_val})(attr_map, only_check_exist_value);}}" + ) + if attr_type.startswith("std::vector"): + attr_map_list.append( + f"{{\"{attr_name}\", {attr_type}{default_val}}}" + ) + else: + attr_map_list.append( + f"{{\"{attr_name}\", {attr_type}{{{default_val}}}}}" + ) + api_extra_attr_map = ", ".join(attr_map_list) + api_extra_attr_checkers = ",\n ".join( + attr_checker_func_list + ) + extra_map_str_list.append( + f"{{\"{get_op_name(op_compat_args['op'])}\", {{ {api_extra_attr_map} }}}}" + ) + extra_checker_str_list.append( + f"{{\"{get_op_name(op_compat_args['op'])}\", {{ {api_extra_attr_checkers} }}}}" + ) + if 'backward' in op_compat_args: + for bw_item in op_compat_args['backward'].split(','): + bw_op_name = get_op_name(bw_item) + extra_map_str_list.append( + f"{{\"{bw_op_name}\", {{ {api_extra_attr_map} }}}}" + ) + extra_checker_str_list.append( + f"{{\"{bw_op_name}\", {{ {api_extra_attr_checkers} }}}}" + ) + ops_extra_info_file = open(ops_extra_info_path, 'w') ops_extra_info_file.write( map_code_template( - ",\n ".join(extra_default_attr_str_list), - ",\n ".join(extra_dynamic_attr_str_list), + ",\n ".join(extra_map_str_list), ",\n ".join(extra_checker_str_list), ) ) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 0afdd49f5aa27..29e8842026276 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -112,7 +112,6 @@ bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false, int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false] - dynamic_attrs : ['float[] Bias_scales = {1.0f}', float Sum_scale = 1.0f, 'float[] Output_shift_scale = {1.0f}', float Activation_scale = 1.0f] - op : conv2d_fusion extra : diff --git a/paddle/phi/kernels/onednn/conv_handler.h b/paddle/phi/kernels/onednn/conv_handler.h index b3595291e329e..723784a845c2d 100644 --- a/paddle/phi/kernels/onednn/conv_handler.h +++ b/paddle/phi/kernels/onednn/conv_handler.h @@ -710,6 +710,7 @@ class ConvOneDNNHandlerT LOG(ERROR) << "Bias should be of type int32 but is " << bias->dtype(); } const K_Bias* bias_data = bias->data(); + return this->AcquireMemoryWithReorder( bias->mem_desc(), this->fwd_pd_->bias_desc(), From 12e18f1e419f0f6e7d0df023f6fc5ab786c4c390 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 24 Oct 2022 12:49:36 +0000 Subject: [PATCH 26/36] add int attr support --- paddle/fluid/framework/operator.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 0796ea8082006..536ec8dd5e8ca 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2722,6 +2722,9 @@ static void SetDnnAttrIntoDeviceContext( case proto::AttrType::FLOAT: one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(float, attr)); break; + case proto::AttrType::INT: + one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(int, attr)); + break; case proto::AttrType::STRING: one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(std::string, attr)); break; From 2ea1a2b175f4c21cbcbeb89e90e8fb8b653983be Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 25 Oct 2022 08:09:23 +0000 Subject: [PATCH 27/36] remove vector inputnames creating --- paddle/fluid/framework/operator.cc | 29 ++++++++++--------------- paddle/fluid/operators/ops_extra_info.h | 10 +++++---- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 536ec8dd5e8ca..c2a044b52e404 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2371,10 +2371,7 @@ Scope* OperatorWithKernel::PrepareData( }; if (run_phi_kernel_) { - std::vector input_names; - for (auto& input_name : kernel_signature_->input_names) { - input_names.emplace_back(input_name); - } + const auto& input_names = kernel_signature_->input_names; const auto& input_defs = phi_kernel_->args_def().input_defs(); PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(), @@ -2384,7 +2381,6 @@ Scope* OperatorWithKernel::PrepareData( input_names.size(), input_defs.size())); for (size_t i = 0; i < input_defs.size(); ++i) { - const auto& input_defs = phi_kernel_->args_def().input_defs(); auto& in_def = input_defs.at(i); std::string input_name = input_names[i]; auto iter = ctx->inputs.find(input_name); @@ -2396,19 +2392,18 @@ Scope* OperatorWithKernel::PrepareData( no_buffer_ins && no_buffer_ins->count(input_name) > 0; prepare_input_data(input_name, &ins_vector, &in_def, should_skip_input); } - // For input that is Extra - for (auto& var_name_item : Inputs()) { - if (std::find(input_names.begin(), - input_names.end(), - var_name_item.first) == input_names.end()) { - bool should_skip_input = - no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0; - std::vector& input_vars = ctx->inputs[var_name_item.first]; - prepare_input_data( - var_name_item.first, &input_vars, nullptr, should_skip_input); - } +#ifdef PADDLE_WITH_MKLDNN + // For input that is Extra, only MKLDNN will use Extra Inputs + auto& extra_input_names = + paddle::operators::ExtraInfoUtils::Instance().GetExtraInputNamesMap( + Type()); + for (const auto& input_name : extra_input_names) { + bool should_skip_input = + no_buffer_ins && no_buffer_ins->count(input_name) > 0; + std::vector& input_vars = ctx->inputs[input_name]; + prepare_input_data(input_name, &input_vars, nullptr, should_skip_input); } - +#endif } else { for (auto& var_name_item : Inputs()) { bool should_skip_input = diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h index a54fe36fd201d..aca2115fc6c24 100644 --- a/paddle/fluid/operators/ops_extra_info.h +++ b/paddle/fluid/operators/ops_extra_info.h @@ -88,9 +88,7 @@ const std::unordered_map {"use_cudnn", ExtraAttrProperty::SCHEDULE}, {"use_mkldnn", ExtraAttrProperty::SCHEDULE}, // ONEDNN dedicated attributes - {"Activation_scale", ExtraAttrProperty::ONEDNN}, {"Bias", ExtraAttrProperty::ONEDNN}, - {"Bias_scales", ExtraAttrProperty::ONEDNN}, {"data_format", ExtraAttrProperty::ONEDNN}, {"force_fp32_output", ExtraAttrProperty::ONEDNN}, {"fuse_activation", ExtraAttrProperty::ONEDNN}, @@ -112,7 +110,6 @@ const std::unordered_map {"fused_transpose_X", ExtraAttrProperty::ONEDNN}, {"fused_transpose_Y", ExtraAttrProperty::ONEDNN}, {"mkldnn_data_type", ExtraAttrProperty::ONEDNN}, - {"Output_shift_scale", ExtraAttrProperty::ONEDNN}, {"ResidualData", ExtraAttrProperty::ONEDNN}, {"scale_x", ExtraAttrProperty::ONEDNN}, {"scale_y", ExtraAttrProperty::ONEDNN}, @@ -123,9 +120,13 @@ const std::unordered_map {"Scale_y", ExtraAttrProperty::ONEDNN}, {"Scale_out", ExtraAttrProperty::ONEDNN}, {"Scale_weights", ExtraAttrProperty::ONEDNN}, - {"Sum_scale", ExtraAttrProperty::ONEDNN}, {"x_data_format", ExtraAttrProperty::ONEDNN}, {"y_data_format", ExtraAttrProperty::ONEDNN}, + // ONEDNN pass dedicated attributes + {"Activation_scale", ExtraAttrProperty::ONEDNN}, + {"Bias_scales", ExtraAttrProperty::ONEDNN}, + {"Output_shift_scale", ExtraAttrProperty::ONEDNN}, + {"Sum_scale", ExtraAttrProperty::ONEDNN}, // GPUDNN dedicated attributes {"exhaustive_search", ExtraAttrProperty::GPUDNN}, {"fuse_relu_before_depthwise_conv", ExtraAttrProperty::GPUDNN}, @@ -220,6 +221,7 @@ class ExtraInfoUtils { std::vector> empty_extra_attrs_checker_{}; + // TODO(chenweihang): move these extra inputs into op_compat.yaml std::unordered_map> g_extra_input_names_map_ = {{"conv2d", {"Bias", "ResidualData"}}, {"conv2d_grad", {"Bias"}}}; From b80ef68d10b5f19ac99c241d6d2fcb62e62db0cc Mon Sep 17 00:00:00 2001 From: Silv3S Date: Tue, 25 Oct 2022 10:32:49 +0200 Subject: [PATCH 28/36] merge dev --- paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc | 2 +- paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc | 2 +- paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc | 2 +- paddle/phi/kernels/onednn/activation_grad_kernel.cc | 3 +++ paddle/phi/kernels/onednn/activation_kernel.cc | 3 +++ paddle/phi/kernels/onednn/log_softmax_kernel.cc | 3 +++ paddle/phi/kernels/onednn/softmax_grad_kernel.cc | 3 +++ 7 files changed, 15 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index 3b6aa6dc4dc8d..673f7cd88d6ca 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -23,7 +23,7 @@ #include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(softmax); -PD_DECLARE_KERNEL(softmax, OneDNN, ALL_LAYOUT); +PD_DECLARE_KERNEL(softmax, OneDNN, ONEDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(leaky_relu); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 464f9fadff95c..83f28885438b7 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -34,7 +34,7 @@ USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN); USE_OP_ITSELF(relu); PD_DECLARE_KERNEL(relu, OneDNN, ONEDNN); USE_OP_ITSELF(softmax); -PD_DECLARE_KERNEL(softmax, OneDNN, ALL_LAYOUT); +PD_DECLARE_KERNEL(softmax, OneDNN, ONEDNN); USE_OP_ITSELF(conv2d); USE_OP_DEVICE_KERNEL(conv2d, MKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index f5a34e926778c..8aa299570443b 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -32,7 +32,7 @@ USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(relu); PD_DECLARE_KERNEL(relu, OneDNN, ONEDNN); USE_OP_ITSELF(softmax); -PD_DECLARE_KERNEL(softmax, OneDNN, ALL_LAYOUT); +PD_DECLARE_KERNEL(softmax, OneDNN, ONEDNN); PD_DECLARE_KERNEL(softmax, CPU, ALL_LAYOUT); namespace paddle { diff --git a/paddle/phi/kernels/onednn/activation_grad_kernel.cc b/paddle/phi/kernels/onednn/activation_grad_kernel.cc index c8c2b84056c6d..1ebe9f20c63ea 100644 --- a/paddle/phi/kernels/onednn/activation_grad_kernel.cc +++ b/paddle/phi/kernels/onednn/activation_grad_kernel.cc @@ -15,7 +15,10 @@ #include "paddle/phi/kernels/activation_grad_kernel.h" #include "paddle/phi/kernels/gelu_grad_kernel.h" +#include "paddle/phi/backends/onednn/onednn_context.h" #include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/activation_functor.h" diff --git a/paddle/phi/kernels/onednn/activation_kernel.cc b/paddle/phi/kernels/onednn/activation_kernel.cc index c8d6cb6da6d2d..b9db43fa03b96 100644 --- a/paddle/phi/kernels/onednn/activation_kernel.cc +++ b/paddle/phi/kernels/onednn/activation_kernel.cc @@ -15,7 +15,10 @@ #include "paddle/phi/kernels/activation_kernel.h" #include "paddle/phi/kernels/gelu_grad_kernel.h" +#include "paddle/phi/backends/onednn/onednn_context.h" #include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/activation_functor.h" diff --git a/paddle/phi/kernels/onednn/log_softmax_kernel.cc b/paddle/phi/kernels/onednn/log_softmax_kernel.cc index 8c479b119de89..800d67a31cb5d 100644 --- a/paddle/phi/kernels/onednn/log_softmax_kernel.cc +++ b/paddle/phi/kernels/onednn/log_softmax_kernel.cc @@ -14,7 +14,10 @@ #include "paddle/phi/kernels/log_softmax_kernel.h" +#include "paddle/phi/backends/onednn/onednn_context.h" #include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { diff --git a/paddle/phi/kernels/onednn/softmax_grad_kernel.cc b/paddle/phi/kernels/onednn/softmax_grad_kernel.cc index 0db255d91948e..facbb9e9f193c 100644 --- a/paddle/phi/kernels/onednn/softmax_grad_kernel.cc +++ b/paddle/phi/kernels/onednn/softmax_grad_kernel.cc @@ -14,7 +14,10 @@ #include "paddle/phi/kernels/softmax_grad_kernel.h" +#include "paddle/phi/backends/onednn/onednn_context.h" #include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { From 148eebcf1a263c26136b21075a28ba151b4444cf Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 25 Oct 2022 11:21:42 +0000 Subject: [PATCH 29/36] fix map at error --- paddle/fluid/framework/operator.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c2a044b52e404..bb1f85ae83ae6 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2398,9 +2398,13 @@ Scope* OperatorWithKernel::PrepareData( paddle::operators::ExtraInfoUtils::Instance().GetExtraInputNamesMap( Type()); for (const auto& input_name : extra_input_names) { + auto iter = ctx->inputs.find(input_name); + if (iter == ctx->inputs.end()) { + continue; + } bool should_skip_input = no_buffer_ins && no_buffer_ins->count(input_name) > 0; - std::vector& input_vars = ctx->inputs[input_name]; + std::vector& input_vars = iter->second; prepare_input_data(input_name, &input_vars, nullptr, should_skip_input); } #endif From 2d4291249c2c121ca6649c931d93542d8041f505 Mon Sep 17 00:00:00 2001 From: Silv3S Date: Tue, 25 Oct 2022 16:53:51 +0200 Subject: [PATCH 30/36] adjust attribute --- paddle/phi/kernels/onednn/softmax_kernel.cc | 53 ++++++++++----------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/paddle/phi/kernels/onednn/softmax_kernel.cc b/paddle/phi/kernels/onednn/softmax_kernel.cc index 818396e71bd46..ec621e7361438 100644 --- a/paddle/phi/kernels/onednn/softmax_kernel.cc +++ b/paddle/phi/kernels/onednn/softmax_kernel.cc @@ -26,36 +26,33 @@ void SoftmaxKernel(const Context& dev_ctx, DenseTensor* out) { funcs::SoftmaxOneDNNHandler handler( dev_ctx.GetEngine(), dev_ctx.GetPlace(), axis, &x, out); - - auto src_memory_p = handler.AcquireSrcMemory(&x); - std::shared_ptr dst_memory_p = nullptr; - if (x.IsSharedBufferWith(*out)) { - dst_memory_p = src_memory_p; - dev_ctx.template Alloc(out); - } else { - dst_memory_p = handler.AcquireDstMemory(out); - } - auto softmax_p = handler.AcquireForwardPrimitive(); - - auto& astream = OneDNNContext::tls().get_stream(); - softmax_p->execute(astream, - {{DNNL_ARG_SRC, *src_memory_p}, - {DNNL_ARG_DST, *dst_memory_p}}); - astream.wait(); - - // Find a way to get attribute - // const bool is_test = ctx.Attr("is_test"); - // if (!is_test) { - // T* output_data = out->mutable_data(ctx.GetPlace()); - // std::for_each(output_data, &output_data[out->numel()], [](T& val) { - // val = std::max(val, static_cast(exp(-64))); - // }); - // } - - out->set_mem_desc(dst_memory_p->get_desc()); + + auto src_memory_p = handler.AcquireSrcMemory(&x); + std::shared_ptr dst_memory_p = nullptr; + if (x.IsSharedBufferWith(*out)) { + dst_memory_p = src_memory_p; + dev_ctx.template Alloc(out); + } else { + dst_memory_p = handler.AcquireDstMemory(out); + } + auto softmax_p = handler.AcquireForwardPrimitive(); + + auto& astream = OneDNNContext::tls().get_stream(); + softmax_p->execute( + astream, {{DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}); + astream.wait(); + + if (dev_ctx.HasDnnAttr("is_test") && !(dev_ctx.GetDnnAttr("is_test"))) { + dev_ctx.template Alloc(output_data); + std::for_each(output_data, &output_data[output->numel()], [](T& val) { + val = std::max(val, static_cast(exp(-64))); + }); } + out->set_mem_desc(dst_memory_p->get_desc()); +} + } // namespace phi PD_REGISTER_KERNEL( - softmax, OneDNN, ALL_LAYOUT, phi::SoftmaxKernel, float, phi::dtype::bfloat16) {} + softmax, OneDNN, ONEDNN, phi::SoftmaxKernel, float, phi::dtype::bfloat16) {} From 9111348ba87e988085b35f00ab8af52cc33f07c2 Mon Sep 17 00:00:00 2001 From: Silv3S Date: Tue, 25 Oct 2022 23:54:41 +0200 Subject: [PATCH 31/36] adapt funcs to PHI --- paddle/phi/kernels/onednn/softmax_kernel.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/phi/kernels/onednn/softmax_kernel.cc b/paddle/phi/kernels/onednn/softmax_kernel.cc index ec621e7361438..06709aa0fd158 100644 --- a/paddle/phi/kernels/onednn/softmax_kernel.cc +++ b/paddle/phi/kernels/onednn/softmax_kernel.cc @@ -42,9 +42,12 @@ void SoftmaxKernel(const Context& dev_ctx, astream, {{DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}); astream.wait(); - if (dev_ctx.HasDnnAttr("is_test") && !(dev_ctx.GetDnnAttr("is_test"))) { - dev_ctx.template Alloc(output_data); - std::for_each(output_data, &output_data[output->numel()], [](T& val) { + bool is_test = dev_ctx.HasDnnAttr("is_test") + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test")) + : false; + if (!is_test) { + T* out_data = dev_ctx.template Alloc(out); + std::for_each(out_data, &out_data[out->numel()], [](T& val) { val = std::max(val, static_cast(exp(-64))); }); } From 4050733343a253faf4733fff1337211591289e7c Mon Sep 17 00:00:00 2001 From: Silv3S Date: Thu, 27 Oct 2022 10:42:59 +0200 Subject: [PATCH 32/36] init --- .../operators/mkldnn/activation_mkldnn_op.cc | 65 ----------- .../operators/mkldnn/softplus_mkldnn_op.h | 105 ------------------ paddle/phi/kernels/onednn/softplus_kernel.cc | 104 +++++++++++++++++ 3 files changed, 104 insertions(+), 170 deletions(-) delete mode 100644 paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc delete mode 100644 paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h create mode 100644 paddle/phi/kernels/onednn/softplus_kernel.cc diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc deleted file mode 100644 index 0af0b49076795..0000000000000 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h" -#include "paddle/fluid/platform/mkldnn_reuse.h" - -namespace phi { -class DenseTensor; -} // namespace phi - -namespace paddle { -namespace operators { - -using dnnl::memory; -using dnnl::primitive; -using dnnl::stream; -using phi::DataLayout; - -using platform::GetMKLDNNFormat; -using platform::MKLDNNDeviceContext; -using platform::to_void_cast; - -template -class MKLDNNActivationKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - Functor functor; - functor(ctx); - } -}; - -template -struct SoftplusMKLDNNFunctor : public BaseActivationFunctor { - void operator()(const framework::ExecutionContext &ctx) const { - custom_softplus_eltwise_forward(ctx); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -#define REGISTER_FWD_ACTIVATION_MKLDNN_KERNEL(act_type, functor) \ - REGISTER_OP_KERNEL( \ - act_type, \ - MKLDNN, \ - ::paddle::platform::CPUPlace, \ - ops::MKLDNNActivationKernel>, \ - ops::MKLDNNActivationKernel>); - -REGISTER_FWD_ACTIVATION_MKLDNN_KERNEL(softplus, SoftplusMKLDNNFunctor); diff --git a/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h deleted file mode 100644 index 25886c5791fea..0000000000000 --- a/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h +++ /dev/null @@ -1,105 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/platform/mkldnn_reuse.h" - -namespace paddle { -namespace operators { - -template -class SoftplusMKLDNNHandler - : public platform::MKLDNNHandlerNoCachingT { - public: - SoftplusMKLDNNHandler(const framework::ExecutionContext& ctx, - const phi::DenseTensor* x, - const float beta, - const dnnl::engine engine) - : platform::MKLDNNHandlerNoCachingT(engine, - ctx.GetPlace()) { - auto x_tz = phi::vectorize(x->dims()); - - auto beta_tz = std::vector(x_tz.size(), 1); - auto beta_md = - dnnl::memory::desc(beta_tz, - platform::MKLDNNGetDataType(), - platform::GetPlainMKLDNNFormat(x_tz.size())); - - dnnl::post_ops post_ops; - post_ops.append_eltwise( - 1.0f, dnnl::algorithm::eltwise_soft_relu, 0.0f, 0.0f); - if (beta != 1.0f) { - post_ops.append_eltwise( - 1.0f, dnnl::algorithm::eltwise_linear, 1.0f / beta, 0.0f); - } - - platform::AppendActivation(ctx, post_ops); - - dnnl::primitive_attr attrs; - attrs.set_post_ops(post_ops); - - this->AcquireForwardPrimitiveDescriptor(attrs, - dnnl::algorithm::binary_mul, - x->mem_desc(), - beta_md, - x->mem_desc()); - } - - std::shared_ptr AcquireBetaMemory(const float* beta) { - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->src1_desc(), platform::to_void_cast(beta)); - } -}; - -template -void custom_softplus_eltwise_forward(const framework::ExecutionContext& ctx) { - const auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - const auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - bool is_inplaced = x->IsSharedBufferWith(*out); - - const float beta = ctx.Attr("beta"); - - SoftplusMKLDNNHandler handler(ctx, x, beta, mkldnn_engine); - - auto src_memory_p = handler.AcquireSrcMemory(x); - - auto beta_memory_p = handler.AcquireBetaMemory(&beta); - std::shared_ptr dst_memory_p = nullptr; - if (is_inplaced) { - dst_memory_p = src_memory_p; - out->mutable_data(ctx.GetPlace()); - } else { - dst_memory_p = handler.AcquireDstMemory(out); - } - auto binary_p = handler.AcquireForwardPrimitive(); - - auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_memory_p}, - {DNNL_ARG_SRC_1, *beta_memory_p}, - {DNNL_ARG_DST, *dst_memory_p}}; - - binary_p->execute(astream, args); - astream.wait(); - - out->set_mem_desc(dst_memory_p->get_desc()); -} -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/onednn/softplus_kernel.cc b/paddle/phi/kernels/onednn/softplus_kernel.cc new file mode 100644 index 0000000000000..e97e901574cdc --- /dev/null +++ b/paddle/phi/kernels/onednn/softplus_kernel.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/activation_kernel.h" + +#include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +class SoftplusOneDNNHandler : public OneDNNHandlerNoCachingT { + public: + SoftplusOneDNNHandler(const dnnl::engine onednn_engine, + Place cpu_place, + const phi::DenseTensor* x, + const float beta) + : OneDNNHandlerNoCachingT(onednn_engine, cpu_place) { + auto x_tz = phi::vectorize(x->dims()); + + auto beta_tz = std::vector(x_tz.size(), 1); + auto beta_md = + dnnl::memory::desc(beta_tz, + platform::OneDNNGetDataType(), + platform::GetPlainOneDNNFormat(x_tz.size())); + + dnnl::post_ops post_ops; + post_ops.append_eltwise( + 1.0f, dnnl::algorithm::eltwise_soft_relu, 0.0f, 0.0f); + if (beta != 1.0f) { + post_ops.append_eltwise( + 1.0f, dnnl::algorithm::eltwise_linear, 1.0f / beta, 0.0f); + } + + platform::AppendActivation(ctx, post_ops); + + dnnl::primitive_attr attrs; + attrs.set_post_ops(post_ops); + + this->AcquireForwardPrimitiveDescriptor(attrs, + dnnl::algorithm::binary_mul, + x->mem_desc(), + beta_md, + x->mem_desc()); + } + + std::shared_ptr AcquireBetaMemory(const float* beta) { + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->src1_desc(), platform::to_void_cast(beta)); + } +}; + +template +void SoftplusKernel(const Context& dev_ctx, + const DenseTensor& x, + float beta, + float threshold, + DenseTensor* out) { + SoftplusOneDNNHandler handler( + dev_ctx.GetEngine(), dev_ctx.GetPlace(), &x, beta); + + auto src_memory_p = handler.AcquireSrcMemory(&x); + auto beta_memory_p = handler.AcquireBetaMemory(&beta); + std::shared_ptr dst_memory_p = nullptr; + if (x.IsSharedBufferWith(*out)) { + dst_memory_p = src_memory_p; + dev_ctx.template Alloc(out); + } else { + dst_memory_p = handler.AcquireDstMemory(out); + } + auto binary_p = handler.AcquireForwardPrimitive(); + + auto& astream = OneDNNContext::tls().get_stream(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_memory_p}, + {DNNL_ARG_SRC_1, *beta_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + binary_p->execute(astream, args); + astream.wait(); + + out->set_mem_desc(dst_memory_p->get_desc()); +} + +} // namespace phi + +PD_REGISTER_KERNEL(softplus, + OneDNN, + ONEDNN, + phi::SoftplusKernel, + float, + phi::dtype::bfloat16) {} From 694f3becae10ac84abf1b767da9fa2057481cf4d Mon Sep 17 00:00:00 2001 From: Silv3S Date: Thu, 27 Oct 2022 11:00:49 +0200 Subject: [PATCH 33/36] adjust imports --- paddle/phi/kernels/onednn/softplus_kernel.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/paddle/phi/kernels/onednn/softplus_kernel.cc b/paddle/phi/kernels/onednn/softplus_kernel.cc index e97e901574cdc..d34cd68c66337 100644 --- a/paddle/phi/kernels/onednn/softplus_kernel.cc +++ b/paddle/phi/kernels/onednn/softplus_kernel.cc @@ -20,20 +20,21 @@ namespace phi { template -class SoftplusOneDNNHandler : public OneDNNHandlerNoCachingT { +class SoftplusOneDNNHandler + : public funcs::OneDNNHandlerNoCachingT { public: SoftplusOneDNNHandler(const dnnl::engine onednn_engine, Place cpu_place, const phi::DenseTensor* x, const float beta) - : OneDNNHandlerNoCachingT(onednn_engine, cpu_place) { + : funcs::OneDNNHandlerNoCachingT(onednn_engine, + cpu_place) { auto x_tz = phi::vectorize(x->dims()); auto beta_tz = std::vector(x_tz.size(), 1); - auto beta_md = - dnnl::memory::desc(beta_tz, - platform::OneDNNGetDataType(), - platform::GetPlainOneDNNFormat(x_tz.size())); + auto beta_md = dnnl::memory::desc(beta_tz, + funcs::OneDNNGetDataType(), + funcs::GetPlainOneDNNFormat(x_tz.size())); dnnl::post_ops post_ops; post_ops.append_eltwise( @@ -43,7 +44,7 @@ class SoftplusOneDNNHandler : public OneDNNHandlerNoCachingT { 1.0f, dnnl::algorithm::eltwise_linear, 1.0f / beta, 0.0f); } - platform::AppendActivation(ctx, post_ops); + // funcs::AppendActivation(this->dev_ctx_, post_ops); dnnl::primitive_attr attrs; attrs.set_post_ops(post_ops); @@ -56,8 +57,8 @@ class SoftplusOneDNNHandler : public OneDNNHandlerNoCachingT { } std::shared_ptr AcquireBetaMemory(const float* beta) { - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->src1_desc(), platform::to_void_cast(beta)); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src1_desc(), + funcs::to_void_cast(beta)); } }; From bbd33ebcbc3550fee53ed3f2a310f4f994400879 Mon Sep 17 00:00:00 2001 From: Silv3S Date: Thu, 27 Oct 2022 11:18:18 +0200 Subject: [PATCH 34/36] support postops --- paddle/phi/kernels/onednn/softplus_kernel.cc | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/paddle/phi/kernels/onednn/softplus_kernel.cc b/paddle/phi/kernels/onednn/softplus_kernel.cc index d34cd68c66337..9775e6362f2ab 100644 --- a/paddle/phi/kernels/onednn/softplus_kernel.cc +++ b/paddle/phi/kernels/onednn/softplus_kernel.cc @@ -23,12 +23,11 @@ template class SoftplusOneDNNHandler : public funcs::OneDNNHandlerNoCachingT { public: - SoftplusOneDNNHandler(const dnnl::engine onednn_engine, - Place cpu_place, + SoftplusOneDNNHandler(const OneDNNContext& dev_ctx, const phi::DenseTensor* x, const float beta) - : funcs::OneDNNHandlerNoCachingT(onednn_engine, - cpu_place) { + : funcs::OneDNNHandlerNoCachingT(dev_ctx.GetEngine(), + dev_ctx.GetPlace()) { auto x_tz = phi::vectorize(x->dims()); auto beta_tz = std::vector(x_tz.size(), 1); @@ -44,7 +43,7 @@ class SoftplusOneDNNHandler 1.0f, dnnl::algorithm::eltwise_linear, 1.0f / beta, 0.0f); } - // funcs::AppendActivation(this->dev_ctx_, post_ops); + funcs::AppendActivation(dev_ctx, post_ops); dnnl::primitive_attr attrs; attrs.set_post_ops(post_ops); @@ -68,8 +67,7 @@ void SoftplusKernel(const Context& dev_ctx, float beta, float threshold, DenseTensor* out) { - SoftplusOneDNNHandler handler( - dev_ctx.GetEngine(), dev_ctx.GetPlace(), &x, beta); + SoftplusOneDNNHandler handler(dev_ctx, &x, beta); auto src_memory_p = handler.AcquireSrcMemory(&x); auto beta_memory_p = handler.AcquireBetaMemory(&beta); From d4e7b66fe9aa1c0e8bbf5aa712cbe936c7601cf9 Mon Sep 17 00:00:00 2001 From: Silv3S Date: Thu, 27 Oct 2022 11:30:01 +0200 Subject: [PATCH 35/36] format codeblocks --- paddle/phi/kernels/onednn/softplus_kernel.cc | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/paddle/phi/kernels/onednn/softplus_kernel.cc b/paddle/phi/kernels/onednn/softplus_kernel.cc index 9775e6362f2ab..b87938e3dc11b 100644 --- a/paddle/phi/kernels/onednn/softplus_kernel.cc +++ b/paddle/phi/kernels/onednn/softplus_kernel.cc @@ -28,13 +28,6 @@ class SoftplusOneDNNHandler const float beta) : funcs::OneDNNHandlerNoCachingT(dev_ctx.GetEngine(), dev_ctx.GetPlace()) { - auto x_tz = phi::vectorize(x->dims()); - - auto beta_tz = std::vector(x_tz.size(), 1); - auto beta_md = dnnl::memory::desc(beta_tz, - funcs::OneDNNGetDataType(), - funcs::GetPlainOneDNNFormat(x_tz.size())); - dnnl::post_ops post_ops; post_ops.append_eltwise( 1.0f, dnnl::algorithm::eltwise_soft_relu, 0.0f, 0.0f); @@ -42,12 +35,16 @@ class SoftplusOneDNNHandler post_ops.append_eltwise( 1.0f, dnnl::algorithm::eltwise_linear, 1.0f / beta, 0.0f); } - funcs::AppendActivation(dev_ctx, post_ops); - dnnl::primitive_attr attrs; attrs.set_post_ops(post_ops); + auto x_tz = phi::vectorize(x->dims()); + auto beta_tz = std::vector(x_tz.size(), 1); + auto beta_md = dnnl::memory::desc(beta_tz, + funcs::OneDNNGetDataType(), + funcs::GetPlainOneDNNFormat(x_tz.size())); + this->AcquireForwardPrimitiveDescriptor(attrs, dnnl::algorithm::binary_mul, x->mem_desc(), From 83ba50b6d3a9cd4d5218996080500d13d2f1a7aa Mon Sep 17 00:00:00 2001 From: Silv3S Date: Wed, 2 Nov 2022 09:12:24 +0100 Subject: [PATCH 36/36] revert changes to softmax --- .../ir/mkldnn/mkldnn_inplace_pass_tester.cc | 2 +- .../operators/mkldnn/softmax_mkldnn_op.cc | 111 ++++++++++++++++++ .../operators/mkldnn/test_mkldnn_caching.cc | 2 +- .../mkldnn/test_mkldnn_op_inplace.cc | 2 +- paddle/phi/backends/onednn/onednn_reuse.h | 9 +- paddle/phi/kernels/onednn/softmax_kernel.cc | 61 ---------- 6 files changed, 115 insertions(+), 72 deletions(-) create mode 100644 paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc delete mode 100644 paddle/phi/kernels/onednn/softmax_kernel.cc diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index 673f7cd88d6ca..9b0cb0a60902f 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -23,7 +23,7 @@ #include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(softmax); -PD_DECLARE_KERNEL(softmax, OneDNN, ONEDNN); +USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(leaky_relu); diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc new file mode 100644 index 0000000000000..57935a1a1c1aa --- /dev/null +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -0,0 +1,111 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" + +namespace paddle { +namespace operators { + +using paddle::platform::MKLDNNDeviceContext; +using paddle::platform::MKLDNNMemDesc; + +using dnnl::memory; // Note: paddle has also "memory" namespace +using dnnl::primitive; +using dnnl::prop_kind; +using dnnl::softmax_backward; +using dnnl::softmax_forward; +using dnnl::stream; +using platform::to_void_cast; + +template +class SoftmaxMKLDNNHandler + : public platform::MKLDNNHandlerNoCachingT { + public: + SoftmaxMKLDNNHandler(const dnnl::engine mkldnn_engine, + platform::Place cpu_place, + const phi::DenseTensor* input, + phi::DenseTensor* output, + const int axis) + : platform::MKLDNNHandlerNoCachingT(mkldnn_engine, + cpu_place) { + PADDLE_ENFORCE_EQ( + input->dims(), + output->dims(), + platform::errors::InvalidArgument( + "The shape of input and output tensor must be identical.")); + + this->AcquireForwardPrimitiveDescriptor( + prop_kind::forward_scoring, input->mem_desc(), axis); + } +}; + +template +class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + const phi::DenseTensor* input = ctx.Input("X"); + phi::DenseTensor* output = ctx.Output("Out"); + bool is_inplaced = input->IsSharedBufferWith(*output); + + const int axis = + phi::funcs::CanonicalAxis(ctx.Attr("axis"), input->dims().size()); + + SoftmaxMKLDNNHandler handler( + mkldnn_engine, ctx.GetPlace(), input, output, axis); + + auto softmax_src_memory_p = handler.AcquireSrcMemory(input); + // For Inplace src and and dst are the same memory object + std::shared_ptr softmax_dst_memory_p = nullptr; + if (is_inplaced) { + softmax_dst_memory_p = softmax_src_memory_p; + output->mutable_data(ctx.GetPlace()); + } else { + softmax_dst_memory_p = handler.AcquireDstMemory(output); + } + auto softmax_p = handler.AcquireForwardPrimitive(); + + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); + softmax_p->execute(astream, + {{DNNL_ARG_SRC, *softmax_src_memory_p}, + {DNNL_ARG_DST, *softmax_dst_memory_p}}); + astream.wait(); + + const bool is_test = ctx.Attr("is_test"); + if (!is_test) { + T* output_data = output->mutable_data(ctx.GetPlace()); + std::for_each(output_data, &output_data[output->numel()], [](T& val) { + val = std::max(val, static_cast(exp(-64))); + }); + } + + output->set_mem_desc(softmax_dst_memory_p->get_desc()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(softmax, + MKLDNN, + ::paddle::platform::CPUPlace, + ops::SoftmaxMKLDNNKernel, + ops::SoftmaxMKLDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 63b7cfd51fb55..9a4237230e86f 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -34,7 +34,7 @@ USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN); USE_OP_ITSELF(relu); PD_DECLARE_KERNEL(relu, OneDNN, ONEDNN); USE_OP_ITSELF(softmax); -PD_DECLARE_KERNEL(softmax, OneDNN, ONEDNN); +USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_ITSELF(conv2d); PD_DECLARE_KERNEL(conv2d, OneDNN, ONEDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index 8aa299570443b..62b57d286b86a 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -32,7 +32,7 @@ USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(relu); PD_DECLARE_KERNEL(relu, OneDNN, ONEDNN); USE_OP_ITSELF(softmax); -PD_DECLARE_KERNEL(softmax, OneDNN, ONEDNN); +USE_OP_DEVICE_KERNEL(softmax, MKLDNN); PD_DECLARE_KERNEL(softmax, CPU, ALL_LAYOUT); namespace paddle { diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h index 536ac86496d3f..4a28c4262f32d 100644 --- a/paddle/phi/backends/onednn/onednn_reuse.h +++ b/paddle/phi/backends/onednn/onednn_reuse.h @@ -753,19 +753,12 @@ class SoftmaxOneDNNHandler public: SoftmaxOneDNNHandler(const dnnl::engine onednn_engine, Place cpu_place, - int axis, const DenseTensor* x, - DenseTensor* out) + int axis) : OneDNNHandlerNoCachingT(onednn_engine, cpu_place) { - PADDLE_ENFORCE_EQ( - x->dims(), - out->dims(), - phi::errors::InvalidArgument( - "The shape of input and output tensor must be identical.")); - const int canonical_axis = funcs::CanonicalAxis(axis, x->dims().size()); this->AcquireForwardPrimitiveDescriptor( dnnl::prop_kind::forward_scoring, x->mem_desc(), canonical_axis); diff --git a/paddle/phi/kernels/onednn/softmax_kernel.cc b/paddle/phi/kernels/onednn/softmax_kernel.cc deleted file mode 100644 index 06709aa0fd158..0000000000000 --- a/paddle/phi/kernels/onednn/softmax_kernel.cc +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/softmax_kernel.h" - -#include "paddle/phi/backends/onednn/onednn_reuse.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -template -void SoftmaxKernel(const Context& dev_ctx, - const DenseTensor& x, - int axis, - DenseTensor* out) { - funcs::SoftmaxOneDNNHandler handler( - dev_ctx.GetEngine(), dev_ctx.GetPlace(), axis, &x, out); - - auto src_memory_p = handler.AcquireSrcMemory(&x); - std::shared_ptr dst_memory_p = nullptr; - if (x.IsSharedBufferWith(*out)) { - dst_memory_p = src_memory_p; - dev_ctx.template Alloc(out); - } else { - dst_memory_p = handler.AcquireDstMemory(out); - } - auto softmax_p = handler.AcquireForwardPrimitive(); - - auto& astream = OneDNNContext::tls().get_stream(); - softmax_p->execute( - astream, {{DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}); - astream.wait(); - - bool is_test = dev_ctx.HasDnnAttr("is_test") - ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test")) - : false; - if (!is_test) { - T* out_data = dev_ctx.template Alloc(out); - std::for_each(out_data, &out_data[out->numel()], [](T& val) { - val = std::max(val, static_cast(exp(-64))); - }); - } - - out->set_mem_desc(dst_memory_p->get_desc()); -} - -} // namespace phi - -PD_REGISTER_KERNEL( - softmax, OneDNN, ONEDNN, phi::SoftmaxKernel, float, phi::dtype::bfloat16) {}