From 98270c18778ec6ad59c162d9cac975f24aeacc92 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Thu, 5 Aug 2021 16:03:19 +0200 Subject: [PATCH] - modified UT --- .../mkldnn/elementwise_mkldnn_op.h | 4 +- .../mkldnn/elementwise_mul_mkldnn_op.cc | 8 +- .../operators/mkldnn/activation_mkldnn_op.cc | 11 +- .../fluid/operators/mkldnn/scale_mkldnn_op.cc | 6 +- .../operators/mkldnn/softmax_mkldnn_op.cc | 50 +-- .../operators/mkldnn/test_mkldnn_caching.cc | 66 ++-- paddle/fluid/platform/mkldnn_reuse.h | 316 +++++++++--------- 7 files changed, 220 insertions(+), 241 deletions(-) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index 46201d5e5c9a1..77b573183835b 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -47,7 +47,9 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { float scale_o = ctx.Attr("Scale_out"); int axis = ctx.Attr("axis"); - platform::BinaryMKLDNNHandler handler( BINARY_OP, axis, mkldnn_engine, ctx.GetPlace(), x, y, z, scale_x, scale_y, scale_o); + platform::BinaryMKLDNNHandler handler(BINARY_OP, axis, mkldnn_engine, + ctx.GetPlace(), x, y, z, scale_x, + scale_y, scale_o); const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index 508867ed37434..af4aab8047888 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -48,8 +48,8 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { if (dx) { // dx = dout*y platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, - ctx.GetPlace(), dout, y, dx, 1.0f, 1.0f, 1.0f); + dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), + dout, y, dx, 1.0f, 1.0f, 1.0f); const auto src_dout_memory = handler.AcquireSrcMemory(dout); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); @@ -74,8 +74,8 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { // Handler is having nullptr passed instead of output tensor as // we want Dst buffer to be allocated by oneDNN not to use Tensor platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, - ctx.GetPlace(), dout, x, nullptr, 1.0f, 1.0f, 1.0f); + dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), + dout, x, nullptr, 1.0f, 1.0f, 1.0f); const auto src_dout_memory = handler.AcquireSrcMemory(dout); const auto src_x_memory = handler.AcquireSecondSrcMemory(x); diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 2351e759f2d8e..d992890adeec3 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -79,14 +79,15 @@ void eltwise_forward(const framework::ExecutionContext &ctx, paddle::platform::errors::PreconditionNotMet( "Operator DNNL eletwise_forward must use CPUPlace")); auto &dev_ctx = ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); + const auto &mkldnn_engine = dev_ctx.GetEngine(); const auto *x = ctx.Input("X"); auto *y = ctx.Output("Out"); bool is_inplaced = x->IsSharedBufferWith(*y); - platform::ActivationMKLDNNHandler handler(algorithm, ctx, mkldnn_engine, ctx.GetPlace(), x); + platform::ActivationMKLDNNHandler handler(algorithm, ctx, mkldnn_engine, + ctx.GetPlace(), x); auto src_memory_p = handler.AcquireSrcMemory(x); auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(y); @@ -105,14 +106,14 @@ template void eltwise_grad(const framework::ExecutionContext &ctx, mkldnn::algorithm algorithm) { auto &dev_ctx = ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); + const auto &mkldnn_engine = dev_ctx.GetEngine(); const auto *x = ctx.Input("X"); const auto *diff_y = ctx.Input(framework::GradVarName("Out")); auto *diff_x = ctx.Output(framework::GradVarName("X")); - platform::ActivationMKLDNNHandler handler( - algorithm, ctx, mkldnn_engine, ctx.GetPlace(), x, diff_y); + platform::ActivationMKLDNNHandler handler(algorithm, ctx, mkldnn_engine, + ctx.GetPlace(), x, diff_y); auto src_memory_p = handler.AcquireBackwardSrcMemory(x); auto diff_dst_memory_p = handler.AcquireDiffDstMemory(diff_y); diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc index 2022c1ab910b4..84ac14d04b85b 100644 --- a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc @@ -37,10 +37,12 @@ class ScaleMKLDNNKernel : public framework::OpKernel { bool is_inplaced = x->IsSharedBufferWith(*out); platform::ActivationMKLDNNHandler handler( - mkldnn::algorithm::eltwise_linear, ctx, mkldnn_engine, ctx.GetPlace(), x); + mkldnn::algorithm::eltwise_linear, ctx, mkldnn_engine, ctx.GetPlace(), + x); auto src_memory_p = handler.AcquireSrcMemory(x); - auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(out); + auto dst_memory_p = + is_inplaced ? src_memory_p : handler.AcquireDstMemory(out); auto activation_p = handler.AcquireForwardPrimitive(); auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index fc647e184cb81..8be2e1d6227f4 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -33,12 +33,13 @@ using platform::to_void_cast; template class SoftmaxMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT { + mkldnn::softmax_backward> { public: SoftmaxMKLDNNHandler(const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* input, Tensor* output, const int axis) - : platform::MKLDNNHandlerNoCachingT( + : platform::MKLDNNHandlerNoCachingT( mkldnn_engine, cpu_place) { PADDLE_ENFORCE_EQ( input->dims(), output->dims(), @@ -49,7 +50,8 @@ class SoftmaxMKLDNNHandler auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType(), input->format()); - this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, axis); + this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, + axis); } SoftmaxMKLDNNHandler(const framework::ExecutionContext& ctx, @@ -58,25 +60,26 @@ class SoftmaxMKLDNNHandler const Tensor* out_grad, Tensor* in_x_grad, const std::string& unique_name) : platform::MKLDNNHandlerNoCachingT(mkldnn_engine, cpu_place) { - PADDLE_ENFORCE_EQ( - out_grad->dims(), in_x_grad->dims(), - platform::errors::InvalidArgument("The shape of softmax_grad's input " - "and output must be identical.")); - - auto dims = out_grad->dims(); // input and output share the same shape - const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); - auto softmax_tz = framework::vectorize(dims); - - auto data_softmax_md = MKLDNNMemDesc( - softmax_tz, platform::MKLDNNGetDataType(), out->format()); - auto diff_softmax_md = MKLDNNMemDesc( - softmax_tz, platform::MKLDNNGetDataType(), out_grad->format()); - - this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, - data_softmax_md, axis); - this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, - axis); + mkldnn::softmax_backward>( + mkldnn_engine, cpu_place) { + PADDLE_ENFORCE_EQ( + out_grad->dims(), in_x_grad->dims(), + platform::errors::InvalidArgument("The shape of softmax_grad's input " + "and output must be identical.")); + + auto dims = out_grad->dims(); // input and output share the same shape + const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); + auto softmax_tz = framework::vectorize(dims); + + auto data_softmax_md = MKLDNNMemDesc( + softmax_tz, platform::MKLDNNGetDataType(), out->format()); + auto diff_softmax_md = MKLDNNMemDesc( + softmax_tz, platform::MKLDNNGetDataType(), out_grad->format()); + + this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, + data_softmax_md, axis); + this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, + axis); } }; @@ -93,7 +96,8 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { const int axis = CanonicalAxis(ctx.Attr("axis"), input->dims().size()); - SoftmaxMKLDNNHandler handler(mkldnn_engine, ctx.GetPlace(), input, output, axis); + SoftmaxMKLDNNHandler handler(mkldnn_engine, ctx.GetPlace(), input, + output, axis); auto softmax_src_memory_p = handler.AcquireSrcMemory(input); // For Inplace src and and dst are the same memory object diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index cad4f47ec1402..04337c2596531 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -70,11 +70,16 @@ void RunOperator(const platform::Place &place, const std::string &op_type, std::map num_inputs = {{"softmax", 1}, {"relu", 1}, + {"conv2d", 2}, {"elementwise_add", 2}, {"elementwise_mul", 2}}; std::string first_input = inplace == true ? output_name : "x"; + std::string first_input_var_name = (op_type == "conv2d") ? "Input" : "X"; + std::string second_input_var_name = (op_type == "conv2d") ? "Filter" : "Y"; + std::string output_var_name = (op_type == "conv2d") ? "Output" : "Out"; + std::vector input_names = { {first_input, scope.Var(first_input)->GetMutable()}, {"x1", num_inputs[op_type] > 1 @@ -113,68 +118,37 @@ void RunOperator(const platform::Place &place, const std::string &op_type, auto &pool = platform::DeviceContextPool::Instance(); - auto op = num_inputs[op_type] > 1 - ? framework::OpRegistry::CreateOp( - op_type, {{"X", {first_input}}, {"Y", {"x1"}}}, - {{"Out", {output_name}}}, {{"use_mkldnn", {true}}}) - : framework::OpRegistry::CreateOp( - op_type, {{"X", {first_input}}}, {{"Out", {output_name}}}, - {{"use_mkldnn", {true}}}); + auto op = + num_inputs[op_type] > 1 + ? framework::OpRegistry::CreateOp( + op_type, {{first_input_var_name, {first_input}}, + {second_input_var_name, {"x1"}}}, + {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}}) + : framework::OpRegistry::CreateOp( + op_type, {{first_input_var_name, {first_input}}}, + {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}}); op->Run(scope, place); pool.Get(place)->Wait(); } TEST(test_softmax_reuse_cache, cpu_place) { - framework::DDim dims({32, 64}); + framework::DDim dims({1, 16, 32, 64}); platform::CPUPlace p; CacheTester ct; - RunOperator(p, "softmax", dims, "softmax_out"); - RunOperator(p, "softmax", dims, "softmax_out"); + RunOperator(p, "conv2d", dims, "conv_out"); + RunOperator(p, "conv2d", dims, "conv_out"); PADDLE_ENFORCE_EQ(ct.Analyze(4), true, platform::errors::InvalidArgument( "Wrong number of cached oneDNN objects")); } TEST(test_softmax_noreuse_cache, cpu_place) { - framework::DDim dims({32, 64}); - platform::CPUPlace p; - CacheTester ct; - RunOperator(p, "softmax", dims, "softmax_out"); - RunOperator(p, "softmax", dims, "softmax_out2"); - PADDLE_ENFORCE_EQ(ct.Analyze(8), true, - platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); -} - -TEST(test_softmax_inplace_cache, cpu_place) { - framework::DDim dims({32, 64}); - platform::CPUPlace p; - CacheTester ct; - RunOperator(p, "softmax", dims, "softmax_out"); - RunOperator(p, "softmax", dims, "softmax_out", true); - PADDLE_ENFORCE_EQ(ct.Analyze(7), true, - platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); -} - -TEST(test_relu_inplace_cache, cpu_place) { - framework::DDim dims({32, 64}); - platform::CPUPlace p; - CacheTester ct; - RunOperator(p, "relu", dims, "relu_out"); - RunOperator(p, "relu", dims, "relu_out", true); - PADDLE_ENFORCE_EQ(ct.Analyze(7), true, - platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); -} - -TEST(test_elementwise_add_reuse_cache, cpu_place) { - framework::DDim dims({32, 64}); + framework::DDim dims({1, 16, 32, 64}); platform::CPUPlace p; CacheTester ct; - RunOperator(p, "elementwise_add", dims, "elementwise_add_out"); - RunOperator(p, "relu", dims, "elementwise_add_out", true); + RunOperator(p, "conv2d", dims, "conv_out"); + RunOperator(p, "conv2d", dims, "conv_out2"); PADDLE_ENFORCE_EQ(ct.Analyze(8), true, platform::errors::InvalidArgument( "Wrong number of cached oneDNN objects")); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 3668105ccfd54..ab93d6b7cdf31 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -34,40 +34,36 @@ using framework::Tensor; using user_function = std::function(const float*)>; using memory = mkldnn::memory; - template class MKLDNNHandlerNoCachingT { public: MKLDNNHandlerNoCachingT(mkldnn::engine engine, platform::Place cpu_place) - : engine_(engine), - place_(cpu_place), - fwd_pd_(nullptr), - bwd_pd_(nullptr) { + : engine_(engine), place_(cpu_place), fwd_pd_(nullptr), bwd_pd_(nullptr) { platform::MKLDNNDeviceContext::tls().log_lib_version(); } std::shared_ptr AcquireForwardPrimitive() { - return std::make_shared(*fwd_pd_); + return std::make_shared(*fwd_pd_); } std::shared_ptr AcquireBackwardPrimitive() { - return std::make_shared(*bwd_pd_); + return std::make_shared(*bwd_pd_); } std::shared_ptr AcquireBackwardWeightsPrimitive() { - PADDLE_ENFORCE_NOT_NULL(bwd_w_pd_, platform::errors::Unavailable( - "Error: BWD_PD should be set when " - "getting BWD prim .")); - return std::make_shared(*bwd_w_pd_); + PADDLE_ENFORCE_NOT_NULL(bwd_w_pd_, platform::errors::Unavailable( + "Error: BWD_PD should be set when " + "getting BWD prim .")); + return std::make_shared(*bwd_w_pd_); } std::shared_ptr AcquireSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive( - fwd_pd_->src_desc(), to_void_cast(input_data)); + return this->AcquireMemoryFromPrimitive(fwd_pd_->src_desc(), + to_void_cast(input_data)); } template @@ -93,8 +89,8 @@ class MKLDNNHandlerNoCachingT { std::shared_ptr AcquireDiffDstMemory( const framework::Tensor* diffdst) { const T* ptr = diffdst->data(); - return this->AcquireMemoryFromPrimitive( - bwd_pd_->diff_dst_desc(), to_void_cast(ptr)); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_dst_desc(), + to_void_cast(ptr)); } std::shared_ptr AcquireDiffSrcMemory( @@ -113,7 +109,8 @@ class MKLDNNHandlerNoCachingT { "Error: BWD_W_PD should be set when getting BWD grad of weights.")); T* ptr = diff_weights->mutable_data( place_, bwd_w_pd_->diff_weights_desc().get_size()); - return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), ptr); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), + ptr); } // Buffer is allocated by oneDNN to store computation results @@ -126,14 +123,13 @@ class MKLDNNHandlerNoCachingT { } protected: - // If your primitive descriptor requires attributes, pass them as a // first argument and paramters to descriptor constructor in the following // arguments. Otherwise, all arguments will be forwarded to descriptor // constructor, including the first one. template void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { - CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); + CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); } // Using sfinae to specialise variadic function. Workaround for not having @@ -161,9 +157,9 @@ class MKLDNNHandlerNoCachingT { void AcquireBackwardPrimitiveDescriptor(Args&&... args) { // fwd_pd_ is set during grad by calling // AcquireForwardPrimitiveDescriptor - PADDLE_ENFORCE_NOT_NULL( - fwd_pd_, - platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.")); + PADDLE_ENFORCE_NOT_NULL(fwd_pd_, + platform::errors::Unavailable( + "Get MKLDNN Forward primitive %s failed.")); auto bwd_desc = typename TBackward::desc(std::forward(args)...); bwd_pd_ = std::make_shared( bwd_desc, engine_, *fwd_pd_); @@ -173,29 +169,29 @@ class MKLDNNHandlerNoCachingT { void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { // fwd_pd_ is set during grad by calling // AcquireForwardPrimitiveDescriptor - PADDLE_ENFORCE_NOT_NULL( - fwd_pd_, - platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.")); - auto bwd_desc = - typename TBackward_params::desc(std::forward(args)...); - bwd_w_pd_ = std::make_shared( - bwd_desc, engine_, *fwd_pd_); + PADDLE_ENFORCE_NOT_NULL(fwd_pd_, + platform::errors::Unavailable( + "Get MKLDNN Forward primitive %s failed.")); + auto bwd_desc = + typename TBackward_params::desc(std::forward(args)...); + bwd_w_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); } std::shared_ptr AcquireMemoryFromPrimitive( mkldnn::memory::desc md, void* ptr) { - return std::make_shared(md, engine_, ptr); + return std::make_shared(md, engine_, ptr); } std::shared_ptr AcquireMemoryFromPrimitive( mkldnn::memory::desc md) { - return std::make_shared(md, engine_); + return std::make_shared(md, engine_); } void AcquireReorder(const std::shared_ptr& user_memory_p, const std::shared_ptr& target_memory_p) { - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); @@ -212,33 +208,30 @@ class MKLDNNHandlerNoCachingT { const mkldnn::memory::desc& target_md, void* ptr, const std::string& suffix, bool is_persistent = false, std::function(const F*)> custom_reorder_func = {}) { + std::shared_ptr target_memory_p; + if (custom_reorder_func) { + auto reordered_data = + custom_reorder_func(reinterpret_cast(ptr)); + ptr = reinterpret_cast(reordered_data.get()); + } + auto user_memory_p = std::make_shared(user_md, engine_, ptr); + if (user_md != target_md) { + target_memory_p = std::make_shared(target_md, engine_); + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); - std::shared_ptr target_memory_p; - if (custom_reorder_func) { - auto reordered_data = - custom_reorder_func(reinterpret_cast(ptr)); - ptr = reinterpret_cast(reordered_data.get()); - } - auto user_memory_p = - std::make_shared(user_md, engine_, ptr); - if (user_md != target_md) { - target_memory_p = std::make_shared(target_md, engine_); - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); - astream.wait(); - } else { - target_memory_p = user_memory_p; - } + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } else { + target_memory_p = user_memory_p; + } return target_memory_p; } - mkldnn::engine engine_; platform::Place place_; std::shared_ptr fwd_pd_; @@ -801,63 +794,64 @@ class MKLDNNHandler { }; template -class BinaryMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT { +class BinaryMKLDNNHandler + : public platform::MKLDNNHandlerNoCachingT { public: BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis, const mkldnn::engine engine, platform::Place cpu_place, const Tensor* x, const Tensor* y, Tensor* z, float scale_x, float scale_y, float scale_z) : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { - PADDLE_ENFORCE_EQ( - x->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for X tensor.")); - PADDLE_ENFORCE_NE( - x->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument("Wrong format set for X tensor.")); - - PADDLE_ENFORCE_EQ( - y->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for Y tensor.")); - PADDLE_ENFORCE_NE( - y->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument("Wrong format set for Y tensor.")); - - const auto src_x_tz = framework::vectorize(x->dims()); - const auto src_y_tz = framework::vectorize(y->dims()); - // if output tensor(z) is nullptr then we are computing into oneDNN - // managed buffer - auto rankdiff = x->dims().size() - y->dims().size(); - const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) - : framework::vectorize(z->dims()); - - auto src0_md = dnnl::memory::desc( - src_x_tz, platform::MKLDNNGetDataType(), x->format()); - auto src1_md = dnnl::memory::desc( - src_y_tz, platform::MKLDNNGetDataType(), y->format()); - if (rankdiff > 0) { // Second input is of smaller rank than first - std::vector dims1_ex(rankdiff, 1); - dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), - src_y_tz.begin(), src_y_tz.end()); - src1_md = src1_md.reshape(dims1_ex); - } else if (rankdiff < 0) { // First input is of smaller than second - std::vector dims0_ex(-rankdiff, 1); - dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), - src_x_tz.begin(), src_x_tz.end()); - src0_md = src0_md.reshape(dims0_ex); - } - const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), - MKLDNNMemoryFormat::any); + PADDLE_ENFORCE_EQ( + x->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument("Wrong layout set for X tensor.")); + PADDLE_ENFORCE_NE( + x->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument("Wrong format set for X tensor.")); + + PADDLE_ENFORCE_EQ( + y->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument("Wrong layout set for Y tensor.")); + PADDLE_ENFORCE_NE( + y->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument("Wrong format set for Y tensor.")); + + const auto src_x_tz = framework::vectorize(x->dims()); + const auto src_y_tz = framework::vectorize(y->dims()); + // if output tensor(z) is nullptr then we are computing into oneDNN + // managed buffer + auto rankdiff = x->dims().size() - y->dims().size(); + const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) + : framework::vectorize(z->dims()); + + auto src0_md = dnnl::memory::desc( + src_x_tz, platform::MKLDNNGetDataType(), x->format()); + auto src1_md = dnnl::memory::desc( + src_y_tz, platform::MKLDNNGetDataType(), y->format()); + if (rankdiff > 0) { // Second input is of smaller rank than first + std::vector dims1_ex(rankdiff, 1); + dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), + src_y_tz.begin(), src_y_tz.end()); + src1_md = src1_md.reshape(dims1_ex); + } else if (rankdiff < 0) { // First input is of smaller than second + std::vector dims0_ex(-rankdiff, 1); + dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), + src_x_tz.begin(), src_x_tz.end()); + src0_md = src0_md.reshape(dims0_ex); + } + const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), + MKLDNNMemoryFormat::any); - auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); - this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, - src1_md, dst_md); + auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); + this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md, + dst_md); } std::shared_ptr AcquireSecondSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->src1_desc(), to_void_cast(input_data)); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src1_desc(), + to_void_cast(input_data)); } private: @@ -981,51 +975,51 @@ class ReductionMKLDNNHandler template class ActivationMKLDNNHandler : public MKLDNNHandlerNoCachingT { + mkldnn::eltwise_backward> { public: ActivationMKLDNNHandler(mkldnn::algorithm algorithm, const framework::ExecutionContext& ctx, const mkldnn::engine engine, Place cpu_place, const framework::Tensor* in_x) : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; - float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; - // eltwise_linear means we are in scale op - if (algorithm == mkldnn::algorithm::eltwise_linear) { - bool bias_after_scale = ctx.Attr("bias_after_scale"); - auto* scale_tensor = ctx.Input("ScaleTensor"); - alpha = (scale_tensor == nullptr) ? ctx.Attr("scale") - : (float)*(scale_tensor->data()); - beta = ctx.Attr("bias"); - // if bias_after_scale == true - // out = scale*X + bias - // else - // out = scale*(X + bias) = scale*X + scale*bias - if (!bias_after_scale) beta *= alpha; - } else { - // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { - std::swap(alpha, beta); - } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { - alpha = ctx.Attr("threshold"); - } + mkldnn::eltwise_backward>(engine, + cpu_place) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + // eltwise_linear means we are in scale op + if (algorithm == mkldnn::algorithm::eltwise_linear) { + bool bias_after_scale = ctx.Attr("bias_after_scale"); + auto* scale_tensor = ctx.Input("ScaleTensor"); + alpha = (scale_tensor == nullptr) ? ctx.Attr("scale") + : (float)*(scale_tensor->data()); + beta = ctx.Attr("bias"); + // if bias_after_scale == true + // out = scale*X + bias + // else + // out = scale*(X + bias) = scale*X + scale*bias + if (!bias_after_scale) beta *= alpha; + } else { + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); } + } - PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6, - platform::errors::Unimplemented( - "Input dimension size can be 1, 2, 3, 4, " - "5, or 6, but now the dimension size is", - in_x->dims().size())); + PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6, + platform::errors::Unimplemented( + "Input dimension size can be 1, 2, 3, 4, " + "5, or 6, but now the dimension size is", + in_x->dims().size())); - auto src_tz = framework::vectorize(in_x->dims()); - auto src_fmt = - src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); - auto md = mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType(), - src_fmt); + auto src_tz = framework::vectorize(in_x->dims()); + auto src_fmt = src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto md = + mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType(), src_fmt); - this->AcquireForwardPrimitiveDescriptor( - mkldnn::prop_kind::forward_training, algorithm, md, alpha, beta); + this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, + algorithm, md, alpha, beta); } ActivationMKLDNNHandler(mkldnn::algorithm algorithm, @@ -1033,40 +1027,42 @@ class ActivationMKLDNNHandler const mkldnn::engine engine, Place cpu_place, const framework::Tensor* in_x, const Tensor* out_grad) : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; - float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; - - // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { - std::swap(alpha, beta); - } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { - alpha = ctx.Attr("threshold"); - } + mkldnn::eltwise_backward>(engine, + cpu_place) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); + } - auto diff_dst_tz = framework::vectorize(out_grad->dims()); + auto diff_dst_tz = framework::vectorize(out_grad->dims()); - auto src_fmt = - diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); - auto diff_fmt = - diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format(); + auto src_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto diff_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format(); - auto dims = framework::vectorize(in_x->dims()); - auto diff_dst_md = platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), diff_fmt); - auto src_md = platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), src_fmt); + auto dims = framework::vectorize(in_x->dims()); + auto diff_dst_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), diff_fmt); + auto src_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), src_fmt); - this->AcquireForwardPrimitiveDescriptor( - mkldnn::prop_kind::forward_training, algorithm, src_md, alpha, beta); - this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, - alpha, beta); + this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, + algorithm, src_md, alpha, beta); + this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, + alpha, beta); } std::shared_ptr AcquireBackwardSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_desc(), to_void_cast(input_data)); + return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_desc(), + to_void_cast(input_data)); } };