From 9e4533dbb94c74e560964b7a4098cc8c8e46538d Mon Sep 17 00:00:00 2001 From: Jakub Piasecki Date: Tue, 13 Jul 2021 15:20:31 +0200 Subject: [PATCH 1/7] test version of matmul_v2 --- paddle/fluid/operators/matmul_v2_op.cc | 17 +- .../operators/mkldnn/matmul_mkldnn_op.cc | 690 ------------------ .../operators/mkldnn/matmul_v2_mkldnn_op.cc | 145 +++- .../mkldnn/test_matmul_v2_mkldnn_op.py | 326 ++++----- 4 files changed, 282 insertions(+), 896 deletions(-) delete mode 100644 paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index 8ac81596a36d3..c8f5924165637 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -168,11 +168,18 @@ class MatMulV2OpGrad : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto out_grad_name = framework::GradVarName("Out"); - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, out_grad_name), - ctx.GetPlace()); + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, framework::GradVarName("Out")); + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } framework::OpKernelType GetKernelTypeForVar( diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc deleted file mode 100644 index 2b3496359b0c6..0000000000000 --- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc +++ /dev/null @@ -1,690 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/platform/mkldnn_reuse.h" - -namespace paddle { -namespace platform { -class MKLDNNDeviceContext; -struct CPUPlace; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { - -using dnnl::memory; -using dnnl::primitive; -using framework::DataLayout; -using framework::ExecutionContext; -using platform::GetMKLDNNFormat; -using platform::MKLDNNDeviceContext; -using platform::MKLDNNGetDataType; -using platform::to_void_cast; -using Tensor = framework::Tensor; - -// Reshape a rank-3 tensor from P x M x N to (P * M) x N. -// Identity op if the tensor is not of rank 3. -static framework::Tensor FoldOuterDims(const Tensor& input) { - auto output = input; - auto in_dims = input.dims(); - if (in_dims.size() == 3) { - output.Resize({in_dims[0] * in_dims[1], in_dims[2]}); - } - return output; -} - -// Reshape a rank-3 tensor from P x M x N to M x (P * N). -// (Warning: This requires transposing data and writes into new memory.) -// Identity op if the tensor is not of rank 3. -template -static framework::Tensor FoldFirstAndLastDims( - const MKLDNNDeviceContext& dev_ctx, const Tensor* input) { - auto input_dims = framework::vectorize(input->dims()); - if (input_dims.size() != 3) { - return *input; - } - - framework::Tensor output; - output.Resize({input_dims[1], input_dims[0], input_dims[2]}); - - auto output_dims = framework::vectorize(output.dims()); - - memory::data_type input_type = framework::ToMKLDNNDataType(input->type()); - std::string key = platform::CreateKey(dev_ctx, input_dims, input->format(), - input->format(), input_type); - platform::ReorderMKLDNNHandler reorder_handler(output_dims, input->type(), - input_type, dev_ctx, - dev_ctx.GetEngine(), key); - - auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( - memory::format_tag::abc, platform::to_void_cast(input->data())); - auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( - &output, memory::format_tag::bac, dev_ctx.GetPlace()); - auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p, - reorder_dst_memory_p); - - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); - astream.wait(); - - output.Resize({input_dims[1], input_dims[0] * input_dims[2]}); - return output; -} - -template -class MatMulMKLDNNHandler : public platform::MKLDNNHandlerT { - public: - MatMulMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, - const mkldnn::engine engine, platform::Place cpu_place, - Tensor* x, bool trans_x, Tensor* y, bool trans_y, - Tensor* out, float scale, const std::string& uniq_name) - : platform::MKLDNNHandlerT( - dev_ctx, engine, cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), - uniq_name)) { - if (!this->isCached()) { - auto mat_dim_x = math::CreateMatrixDescriptor(x->dims(), 0, trans_x); - auto mat_dim_y = math::CreateMatrixDescriptor(y->dims(), 0, trans_y); - - memory::dim x_bs = mat_dim_x.batch_size_; - memory::dim y_bs = mat_dim_y.batch_size_; - - memory::dim out_bs = x_bs || y_bs ? std::max(x_bs, y_bs) : 1; - const memory::dim M = mat_dim_x.height_; - const memory::dim N = mat_dim_y.width_; - const memory::dim K = mat_dim_x.width_; - - memory::dims x_dims = {x_bs > 0 ? x_bs : 1, M, K}; - memory::dims y_dims = {y_bs > 0 ? y_bs : 1, K, N}; - memory::dims out_dims = {out_bs, M, N}; - - memory::dims x_strides = - !trans_x ? memory::dims{M * K, K, 1} : memory::dims{M * K, 1, M}; - - memory::dims y_strides = - !trans_y ? memory::dims{N * K, N, 1} : memory::dims{N * K, 1, K}; - memory::dims out_strides = memory::dims{M * N, N, 1}; - - auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); - auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); - auto out_md = memory::desc(out_dims, MKLDNNGetDataType(), out_strides); - - dnnl::primitive_attr attrs; - if (scale != 1.0f) attrs.set_output_scales(0, {scale}); - - this->AcquireForwardPrimitiveDescriptor(attrs, x_md, y_md, out_md); - } - } - - std::shared_ptr AcquireWeightsMemory(const Tensor* input) { - const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), - to_void_cast(input_data), - "@weights_mem_p"); - } -}; - -template -constexpr bool IsInt8() { - return std::is_same::value || std::is_same::value; -} - -template -constexpr bool IsBfloat16() { - return std::is_same::value; -} - -// Get row matrix shape from a vector shape. If the rank of x_dim > 1, the -// original x_dim is returned. -static framework::DDim RowMatrixDimsFromVector(const framework::DDim& x_dim) { - return x_dim.size() > 1 ? x_dim : framework::make_ddim({1, x_dim[0]}); -} - -// Get column matrix shape from a vector shape. If the ran of y_dim > 1, the -// original y_dim is returned. -static framework::DDim ColumnMatrixDimsFromVector( - const framework::DDim& y_dim) { - return y_dim.size() > 1 ? y_dim : framework::make_ddim({y_dim[0], 1}); -} - -/** - * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor. - * - * The shape would be [BatchSize, H, W] or [H, W]. - * If transposed, `H,W` will be swapped. - */ -static void ReshapeTensorToMatrixSequence( - framework::Tensor* x, const math::MatDescriptor& descriptor) { - int64_t h, w; - h = descriptor.height_; - w = descriptor.width_; - if (descriptor.trans_) { - std::swap(w, h); - } - if (descriptor.batch_size_) { - x->Resize({descriptor.batch_size_, h, w}); - } else { - x->Resize({h, w}); - } -} - -/** - * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor - * Out = matmul(x, y) - * - * This method will first calculate X,Y matrix sequence, and then calculate - * the out shape. - * - * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2] - * The out = [BatchSize, H1, W2] - * - * If there is no batch size in `X` and `Y`, the out will be [H1, W2] - * If any of `X` and `Y` has batch size BatchSize, the out will have the - * BatchSize. - */ -static void ReshapeXYOutToMatrixSequence(framework::Tensor* x, - framework::Tensor* y, - framework::Tensor* out, bool trans_x, - bool trans_y) { - auto x_dim = RowMatrixDimsFromVector(x->dims()); - auto y_dim = ColumnMatrixDimsFromVector(y->dims()); - auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x); - auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y); - if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { - out->Resize({mat_dim_x.height_, mat_dim_y.width_}); - } else { - out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_), - mat_dim_x.height_, mat_dim_y.width_}); - } - - ReshapeTensorToMatrixSequence(x, mat_dim_x); - ReshapeTensorToMatrixSequence(y, mat_dim_y); -} - -template -class MatMulFactory { - public: - void CreateAndExecute(const ExecutionContext& ctx) { - SetDNNLEngine(ctx); - if (IsInitialized()) { - UpdateDataPointers(ctx); - Execute(); - SetOutputFormat(ctx); - return; - } - CreateMemories(ctx); - CreatePrimitive(ctx); - Execute(); - SetOutputFormat(ctx); - SetInitialized(); - } - - private: - struct MatMulDims { - const memory::dims x_dims, y_dims, out_dims, x_strides, y_strides, - out_strides; - }; - - void SetDNNLEngine(const ExecutionContext& ctx) { - auto& dev_ctx = - ctx.template device_context(); - engine_ = dev_ctx.GetEngine(); - } - - template - dnnl::memory CreateMemory(const memory::dims& dims, - const memory::dims& strides, const T* data) { - auto md = memory::desc(dims, MKLDNNGetDataType(), strides); - return dnnl::memory(md, engine_, to_void_cast(data)); - } - - std::vector Transpose(const std::vector& x, - const std::vector& axis) { - size_t in_rank = x.size(); - size_t axis_size = axis.size(); - - auto axis_set = std::set(axis.begin(), axis.end()); - PADDLE_ENFORCE_EQ(axis_set.size(), axis_size, - platform::errors::InvalidArgument( - "In an axis array, elements must be unique.")); - - PADDLE_ENFORCE_EQ( - in_rank, axis_size, - platform::errors::InvalidArgument("The input dimension's size " - "should be equal to the axis's size. " - "But received dimension is %d, " - "axis's size is %d", - in_rank, axis_size)); - - PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), axis_size, - platform::errors::InvalidArgument( - "Axis values must be ranging from 0 to (dims - 1).")); - - std::vector new_x(x.size()); - for (size_t i = 0; i < x.size(); i++) { - new_x[i] = x[axis[i]]; - } - return new_x; - } - - std::pair GetInputDimsAndStrides( - const ExecutionContext& ctx, std::string input_name) { - auto shape = ctx.Attr>("fused_reshape_" + input_name); - auto axis = ctx.Attr>("fused_transpose_" + input_name); - auto input_dims = ctx.Input(input_name)->dims(); - auto new_dims = input_dims; - if (!shape.empty() && !axis.empty()) { - new_dims = input_dims.reshape(shape).transpose(axis); - } - - auto& MatrixDimsFromVector = input_name == "X" ? RowMatrixDimsFromVector - : ColumnMatrixDimsFromVector; - math::MatDescriptor mat_dim = - math::CreateMatrixDescriptor(MatrixDimsFromVector(new_dims), 0, - ctx.Attr("transpose_" + input_name)); - - memory::dims strides; - if (!shape.empty()) { - auto shape2 = input_dims.reshape(shape); - strides.push_back(1); - for (auto i = shape2.size() - 1; i > 0; --i) { - strides.insert(strides.begin(), strides.front() * shape2[i]); - } - strides = Transpose(strides, axis); - if (shape.size() == 4) - strides.erase(strides.begin()); - else if (shape.size() == 2) - strides.insert(strides.begin(), shape[0] * shape[1]); - mat_dim.stride_ = strides[0]; - if (mat_dim.trans_) std::swap(*strides.rbegin(), *(++strides.rbegin())); - } - return std::make_pair(mat_dim, strides); - } - - bool IsInputFused(const ExecutionContext& ctx) const { - return !(ctx.Attr>("fused_reshape_X").empty() && - ctx.Attr>("fused_reshape_Y").empty()); - } - - bool IsOutputFused(const ExecutionContext& ctx) const { - auto& fused_reshape_Out = ctx.Attr>("fused_reshape_Out"); - auto& fused_transpose_Out = - ctx.Attr>("fused_transpose_Out"); - return !fused_reshape_Out.empty() && !fused_transpose_Out.empty(); - } - - void CorrectStridesWhenFloatOutputFused(const ExecutionContext& ctx, - const memory::dim N, memory::dim b, - memory::dims* out_strides) const { - if (!IsInt8() && !IsBfloat16() && IsOutputFused(ctx)) { - *out_strides = {N, b * N, 1}; - } - } - - MatMulDims GetMatmulDims(const ExecutionContext& ctx) { - math::MatDescriptor mat_dim_x; - memory::dims strides_x; - std::tie(mat_dim_x, strides_x) = GetInputDimsAndStrides(ctx, "X"); - math::MatDescriptor mat_dim_y; - memory::dims strides_y; - std::tie(mat_dim_y, strides_y) = GetInputDimsAndStrides(ctx, "Y"); - - auto x_bs = mat_dim_x.batch_size_; - auto y_bs = mat_dim_y.batch_size_; - PADDLE_ENFORCE_EQ(x_bs > 0 && y_bs > 0 && x_bs != y_bs, false, - platform::errors::InvalidArgument( - "If batch sizes of X and Y are positive," - "they have to be equal.")); - - memory::dim out_bs = x_bs || y_bs ? std::max(x_bs, y_bs) : 1; - const memory::dim M = mat_dim_x.height_; - const memory::dim N = mat_dim_y.width_; - const memory::dim K = mat_dim_x.width_; - - batch_size_ = 1; - if (out_bs > 1 && (IsOutputFused(ctx) || IsInputFused(ctx))) { - auto& x_dims = ctx.Input("X")->dims(); - auto& y_dims = ctx.Input("Y")->dims(); - batch_size_ = x_bs > y_bs ? x_dims[0] : y_dims[0]; - x_bs /= batch_size_; - y_bs /= batch_size_; - out_bs /= batch_size_; - } - memory::dims x_dims = {x_bs > 0 ? x_bs : 1, M, K}; - memory::dims y_dims = {y_bs > 0 ? y_bs : 1, K, N}; - memory::dims out_dims = {out_bs, M, N}; - - x_offset_ = x_bs * M * K * sizeof(XT); - y_offset_ = y_bs * K * N * sizeof(YT); - out_offset_ = out_bs * M * N * sizeof(OT); - - // Translate transA and transB - if (strides_x.empty()) - strides_x = !ctx.Attr("transpose_X") ? memory::dims{M * K, K, 1} - : memory::dims{M * K, 1, M}; - if (strides_y.empty()) - strides_y = !ctx.Attr("transpose_Y") ? memory::dims{N * K, N, 1} - : memory::dims{N * K, 1, K}; - memory::dims out_strides = memory::dims{M * N, N, 1}; - - CorrectStridesWhenFloatOutputFused(ctx, N, out_bs, &out_strides); - - return {x_dims, y_dims, out_dims, strides_x, strides_y, out_strides}; - } - - void CreateMemories(const ExecutionContext& ctx) { - auto matmul_dims = GetMatmulDims(ctx); - - x_mem_ = CreateMemory(matmul_dims.x_dims, matmul_dims.x_strides, - ctx.Input("X")->data()); - y_mem_ = CreateMemory(matmul_dims.y_dims, matmul_dims.y_strides, - ctx.Input("Y")->data()); - out_mem_ = CreateMemory( - matmul_dims.out_dims, matmul_dims.out_strides, - ctx.Output("Out")->mutable_data(ctx.GetPlace())); - } - - float ComputeOutputScale(const ExecutionContext& ctx) { - float scale_x = ctx.Attr("Scale_x"); - float scale_y = ctx.Attr("Scale_y"); - bool force_fp32_out = ctx.Attr("force_fp32_output"); - float scale_out = force_fp32_out ? 1.f : ctx.Attr("Scale_out"); - float alpha = ctx.Attr("alpha"); - return alpha * scale_out / (scale_x * scale_y); - } - - void CreatePrimitive(const ExecutionContext& ctx) { - dnnl::primitive_attr attr; - float scale_out = ComputeOutputScale(ctx); - if (scale_out != 1.0f) { - constexpr unsigned tensor_wide_scale = 0; - attr.set_output_scales(tensor_wide_scale, {scale_out}); - } - - auto matmul_d = dnnl::matmul::desc(x_mem_.get_desc(), y_mem_.get_desc(), - out_mem_.get_desc()); - auto matmul_pd = dnnl::matmul::primitive_desc(matmul_d, attr, engine_); - matmul_prim_ = dnnl::matmul(matmul_pd); - } - - void Execute() { - dnnl::stream stream(engine_); - - void* x_ptr = x_mem_.get_data_handle(); - void* y_ptr = y_mem_.get_data_handle(); - void* out_ptr = out_mem_.get_data_handle(); - for (uint16_t i = 0; i < batch_size_; i++) { - x_mem_.set_data_handle(x_ptr); - y_mem_.set_data_handle(y_ptr); - out_mem_.set_data_handle(out_ptr); - matmul_prim_.execute(stream, { - {MKLDNN_ARG_SRC, x_mem_}, - {MKLDNN_ARG_WEIGHTS, y_mem_}, - {MKLDNN_ARG_DST, out_mem_}, - }); - x_ptr = static_cast(x_ptr) + x_offset_; - y_ptr = static_cast(y_ptr) + y_offset_; - out_ptr = static_cast(out_ptr) + out_offset_; - } - stream.wait(); - } - - void SetOutputFormat(const ExecutionContext& ctx) { - using platform::MKLDNNFormatForSize; - auto* out = ctx.Output("Out"); - auto format = - MKLDNNFormatForSize(out->dims().size(), MKLDNNMemoryFormat::nchw); - out->set_format(format); - out->set_layout(DataLayout::kMKLDNN); - } - - void UpdateDataPointers(const ExecutionContext& ctx) { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - x_mem_.set_data_handle(to_void_cast(x->data())); - y_mem_.set_data_handle(to_void_cast(y->data())); - out_mem_.set_data_handle(out->mutable_data(ctx.GetPlace())); - } - - // If initialized, x memory should've been already initialized - bool IsInitialized() { return initialized_; } - - void SetInitialized() { initialized_ = true; } - - private: - struct memory_offsets { - size_t x_offset; - size_t y_offset; - size_t out_offset; - }; - - dnnl::engine engine_; - dnnl::memory x_mem_; - dnnl::memory y_mem_; - dnnl::memory out_mem_; - dnnl::matmul matmul_prim_; - uint32_t x_offset_; - uint32_t y_offset_; - uint32_t out_offset_; - uint16_t batch_size_; - bool initialized_ = false; -}; - -template -static std::shared_ptr> GetPrimitiveFactory( - const ExecutionContext& ctx) { - const auto& out_name = ctx.OutputName("Out"); - const auto& dev_ctx = ctx.template device_context(); - const auto batch_size = ctx.Input("X")->dims()[0]; - std::string key = platform::CreateKey(dev_ctx, batch_size, out_name); - key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); - - auto factory = - std::static_pointer_cast>(dev_ctx.GetBlob(key)); - if (factory == nullptr) { - factory = std::make_shared>(); - dev_ctx.SetBlob(key, factory); - } - - return factory; -} - -// Choose appropriate primitive factory implementation based on inferred -// output type (uint8, int8 or float). -template -static void ExecuteMatMul(const ExecutionContext& ctx) { - constexpr bool is_int8 = IsInt8(); - constexpr bool is_bfloat16 = IsBfloat16(); - const bool force_fp32_output = ctx.Attr("force_fp32_output"); - constexpr bool fuse_relu = false; // TODO(intel): Enable eltwise fuses - if (force_fp32_output || ((!is_int8) && (!is_bfloat16))) { - GetPrimitiveFactory(ctx)->CreateAndExecute(ctx); - } else if (is_bfloat16) { - GetPrimitiveFactory(ctx) - ->CreateAndExecute(ctx); - } else if (fuse_relu) { - GetPrimitiveFactory(ctx)->CreateAndExecute(ctx); - } else { - GetPrimitiveFactory(ctx)->CreateAndExecute(ctx); - } -} - -template -class DNNLMatMulKernel : public framework::OpKernel { - public: - void Compute(const ExecutionContext& ctx) const override { - if (ctx.HasAttr("head_number")) { - PADDLE_ENFORCE_EQ( - ctx.Attr("head_number"), 1, - platform::errors::Unimplemented( - "DNNL matmul doesn't support multiple heads. Expected " - "head_number=1. But received `head_number` is %d", - ctx.Attr("head_number"))); - } - platform::MKLDNNDeviceContext::tls().log_lib_version(); - ExecuteMatMul(ctx); - } -}; - -template -class MatMulGradMKLDNNKernel : public framework::OpKernel { - public: - void Compute(const ExecutionContext& ctx) const override { - if (ctx.HasAttr("head_number")) { - PADDLE_ENFORCE_EQ( - ctx.Attr("head_number"), 1, - platform::errors::Unimplemented( - "DNNL matmul doesn't support multiple heads. Expected " - "head_number=1. But received `head_number` is %d", - ctx.Attr("head_number"))); - } - RunKernel(ctx); - } - - private: - void ExecuteMatMulGrad(const ExecutionContext& ctx, - const MKLDNNDeviceContext& dev_ctx, - const mkldnn::engine& engine, Tensor* x, bool trans_x, - bool is_fold_init_dims_x, Tensor* y, bool trans_y, - bool is_fold_init_dims_y, Tensor* out, - int execution_number) const { - // gradient is calculated in a different way when broadcasting is used - bool need_combine = (x->dims().size() == 3 || y->dims().size() == 3) && - out->dims().size() == 2; - - Tensor x_combined, y_combined; - if (!need_combine) { - x_combined = *x; - y_combined = *y; - } else { - x_combined = is_fold_init_dims_x ? FoldOuterDims(*x) - : FoldFirstAndLastDims(dev_ctx, x); - y_combined = is_fold_init_dims_y ? FoldOuterDims(*y) - : FoldFirstAndLastDims(dev_ctx, y); - } - - MatMulMKLDNNHandler handler( - dev_ctx, engine, ctx.GetPlace(), &x_combined, trans_x, &y_combined, - trans_y, out, ctx.Attr("alpha"), - ctx.InputName(framework::GradVarName("Out")) + - std::to_string(execution_number)); - - const auto src_memory_p = handler.AcquireSrcMemory(&x_combined); - const auto weights_memory_p = handler.AcquireWeightsMemory(&y_combined); - const auto dst_memory_p = handler.AcquireDstMemory(out); - - auto matmul_p = handler.AcquireForwardPrimitive(); - - std::unordered_map matmul_args = { - {DNNL_ARG_SRC, *src_memory_p}, - {DNNL_ARG_WEIGHTS, *weights_memory_p}, - {DNNL_ARG_DST, *dst_memory_p}}; - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - matmul_p->execute(astream, matmul_args); - astream.wait(); - - out->set_layout(framework::DataLayout::kMKLDNN); - out->set_format(platform::GetMKLDNNFormat(dst_memory_p->get_desc().reshape( - framework::vectorize(out->dims())))); - } - - template - void RunKernel(const ExecutionContext& ctx) const { - const auto& dev_ctx = - ctx.template device_context(); - const auto& onednn_engine = dev_ctx.GetEngine(); - - auto x = *ctx.Input("X"); - auto y = *ctx.Input("Y"); - auto dout = *ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - bool transpose_x = ctx.Attr("transpose_X"); - bool transpose_y = ctx.Attr("transpose_Y"); - - ReshapeXYOutToMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); - framework::DDim dx_dims; - if (dx) { - dx_dims = dx->dims(); - if (dx_dims != x.dims()) { - dx->Resize(x.dims()); - } - } - - framework::DDim dy_dims; - if (dy) { - dy_dims = dy->dims(); - if (dy_dims != y.dims()) { - dy->Resize(y.dims()); - } - } - - if (transpose_x && transpose_y) { - this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &y, true, true, - &dout, true, false, dx, 0); - this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, true, true, - &x, true, false, dy, 1); - } else if (transpose_x) { - this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &y, false, false, - &dout, true, false, dx, 0); - this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &x, false, false, - &dout, false, true, dy, 1); - } else if (transpose_y) { - this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, false, false, - &y, false, true, dx, 0); - this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, true, true, - &x, false, true, dy, 1); - } else { - this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, false, false, - &y, true, false, dx, 0); - this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &x, true, true, - &dout, false, true, dy, 1); - } - - if (dx) { - if (dx_dims != x.dims()) { - dx->Resize(dx_dims); - } - } - if (dy) { - if (dy_dims != y.dims()) { - dy->Resize(dy_dims); - } - } - } -}; - -} // namespace operators -} // namespace paddle -namespace ops = paddle::operators; - -REGISTER_OP_KERNEL(matmul, MKLDNN, ::paddle::platform::CPUPlace, - ops::DNNLMatMulKernel, - ops::DNNLMatMulKernel, - ops::DNNLMatMulKernel, - ops::DNNLMatMulKernel); - -REGISTER_OP_KERNEL(matmul_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::MatMulGradMKLDNNKernel, - ops::MatMulGradMKLDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc index 50afd417170e0..a0e8ad5110dcd 100644 --- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -12,10 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h" namespace paddle { namespace operators { @@ -37,7 +34,7 @@ class MatMulV2MKLDNNHandler : public platform::MKLDNNHandlerT { const mkldnn::engine engine, platform::Place cpu_place, std::vector& x_dims, bool trans_x, std::vector& y_dims, bool trans_y, - const std::string& uniq_name) + const std::string& uniq_name, float scale = 1.0f) : platform::MKLDNNHandlerT( dev_ctx, engine, cpu_place, platform::CreateKey(dev_ctx, x_dims, uniq_name)) { @@ -104,7 +101,61 @@ class MatMulV2MKLDNNHandler : public platform::MKLDNNHandlerT { }; template -class MatMulV2MKLDNNKernel : public framework::OpKernel { +class MatMulV2MKLDNNKernel : public MatMulGradMKLDNNKernel { + public: + void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); } + + private: + void RunKernel(const ExecutionContext& ctx) const { + const auto& dev_ctx = ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + bool trans_x = ctx.Attr("trans_x"); + bool trans_y = ctx.Attr("trans_y"); + + auto x_dims = framework::vectorize(x->dims()); + auto y_dims = framework::vectorize(y->dims()); + auto out_dims = framework::vectorize(out->dims()); + + int ndims = std::max(x->dims().size(), y->dims().size()); + ndims = std::max(ndims, 3); + + std::vector x_bd_dims(ndims, 1); + std::vector y_bd_dims(ndims, 1); + + this->CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, out_dims, + out); + + MatMulV2MKLDNNHandler handler(dev_ctx, onednn_engine, ctx.GetPlace(), + x_bd_dims, trans_x, y_bd_dims, trans_y, + ctx.InputName("X")); + + const auto src_memory_p = handler.AcquireSrcMemory(x); + const auto weights_memory_p = handler.AcquireWeightsMemory(y); + const auto dst_memory_p = handler.AcquireDstMemory(out); + + auto matmul_p = handler.AcquireForwardPrimitive(); + + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + auto& astream = MKLDNNDeviceContext::tls().get_stream(); + matmul_p->execute(astream, matmul_args); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format( + GetMKLDNNFormat(dst_memory_p->get_desc().reshape(out_dims))); + } +}; + +template +class MatMulV2GradMKLDNNKernel : public MatMulGradMKLDNNKernel { public: void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); } @@ -151,13 +202,32 @@ class MatMulV2MKLDNNKernel : public framework::OpKernel { auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - bool trans_x = ctx.Attr("trans_x"); - bool trans_y = ctx.Attr("trans_y"); auto x_dims = framework::vectorize(x->dims()); auto y_dims = framework::vectorize(y->dims()); - auto out_dims = framework::vectorize(out->dims()); + + bool is_broadcast = true; + if (x_dims.size() <= 2 || y_dims.size() <= 2) { + is_broadcast = false; + } else if (x_dims.size() != y_dims.size()) { + is_broadcast = true; + } else { + is_broadcast = !std::equal(x_dims.cbegin(), x_dims.cbegin() + x_dims.size() - 2, + y_dims.cbegin()); + } + + if(!is_broadcast){ + MatMulGradMKLDNNKernel::Compute(ctx); + return; + } + + auto* dout=ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + + bool trans_x = ctx.Attr("trans_x"); + bool trans_y = ctx.Attr("trans_y"); + auto dout_dims = framework::vectorize(dout->dims()); int ndims = std::max(x->dims().size(), y->dims().size()); ndims = std::max(ndims, 3); @@ -165,33 +235,32 @@ class MatMulV2MKLDNNKernel : public framework::OpKernel { std::vector x_bd_dims(ndims, 1); std::vector y_bd_dims(ndims, 1); - CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, out_dims, - out); - - MatMulV2MKLDNNHandler handler(dev_ctx, onednn_engine, ctx.GetPlace(), - x_bd_dims, trans_x, y_bd_dims, trans_y, - ctx.InputName("X")); - - const auto src_memory_p = handler.AcquireSrcMemory(x); - const auto weights_memory_p = handler.AcquireWeightsMemory(y); - const auto dst_memory_p = handler.AcquireDstMemory(out); - - auto matmul_p = handler.AcquireForwardPrimitive(); - - std::unordered_map matmul_args = { - {DNNL_ARG_SRC, *src_memory_p}, - {DNNL_ARG_WEIGHTS, *weights_memory_p}, - {DNNL_ARG_DST, *dst_memory_p}}; - - auto& astream = MKLDNNDeviceContext::tls().get_stream(); - matmul_p->execute(astream, matmul_args); - astream.wait(); - - out->set_layout(framework::DataLayout::kMKLDNN); - out->set_format( - GetMKLDNNFormat(dst_memory_p->get_desc().reshape(out_dims))); + //CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, dout_dims, + // dout); + + //MatMulV2MKLDNNHandler handler(dev_ctx, onednn_engine, ctx.GetPlace(), + // x_bd_dims, trans_x, y_bd_dims, trans_y, + // ctx.InputName("X")); + //const auto src_memory_p = handler.AcquireSrcMemory(x); + //const auto weights_memory_p = handler.AcquireWeightsMemory(y); + //const auto dst_memory_p = handler.AcquireDstMemory(dout); + //auto matmul_p = handler.AcquireForwardPrimitive(); + + //std::unordered_map matmul_args = { + // {DNNL_ARG_SRC, *src_memory_p}, + // {DNNL_ARG_WEIGHTS, *weights_memory_p}, + // {DNNL_ARG_DST, *dst_memory_p}}; + + //auto& astream = MKLDNNDeviceContext::tls().get_stream(); + //matmul_p->execute(astream, matmul_args); + //astream.wait(); + + //dout->set_layout(framework::DataLayout::kMKLDNN); + //dout->set_format( + // GetMKLDNNFormat(dst_memory_p->get_desc().reshape(dout_dims))); } }; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; @@ -200,6 +269,6 @@ REGISTER_OP_KERNEL(matmul_v2, MKLDNN, ::paddle::platform::CPUPlace, ops::MatMulV2MKLDNNKernel, ops::MatMulV2MKLDNNKernel); -// REGISTER_OP_KERNEL(matmul_grad_v2, MKLDNN, ::paddle::platform::CPUPlace, -// ops::MatMulV2GradMKLDNNKernel, -// ops::MatMulV2GradMKLDNNKernel); +REGISTER_OP_KERNEL(matmul_v2_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::MatMulV2GradMKLDNNKernel, + ops::MatMulV2GradMKLDNNKernel); diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py index 11b111310d3b9..68865c8ebc855 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py @@ -51,8 +51,8 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False): class TestMatMulV2VectorXVectorOneDNNOp(OpTest): def config(self): - self.x_shape = (100, ) - self.y_shape = (100, ) + self.x_shape = (2, 3, 10, 5) + self.y_shape = (2, 3, 5, 10) self.trans_x = False self.trans_y = False @@ -89,150 +89,150 @@ def test_check_grad(self): self.check_grad(['X', 'Y'], 'Out') -class TestMatMulV2VectorXMatrixTransposeYOneDNNOp( - TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (100, ) - self.y_shape = (1, 3, 2, 100) - self.trans_x = False - self.trans_y = True - - -class TestMatMulV2VectorXMatrixOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (100, ) - self.y_shape = (1, 1, 100, 2) - self.trans_x = False - self.trans_y = False - - -class TestMatMulV2MatrixXVectorTransposeXOneDNNOp( - TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (1, 1, 100, 1) - self.y_shape = (100, ) - self.trans_x = True - self.trans_y = False - - -class TestMatMulV2MatrixXVectorOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (1, 2, 1, 100) - self.y_shape = (100, ) - self.trans_x = False - self.trans_y = False - - -class TestMatMulV2MatrixXMatrixOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (1, 1, 2, 100) - self.y_shape = (1, 1, 100, 1) - self.trans_x = False - self.trans_y = False - - -class TestMatMulV2MatrixXMatrixTransposeYOneDNNOp( - TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (1, 1, 1, 100) - self.y_shape = (2, 1, 2, 100) - self.trans_x = False - self.trans_y = True - - -class TestMatMulV2MatrixXMatrix2OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (1, 1, 12, 4) - self.y_shape = (1, 2, 4, 12) - self.trans_x = False - self.trans_y = False - - -class TestMatMulV2MatrixXMatrix3OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (2, 1, 2, 100) - self.y_shape = (1, 1, 100, 2) - self.trans_x = False - self.trans_y = False - - -class TestMatMulV2MatrixXMatrixTranposeXOneDNNOp2( - TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (2, 1, 4, 25) - self.y_shape = (1, 1, 4, 25) - self.trans_x = True - self.trans_y = False - - -class TestMatMulV2MatrixXMatrixTranposeX2OneDNNOp3( - TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (2, 2, 5, 4) - self.y_shape = (2, 2, 5, 3) - self.trans_x = True - self.trans_y = False - - -class TestMatMulV2MatrixXMatrixTransposeX3OneDNNOp( - TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (3, 1, 6, 5) - self.y_shape = (1, 2, 6, 9) - self.trans_x = True - self.trans_y = False - - -class TestMatMulV2MatrixXMatrix4OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (3, 1, 6, 6) - self.y_shape = (1, 2, 6, 9) - self.trans_x = False - self.trans_y = False - - -class TestMatMulV2VectorXMatrix5DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (100) - self.y_shape = (1, 2, 2, 100, 2) - self.trans_x = False - self.trans_y = False - - -class TestMatMulV2Matrix3DXVectorOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (2, 1, 40) - self.y_shape = (40) - self.trans_x = False - self.trans_y = False - - -class TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp( - TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (3, 1, 10, 8) - self.y_shape = (1, 2, 9, 10) - self.trans_x = True - self.trans_y = True - - -class TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp( - TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (3, 1, 10, 10) - self.y_shape = (1, 2, 9, 10) - self.trans_x = False - self.trans_y = True - - -class TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp( - TestMatMulV2VectorXVectorOneDNNOp): - def config(self): - self.x_shape = (1, 3, 1, 10, 10) - self.y_shape = (3, 1, 2, 9, 10) - self.trans_x = False - self.trans_y = True - +#class TestMatMulV2VectorXMatrixTransposeYOneDNNOp( +# TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (100, ) +# self.y_shape = (1, 3, 2, 100) +# self.trans_x = False +# self.trans_y = True +# +# +#class TestMatMulV2VectorXMatrixOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (100, ) +# self.y_shape = (1, 1, 100, 2) +# self.trans_x = False +# self.trans_y = False +# +# +#class TestMatMulV2MatrixXVectorTransposeXOneDNNOp( +# TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (1, 1, 100, 1) +# self.y_shape = (100, ) +# self.trans_x = True +# self.trans_y = False +# +# +#class TestMatMulV2MatrixXVectorOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (1, 2, 1, 100) +# self.y_shape = (100, ) +# self.trans_x = False +# self.trans_y = False +# +# +#class TestMatMulV2MatrixXMatrixOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (1, 1, 2, 100) +# self.y_shape = (1, 1, 100, 1) +# self.trans_x = False +# self.trans_y = False +# +# +#class TestMatMulV2MatrixXMatrixTransposeYOneDNNOp( +# TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (1, 1, 1, 100) +# self.y_shape = (2, 1, 2, 100) +# self.trans_x = False +# self.trans_y = True +# +# +#class TestMatMulV2MatrixXMatrix2OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (1, 1, 12, 4) +# self.y_shape = (1, 2, 4, 12) +# self.trans_x = False +# self.trans_y = False +# +# +#class TestMatMulV2MatrixXMatrix3OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (2, 1, 2, 100) +# self.y_shape = (1, 1, 100, 2) +# self.trans_x = False +# self.trans_y = False +# +# +#class TestMatMulV2MatrixXMatrixTranposeXOneDNNOp2( +# TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (2, 1, 4, 25) +# self.y_shape = (1, 1, 4, 25) +# self.trans_x = True +# self.trans_y = False +# +# +#class TestMatMulV2MatrixXMatrixTranposeX2OneDNNOp3( +# TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (2, 2, 5, 4) +# self.y_shape = (2, 2, 5, 3) +# self.trans_x = True +# self.trans_y = False +# +# +#class TestMatMulV2MatrixXMatrixTransposeX3OneDNNOp( +# TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (3, 1, 6, 5) +# self.y_shape = (1, 2, 6, 9) +# self.trans_x = True +# self.trans_y = False +# +# +#class TestMatMulV2MatrixXMatrix4OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (3, 1, 6, 6) +# self.y_shape = (1, 2, 6, 9) +# self.trans_x = False +# self.trans_y = False +# +# +#class TestMatMulV2VectorXMatrix5DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (100) +# self.y_shape = (1, 2, 2, 100, 2) +# self.trans_x = False +# self.trans_y = False +# +# +#class TestMatMulV2Matrix3DXVectorOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (2, 1, 40) +# self.y_shape = (40) +# self.trans_x = False +# self.trans_y = False +# +# +#class TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp( +# TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (3, 1, 10, 8) +# self.y_shape = (1, 2, 9, 10) +# self.trans_x = True +# self.trans_y = True +# +# +#class TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp( +# TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (3, 1, 10, 10) +# self.y_shape = (1, 2, 9, 10) +# self.trans_x = False +# self.trans_y = True +# +# +#class TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp( +# TestMatMulV2VectorXVectorOneDNNOp): +# def config(self): +# self.x_shape = (1, 3, 1, 10, 10) +# self.y_shape = (3, 1, 2, 9, 10) +# self.trans_x = False +# self.trans_y = True +# # BF16 TESTS def create_bf16_test_class(parent): @@ -265,23 +265,23 @@ def test_check_grad(self): globals()[cls_name] = TestMatMulV2Bf16OneDNNOp -create_bf16_test_class(TestMatMulV2VectorXMatrixTransposeYOneDNNOp) -create_bf16_test_class(TestMatMulV2VectorXMatrixOneDNNOp) -create_bf16_test_class(TestMatMulV2MatrixXVectorTransposeXOneDNNOp) -create_bf16_test_class(TestMatMulV2MatrixXVectorOneDNNOp) -create_bf16_test_class(TestMatMulV2MatrixXMatrixOneDNNOp) -create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeYOneDNNOp) -create_bf16_test_class(TestMatMulV2MatrixXMatrix2OneDNNOp) -create_bf16_test_class(TestMatMulV2MatrixXMatrix3OneDNNOp) -create_bf16_test_class(TestMatMulV2MatrixXMatrixTranposeXOneDNNOp2) -create_bf16_test_class(TestMatMulV2MatrixXMatrixTranposeX2OneDNNOp3) -create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeX3OneDNNOp) -create_bf16_test_class(TestMatMulV2MatrixXMatrix4OneDNNOp) -create_bf16_test_class(TestMatMulV2VectorXMatrix5DOneDNNOp) -create_bf16_test_class(TestMatMulV2Matrix3DXVectorOneDNNOp) -create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp) -create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp) -create_bf16_test_class(TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp) +#create_bf16_test_class(TestMatMulV2VectorXMatrixTransposeYOneDNNOp) +#create_bf16_test_class(TestMatMulV2VectorXMatrixOneDNNOp) +#create_bf16_test_class(TestMatMulV2MatrixXVectorTransposeXOneDNNOp) +#create_bf16_test_class(TestMatMulV2MatrixXVectorOneDNNOp) +#create_bf16_test_class(TestMatMulV2MatrixXMatrixOneDNNOp) +#create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeYOneDNNOp) +#create_bf16_test_class(TestMatMulV2MatrixXMatrix2OneDNNOp) +#create_bf16_test_class(TestMatMulV2MatrixXMatrix3OneDNNOp) +#create_bf16_test_class(TestMatMulV2MatrixXMatrixTranposeXOneDNNOp2) +#create_bf16_test_class(TestMatMulV2MatrixXMatrixTranposeX2OneDNNOp3) +#create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeX3OneDNNOp) +#create_bf16_test_class(TestMatMulV2MatrixXMatrix4OneDNNOp) +#create_bf16_test_class(TestMatMulV2VectorXMatrix5DOneDNNOp) +#create_bf16_test_class(TestMatMulV2Matrix3DXVectorOneDNNOp) +#create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp) +#create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp) +#create_bf16_test_class(TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp) if __name__ == "__main__": paddle.enable_static() From 11b2b6b59a1fe6832915bef8c209737dca1dc171 Mon Sep 17 00:00:00 2001 From: Jakub Piasecki Date: Thu, 15 Jul 2021 12:27:44 +0200 Subject: [PATCH 2/7] added matmul_v2 grad kernel --- paddle/fluid/operators/matmul_v2_op.cc | 15 +- .../fluid/operators/mkldnn/matmul_mkldnn_op.h | 698 ++++++++++++++++++ .../operators/mkldnn/matmul_v2_mkldnn_op.cc | 238 ++++-- .../mkldnn/test_matmul_v2_mkldnn_op.py | 437 ++++++----- 4 files changed, 1140 insertions(+), 248 deletions(-) create mode 100644 paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index c8f5924165637..b75ef49b6876d 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -62,10 +62,15 @@ class MatMulV2Op : public framework::OperatorWithKernel { } std::vector new_dims; - if (ndims_x >= ndims_y) { + if (ndims_x > ndims_y) { new_dims.assign(dims_x.begin(), dims_x.end() - 2); - } else { + } else if (ndims_x < ndims_y) { new_dims.assign(dims_y.begin(), dims_y.end() - 2); + } else { + new_dims.reserve(ndims_x); + for (int i = 0; i < ndims_x - 2; ++i) { + new_dims.push_back(std::max(dims_x[i], dims_y[i])); + } } if (!x_broadcasted) { new_dims.push_back(M); @@ -168,9 +173,9 @@ class MatMulV2OpGrad : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::OperatorWithKernel::IndicateVarDataType(ctx, framework::GradVarName("Out")); + const framework::ExecutionContext& ctx) const override { + auto input_data_type = OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); #ifdef PADDLE_WITH_MKLDNN if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h new file mode 100644 index 0000000000000..9e90fe805d27e --- /dev/null +++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h @@ -0,0 +1,698 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace platform { +class MKLDNNDeviceContext; +struct CPUPlace; +} // namespace platform +} // namespace paddle + +namespace paddle { +namespace operators { + +using dnnl::memory; +using dnnl::primitive; +using framework::DataLayout; +using framework::ExecutionContext; +using platform::GetMKLDNNFormat; +using platform::MKLDNNDeviceContext; +using platform::MKLDNNGetDataType; +using platform::to_void_cast; +using Tensor = framework::Tensor; + +// Reshape a rank-3 tensor from P x M x N to (P * M) x N. +// Identity op if the tensor is not of rank 3. +static framework::Tensor FoldOuterDims(const Tensor& input) { + auto output = input; + auto in_dims = input.dims(); + if (in_dims.size() == 3) { + output.Resize({in_dims[0] * in_dims[1], in_dims[2]}); + } + return output; +} + +// Reshape a rank-3 tensor from P x M x N to M x (P * N). +// (Warning: This requires transposing data and writes into new memory.) +// Identity op if the tensor is not of rank 3. +template +static framework::Tensor FoldFirstAndLastDims( + const MKLDNNDeviceContext& dev_ctx, const Tensor* input) { + auto input_dims = framework::vectorize(input->dims()); + if (input_dims.size() != 3) { + return *input; + } + + framework::Tensor output; + output.Resize({input_dims[1], input_dims[0], input_dims[2]}); + + auto output_dims = framework::vectorize(output.dims()); + + memory::data_type input_type = framework::ToMKLDNNDataType(input->type()); + std::string key = platform::CreateKey(dev_ctx, input_dims, input->format(), + input->format(), input_type); + platform::ReorderMKLDNNHandler reorder_handler(output_dims, input->type(), + input_type, dev_ctx, + dev_ctx.GetEngine(), key); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + memory::format_tag::abc, platform::to_void_cast(input->data())); + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + &output, memory::format_tag::bac, dev_ctx.GetPlace()); + auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p, + reorder_dst_memory_p); + + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); + astream.wait(); + + output.Resize({input_dims[1], input_dims[0] * input_dims[2]}); + return output; +} + +template +class MatMulMKLDNNHandler : public platform::MKLDNNHandlerT { + public: + MatMulMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine engine, platform::Place cpu_place, + Tensor* x, bool trans_x, Tensor* y, bool trans_y, + Tensor* out, float scale, const std::string& uniq_name) + : platform::MKLDNNHandlerT( + dev_ctx, engine, cpu_place, + platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), + uniq_name)) { + if (!this->isCached()) { + auto mat_dim_x = math::CreateMatrixDescriptor(x->dims(), 0, trans_x); + auto mat_dim_y = math::CreateMatrixDescriptor(y->dims(), 0, trans_y); + + memory::dim x_bs = mat_dim_x.batch_size_; + memory::dim y_bs = mat_dim_y.batch_size_; + + memory::dim out_bs = x_bs || y_bs ? std::max(x_bs, y_bs) : 1; + const memory::dim M = mat_dim_x.height_; + const memory::dim N = mat_dim_y.width_; + const memory::dim K = mat_dim_x.width_; + + memory::dims x_dims = {x_bs > 0 ? x_bs : 1, M, K}; + memory::dims y_dims = {y_bs > 0 ? y_bs : 1, K, N}; + memory::dims out_dims = {out_bs, M, N}; + + memory::dims x_strides = + !trans_x ? memory::dims{M * K, K, 1} : memory::dims{M * K, 1, M}; + + memory::dims y_strides = + !trans_y ? memory::dims{N * K, N, 1} : memory::dims{N * K, 1, K}; + memory::dims out_strides = memory::dims{M * N, N, 1}; + + auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); + auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); + auto out_md = memory::desc(out_dims, MKLDNNGetDataType(), out_strides); + + dnnl::primitive_attr attrs; + if (scale != 1.0f) attrs.set_output_scales(0, {scale}); + + this->AcquireForwardPrimitiveDescriptor(attrs, x_md, y_md, out_md); + } + } + + std::shared_ptr AcquireWeightsMemory(const Tensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), + to_void_cast(input_data), + "@weights_mem_p"); + } +}; + +template +constexpr bool IsInt8() { + return std::is_same::value || std::is_same::value; +} + +template +constexpr bool IsBfloat16() { + return std::is_same::value; +} + +// Get row matrix shape from a vector shape. If the rank of x_dim > 1, the +// original x_dim is returned. +static framework::DDim RowMatrixDimsFromVector(const framework::DDim& x_dim) { + return x_dim.size() > 1 ? x_dim : framework::make_ddim({1, x_dim[0]}); +} + +// Get column matrix shape from a vector shape. If the ran of y_dim > 1, the +// original y_dim is returned. +static framework::DDim ColumnMatrixDimsFromVector( + const framework::DDim& y_dim) { + return y_dim.size() > 1 ? y_dim : framework::make_ddim({y_dim[0], 1}); +} + +/** + * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor. + * + * The shape would be [BatchSize, H, W] or [H, W]. + * If transposed, `H,W` will be swapped. + */ +static void ReshapeTensorToMatrixSequence( + framework::Tensor* x, const math::MatDescriptor& descriptor) { + int64_t h, w; + h = descriptor.height_; + w = descriptor.width_; + if (descriptor.trans_) { + std::swap(w, h); + } + if (descriptor.batch_size_) { + x->Resize({descriptor.batch_size_, h, w}); + } else { + x->Resize({h, w}); + } +} + +/** + * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor + * Out = matmul(x, y) + * + * This method will first calculate X,Y matrix sequence, and then calculate + * the out shape. + * + * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2] + * The out = [BatchSize, H1, W2] + * + * If there is no batch size in `X` and `Y`, the out will be [H1, W2] + * If any of `X` and `Y` has batch size BatchSize, the out will have the + * BatchSize. + */ +static void ReshapeXYOutToMatrixSequence(framework::Tensor* x, + framework::Tensor* y, + framework::Tensor* out, bool trans_x, + bool trans_y) { + auto x_dim = RowMatrixDimsFromVector(x->dims()); + auto y_dim = ColumnMatrixDimsFromVector(y->dims()); + auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x); + auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y); + if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { + out->Resize({mat_dim_x.height_, mat_dim_y.width_}); + } else { + out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_), + mat_dim_x.height_, mat_dim_y.width_}); + } + + ReshapeTensorToMatrixSequence(x, mat_dim_x); + ReshapeTensorToMatrixSequence(y, mat_dim_y); +} + +template +class MatMulFactory { + public: + void CreateAndExecute(const ExecutionContext& ctx) { + SetDNNLEngine(ctx); + if (IsInitialized()) { + UpdateDataPointers(ctx); + Execute(); + SetOutputFormat(ctx); + return; + } + CreateMemories(ctx); + CreatePrimitive(ctx); + Execute(); + SetOutputFormat(ctx); + SetInitialized(); + } + + private: + struct MatMulDims { + const memory::dims x_dims, y_dims, out_dims, x_strides, y_strides, + out_strides; + }; + + void SetDNNLEngine(const ExecutionContext& ctx) { + auto& dev_ctx = + ctx.template device_context(); + engine_ = dev_ctx.GetEngine(); + } + + template + dnnl::memory CreateMemory(const memory::dims& dims, + const memory::dims& strides, const T* data) { + auto md = memory::desc(dims, MKLDNNGetDataType(), strides); + return dnnl::memory(md, engine_, to_void_cast(data)); + } + + std::vector Transpose(const std::vector& x, + const std::vector& axis) { + size_t in_rank = x.size(); + size_t axis_size = axis.size(); + + auto axis_set = std::set(axis.begin(), axis.end()); + PADDLE_ENFORCE_EQ(axis_set.size(), axis_size, + platform::errors::InvalidArgument( + "In an axis array, elements must be unique.")); + + PADDLE_ENFORCE_EQ( + in_rank, axis_size, + platform::errors::InvalidArgument("The input dimension's size " + "should be equal to the axis's size. " + "But received dimension is %d, " + "axis's size is %d", + in_rank, axis_size)); + + PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), axis_size, + platform::errors::InvalidArgument( + "Axis values must be ranging from 0 to (dims - 1).")); + + std::vector new_x(x.size()); + for (size_t i = 0; i < x.size(); i++) { + new_x[i] = x[axis[i]]; + } + return new_x; + } + + std::pair GetInputDimsAndStrides( + const ExecutionContext& ctx, std::string input_name) { + auto shape = ctx.Attr>("fused_reshape_" + input_name); + auto axis = ctx.Attr>("fused_transpose_" + input_name); + auto input_dims = ctx.Input(input_name)->dims(); + auto new_dims = input_dims; + if (!shape.empty() && !axis.empty()) { + new_dims = input_dims.reshape(shape).transpose(axis); + } + + auto& MatrixDimsFromVector = input_name == "X" ? RowMatrixDimsFromVector + : ColumnMatrixDimsFromVector; + math::MatDescriptor mat_dim = + math::CreateMatrixDescriptor(MatrixDimsFromVector(new_dims), 0, + ctx.Attr("transpose_" + input_name)); + + memory::dims strides; + if (!shape.empty()) { + auto shape2 = input_dims.reshape(shape); + strides.push_back(1); + for (auto i = shape2.size() - 1; i > 0; --i) { + strides.insert(strides.begin(), strides.front() * shape2[i]); + } + strides = Transpose(strides, axis); + if (shape.size() == 4) + strides.erase(strides.begin()); + else if (shape.size() == 2) + strides.insert(strides.begin(), shape[0] * shape[1]); + mat_dim.stride_ = strides[0]; + if (mat_dim.trans_) std::swap(*strides.rbegin(), *(++strides.rbegin())); + } + return std::make_pair(mat_dim, strides); + } + + bool IsInputFused(const ExecutionContext& ctx) const { + return !(ctx.Attr>("fused_reshape_X").empty() && + ctx.Attr>("fused_reshape_Y").empty()); + } + + bool IsOutputFused(const ExecutionContext& ctx) const { + auto& fused_reshape_Out = ctx.Attr>("fused_reshape_Out"); + auto& fused_transpose_Out = + ctx.Attr>("fused_transpose_Out"); + return !fused_reshape_Out.empty() && !fused_transpose_Out.empty(); + } + + void CorrectStridesWhenFloatOutputFused(const ExecutionContext& ctx, + const memory::dim N, memory::dim b, + memory::dims* out_strides) const { + if (!IsInt8() && !IsBfloat16() && IsOutputFused(ctx)) { + *out_strides = {N, b * N, 1}; + } + } + + MatMulDims GetMatmulDims(const ExecutionContext& ctx) { + math::MatDescriptor mat_dim_x; + memory::dims strides_x; + std::tie(mat_dim_x, strides_x) = GetInputDimsAndStrides(ctx, "X"); + math::MatDescriptor mat_dim_y; + memory::dims strides_y; + std::tie(mat_dim_y, strides_y) = GetInputDimsAndStrides(ctx, "Y"); + + auto x_bs = mat_dim_x.batch_size_; + auto y_bs = mat_dim_y.batch_size_; + PADDLE_ENFORCE_EQ(x_bs > 0 && y_bs > 0 && x_bs != y_bs, false, + platform::errors::InvalidArgument( + "If batch sizes of X and Y are positive," + "they have to be equal.")); + + memory::dim out_bs = x_bs || y_bs ? std::max(x_bs, y_bs) : 1; + const memory::dim M = mat_dim_x.height_; + const memory::dim N = mat_dim_y.width_; + const memory::dim K = mat_dim_x.width_; + + batch_size_ = 1; + if (out_bs > 1 && (IsOutputFused(ctx) || IsInputFused(ctx))) { + auto& x_dims = ctx.Input("X")->dims(); + auto& y_dims = ctx.Input("Y")->dims(); + batch_size_ = x_bs > y_bs ? x_dims[0] : y_dims[0]; + x_bs /= batch_size_; + y_bs /= batch_size_; + out_bs /= batch_size_; + } + memory::dims x_dims = {x_bs > 0 ? x_bs : 1, M, K}; + memory::dims y_dims = {y_bs > 0 ? y_bs : 1, K, N}; + memory::dims out_dims = {out_bs, M, N}; + + x_offset_ = x_bs * M * K * sizeof(XT); + y_offset_ = y_bs * K * N * sizeof(YT); + out_offset_ = out_bs * M * N * sizeof(OT); + + // Translate transA and transB + if (strides_x.empty()) + strides_x = !ctx.Attr("transpose_X") ? memory::dims{M * K, K, 1} + : memory::dims{M * K, 1, M}; + if (strides_y.empty()) + strides_y = !ctx.Attr("transpose_Y") ? memory::dims{N * K, N, 1} + : memory::dims{N * K, 1, K}; + memory::dims out_strides = memory::dims{M * N, N, 1}; + + CorrectStridesWhenFloatOutputFused(ctx, N, out_bs, &out_strides); + + return {x_dims, y_dims, out_dims, strides_x, strides_y, out_strides}; + } + + void CreateMemories(const ExecutionContext& ctx) { + auto matmul_dims = GetMatmulDims(ctx); + + x_mem_ = CreateMemory(matmul_dims.x_dims, matmul_dims.x_strides, + ctx.Input("X")->data()); + y_mem_ = CreateMemory(matmul_dims.y_dims, matmul_dims.y_strides, + ctx.Input("Y")->data()); + out_mem_ = CreateMemory( + matmul_dims.out_dims, matmul_dims.out_strides, + ctx.Output("Out")->mutable_data(ctx.GetPlace())); + } + + float ComputeOutputScale(const ExecutionContext& ctx) { + float scale_x = ctx.Attr("Scale_x"); + float scale_y = ctx.Attr("Scale_y"); + bool force_fp32_out = ctx.Attr("force_fp32_output"); + float scale_out = force_fp32_out ? 1.f : ctx.Attr("Scale_out"); + float alpha = ctx.Attr("alpha"); + return alpha * scale_out / (scale_x * scale_y); + } + + void CreatePrimitive(const ExecutionContext& ctx) { + dnnl::primitive_attr attr; + float scale_out = ComputeOutputScale(ctx); + if (scale_out != 1.0f) { + constexpr unsigned tensor_wide_scale = 0; + attr.set_output_scales(tensor_wide_scale, {scale_out}); + } + + auto matmul_d = dnnl::matmul::desc(x_mem_.get_desc(), y_mem_.get_desc(), + out_mem_.get_desc()); + auto matmul_pd = dnnl::matmul::primitive_desc(matmul_d, attr, engine_); + matmul_prim_ = dnnl::matmul(matmul_pd); + } + + void Execute() { + dnnl::stream stream(engine_); + + void* x_ptr = x_mem_.get_data_handle(); + void* y_ptr = y_mem_.get_data_handle(); + void* out_ptr = out_mem_.get_data_handle(); + for (uint16_t i = 0; i < batch_size_; i++) { + x_mem_.set_data_handle(x_ptr); + y_mem_.set_data_handle(y_ptr); + out_mem_.set_data_handle(out_ptr); + matmul_prim_.execute(stream, { + {MKLDNN_ARG_SRC, x_mem_}, + {MKLDNN_ARG_WEIGHTS, y_mem_}, + {MKLDNN_ARG_DST, out_mem_}, + }); + x_ptr = static_cast(x_ptr) + x_offset_; + y_ptr = static_cast(y_ptr) + y_offset_; + out_ptr = static_cast(out_ptr) + out_offset_; + } + stream.wait(); + } + + void SetOutputFormat(const ExecutionContext& ctx) { + using platform::MKLDNNFormatForSize; + auto* out = ctx.Output("Out"); + auto format = + MKLDNNFormatForSize(out->dims().size(), MKLDNNMemoryFormat::nchw); + out->set_format(format); + out->set_layout(DataLayout::kMKLDNN); + } + + void UpdateDataPointers(const ExecutionContext& ctx) { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + x_mem_.set_data_handle(to_void_cast(x->data())); + y_mem_.set_data_handle(to_void_cast(y->data())); + out_mem_.set_data_handle(out->mutable_data(ctx.GetPlace())); + } + + // If initialized, x memory should've been already initialized + bool IsInitialized() { return initialized_; } + + void SetInitialized() { initialized_ = true; } + + private: + struct memory_offsets { + size_t x_offset; + size_t y_offset; + size_t out_offset; + }; + + dnnl::engine engine_; + dnnl::memory x_mem_; + dnnl::memory y_mem_; + dnnl::memory out_mem_; + dnnl::matmul matmul_prim_; + uint32_t x_offset_; + uint32_t y_offset_; + uint32_t out_offset_; + uint16_t batch_size_; + bool initialized_ = false; +}; + +template +static std::shared_ptr> GetPrimitiveFactory( + const ExecutionContext& ctx) { + const auto& out_name = ctx.OutputName("Out"); + const auto& dev_ctx = ctx.template device_context(); + const auto batch_size = ctx.Input("X")->dims()[0]; + std::string key = platform::CreateKey(dev_ctx, batch_size, out_name); + key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); + + auto factory = + std::static_pointer_cast>(dev_ctx.GetBlob(key)); + if (factory == nullptr) { + factory = std::make_shared>(); + dev_ctx.SetBlob(key, factory); + } + + return factory; +} + +// Choose appropriate primitive factory implementation based on inferred +// output type (uint8, int8 or float). +template +static void ExecuteMatMul(const ExecutionContext& ctx) { + constexpr bool is_int8 = IsInt8(); + constexpr bool is_bfloat16 = IsBfloat16(); + const bool force_fp32_output = ctx.Attr("force_fp32_output"); + constexpr bool fuse_relu = false; // TODO(intel): Enable eltwise fuses + if (force_fp32_output || ((!is_int8) && (!is_bfloat16))) { + GetPrimitiveFactory(ctx)->CreateAndExecute(ctx); + } else if (is_bfloat16) { + GetPrimitiveFactory(ctx) + ->CreateAndExecute(ctx); + } else if (fuse_relu) { + GetPrimitiveFactory(ctx)->CreateAndExecute(ctx); + } else { + GetPrimitiveFactory(ctx)->CreateAndExecute(ctx); + } +} + +template +class DNNLMatMulKernel : public framework::OpKernel { + public: + void Compute(const ExecutionContext& ctx) const override { + if (ctx.HasAttr("head_number")) { + PADDLE_ENFORCE_EQ( + ctx.Attr("head_number"), 1, + platform::errors::Unimplemented( + "DNNL matmul doesn't support multiple heads. Expected " + "head_number=1. But received `head_number` is %d", + ctx.Attr("head_number"))); + } + platform::MKLDNNDeviceContext::tls().log_lib_version(); + ExecuteMatMul(ctx); + } +}; + +template +class MatMulGradMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const ExecutionContext& ctx) const override { + if (ctx.HasAttr("head_number")) { + PADDLE_ENFORCE_EQ( + ctx.Attr("head_number"), 1, + platform::errors::Unimplemented( + "DNNL matmul doesn't support multiple heads. Expected " + "head_number=1. But received `head_number` is %d", + ctx.Attr("head_number"))); + } + RunKernel(ctx); + } + + private: + void ExecuteMatMulGrad(const ExecutionContext& ctx, + const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine& engine, Tensor* x, bool trans_x, + bool is_fold_init_dims_x, Tensor* y, bool trans_y, + bool is_fold_init_dims_y, Tensor* out, + int execution_number) const { + // gradient is calculated in a different way when broadcasting is used + bool need_combine = (x->dims().size() == 3 || y->dims().size() == 3) && + out->dims().size() == 2; + + Tensor x_combined, y_combined; + if (!need_combine) { + x_combined = *x; + y_combined = *y; + } else { + x_combined = is_fold_init_dims_x ? FoldOuterDims(*x) + : FoldFirstAndLastDims(dev_ctx, x); + y_combined = is_fold_init_dims_y ? FoldOuterDims(*y) + : FoldFirstAndLastDims(dev_ctx, y); + } + + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 1.0f; + + MatMulMKLDNNHandler handler( + dev_ctx, engine, ctx.GetPlace(), &x_combined, trans_x, &y_combined, + trans_y, out, alpha, ctx.InputName(framework::GradVarName("Out")) + + std::to_string(execution_number)); + + const auto src_memory_p = handler.AcquireSrcMemory(&x_combined); + const auto weights_memory_p = handler.AcquireWeightsMemory(&y_combined); + const auto dst_memory_p = handler.AcquireDstMemory(out); + + auto matmul_p = handler.AcquireForwardPrimitive(); + + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + matmul_p->execute(astream, matmul_args); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory_p->get_desc().reshape( + framework::vectorize(out->dims())))); + } + + template + void RunKernel(const ExecutionContext& ctx) const { + const auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + auto x = *ctx.Input("X"); + auto y = *ctx.Input("Y"); + auto dout = *ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + + bool transpose_x = ctx.HasAttr("transpose_X") + ? ctx.Attr("transpose_X") + : ctx.Attr("trans_x"); + bool transpose_y = ctx.HasAttr("transpose_Y") + ? ctx.Attr("transpose_Y") + : ctx.Attr("trans_y"); + + ReshapeXYOutToMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); + + framework::DDim dx_dims; + if (dx) { + dx_dims = dx->dims(); + if (dx_dims != x.dims()) { + dx->Resize(x.dims()); + } + } + + framework::DDim dy_dims; + if (dy) { + dy_dims = dy->dims(); + if (dy_dims != y.dims()) { + dy->Resize(y.dims()); + } + } + + if (transpose_x && transpose_y) { + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &y, true, true, + &dout, true, false, dx, 0); + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, true, true, + &x, true, false, dy, 1); + } else if (transpose_x) { + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &y, false, false, + &dout, true, false, dx, 0); + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &x, false, false, + &dout, false, true, dy, 1); + } else if (transpose_y) { + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, false, false, + &y, false, true, dx, 0); + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, true, true, + &x, false, true, dy, 1); + } else { + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, false, false, + &y, true, false, dx, 0); + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &x, true, true, + &dout, false, true, dy, 1); + } + + if (dx) { + if (dx_dims != x.dims()) { + dx->Resize(dx_dims); + dx->set_format(x.format()); + } + } + if (dy) { + if (dy_dims != y.dims()) { + dy->Resize(dy_dims); + dy->set_format(y.format()); + } + } + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(matmul, MKLDNN, ::paddle::platform::CPUPlace, + ops::DNNLMatMulKernel, + ops::DNNLMatMulKernel, + ops::DNNLMatMulKernel, + ops::DNNLMatMulKernel); + +REGISTER_OP_KERNEL(matmul_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::MatMulGradMKLDNNKernel, + ops::MatMulGradMKLDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc index a0e8ad5110dcd..79356be5be1ff 100644 --- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -32,14 +32,17 @@ class MatMulV2MKLDNNHandler : public platform::MKLDNNHandlerT { public: MatMulV2MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, const mkldnn::engine engine, platform::Place cpu_place, - std::vector& x_dims, bool trans_x, - std::vector& y_dims, bool trans_y, - const std::string& uniq_name, float scale = 1.0f) + const std::vector& x_org_dims, bool trans_x, + const std::vector& y_org_dims, bool trans_y, + const std::string& uniq_name) : platform::MKLDNNHandlerT( dev_ctx, engine, cpu_place, - platform::CreateKey(dev_ctx, x_dims, uniq_name)) { + platform::CreateKey(dev_ctx, x_org_dims, uniq_name)) { if (!this->isCached()) { // M X K * K X N + std::vector x_dims(x_org_dims); + std::vector y_dims(y_org_dims); + const int MB_idx = x_dims.size() - 3; const int H_idx = x_dims.size() - 2; const int W_idx = x_dims.size() - 1; @@ -105,33 +108,18 @@ class MatMulV2MKLDNNKernel : public MatMulGradMKLDNNKernel { public: void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); } - private: - void RunKernel(const ExecutionContext& ctx) const { - const auto& dev_ctx = ctx.template device_context(); - const auto& onednn_engine = dev_ctx.GetEngine(); - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - bool trans_x = ctx.Attr("trans_x"); - bool trans_y = ctx.Attr("trans_y"); - - auto x_dims = framework::vectorize(x->dims()); - auto y_dims = framework::vectorize(y->dims()); - auto out_dims = framework::vectorize(out->dims()); - - int ndims = std::max(x->dims().size(), y->dims().size()); - ndims = std::max(ndims, 3); - - std::vector x_bd_dims(ndims, 1); - std::vector y_bd_dims(ndims, 1); - - this->CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, out_dims, - out); - - MatMulV2MKLDNNHandler handler(dev_ctx, onednn_engine, ctx.GetPlace(), - x_bd_dims, trans_x, y_bd_dims, trans_y, - ctx.InputName("X")); + protected: + void ExecuteMatMul(const ExecutionContext& ctx, + const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine onednn_engine, + platform::Place cpu_place, const Tensor* x, + std::vector& x_dims, bool trans_x, + const Tensor* y, std::vector& y_dims, + bool trans_y, Tensor* out, std::vector& out_dims, + int execution_number = 0) const { + MatMulV2MKLDNNHandler handler( + dev_ctx, onednn_engine, ctx.GetPlace(), x_dims, trans_x, y_dims, + trans_y, ctx.InputName("X") + std::to_string(execution_number)); const auto src_memory_p = handler.AcquireSrcMemory(x); const auto weights_memory_p = handler.AcquireWeightsMemory(y); @@ -152,12 +140,6 @@ class MatMulV2MKLDNNKernel : public MatMulGradMKLDNNKernel { out->set_format( GetMKLDNNFormat(dst_memory_p->get_desc().reshape(out_dims))); } -}; - -template -class MatMulV2GradMKLDNNKernel : public MatMulGradMKLDNNKernel { - public: - void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); } private: void CalculateMatrixDims(const ExecutionContext& ctx, @@ -168,6 +150,9 @@ class MatMulV2GradMKLDNNKernel : public MatMulGradMKLDNNKernel { std::vector& out_dims, Tensor* out) const { if (x_dims.size() == 1) { x_bd_dims[x_bd_dims.size() - 1] = x_dims[0]; + } else if (x_dims.size() == 2) { + x_bd_dims[2] = x_dims[1]; + x_bd_dims[1] = x_dims[0]; } else { for (size_t i = 0; i < x_dims.size(); ++i) { x_bd_dims[i] = x_dims[i]; @@ -175,6 +160,9 @@ class MatMulV2GradMKLDNNKernel : public MatMulGradMKLDNNKernel { } if (y_dims.size() == 1) { y_bd_dims[x_bd_dims.size() - 2] = y_dims[0]; + } else if (y_dims.size() == 2) { + y_bd_dims[2] = y_dims[1]; + y_bd_dims[1] = y_dims[0]; } else { for (size_t i = 0; i < y_dims.size(); ++i) { y_bd_dims[i] = y_dims[i]; @@ -196,6 +184,84 @@ class MatMulV2GradMKLDNNKernel : public MatMulGradMKLDNNKernel { } } + void RunKernel(const ExecutionContext& ctx) const { + const auto& dev_ctx = ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + bool trans_x = ctx.Attr("trans_x"); + bool trans_y = ctx.Attr("trans_y"); + + auto x_dims = framework::vectorize(x->dims()); + auto y_dims = framework::vectorize(y->dims()); + auto out_dims = framework::vectorize(out->dims()); + + int ndims = std::max(x->dims().size(), y->dims().size()); + ndims = std::max(ndims, 3); + + std::vector x_bd_dims(ndims, 1); + std::vector y_bd_dims(ndims, 1); + + this->CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, + out_dims, out); + + ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_bd_dims, + trans_x, y, y_bd_dims, trans_y, out, out_dims); + } +}; + +template +class MatMulV2GradMKLDNNKernel : public MatMulV2MKLDNNKernel { + public: + void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); } + + private: + void CalculateGradMatrixDims(const ExecutionContext& ctx, Tensor* dx_tmp, + Tensor* dy_tmp, + const std::vector& dx_dims, + const std::vector& dy_dims, + std::vector& dx_bd_dims, + std::vector& dy_bd_dims) const { + for (size_t i = 0; i < dx_dims.size() - 2; ++i) { + if (dx_dims[i] != dy_dims[i]) { + if (dx_dims[i] == 1) { + dx_bd_dims[i] = dy_dims[i]; + } else { + dy_bd_dims[i] = dx_dims[i]; + } + } + } + + dx_tmp->Resize(framework::make_ddim(dx_bd_dims)); + dx_tmp->mutable_data(ctx.GetPlace()); + dy_tmp->Resize(framework::make_ddim(dy_bd_dims)); + dy_tmp->mutable_data(ctx.GetPlace()); + } + + void ReduceSumForMatmulGradOutput(const ExecutionContext& ctx, + const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine onednn_engine, + const Tensor* dx_tmp, Tensor* dx, + std::vector dx_dims) const { + platform::ReductionMKLDNNHandler handler( + dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, onednn_engine, + ctx.GetPlace(), dx_tmp, dx, ctx.InputName("X"), dx_dims); + + auto src_memory_p = handler.AcquireSrcMemory(dx_tmp); + auto dst_memory_p = handler.AcquireDstMemory(dx); + + std::unordered_map reduction_args = { + {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}; + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + auto reduction_p = handler.AcquireForwardPrimitive(); + + reduction_p->execute(astream, reduction_args); + astream.wait(); + } + void RunKernel(const ExecutionContext& ctx) const { const auto& dev_ctx = ctx.template device_context(); const auto& onednn_engine = dev_ctx.GetEngine(); @@ -212,19 +278,22 @@ class MatMulV2GradMKLDNNKernel : public MatMulGradMKLDNNKernel { } else if (x_dims.size() != y_dims.size()) { is_broadcast = true; } else { - is_broadcast = !std::equal(x_dims.cbegin(), x_dims.cbegin() + x_dims.size() - 2, - y_dims.cbegin()); + is_broadcast = + !std::equal(x_dims.cbegin(), x_dims.cbegin() + x_dims.size() - 2, + y_dims.cbegin()); } - if(!is_broadcast){ + // if no broadcasting is needed, we can simply use matmul's grad and avoid + // using reduce_sum + if (!is_broadcast) { MatMulGradMKLDNNKernel::Compute(ctx); return; } - auto* dout=ctx.Input(framework::GradVarName("Out")); + auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); - + bool trans_x = ctx.Attr("trans_x"); bool trans_y = ctx.Attr("trans_y"); auto dout_dims = framework::vectorize(dout->dims()); @@ -232,32 +301,63 @@ class MatMulV2GradMKLDNNKernel : public MatMulGradMKLDNNKernel { int ndims = std::max(x->dims().size(), y->dims().size()); ndims = std::max(ndims, 3); - std::vector x_bd_dims(ndims, 1); - std::vector y_bd_dims(ndims, 1); + // in broadcasting scenario new memory is required because + // reduce sum must be calculated upon broadcasted dims + Tensor dx_tmp, dy_tmp; + + std::vector dx_bd_dims(x_dims); + std::vector dy_bd_dims(y_dims); + + CalculateGradMatrixDims(ctx, &dx_tmp, &dy_tmp, x_dims, y_dims, dx_bd_dims, + dy_bd_dims); + + if (trans_x && trans_y) { + this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, + y_dims, true, dout, dout_dims, true, &dx_tmp, + dx_bd_dims, 1); + this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, + dout_dims, true, x, x_dims, true, &dy_tmp, dy_bd_dims, + 2); + } else if (trans_x) { + this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, + y_dims, false, dout, dout_dims, true, &dx_tmp, + dx_bd_dims, 1); + this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, + x_dims, false, dout, dout_dims, false, &dy_tmp, + dy_bd_dims, 2); + } else if (trans_y) { + this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, + dout_dims, false, y, y_dims, false, &dx_tmp, + dx_bd_dims, 1); + this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, + dout_dims, true, x, x_dims, false, &dy_tmp, + dy_bd_dims, 2); + } else { + this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout, + dout_dims, false, y, y_dims, true, &dx_tmp, + dx_bd_dims, 1); + this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, + x_dims, true, dout, dout_dims, false, &dy_tmp, + dy_bd_dims, 2); + } + + if (x_dims != dx_bd_dims) { + ReduceSumForMatmulGradOutput(ctx, dev_ctx, onednn_engine, &dx_tmp, dx, + x_dims); + } else { + *dx = std::move(dx_tmp); + } + if (y_dims != dy_bd_dims) { + ReduceSumForMatmulGradOutput(ctx, dev_ctx, onednn_engine, &dy_tmp, dy, + y_dims); + } else { + *dy = std::move(dy_tmp); + } - //CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, dout_dims, - // dout); - - //MatMulV2MKLDNNHandler handler(dev_ctx, onednn_engine, ctx.GetPlace(), - // x_bd_dims, trans_x, y_bd_dims, trans_y, - // ctx.InputName("X")); - //const auto src_memory_p = handler.AcquireSrcMemory(x); - //const auto weights_memory_p = handler.AcquireWeightsMemory(y); - //const auto dst_memory_p = handler.AcquireDstMemory(dout); - //auto matmul_p = handler.AcquireForwardPrimitive(); - - //std::unordered_map matmul_args = { - // {DNNL_ARG_SRC, *src_memory_p}, - // {DNNL_ARG_WEIGHTS, *weights_memory_p}, - // {DNNL_ARG_DST, *dst_memory_p}}; - - //auto& astream = MKLDNNDeviceContext::tls().get_stream(); - //matmul_p->execute(astream, matmul_args); - //astream.wait(); - - //dout->set_layout(framework::DataLayout::kMKLDNN); - //dout->set_format( - // GetMKLDNNFormat(dst_memory_p->get_desc().reshape(dout_dims))); + dx->set_layout(framework::DataLayout::kMKLDNN); + dx->set_format(x->format()); + dy->set_layout(framework::DataLayout::kMKLDNN); + dy->set_format(y->format()); } }; @@ -270,5 +370,5 @@ REGISTER_OP_KERNEL(matmul_v2, MKLDNN, ::paddle::platform::CPUPlace, ops::MatMulV2MKLDNNKernel); REGISTER_OP_KERNEL(matmul_v2_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::MatMulV2GradMKLDNNKernel, - ops::MatMulV2GradMKLDNNKernel); + ops::MatMulV2GradMKLDNNKernel, + ops::MatMulV2GradMKLDNNKernel); diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py index 68865c8ebc855..aa24e0d6d0823 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py @@ -15,6 +15,7 @@ from __future__ import print_function import unittest +from functools import reduce import numpy as np from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16 @@ -24,11 +25,11 @@ import paddle.fluid.framework as framework -def reference_matmul(X, Y, transpose_X=False, transpose_Y=False): +def reference_matmul(X, Y, trans_x=False, trans_y=False): """Reference forward implementation using np.matmul.""" # np.matmul does not support the transpose flags, so we manually # transpose X and Y appropriately. - if transpose_X: + if trans_x: if X.ndim == 1: X = X.reshape((X.size, )) elif X.ndim == 2: @@ -37,7 +38,7 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False): dim = [i for i in range(len(X.shape))] dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1] X = np.transpose(X, tuple(dim)) - if transpose_Y: + if trans_y: if Y.ndim == 1: Y = Y.reshape((Y.size, )) else: @@ -52,7 +53,7 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False): class TestMatMulV2VectorXVectorOneDNNOp(OpTest): def config(self): self.x_shape = (2, 3, 10, 5) - self.y_shape = (2, 3, 5, 10) + self.y_shape = (1, 3, 5, 10) self.trans_x = False self.trans_y = False @@ -89,150 +90,150 @@ def test_check_grad(self): self.check_grad(['X', 'Y'], 'Out') -#class TestMatMulV2VectorXMatrixTransposeYOneDNNOp( -# TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (100, ) -# self.y_shape = (1, 3, 2, 100) -# self.trans_x = False -# self.trans_y = True -# -# -#class TestMatMulV2VectorXMatrixOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (100, ) -# self.y_shape = (1, 1, 100, 2) -# self.trans_x = False -# self.trans_y = False -# -# -#class TestMatMulV2MatrixXVectorTransposeXOneDNNOp( -# TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (1, 1, 100, 1) -# self.y_shape = (100, ) -# self.trans_x = True -# self.trans_y = False -# -# -#class TestMatMulV2MatrixXVectorOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (1, 2, 1, 100) -# self.y_shape = (100, ) -# self.trans_x = False -# self.trans_y = False -# -# -#class TestMatMulV2MatrixXMatrixOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (1, 1, 2, 100) -# self.y_shape = (1, 1, 100, 1) -# self.trans_x = False -# self.trans_y = False -# -# -#class TestMatMulV2MatrixXMatrixTransposeYOneDNNOp( -# TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (1, 1, 1, 100) -# self.y_shape = (2, 1, 2, 100) -# self.trans_x = False -# self.trans_y = True -# -# -#class TestMatMulV2MatrixXMatrix2OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (1, 1, 12, 4) -# self.y_shape = (1, 2, 4, 12) -# self.trans_x = False -# self.trans_y = False -# -# -#class TestMatMulV2MatrixXMatrix3OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (2, 1, 2, 100) -# self.y_shape = (1, 1, 100, 2) -# self.trans_x = False -# self.trans_y = False -# -# -#class TestMatMulV2MatrixXMatrixTranposeXOneDNNOp2( -# TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (2, 1, 4, 25) -# self.y_shape = (1, 1, 4, 25) -# self.trans_x = True -# self.trans_y = False -# -# -#class TestMatMulV2MatrixXMatrixTranposeX2OneDNNOp3( -# TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (2, 2, 5, 4) -# self.y_shape = (2, 2, 5, 3) -# self.trans_x = True -# self.trans_y = False -# -# -#class TestMatMulV2MatrixXMatrixTransposeX3OneDNNOp( -# TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (3, 1, 6, 5) -# self.y_shape = (1, 2, 6, 9) -# self.trans_x = True -# self.trans_y = False -# -# -#class TestMatMulV2MatrixXMatrix4OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (3, 1, 6, 6) -# self.y_shape = (1, 2, 6, 9) -# self.trans_x = False -# self.trans_y = False -# -# -#class TestMatMulV2VectorXMatrix5DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (100) -# self.y_shape = (1, 2, 2, 100, 2) -# self.trans_x = False -# self.trans_y = False -# -# -#class TestMatMulV2Matrix3DXVectorOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (2, 1, 40) -# self.y_shape = (40) -# self.trans_x = False -# self.trans_y = False -# -# -#class TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp( -# TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (3, 1, 10, 8) -# self.y_shape = (1, 2, 9, 10) -# self.trans_x = True -# self.trans_y = True -# -# -#class TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp( -# TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (3, 1, 10, 10) -# self.y_shape = (1, 2, 9, 10) -# self.trans_x = False -# self.trans_y = True -# -# -#class TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp( -# TestMatMulV2VectorXVectorOneDNNOp): -# def config(self): -# self.x_shape = (1, 3, 1, 10, 10) -# self.y_shape = (3, 1, 2, 9, 10) -# self.trans_x = False -# self.trans_y = True -# +class TestMatMulV2VectorXMatrixTransposeYOneDNNOp( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (100, ) + self.y_shape = (1, 3, 2, 100) + self.trans_x = False + self.trans_y = True + + +class TestMatMulV2VectorXMatrixOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (100, ) + self.y_shape = (1, 1, 100, 2) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2MatrixXVectorTransposeXOneDNNOp( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (1, 1, 100, 1) + self.y_shape = (100, ) + self.trans_x = True + self.trans_y = False + + +class TestMatMulV2MatrixXVectorOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (1, 2, 1, 100) + self.y_shape = (100, ) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2MatrixXMatrixOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (1, 1, 2, 100) + self.y_shape = (1, 1, 100, 1) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2MatrixXMatrixTransposeYOneDNNOp( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (1, 1, 1, 100) + self.y_shape = (2, 1, 2, 100) + self.trans_x = False + self.trans_y = True + + +class TestMatMulV2MatrixXMatrix2OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (2, 1, 12, 9) + self.y_shape = (1, 3, 9, 12) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2MatrixXMatrix3OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (2, 1, 2, 100) + self.y_shape = (1, 1, 100, 2) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2MatrixXMatrixTranposeXOneDNNOp2( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (2, 1, 4, 25) + self.y_shape = (1, 1, 4, 25) + self.trans_x = True + self.trans_y = False + + +class TestMatMulV2MatrixXMatrixTranposeX2OneDNNOp3( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (2, 2, 7, 4) + self.y_shape = (2, 2, 7, 5) + self.trans_x = True + self.trans_y = False + + +class TestMatMulV2MatrixXMatrixTransposeX3OneDNNOp( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (3, 1, 6, 7) + self.y_shape = (1, 2, 6, 9) + self.trans_x = True + self.trans_y = False + + +class TestMatMulV2MatrixXMatrix4OneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (3, 1, 6, 6) + self.y_shape = (1, 2, 6, 9) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2VectorXMatrix5DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (100) + self.y_shape = (1, 2, 2, 100, 2) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2Matrix3DXVectorOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (2, 1, 100) + self.y_shape = (100) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (3, 1, 10, 8) + self.y_shape = (1, 2, 9, 10) + self.trans_x = True + self.trans_y = True + + +class TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (3, 1, 10, 10) + self.y_shape = (1, 2, 9, 10) + self.trans_x = False + self.trans_y = True + + +class TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp( + TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (1, 3, 1, 10, 10) + self.y_shape = (3, 1, 2, 9, 10) + self.trans_x = False + self.trans_y = True + # BF16 TESTS def create_bf16_test_class(parent): @@ -242,46 +243,134 @@ def set_inputs(self, x, y): 'X': convert_float_to_uint16(x), 'Y': convert_float_to_uint16(y) } + self.x_fp32 = x + self.y_fp32 = y def set_dtype_attr(self): self.attrs['mkldnn_data_type'] = "bfloat16" def test_check_output(self): - if core.is_compiled_with_cuda(): - self.skipTest( - "OneDNN doesn't support bf16 with CUDA, skipping UT" + - self.__class__.__name__) - elif not core.supports_bfloat16(): - self.skipTest("Core doesn't support bf16, skipping UT" + - self.__class__.__name__) + self.check_output_with_place(core.CPUPlace()) + + def matmul_grad(self, x, transpose_x, y, transpose_y): + x = np.transpose( + x, self.shape_transpose_axes[x.ndim]) if transpose_x else x + y = np.transpose( + y, self.shape_transpose_axes[y.ndim]) if transpose_y else y + + return np.matmul(x, y) + + def calculate_grads(self): + self.shape_transpose_axes = { + 2: [1, 0], + 3: [0, 2, 1], + 4: [0, 1, 3, 2], + 5: [0, 1, 2, 4, 3] + } + + # expand vector so it will be a valid matrix for multiplication + if self.x_fp32.ndim == 1: + self.x_fp32 = np.expand_dims(self.x_fp32, axis=0) + if self.y_fp32.ndim == 1: + self.y_fp32 = np.expand_dims(self.y_fp32, axis=1) + + x_transpose_axes = self.shape_transpose_axes[self.x_fp32.ndim] + y_transpose_axes = self.shape_transpose_axes[self.y_fp32.ndim] + + x = np.transpose(self.x_fp32, x_transpose_axes) if self.attrs[ + 'trans_x'] is True else self.x_fp32 + y = np.transpose(self.y_fp32, y_transpose_axes) if self.attrs[ + 'trans_y'] is True else self.y_fp32 + + dout = np.matmul(x, y) + + x_shape = x.shape + y_shape = y.shape + + if x.ndim <= 2 or y.ndim <= 2: + is_broadcast = False + elif x.ndim != y.ndim: + is_broadcast = True else: - self.check_output_with_place(core.CPUPlace()) + is_broadcast = x.shape[0:-2] != y.shape[0:-2] + + if self.attrs['trans_x'] is True and self.attrs['trans_y'] is True: + self.dx = self.matmul_grad(self.y_fp32, True, dout, True) + self.dy = self.matmul_grad(dout, True, self.x_fp32, True) + elif self.attrs['trans_x'] is True and self.attrs[ + 'trans_y'] is False: + self.dx = self.matmul_grad(self.y_fp32, False, dout, True) + self.dy = self.matmul_grad(self.x_fp32, False, dout, False) + elif self.attrs['trans_x'] is False and self.attrs[ + 'trans_y'] is True: + self.dx = self.matmul_grad(dout, False, self.y_fp32, False) + self.dy = self.matmul_grad(dout, True, self.x_fp32, False) + else: + self.dx = self.matmul_grad(dout, False, self.y_fp32, True) + self.dy = self.matmul_grad(self.x_fp32, True, dout, False) + + if is_broadcast: + x_reduce_axis = [] + y_reduce_axis = [] + for index, ( + first, second + ) in enumerate(zip(x_shape[0:-2], self.dx.shape[0:-2])): + if first != second: + x_reduce_axis.append(index) + + for index, ( + first, second + ) in enumerate(zip(y_shape[0:-2], self.dy.shape[0:-2])): + if first != second: + y_reduce_axis.append(index) + + if x_reduce_axis: + self.dx = self.dx.sum(axis=tuple(x_reduce_axis), + keepdims=True) + if y_reduce_axis: + self.dy = self.dy.sum(axis=tuple(y_reduce_axis), + keepdims=True) + + # after multiplying with vector one dimension is deleted from tensor + if len(x_shape) == 2 and x_shape[0] == 1: + print("here") + dout = dout.sum(axis=-2) + if len(y_shape) == 2 and y_shape[1] == 1: + print("here") + dout = dout.sum(axis=-1) + + self.dout = dout def test_check_grad(self): - pass + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ["X", "Y"], + "Out", + user_defined_grads=[self.dx, self.dy], + user_defined_grad_outputs=[convert_float_to_uint16(self.dout)]) cls_name = "{0}_{1}".format(parent.__name__, "BF16") TestMatMulV2Bf16OneDNNOp.__name__ = cls_name globals()[cls_name] = TestMatMulV2Bf16OneDNNOp -#create_bf16_test_class(TestMatMulV2VectorXMatrixTransposeYOneDNNOp) -#create_bf16_test_class(TestMatMulV2VectorXMatrixOneDNNOp) -#create_bf16_test_class(TestMatMulV2MatrixXVectorTransposeXOneDNNOp) -#create_bf16_test_class(TestMatMulV2MatrixXVectorOneDNNOp) -#create_bf16_test_class(TestMatMulV2MatrixXMatrixOneDNNOp) -#create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeYOneDNNOp) -#create_bf16_test_class(TestMatMulV2MatrixXMatrix2OneDNNOp) -#create_bf16_test_class(TestMatMulV2MatrixXMatrix3OneDNNOp) -#create_bf16_test_class(TestMatMulV2MatrixXMatrixTranposeXOneDNNOp2) -#create_bf16_test_class(TestMatMulV2MatrixXMatrixTranposeX2OneDNNOp3) -#create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeX3OneDNNOp) -#create_bf16_test_class(TestMatMulV2MatrixXMatrix4OneDNNOp) -#create_bf16_test_class(TestMatMulV2VectorXMatrix5DOneDNNOp) -#create_bf16_test_class(TestMatMulV2Matrix3DXVectorOneDNNOp) -#create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp) -#create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp) -#create_bf16_test_class(TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp) +create_bf16_test_class(TestMatMulV2VectorXMatrixTransposeYOneDNNOp) +create_bf16_test_class(TestMatMulV2VectorXMatrixOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXVectorTransposeXOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXVectorOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrixOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeYOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrix2OneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrix3OneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrixTranposeXOneDNNOp2) +create_bf16_test_class(TestMatMulV2MatrixXMatrixTranposeX2OneDNNOp3) +create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeX3OneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrix4OneDNNOp) +create_bf16_test_class(TestMatMulV2VectorXMatrix5DOneDNNOp) +create_bf16_test_class(TestMatMulV2Matrix3DXVectorOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp) if __name__ == "__main__": paddle.enable_static() From dfd14d039423df3a15f8b8cab0118dba599c3723 Mon Sep 17 00:00:00 2001 From: Jakub Piasecki Date: Thu, 15 Jul 2021 13:05:50 +0200 Subject: [PATCH 3/7] minor changes --- .../fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py index f221e0540b192..ce59614131577 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py @@ -24,9 +24,6 @@ import paddle.fluid as fluid import paddle.fluid.framework as framework -paddle.enable_static() - - def reference_matmul(X, Y, trans_x=False, trans_y=False): """Reference forward implementation using np.matmul.""" # np.matmul does not support the transpose flags, so we manually @@ -336,10 +333,8 @@ def calculate_grads(self): # after multiplying with vector one dimension is deleted from tensor if len(x_shape) == 2 and x_shape[0] == 1: - print("here") dout = dout.sum(axis=-2) if len(y_shape) == 2 and y_shape[1] == 1: - print("here") dout = dout.sum(axis=-1) self.dout = dout From 8caf5584dcfe44e87f1b0f87a7a0ced6c81b1122 Mon Sep 17 00:00:00 2001 From: Jakub Piasecki Date: Thu, 15 Jul 2021 13:13:58 +0200 Subject: [PATCH 4/7] minor changes --- paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc | 4 ++-- .../fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc index 79356be5be1ff..73397786f57b9 100644 --- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -204,8 +204,8 @@ class MatMulV2MKLDNNKernel : public MatMulGradMKLDNNKernel { std::vector x_bd_dims(ndims, 1); std::vector y_bd_dims(ndims, 1); - this->CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, - out_dims, out); + CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, out_dims, + out); ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_bd_dims, trans_x, y, y_bd_dims, trans_y, out, out_dims); diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py index ce59614131577..87dcf55d78239 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py @@ -24,6 +24,7 @@ import paddle.fluid as fluid import paddle.fluid.framework as framework + def reference_matmul(X, Y, trans_x=False, trans_y=False): """Reference forward implementation using np.matmul.""" # np.matmul does not support the transpose flags, so we manually @@ -51,8 +52,8 @@ def reference_matmul(X, Y, trans_x=False, trans_y=False): class TestMatMulV2VectorXVectorOneDNNOp(OpTest): def config(self): - self.x_shape = (2, 3, 10, 5) - self.y_shape = (1, 3, 5, 10) + self.x_shape = (100, ) + self.y_shape = (100, ) self.trans_x = False self.trans_y = False From ca35af070c6489a120c7e950cf82151f1022605c Mon Sep 17 00:00:00 2001 From: Jakub Piasecki Date: Thu, 15 Jul 2021 13:19:15 +0200 Subject: [PATCH 5/7] minor change for CI approval --- .../mkldnn/test_matmul_v2_mkldnn_op.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py index 87dcf55d78239..8741d57f4ad25 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py @@ -253,7 +253,7 @@ def set_dtype_attr(self): def test_check_output(self): self.check_output_with_place(core.CPUPlace()) - def matmul_grad(self, x, transpose_x, y, transpose_y): + def matmul_bwd(self, x, transpose_x, y, transpose_y): x = np.transpose( x, self.shape_transpose_axes[x.ndim]) if transpose_x else x y = np.transpose( @@ -296,19 +296,19 @@ def calculate_grads(self): is_broadcast = x.shape[0:-2] != y.shape[0:-2] if self.attrs['trans_x'] is True and self.attrs['trans_y'] is True: - self.dx = self.matmul_grad(self.y_fp32, True, dout, True) - self.dy = self.matmul_grad(dout, True, self.x_fp32, True) + self.dx = self.matmul_bwd(self.y_fp32, True, dout, True) + self.dy = self.matmul_bwd(dout, True, self.x_fp32, True) elif self.attrs['trans_x'] is True and self.attrs[ 'trans_y'] is False: - self.dx = self.matmul_grad(self.y_fp32, False, dout, True) - self.dy = self.matmul_grad(self.x_fp32, False, dout, False) + self.dx = self.matmul_bwd(self.y_fp32, False, dout, True) + self.dy = self.matmul_bwd(self.x_fp32, False, dout, False) elif self.attrs['trans_x'] is False and self.attrs[ 'trans_y'] is True: - self.dx = self.matmul_grad(dout, False, self.y_fp32, False) - self.dy = self.matmul_grad(dout, True, self.x_fp32, False) + self.dx = self.matmul_bwd(dout, False, self.y_fp32, False) + self.dy = self.matmul_bwd(dout, True, self.x_fp32, False) else: - self.dx = self.matmul_grad(dout, False, self.y_fp32, True) - self.dy = self.matmul_grad(self.x_fp32, True, dout, False) + self.dx = self.matmul_bwd(dout, False, self.y_fp32, True) + self.dy = self.matmul_bwd(self.x_fp32, True, dout, False) if is_broadcast: x_reduce_axis = [] From 1f4b9630757bdd4ca095260676d438b61ff6ed57 Mon Sep 17 00:00:00 2001 From: Jakub Piasecki Date: Thu, 15 Jul 2021 13:26:37 +0200 Subject: [PATCH 6/7] CI fix --- paddle/fluid/operators/matmul_v2_op.cc | 2 +- .../mkldnn/test_matmul_v2_mkldnn_op.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index b75ef49b6876d..d39eac0759cdb 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -68,7 +68,7 @@ class MatMulV2Op : public framework::OperatorWithKernel { new_dims.assign(dims_y.begin(), dims_y.end() - 2); } else { new_dims.reserve(ndims_x); - for (int i = 0; i < ndims_x - 2; ++i) { + for (size_t i = 0; i < ndims_x - 2; ++i) { new_dims.push_back(std::max(dims_x[i], dims_y[i])); } } diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py index 8741d57f4ad25..fe4044a9bb4dd 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py @@ -253,7 +253,7 @@ def set_dtype_attr(self): def test_check_output(self): self.check_output_with_place(core.CPUPlace()) - def matmul_bwd(self, x, transpose_x, y, transpose_y): + def tmttml(self, x, transpose_x, y, transpose_y): x = np.transpose( x, self.shape_transpose_axes[x.ndim]) if transpose_x else x y = np.transpose( @@ -296,19 +296,19 @@ def calculate_grads(self): is_broadcast = x.shape[0:-2] != y.shape[0:-2] if self.attrs['trans_x'] is True and self.attrs['trans_y'] is True: - self.dx = self.matmul_bwd(self.y_fp32, True, dout, True) - self.dy = self.matmul_bwd(dout, True, self.x_fp32, True) + self.dx = self.tmttml(self.y_fp32, True, dout, True) + self.dy = self.tmttml(dout, True, self.x_fp32, True) elif self.attrs['trans_x'] is True and self.attrs[ 'trans_y'] is False: - self.dx = self.matmul_bwd(self.y_fp32, False, dout, True) - self.dy = self.matmul_bwd(self.x_fp32, False, dout, False) + self.dx = self.tmttml(self.y_fp32, False, dout, True) + self.dy = self.tmttml(self.x_fp32, False, dout, False) elif self.attrs['trans_x'] is False and self.attrs[ 'trans_y'] is True: - self.dx = self.matmul_bwd(dout, False, self.y_fp32, False) - self.dy = self.matmul_bwd(dout, True, self.x_fp32, False) + self.dx = self.tmttml(dout, False, self.y_fp32, False) + self.dy = self.tmttml(dout, True, self.x_fp32, False) else: - self.dx = self.matmul_bwd(dout, False, self.y_fp32, True) - self.dy = self.matmul_bwd(self.x_fp32, True, dout, False) + self.dx = self.tmttml(dout, False, self.y_fp32, True) + self.dy = self.tmttml(self.x_fp32, True, dout, False) if is_broadcast: x_reduce_axis = [] From 2104d0d7f7897d9c7360cab47f06dbb57af6cb81 Mon Sep 17 00:00:00 2001 From: Jakub Piasecki Date: Thu, 15 Jul 2021 13:31:14 +0200 Subject: [PATCH 7/7] CI fix --- .../mkldnn/test_matmul_v2_mkldnn_op.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py index fe4044a9bb4dd..3b9d817522561 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py @@ -253,7 +253,15 @@ def set_dtype_attr(self): def test_check_output(self): self.check_output_with_place(core.CPUPlace()) - def tmttml(self, x, transpose_x, y, transpose_y): + def test_check_grad(self): + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ["X", "Y"], + "Out", + user_defined_grads=[self.dx, self.dy], + user_defined_grad_outputs=[convert_float_to_uint16(self.dout)]) + + def matmul_grad(self, x, transpose_x, y, transpose_y): x = np.transpose( x, self.shape_transpose_axes[x.ndim]) if transpose_x else x y = np.transpose( @@ -296,19 +304,19 @@ def calculate_grads(self): is_broadcast = x.shape[0:-2] != y.shape[0:-2] if self.attrs['trans_x'] is True and self.attrs['trans_y'] is True: - self.dx = self.tmttml(self.y_fp32, True, dout, True) - self.dy = self.tmttml(dout, True, self.x_fp32, True) + self.dx = self.matmul_grad(self.y_fp32, True, dout, True) + self.dy = self.matmul_grad(dout, True, self.x_fp32, True) elif self.attrs['trans_x'] is True and self.attrs[ 'trans_y'] is False: - self.dx = self.tmttml(self.y_fp32, False, dout, True) - self.dy = self.tmttml(self.x_fp32, False, dout, False) + self.dx = self.matmul_grad(self.y_fp32, False, dout, True) + self.dy = self.matmul_grad(self.x_fp32, False, dout, False) elif self.attrs['trans_x'] is False and self.attrs[ 'trans_y'] is True: - self.dx = self.tmttml(dout, False, self.y_fp32, False) - self.dy = self.tmttml(dout, True, self.x_fp32, False) + self.dx = self.matmul_grad(dout, False, self.y_fp32, False) + self.dy = self.matmul_grad(dout, True, self.x_fp32, False) else: - self.dx = self.tmttml(dout, False, self.y_fp32, True) - self.dy = self.tmttml(self.x_fp32, True, dout, False) + self.dx = self.matmul_grad(dout, False, self.y_fp32, True) + self.dy = self.matmul_grad(self.x_fp32, True, dout, False) if is_broadcast: x_reduce_axis = [] @@ -340,14 +348,6 @@ def calculate_grads(self): self.dout = dout - def test_check_grad(self): - self.calculate_grads() - self.check_grad_with_place( - core.CPUPlace(), ["X", "Y"], - "Out", - user_defined_grads=[self.dx, self.dy], - user_defined_grad_outputs=[convert_float_to_uint16(self.dout)]) - cls_name = "{0}_{1}".format(parent.__name__, "BF16") TestMatMulV2Bf16OneDNNOp.__name__ = cls_name globals()[cls_name] = TestMatMulV2Bf16OneDNNOp