Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[oneDNN] Fix to issue #34554 #34623

Merged
merged 18 commits into from
Aug 11, 2021
14 changes: 10 additions & 4 deletions paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,19 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
float scale_o = ctx.Attr<float>("Scale_out");
int axis = ctx.Attr<int>("axis");

platform::BinaryMKLDNNHandler<T> handler(
BINARY_OP, axis, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, y, z,
scale_x, scale_y, scale_o, ctx.OutputName("Out"));
platform::BinaryMKLDNNHandler<T> handler(BINARY_OP, axis, mkldnn_engine,
ctx.GetPlace(), x, y, z, scale_x,
scale_y, scale_o);

const auto src_x_memory = handler.AcquireSrcMemory(x);
const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
const auto dst_memory = handler.AcquireDstMemory(z);
// For Inplace src and and dst are the same memory object
// (jczaja) UT mechanics is testing inplace for this op
// regardless shapes, which is wrong when X is to be broadcasted as output
// is of bigger shape that X.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you clarify this comment, please?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure. Comment was updated.

auto dst_memory = (x->numel() == z->numel() && x->IsSharedBufferWith(*z))
? src_x_memory
: handler.AcquireDstMemory(z);

const auto binary_prim = handler.AcquireForwardPrimitive();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,8 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
if (dx) {
// dx = dout*y
platform::BinaryMKLDNNHandler<T> handler(
dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine,
ctx.GetPlace(), dout, y, dx, 1.0f, 1.0f, 1.0f,
ctx.InputName(framework::GradVarName("Out")));
dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
dout, y, dx, 1.0f, 1.0f, 1.0f);

const auto src_dout_memory = handler.AcquireSrcMemory(dout);
const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
Expand All @@ -75,9 +74,8 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
// Handler is having nullptr passed instead of output tensor as
// we want Dst buffer to be allocated by oneDNN not to use Tensor
platform::BinaryMKLDNNHandler<T> handler(
dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine,
ctx.GetPlace(), dout, x, nullptr, 1.0f, 1.0f, 1.0f,
ctx.InputName(framework::GradVarName("Out")));
dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
dout, x, nullptr, 1.0f, 1.0f, 1.0f);

const auto src_dout_memory = handler.AcquireSrcMemory(dout);
const auto src_x_memory = handler.AcquireSecondSrcMemory(x);
Expand Down
11 changes: 6 additions & 5 deletions paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,15 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
paddle::platform::errors::PreconditionNotMet(
"Operator DNNL eletwise_forward must use CPUPlace"));
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto &mkldnn_engine = dev_ctx.GetEngine();

const auto *x = ctx.Input<Tensor>("X");
auto *y = ctx.Output<Tensor>("Out");

bool is_inplaced = x->IsSharedBufferWith(*y);

platform::ActivationMKLDNNHandler<T> handler(algorithm, ctx, dev_ctx,
ctx.GetPlace(), x,
ctx.InputName("X"), is_inplaced);
platform::ActivationMKLDNNHandler<T> handler(algorithm, ctx, mkldnn_engine,
ctx.GetPlace(), x);

auto src_memory_p = handler.AcquireSrcMemory(x);
auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(y);
Expand All @@ -106,13 +106,14 @@ template <typename T>
void eltwise_grad(const framework::ExecutionContext &ctx,
mkldnn::algorithm algorithm) {
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto &mkldnn_engine = dev_ctx.GetEngine();

const auto *x = ctx.Input<Tensor>("X");
const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));

platform::ActivationMKLDNNHandler<T> handler(
algorithm, ctx, dev_ctx, ctx.GetPlace(), x, diff_y, ctx.InputName("X"));
platform::ActivationMKLDNNHandler<T> handler(algorithm, ctx, mkldnn_engine,
ctx.GetPlace(), x, diff_y);

auto src_memory_p = handler.AcquireBackwardSrcMemory(x);
auto diff_dst_memory_p = handler.AcquireDiffDstMemory(diff_y);
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/mkldnn/caching_tests.cmake
Original file line number Diff line number Diff line change
@@ -1 +1 @@
cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op softmax scope device_context enforce)
cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op conv_op im2col vol2col softmax scope device_context enforce)
8 changes: 5 additions & 3 deletions paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,20 @@ class ScaleMKLDNNKernel : public framework::OpKernel<T> {
void RunKernel(const framework::ExecutionContext& ctx) const {
const auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();

auto* x = ctx.Input<Tensor>("X");
auto* out = ctx.Output<Tensor>("Out");

bool is_inplaced = x->IsSharedBufferWith(*out);

platform::ActivationMKLDNNHandler<T> handler(
mkldnn::algorithm::eltwise_linear, ctx, dev_ctx, ctx.GetPlace(), x,
ctx.InputName("X"), is_inplaced);
mkldnn::algorithm::eltwise_linear, ctx, mkldnn_engine, ctx.GetPlace(),
x);

auto src_memory_p = handler.AcquireSrcMemory(x);
auto dst_memory_p = handler.AcquireDstMemory(out);
auto dst_memory_p =
is_inplaced ? src_memory_p : handler.AcquireDstMemory(out);
auto activation_p = handler.AcquireForwardPrimitive();

auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
Expand Down
103 changes: 44 additions & 59 deletions paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,69 +32,54 @@ using platform::to_void_cast;

template <typename T>
class SoftmaxMKLDNNHandler
: public platform::MKLDNNHandlerT<T, mkldnn::softmax_forward,
mkldnn::softmax_backward> {
: public platform::MKLDNNHandlerNoCachingT<T, mkldnn::softmax_forward,
mkldnn::softmax_backward> {
public:
SoftmaxMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
const mkldnn::engine mkldnn_engine,
SoftmaxMKLDNNHandler(const mkldnn::engine mkldnn_engine,
platform::Place cpu_place, const Tensor* input,
Tensor* output, const int axis,
const std::string uniq_name, bool is_inplaced)
: platform::MKLDNNHandlerT<T, mkldnn::softmax_forward,
mkldnn::softmax_backward>(
dev_ctx, mkldnn_engine, cpu_place,
// Softmax may be inplace then uniq_name is no longer unique
is_inplaced ? platform::CreateKey(
dev_ctx, framework::vectorize(input->dims()),
axis, uniq_name)
: platform::CreateKey(
dev_ctx, framework::vectorize(input->dims()),
uniq_name)) {
if (!this->isCached()) {
PADDLE_ENFORCE_EQ(
input->dims(), output->dims(),
platform::errors::InvalidArgument(
"The shape of input and output tensor must be identical."));

auto softmax_tz = framework::vectorize(input->dims());
auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType<T>(),
input->format());

this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md,
axis);
}
Tensor* output, const int axis)
: platform::MKLDNNHandlerNoCachingT<T, mkldnn::softmax_forward,
mkldnn::softmax_backward>(
mkldnn_engine, cpu_place) {
PADDLE_ENFORCE_EQ(
input->dims(), output->dims(),
platform::errors::InvalidArgument(
"The shape of input and output tensor must be identical."));

auto softmax_tz = framework::vectorize(input->dims());
auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType<T>(),
input->format());

this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md,
axis);
}

SoftmaxMKLDNNHandler(const framework::ExecutionContext& ctx,
const MKLDNNDeviceContext& dev_ctx,
const mkldnn::engine mkldnn_engine,
platform::Place cpu_place, const Tensor* out,
const Tensor* out_grad, Tensor* in_x_grad,
const std::string& unique_name)
: platform::MKLDNNHandlerT<T, mkldnn::softmax_forward,
mkldnn::softmax_backward>(
dev_ctx, dev_ctx.GetEngine(), cpu_place,
platform::CreateKey(dev_ctx, framework::vectorize(out->dims()),
unique_name)) {
if (!this->isBwdCached()) {
PADDLE_ENFORCE_EQ(
out_grad->dims(), in_x_grad->dims(),
platform::errors::InvalidArgument("The shape of softmax_grad's input "
"and output must be identical."));

auto dims = out_grad->dims(); // input and output share the same shape
const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
auto softmax_tz = framework::vectorize<int64_t>(dims);

auto data_softmax_md = MKLDNNMemDesc(
softmax_tz, platform::MKLDNNGetDataType<T>(), out->format());
auto diff_softmax_md = MKLDNNMemDesc(
softmax_tz, platform::MKLDNNGetDataType<T>(), out_grad->format());

this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring,
data_softmax_md, axis);
this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md,
axis);
}
: platform::MKLDNNHandlerNoCachingT<T, mkldnn::softmax_forward,
mkldnn::softmax_backward>(
mkldnn_engine, cpu_place) {
PADDLE_ENFORCE_EQ(
out_grad->dims(), in_x_grad->dims(),
platform::errors::InvalidArgument("The shape of softmax_grad's input "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we recommend that the expected value and actual value are also reported in the error message

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

"and output must be identical."));

auto dims = out_grad->dims(); // input and output share the same shape
const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
auto softmax_tz = framework::vectorize<int64_t>(dims);

auto data_softmax_md = MKLDNNMemDesc(
softmax_tz, platform::MKLDNNGetDataType<T>(), out->format());
auto diff_softmax_md = MKLDNNMemDesc(
softmax_tz, platform::MKLDNNGetDataType<T>(), out_grad->format());

this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring,
data_softmax_md, axis);
this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md,
axis);
}
};

Expand All @@ -111,9 +96,8 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {

const int axis = CanonicalAxis(ctx.Attr<int>("axis"), input->dims().size());

SoftmaxMKLDNNHandler<T> handler(dev_ctx, mkldnn_engine, ctx.GetPlace(),
input, output, axis, ctx.OutputName("Out"),
is_inplaced);
SoftmaxMKLDNNHandler<T> handler(mkldnn_engine, ctx.GetPlace(), input,
output, axis);

auto softmax_src_memory_p = handler.AcquireSrcMemory(input);
// For Inplace src and and dst are the same memory object
Expand Down Expand Up @@ -149,11 +133,12 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
paddle::platform::errors::PreconditionNotMet(
"Operator DNNL SoftmaxGrad must use CPUPlace"));
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
const Tensor* output = ctx.Input<Tensor>("Out");
auto* out_grad = ctx.template Input<Tensor>(framework::GradVarName("Out"));
auto* in_x_grad = ctx.template Output<Tensor>(framework::GradVarName("X"));

SoftmaxMKLDNNHandler<T> handler(ctx, dev_ctx, ctx.GetPlace(), output,
SoftmaxMKLDNNHandler<T> handler(ctx, mkldnn_engine, ctx.GetPlace(), output,
out_grad, in_x_grad, ctx.InputName("Out"));

auto dst_memory_p = handler.AcquireDstMemory(output);
Expand Down
84 changes: 29 additions & 55 deletions paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ USE_OP(relu);
USE_OP_DEVICE_KERNEL(relu, MKLDNN);
USE_OP(softmax);
USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
USE_OP(conv2d);
USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -64,16 +66,19 @@ class CacheTester {

template <typename T>
void RunOperator(const platform::Place &place, const std::string &op_type,
const framework::DDim &dims, const std::string &output_name,
bool inplace = false) {
const framework::DDim &dims, const std::string &first_input) {
framework::Scope scope;

std::map<const std::string, int> num_inputs = {{"softmax", 1},
{"relu", 1},
{"conv2d", 2},
{"elementwise_add", 2},
{"elementwise_mul", 2}};

std::string first_input = inplace == true ? output_name : "x";
std::string first_input_var_name = (op_type == "conv2d") ? "Input" : "X";
std::string second_input_var_name = (op_type == "conv2d") ? "Filter" : "Y";
std::string output_var_name = (op_type == "conv2d") ? "Output" : "Out";
std::string output_name = "output";

std::vector<InputVars> input_names = {
{first_input, scope.Var(first_input)->GetMutable<framework::LoDTensor>()},
Expand Down Expand Up @@ -113,71 +118,40 @@ void RunOperator(const platform::Place &place, const std::string &op_type,

auto &pool = platform::DeviceContextPool::Instance();

auto op = num_inputs[op_type] > 1
? framework::OpRegistry::CreateOp(
op_type, {{"X", {first_input}}, {"Y", {"x1"}}},
{{"Out", {output_name}}}, {{"use_mkldnn", {true}}})
: framework::OpRegistry::CreateOp(
op_type, {{"X", {first_input}}}, {{"Out", {output_name}}},
{{"use_mkldnn", {true}}});
auto op =
num_inputs[op_type] > 1
? framework::OpRegistry::CreateOp(
op_type, {{first_input_var_name, {first_input}},
{second_input_var_name, {"x1"}}},
{{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}})
: framework::OpRegistry::CreateOp(
op_type, {{first_input_var_name, {first_input}}},
{{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}});

op->Run(scope, place);
pool.Get(place)->Wait();
}

TEST(test_softmax_reuse_cache, cpu_place) {
framework::DDim dims({32, 64});
TEST(test_conv2d_reuse_cache, cpu_place) {
framework::DDim dims({1, 16, 32, 64});
platform::CPUPlace p;
CacheTester ct;
RunOperator<float>(p, "softmax", dims, "softmax_out");
RunOperator<float>(p, "softmax", dims, "softmax_out");
PADDLE_ENFORCE_EQ(ct.Analyze(4), true,
RunOperator<float>(p, "conv2d", dims, "input_signal");
RunOperator<float>(p, "conv2d", dims, "input_signal");
PADDLE_ENFORCE_EQ(ct.Analyze(9), true,
platform::errors::InvalidArgument(
"Wrong number of cached oneDNN objects"));
"Invalid number of cached oneDNN objects"));
}

TEST(test_softmax_noreuse_cache, cpu_place) {
framework::DDim dims({32, 64});
TEST(test_conv2d_noreuse_cache, cpu_place) {
framework::DDim dims({1, 16, 32, 64});
platform::CPUPlace p;
CacheTester ct;
RunOperator<float>(p, "softmax", dims, "softmax_out");
RunOperator<float>(p, "softmax", dims, "softmax_out2");
PADDLE_ENFORCE_EQ(ct.Analyze(8), true,
RunOperator<float>(p, "conv2d", dims, "input_signal");
RunOperator<float>(p, "conv2d", dims, "input_signal2");
PADDLE_ENFORCE_EQ(ct.Analyze(18), true,
platform::errors::InvalidArgument(
"Wrong number of cached oneDNN objects"));
}

TEST(test_softmax_inplace_cache, cpu_place) {
framework::DDim dims({32, 64});
platform::CPUPlace p;
CacheTester ct;
RunOperator<float>(p, "softmax", dims, "softmax_out");
RunOperator<float>(p, "softmax", dims, "softmax_out", true);
PADDLE_ENFORCE_EQ(ct.Analyze(7), true,
platform::errors::InvalidArgument(
"Wrong number of cached oneDNN objects"));
}

TEST(test_relu_inplace_cache, cpu_place) {
framework::DDim dims({32, 64});
platform::CPUPlace p;
CacheTester ct;
RunOperator<float>(p, "relu", dims, "relu_out");
RunOperator<float>(p, "relu", dims, "relu_out", true);
PADDLE_ENFORCE_EQ(ct.Analyze(7), true,
platform::errors::InvalidArgument(
"Wrong number of cached oneDNN objects"));
}

TEST(test_elementwise_add_reuse_cache, cpu_place) {
framework::DDim dims({32, 64});
platform::CPUPlace p;
CacheTester ct;
RunOperator<float>(p, "elementwise_add", dims, "elementwise_add_out");
RunOperator<float>(p, "relu", dims, "elementwise_add_out", true);
PADDLE_ENFORCE_EQ(ct.Analyze(8), true,
platform::errors::InvalidArgument(
"Wrong number of cached oneDNN objects"));
"Invalid number of cached oneDNN objects"));
}

} // namespace operators
Expand Down
Loading