Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "[oneDNN] Fix to issue #34554" #34838

Merged
merged 1 commit into from
Aug 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 4 additions & 15 deletions paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,24 +47,13 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
float scale_o = ctx.Attr<float>("Scale_out");
int axis = ctx.Attr<int>("axis");

platform::BinaryMKLDNNHandler<T> handler(BINARY_OP, axis, mkldnn_engine,
ctx.GetPlace(), x, y, z, scale_x,
scale_y, scale_o);
platform::BinaryMKLDNNHandler<T> handler(
BINARY_OP, axis, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, y, z,
scale_x, scale_y, scale_o, ctx.OutputName("Out"));

const auto src_x_memory = handler.AcquireSrcMemory(x);
const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
// (jczaja) For Inplace src and dst should be the same memory object.
// So x should share buffer with z. But UT mechanics is testing inplace
// execution for this op not checking that x can be bradcasted to match in
// shape y tensor.
// This is wrong as when x is to be broadcasted then z(out) will match the
// shape of y which is bigger than x. Hence if x is smaller in shape than z
// and they share a buffer (of
// shape x) then this buffer is not big enough to hold result of elementwise
// operation.
auto dst_memory = (x->numel() == z->numel() && x->IsSharedBufferWith(*z))
? src_x_memory
: handler.AcquireDstMemory(z);
const auto dst_memory = handler.AcquireDstMemory(z);

const auto binary_prim = handler.AcquireForwardPrimitive();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
if (dx) {
// dx = dout*y
platform::BinaryMKLDNNHandler<T> handler(
dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
dout, y, dx, 1.0f, 1.0f, 1.0f);
dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine,
ctx.GetPlace(), dout, y, dx, 1.0f, 1.0f, 1.0f,
ctx.InputName(framework::GradVarName("Out")));

const auto src_dout_memory = handler.AcquireSrcMemory(dout);
const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
Expand All @@ -74,8 +75,9 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
// Handler is having nullptr passed instead of output tensor as
// we want Dst buffer to be allocated by oneDNN not to use Tensor
platform::BinaryMKLDNNHandler<T> handler(
dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
dout, x, nullptr, 1.0f, 1.0f, 1.0f);
dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine,
ctx.GetPlace(), dout, x, nullptr, 1.0f, 1.0f, 1.0f,
ctx.InputName(framework::GradVarName("Out")));

const auto src_dout_memory = handler.AcquireSrcMemory(dout);
const auto src_x_memory = handler.AcquireSecondSrcMemory(x);
Expand Down
11 changes: 5 additions & 6 deletions paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,15 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
paddle::platform::errors::PreconditionNotMet(
"Operator DNNL eletwise_forward must use CPUPlace"));
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto &mkldnn_engine = dev_ctx.GetEngine();

const auto *x = ctx.Input<Tensor>("X");
auto *y = ctx.Output<Tensor>("Out");

bool is_inplaced = x->IsSharedBufferWith(*y);

platform::ActivationMKLDNNHandler<T> handler(algorithm, ctx, mkldnn_engine,
ctx.GetPlace(), x);
platform::ActivationMKLDNNHandler<T> handler(algorithm, ctx, dev_ctx,
ctx.GetPlace(), x,
ctx.InputName("X"), is_inplaced);

auto src_memory_p = handler.AcquireSrcMemory(x);
auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(y);
Expand All @@ -106,14 +106,13 @@ template <typename T>
void eltwise_grad(const framework::ExecutionContext &ctx,
mkldnn::algorithm algorithm) {
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto &mkldnn_engine = dev_ctx.GetEngine();

const auto *x = ctx.Input<Tensor>("X");
const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));

platform::ActivationMKLDNNHandler<T> handler(algorithm, ctx, mkldnn_engine,
ctx.GetPlace(), x, diff_y);
platform::ActivationMKLDNNHandler<T> handler(
algorithm, ctx, dev_ctx, ctx.GetPlace(), x, diff_y, ctx.InputName("X"));

auto src_memory_p = handler.AcquireBackwardSrcMemory(x);
auto diff_dst_memory_p = handler.AcquireDiffDstMemory(diff_y);
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/mkldnn/caching_tests.cmake
Original file line number Diff line number Diff line change
@@ -1 +1 @@
cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op conv_op im2col vol2col softmax scope device_context enforce)
cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op softmax scope device_context enforce)
8 changes: 3 additions & 5 deletions paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,18 @@ class ScaleMKLDNNKernel : public framework::OpKernel<T> {
void RunKernel(const framework::ExecutionContext& ctx) const {
const auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();

auto* x = ctx.Input<Tensor>("X");
auto* out = ctx.Output<Tensor>("Out");

bool is_inplaced = x->IsSharedBufferWith(*out);

platform::ActivationMKLDNNHandler<T> handler(
mkldnn::algorithm::eltwise_linear, ctx, mkldnn_engine, ctx.GetPlace(),
x);
mkldnn::algorithm::eltwise_linear, ctx, dev_ctx, ctx.GetPlace(), x,
ctx.InputName("X"), is_inplaced);

auto src_memory_p = handler.AcquireSrcMemory(x);
auto dst_memory_p =
is_inplaced ? src_memory_p : handler.AcquireDstMemory(out);
auto dst_memory_p = handler.AcquireDstMemory(out);
auto activation_p = handler.AcquireForwardPrimitive();

auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
Expand Down
105 changes: 59 additions & 46 deletions paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,56 +32,69 @@ using platform::to_void_cast;

template <typename T>
class SoftmaxMKLDNNHandler
: public platform::MKLDNNHandlerNoCachingT<T, mkldnn::softmax_forward,
mkldnn::softmax_backward> {
: public platform::MKLDNNHandlerT<T, mkldnn::softmax_forward,
mkldnn::softmax_backward> {
public:
SoftmaxMKLDNNHandler(const mkldnn::engine mkldnn_engine,
SoftmaxMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
const mkldnn::engine mkldnn_engine,
platform::Place cpu_place, const Tensor* input,
Tensor* output, const int axis)
: platform::MKLDNNHandlerNoCachingT<T, mkldnn::softmax_forward,
mkldnn::softmax_backward>(
mkldnn_engine, cpu_place) {
PADDLE_ENFORCE_EQ(
input->dims(), output->dims(),
platform::errors::InvalidArgument(
"The shape of input and output tensor must be identical."));

auto softmax_tz = framework::vectorize(input->dims());
auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType<T>(),
input->format());

this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md,
axis);
Tensor* output, const int axis,
const std::string uniq_name, bool is_inplaced)
: platform::MKLDNNHandlerT<T, mkldnn::softmax_forward,
mkldnn::softmax_backward>(
dev_ctx, mkldnn_engine, cpu_place,
// Softmax may be inplace then uniq_name is no longer unique
is_inplaced ? platform::CreateKey(
dev_ctx, framework::vectorize(input->dims()),
axis, uniq_name)
: platform::CreateKey(
dev_ctx, framework::vectorize(input->dims()),
uniq_name)) {
if (!this->isCached()) {
PADDLE_ENFORCE_EQ(
input->dims(), output->dims(),
platform::errors::InvalidArgument(
"The shape of input and output tensor must be identical."));

auto softmax_tz = framework::vectorize(input->dims());
auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType<T>(),
input->format());

this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md,
axis);
}
}

SoftmaxMKLDNNHandler(const framework::ExecutionContext& ctx,
const mkldnn::engine mkldnn_engine,
const MKLDNNDeviceContext& dev_ctx,
platform::Place cpu_place, const Tensor* out,
const Tensor* out_grad, Tensor* in_x_grad,
const std::string& unique_name)
: platform::MKLDNNHandlerNoCachingT<T, mkldnn::softmax_forward,
mkldnn::softmax_backward>(
mkldnn_engine, cpu_place) {
PADDLE_ENFORCE_EQ(out_grad->dims(), in_x_grad->dims(),
platform::errors::InvalidArgument(
"The shape of softmax_grad's input "
"and output must be identical, but shapes differ, "
"out_grad: %s in_grad: %s",
out_grad->dims(), in_x_grad->dims()));

auto dims = out_grad->dims(); // input and output share the same shape
const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
auto softmax_tz = framework::vectorize<int64_t>(dims);

auto data_softmax_md = MKLDNNMemDesc(
softmax_tz, platform::MKLDNNGetDataType<T>(), out->format());
auto diff_softmax_md = MKLDNNMemDesc(
softmax_tz, platform::MKLDNNGetDataType<T>(), out_grad->format());

this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring,
data_softmax_md, axis);
this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md,
axis);
: platform::MKLDNNHandlerT<T, mkldnn::softmax_forward,
mkldnn::softmax_backward>(
dev_ctx, dev_ctx.GetEngine(), cpu_place,
platform::CreateKey(dev_ctx, framework::vectorize(out->dims()),
unique_name)) {
if (!this->isBwdCached()) {
PADDLE_ENFORCE_EQ(
out_grad->dims(), in_x_grad->dims(),
platform::errors::InvalidArgument("The shape of softmax_grad's input "
"and output must be identical."));

auto dims = out_grad->dims(); // input and output share the same shape
const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
auto softmax_tz = framework::vectorize<int64_t>(dims);

auto data_softmax_md = MKLDNNMemDesc(
softmax_tz, platform::MKLDNNGetDataType<T>(), out->format());
auto diff_softmax_md = MKLDNNMemDesc(
softmax_tz, platform::MKLDNNGetDataType<T>(), out_grad->format());

this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring,
data_softmax_md, axis);
this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md,
axis);
}
}
};

Expand All @@ -98,8 +111,9 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {

const int axis = CanonicalAxis(ctx.Attr<int>("axis"), input->dims().size());

SoftmaxMKLDNNHandler<T> handler(mkldnn_engine, ctx.GetPlace(), input,
output, axis);
SoftmaxMKLDNNHandler<T> handler(dev_ctx, mkldnn_engine, ctx.GetPlace(),
input, output, axis, ctx.OutputName("Out"),
is_inplaced);

auto softmax_src_memory_p = handler.AcquireSrcMemory(input);
// For Inplace src and and dst are the same memory object
Expand Down Expand Up @@ -135,12 +149,11 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
paddle::platform::errors::PreconditionNotMet(
"Operator DNNL SoftmaxGrad must use CPUPlace"));
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
const Tensor* output = ctx.Input<Tensor>("Out");
auto* out_grad = ctx.template Input<Tensor>(framework::GradVarName("Out"));
auto* in_x_grad = ctx.template Output<Tensor>(framework::GradVarName("X"));

SoftmaxMKLDNNHandler<T> handler(ctx, mkldnn_engine, ctx.GetPlace(), output,
SoftmaxMKLDNNHandler<T> handler(ctx, dev_ctx, ctx.GetPlace(), output,
out_grad, in_x_grad, ctx.InputName("Out"));

auto dst_memory_p = handler.AcquireDstMemory(output);
Expand Down
84 changes: 55 additions & 29 deletions paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@ USE_OP(relu);
USE_OP_DEVICE_KERNEL(relu, MKLDNN);
USE_OP(softmax);
USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
USE_OP(conv2d);
USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -66,19 +64,16 @@ class CacheTester {

template <typename T>
void RunOperator(const platform::Place &place, const std::string &op_type,
const framework::DDim &dims, const std::string &first_input) {
const framework::DDim &dims, const std::string &output_name,
bool inplace = false) {
framework::Scope scope;

std::map<const std::string, int> num_inputs = {{"softmax", 1},
{"relu", 1},
{"conv2d", 2},
{"elementwise_add", 2},
{"elementwise_mul", 2}};

std::string first_input_var_name = (op_type == "conv2d") ? "Input" : "X";
std::string second_input_var_name = (op_type == "conv2d") ? "Filter" : "Y";
std::string output_var_name = (op_type == "conv2d") ? "Output" : "Out";
std::string output_name = "output";
std::string first_input = inplace == true ? output_name : "x";

std::vector<InputVars> input_names = {
{first_input, scope.Var(first_input)->GetMutable<framework::LoDTensor>()},
Expand Down Expand Up @@ -118,40 +113,71 @@ void RunOperator(const platform::Place &place, const std::string &op_type,

auto &pool = platform::DeviceContextPool::Instance();

auto op =
num_inputs[op_type] > 1
? framework::OpRegistry::CreateOp(
op_type, {{first_input_var_name, {first_input}},
{second_input_var_name, {"x1"}}},
{{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}})
: framework::OpRegistry::CreateOp(
op_type, {{first_input_var_name, {first_input}}},
{{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}});
auto op = num_inputs[op_type] > 1
? framework::OpRegistry::CreateOp(
op_type, {{"X", {first_input}}, {"Y", {"x1"}}},
{{"Out", {output_name}}}, {{"use_mkldnn", {true}}})
: framework::OpRegistry::CreateOp(
op_type, {{"X", {first_input}}}, {{"Out", {output_name}}},
{{"use_mkldnn", {true}}});

op->Run(scope, place);
pool.Get(place)->Wait();
}

TEST(test_conv2d_reuse_cache, cpu_place) {
framework::DDim dims({1, 16, 32, 64});
TEST(test_softmax_reuse_cache, cpu_place) {
framework::DDim dims({32, 64});
platform::CPUPlace p;
CacheTester ct;
RunOperator<float>(p, "conv2d", dims, "input_signal");
RunOperator<float>(p, "conv2d", dims, "input_signal");
PADDLE_ENFORCE_EQ(ct.Analyze(9), true,
RunOperator<float>(p, "softmax", dims, "softmax_out");
RunOperator<float>(p, "softmax", dims, "softmax_out");
PADDLE_ENFORCE_EQ(ct.Analyze(4), true,
platform::errors::InvalidArgument(
"Invalid number of cached oneDNN objects"));
"Wrong number of cached oneDNN objects"));
}

TEST(test_conv2d_noreuse_cache, cpu_place) {
framework::DDim dims({1, 16, 32, 64});
TEST(test_softmax_noreuse_cache, cpu_place) {
framework::DDim dims({32, 64});
platform::CPUPlace p;
CacheTester ct;
RunOperator<float>(p, "conv2d", dims, "input_signal");
RunOperator<float>(p, "conv2d", dims, "input_signal2");
PADDLE_ENFORCE_EQ(ct.Analyze(18), true,
RunOperator<float>(p, "softmax", dims, "softmax_out");
RunOperator<float>(p, "softmax", dims, "softmax_out2");
PADDLE_ENFORCE_EQ(ct.Analyze(8), true,
platform::errors::InvalidArgument(
"Invalid number of cached oneDNN objects"));
"Wrong number of cached oneDNN objects"));
}

TEST(test_softmax_inplace_cache, cpu_place) {
framework::DDim dims({32, 64});
platform::CPUPlace p;
CacheTester ct;
RunOperator<float>(p, "softmax", dims, "softmax_out");
RunOperator<float>(p, "softmax", dims, "softmax_out", true);
PADDLE_ENFORCE_EQ(ct.Analyze(7), true,
platform::errors::InvalidArgument(
"Wrong number of cached oneDNN objects"));
}

TEST(test_relu_inplace_cache, cpu_place) {
framework::DDim dims({32, 64});
platform::CPUPlace p;
CacheTester ct;
RunOperator<float>(p, "relu", dims, "relu_out");
RunOperator<float>(p, "relu", dims, "relu_out", true);
PADDLE_ENFORCE_EQ(ct.Analyze(7), true,
platform::errors::InvalidArgument(
"Wrong number of cached oneDNN objects"));
}

TEST(test_elementwise_add_reuse_cache, cpu_place) {
framework::DDim dims({32, 64});
platform::CPUPlace p;
CacheTester ct;
RunOperator<float>(p, "elementwise_add", dims, "elementwise_add_out");
RunOperator<float>(p, "relu", dims, "elementwise_add_out", true);
PADDLE_ENFORCE_EQ(ct.Analyze(8), true,
platform::errors::InvalidArgument(
"Wrong number of cached oneDNN objects"));
}

} // namespace operators
Expand Down
Loading