Skip to content

Commit

Permalink
Fix the performance problem when 'axis' is not specified
Browse files Browse the repository at this point in the history
  • Loading branch information
ronny1996 committed Aug 24, 2021
1 parent 36f7e75 commit 3a490d3
Showing 1 changed file with 8 additions and 13 deletions.
21 changes: 8 additions & 13 deletions paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,27 +42,22 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
auto y_dims = y->dims();
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
if (x_dims.size() >= y_dims.size()) {
direct_compute =
y_dims == framework::slice_ddim(x_dims, axis, x_dims.size());
direct_compute = x_dims.size() == (y_dims.size() + axis);
} else {
direct_compute =
x_dims == framework::slice_ddim(y_dims, axis, y_dims.size());
direct_compute = y_dims.size() == (x_dims.size() + axis);
}

Tensor transformed_x, transformed_y;
if (direct_compute) {
transformed_x.ShareDataWith(*x);
transformed_y.ShareDataWith(*y);
const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
runner.Run(dev_ctx.stream());
} else {
Tensor transformed_x, transformed_y;
NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &transformed_x,
&transformed_y);
const auto& runner =
NpuOpRunner("Add", {transformed_x, transformed_y}, {*out}, {});
runner.Run(dev_ctx.stream());
}
const auto& runner =
NpuOpRunner("Add", {transformed_x, transformed_y}, {*out}, {});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};

Expand Down

1 comment on commit 3a490d3

@paddle-bot-old
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Congratulation! Your pull request passed all required CI. You could ask reviewer(s) to approve and merge. 🎉

Please sign in to comment.