Skip to content

Commit

Permalink
[MLU] add mlu activation kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
fuwenguang committed Apr 13, 2022
1 parent 1cdd88f commit 5a4aedb
Show file tree
Hide file tree
Showing 15 changed files with 831 additions and 55 deletions.
138 changes: 110 additions & 28 deletions paddle/fluid/operators/activation_op_mlu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,8 @@ limitations under the Licnse. */
#include <memory>
#include <string>

#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/platform/device/mlu/device_context.h"
#include "paddle/phi/core/ddim.h"

namespace paddle {
namespace operators {
Expand All @@ -38,20 +34,39 @@ class ActivationMLUKernel : public framework::OpKernel<T> {
output->mutable_data<T>(ctx.GetPlace());

MLUCnnlActivationDesc act_desc(act_mode, alpha);
MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(input->dtype()));
MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(output->dtype()));

MLUCnnl::Active(ctx, act_desc.get(), input_desc.get(),
reinterpret_cast<const void*>(input->data<T>()),
output_desc.get(),
reinterpret_cast<void*>(output->data<T>()));
MLUCnnlTensorDesc input_desc(*input);
MLUCnnlTensorDesc output_desc(*output);

MLUCnnl::Active(ctx, act_desc.get(), input_desc.get(), GetBasePtr(input),
output_desc.get(), GetBasePtr(output));
}
};

// For gelu, leaky_relu
template <cnnlActivationMode_t act_mode, typename T>
class ActivationGradMLUKernel : public framework::OpKernel<T> {
class ActivationGradMLUKernelV1 : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<Tensor>("X");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;

dx->mutable_data<T>(ctx.GetPlace());

MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnlActivationDesc act_desc(act_mode, alpha);
MLUCnnl::ActiveGrad(ctx, act_desc.get(), nullptr, nullptr, nullptr, nullptr,
dout_desc.get(), GetBasePtr(dout), x_desc.get(),
GetBasePtr(x), dx_desc.get(), GetBasePtr(dx));
}
};

// For tanh, sigmoid
template <cnnlActivationMode_t act_mode, typename T>
class ActivationGradMLUKernelV2 : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Input<Tensor>("Out");
Expand All @@ -61,18 +76,35 @@ class ActivationGradMLUKernel : public framework::OpKernel<T> {

dx->mutable_data<T>(ctx.GetPlace());

MLUCnnlTensorDesc dout_desc(*dout, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(dout->dtype()));
MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(out->dtype()));
MLUCnnlTensorDesc dx_desc(*dx, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(dx->dtype()));
MLUCnnlTensorDesc out_desc(*out);
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnlActivationDesc act_desc(act_mode, alpha);
MLUCnnl::ActiveGrad(
ctx, act_desc.get(), nullptr, nullptr, nullptr, nullptr,
dout_desc.get(), reinterpret_cast<const void*>(dout->data<T>()),
out_desc.get(), reinterpret_cast<const void*>(out->data<T>()),
dx_desc.get(), reinterpret_cast<void*>(dx->data<T>()));
MLUCnnl::ActiveGrad(ctx, act_desc.get(), nullptr, nullptr, out_desc.get(),
GetBasePtr(out), dout_desc.get(), GetBasePtr(dout),
nullptr, nullptr, dx_desc.get(), GetBasePtr(dx));
}
};

// For relu, relu6
template <cnnlActivationMode_t act_mode, typename T>
class ActivationGradMLUKernelV3 : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;

dx->mutable_data<T>(ctx.GetPlace());

MLUCnnlTensorDesc out_desc(*out);
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnlActivationDesc act_desc(act_mode, alpha);
MLUCnnl::ActiveGrad(ctx, act_desc.get(), nullptr, nullptr, nullptr, nullptr,
dout_desc.get(), GetBasePtr(dout), out_desc.get(),
GetBasePtr(out), dx_desc.get(), GetBasePtr(dx));
}
};

Expand All @@ -81,10 +113,60 @@ class ActivationGradMLUKernel : public framework::OpKernel<T> {

namespace ops = paddle::operators;

// relu
REGISTER_OP_MLU_KERNEL(
relu, ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU, float>,
ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU, paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
relu_grad, ops::ActivationGradMLUKernel<CNNL_ACTIVATION_RELU, float>,
ops::ActivationGradMLUKernel<CNNL_ACTIVATION_RELU,
paddle::platform::float16>);
relu_grad, ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU, float>,
ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU,
paddle::platform::float16>);

// relu6
REGISTER_OP_MLU_KERNEL(
relu6, ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU6, float>,
ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU6, paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
relu6_grad, ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU6, float>,
ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU6,
paddle::platform::float16>);

// sigmoid
REGISTER_OP_MLU_KERNEL(sigmoid,
ops::ActivationMLUKernel<CNNL_ACTIVATION_SIGMOID, float>,
ops::ActivationMLUKernel<CNNL_ACTIVATION_SIGMOID,
paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
sigmoid_grad,
ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_SIGMOID, float>,
ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_SIGMOID,
paddle::platform::float16>);

// tanh
REGISTER_OP_MLU_KERNEL(
tanh, ops::ActivationMLUKernel<CNNL_ACTIVATION_TANH, float>,
ops::ActivationMLUKernel<CNNL_ACTIVATION_TANH, paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
tanh_grad, ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_TANH, float>,
ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_TANH,
paddle::platform::float16>);

// gelu
REGISTER_OP_MLU_KERNEL(
gelu, ops::ActivationMLUKernel<CNNL_ACTIVATION_GELU, float>,
ops::ActivationMLUKernel<CNNL_ACTIVATION_GELU, paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
gelu_grad, ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_GELU, float>,
ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_GELU,
paddle::platform::float16>);

// leaky_relu
REGISTER_OP_MLU_KERNEL(
leaky_relu, ops::ActivationMLUKernel<CNNL_ACTIVATION_LEAKYRELU, float>,
ops::ActivationMLUKernel<CNNL_ACTIVATION_LEAKYRELU,
paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
leaky_relu_grad,
ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_LEAKYRELU, float>,
ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_LEAKYRELU,
paddle::platform::float16>);
16 changes: 7 additions & 9 deletions paddle/fluid/operators/fill_constant_op_mlu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ class FillConstantMLUKernel : public framework::OpKernel<T> {
}
}
}
const T *value_data = &value;
cnnlPointerMode_t pointer_mode = CNNL_POINTER_MODE_HOST;
if (ctx.HasInput("ValueTensor")) {
auto *value_tensor = ctx.Input<framework::Tensor>("ValueTensor");
PADDLE_ENFORCE_EQ(
Expand All @@ -59,22 +61,18 @@ class FillConstantMLUKernel : public framework::OpKernel<T> {
"When use Tensor as value to set Tensor value in fill_cosntant, "
"value input(ValueTensor) size must be 1, but get %d",
value_tensor->numel()));
const T *tensor_data = value_tensor->data<T>();
framework::Tensor mlu_tensor;
value_data = value_tensor->data<T>();
auto tmp_place = value_tensor->place();
if (platform::is_mlu_place(tmp_place)) {
framework::TensorCopySync(*value_tensor, platform::CPUPlace(),
&mlu_tensor);
tensor_data = mlu_tensor.data<T>();
pointer_mode = CNNL_POINTER_MODE_DEVICE;
}
value = tensor_data[0];
}

auto shape = GetShape(ctx);
out_var->mutable_data<T>(shape, ctx.GetPlace());
MLUCnnlTensorDesc output_desc(*out_var, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(out_var->dtype()));
MLUCnnl::Fill(ctx, value, output_desc.get(), GetBasePtr(out_var));
MLUCnnlTensorDesc output_desc(*out_var);
MLUCnnl::Fill(ctx, pointer_mode, value_data, output_desc.get(),
GetBasePtr(out_var));
}
};
} // namespace operators
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/operators/mean_op_mlu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ class MeanMLUGradKernel : public framework::OpKernel<T> {
MLUCnnlTensorDesc mean_var_desc(mean_var, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(mean_var.dtype()));
auto value = static_cast<T>(1.0 / static_cast<float>(input_grad->numel()));
MLUCnnl::Fill(context, value, mean_var_desc.get(), GetBasePtr(&mean_var));
MLUCnnl::Fill(context, CNNL_POINTER_MODE_HOST, &value, mean_var_desc.get(),
GetBasePtr(&mean_var));

// means mul output_grad
MLUCnnlTensorDesc in_desc(*output_grad, CNNL_LAYOUT_ARRAY,
Expand Down
8 changes: 5 additions & 3 deletions paddle/fluid/operators/metrics/accuracy_op_mlu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,15 +136,17 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
// [total]
total->mutable_data<int>(ctx.GetPlace());
MLUCnnlTensorDesc total_desc(*total);
MLUCnnl::Fill(ctx, num_samples, total_desc.get(), GetBasePtr(total));
MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &num_samples, total_desc.get(),
GetBasePtr(total));

// use `total` of type `float32` for calculating accuracy
Tensor total_fp32(framework::TransToPhiDataType(VT::FP32));
total_fp32.Resize(total->dims());
total_fp32.mutable_data<float>(ctx.GetPlace());
MLUCnnlTensorDesc total_fp32_desc(total_fp32);
MLUCnnl::Fill(ctx, static_cast<float>(num_samples), total_fp32_desc.get(),
GetBasePtr(&total_fp32));
float num_samples_fp32 = static_cast<float>(num_samples);
MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &num_samples_fp32,
total_fp32_desc.get(), GetBasePtr(&total_fp32));

// [accuracy]
accuracy->mutable_data<float>(ctx.GetPlace());
Expand Down
23 changes: 19 additions & 4 deletions paddle/fluid/operators/mlu/mlu_baseop.cc
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,20 @@ MLUCnnlTensorDesc::~MLUCnnlTensorDesc() {
MLUCnnlActivationDesc::MLUCnnlActivationDesc(
const cnnlActivationMode_t act_mode, const float ceof) {
PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateActivationDescriptor(&active_desc_));
PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetActivationDescriptor(
active_desc_, act_mode, CNNL_NOT_PROPAGATE_NAN, ceof));
PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetActivationDescriptor_v4(
active_desc_, act_mode, CNNL_ACTIVATION_HIGH_PRECISION,
CNNL_NOT_PROPAGATE_NAN, ceof, 1.0f /*sliced_dim*/,
1.67326319217681884765625 /*selu_alpha*/,
1.05070102214813232421875 /*selu_lambda*/));
}

MLUCnnlActivationDesc::MLUCnnlActivationDesc(
const cnnlActivationMode_t act_mode, const float ceof,
const float sliced_dim, const float selu_alpha, const float selu_lambda) {
PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateActivationDescriptor(&active_desc_));
PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetActivationDescriptor_v4(
active_desc_, act_mode, CNNL_ACTIVATION_HIGH_PRECISION,
CNNL_NOT_PROPAGATE_NAN, ceof, sliced_dim, selu_alpha, selu_lambda));
}

const cnnlActivationDescriptor_t MLUCnnlActivationDesc::get() const {
Expand Down Expand Up @@ -541,12 +553,15 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
output_desc, output));
}

/* static */ void MLUCnnl::Fill(const ExecutionContext& ctx, float value,
/* static */ void MLUCnnl::Fill(const ExecutionContext& ctx,
const cnnlPointerMode_t pointer_mode,
const void* value_ptr,
const cnnlTensorDescriptor_t output_desc,
void* output) {
cnnlHandle_t handle = GetHandleFromCTX(ctx);

PADDLE_ENFORCE_MLU_SUCCESS(cnnlFill(handle, value, output_desc, output));
PADDLE_ENFORCE_MLU_SUCCESS(
cnnlFill_v3(handle, pointer_mode, value_ptr, output_desc, output));
}

/* static */ void MLUCnnl::QuantifyOffline(
Expand Down
6 changes: 5 additions & 1 deletion paddle/fluid/operators/mlu/mlu_baseop.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,9 @@ class MLUCnnlActivationDesc {
MLUCnnlActivationDesc(const MLUCnnlActivationDesc& desc) = delete;
MLUCnnlActivationDesc& operator=(const MLUCnnlActivationDesc& desc) = delete;
MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, const float ceof);
MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, const float ceof,
const float sliced_dim, const float selu_alpha,
const float selu_lambda);

const cnnlActivationDescriptor_t get() const;
~MLUCnnlActivationDesc();
Expand Down Expand Up @@ -418,7 +421,8 @@ class MLUCnnl {
const cnnlTensorDescriptor_t in1_desc, const void* in1,
const cnnlTensorDescriptor_t output_desc, void* output);

static void Fill(const ExecutionContext& ctx, float value,
static void Fill(const ExecutionContext& ctx,
const cnnlPointerMode_t pointer_mode, const void* value_ptr,
const cnnlTensorDescriptor_t output_desc, void* output);

static void LRN(const ExecutionContext& ctx, const int local_size,
Expand Down
5 changes: 3 additions & 2 deletions paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
"the same Tensors."));
}

auto mu = ctx.Attr<float>("mu");
auto mu = static_cast<T>(ctx.Attr<float>("mu"));
auto lrs = ctx.MultiInput<framework::Tensor>("LearningRate");
if (lrs.size() != 1) {
PADDLE_ENFORCE_EQ(
Expand Down Expand Up @@ -114,7 +114,8 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {

Tensor mu_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
MLUCnnl::Fill(ctx, mu, mu_tensor_desc.get(), GetBasePtr(&mu_tensor));
MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &mu, mu_tensor_desc.get(),
GetBasePtr(&mu_tensor));

for (size_t idx = 0; idx < n; ++idx) {
RegularizationType regularization_flag =
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/operators/optimizers/momentum_op_mlu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ class MLUMomentumOpKernel : public framework::OpKernel<T> {
Tensor mu_tensor =
ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
MLUCnnl::Fill(ctx, mu, mu_tensor_desc.get(), GetBasePtr(&mu_tensor));
MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &mu, mu_tensor_desc.get(),
GetBasePtr(&mu_tensor));

Tensor regularized_grad;
MLUCnnlTensorDesc param_desc(*param);
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ class ReduceMeanGradMLUKernel : public framework::OpKernel<T> {
ToCnnlDataType(input_grad->dtype()));

auto value = static_cast<T>(1.0 / static_cast<float>(reduce_numel));
MLUCnnl::Fill(context, value, input_grad_desc.get(),
GetBasePtr(input_grad));
MLUCnnl::Fill(context, CNNL_POINTER_MODE_HOST, &value,
input_grad_desc.get(), GetBasePtr(input_grad));

MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN);
Expand Down
10 changes: 6 additions & 4 deletions paddle/fluid/operators/scale_op_mlu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class ScaleMLUKernel : public framework::OpKernel<T> {
auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);

// cnnl require input, scale, bias with same type. And all in device side.
auto& scale = ctx.Attr<float>("scale");
auto scale = static_cast<T>(ctx.Attr<float>("scale"));
framework::Tensor scale_tensor;
if (ctx.HasInput("ScaleTensor")) {
framework::Tensor float_scale_tensor =
Expand All @@ -49,14 +49,16 @@ class ScaleMLUKernel : public framework::OpKernel<T> {
} else {
scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
MLUCnnlTensorDesc scale_desc(scale_tensor);
MLUCnnl::Fill(ctx, scale, scale_desc.get(), GetBasePtr(&scale_tensor));
MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &scale, scale_desc.get(),
GetBasePtr(&scale_tensor));
}

auto& bias = ctx.Attr<float>("bias");
auto bias = static_cast<T>(ctx.Attr<float>("bias"));
framework::Tensor bias_tensor =
ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
MLUCnnlTensorDesc bias_desc(bias_tensor);
MLUCnnl::Fill(ctx, bias, bias_desc.get(), GetBasePtr(&bias_tensor));
MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &bias, bias_desc.get(),
GetBasePtr(&bias_tensor));

auto* out_var = ctx.OutputVar("Out");
if (in_var->IsType<phi::SelectedRows>() && in_var != out_var) {
Expand Down
Loading

1 comment on commit 5a4aedb

@paddle-bot-old
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Congratulation! Your pull request passed all required CI. You could ask reviewer(s) to approve and merge. 🎉

Please sign in to comment.