Skip to content

Commit

Permalink
add some ops support fp16 in kunlun2 (#36854)
Browse files Browse the repository at this point in the history
* aaaa

* add some ops support fp16 in kunlun2
  • Loading branch information
taixiurong authored Oct 29, 2021
1 parent 113816d commit 442688a
Show file tree
Hide file tree
Showing 14 changed files with 482 additions and 179 deletions.
130 changes: 68 additions & 62 deletions paddle/fluid/operators/activation_op_xpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,14 @@ class XPUActivationGradKernel
}
};

template <typename DeviceContext, typename T>
template <typename DeviceContext, typename T, typename XPUT>
void xpu_activation_forward(
const framework::ExecutionContext &ctx,
std::function<int(xpu::Context *, const T *, T *, int)> func) {
std::function<int(xpu::Context *, const XPUT *, XPUT *, int)> func) {
const auto *x = ctx.Input<Tensor>("X");
auto *y = ctx.Output<Tensor>("Out");
const T *x_data = x->data<T>();
T *y_data = y->mutable_data<T>(ctx.GetPlace());
const XPUT *x_data = reinterpret_cast<const XPUT *>(x->data<T>());
XPUT *y_data = reinterpret_cast<XPUT *>(y->mutable_data<T>(ctx.GetPlace()));

auto xpu_context = ctx.device_context<DeviceContext>().x_context();
int r = func(xpu_context, x_data, y_data, x->numel());
Expand All @@ -70,23 +70,24 @@ void xpu_activation_forward(
r, XPUAPIErrorMsg[r]));
}

template <typename DeviceContext, typename T>
void xpu_activation_backward(const framework::ExecutionContext &ctx,
std::function<int(xpu::Context *, const T *,
const T *, const T *, T *, int)>
func) {
template <typename DeviceContext, typename T, typename XPUT>
void xpu_activation_backward(
const framework::ExecutionContext &ctx,
std::function<int(xpu::Context *, const XPUT *, const XPUT *, const XPUT *,
XPUT *, int)>
func) {
/* TODO: relu tanh sigmoid are inplace */
const auto *x = ctx.Input<Tensor>("X");
auto *y = ctx.Input<Tensor>("Out");
auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
const T *x_data = nullptr;
const T *y_data = nullptr;
const T *y_grad = nullptr;
if (x != nullptr) x_data = x->data<T>();
if (y != nullptr) y_data = y->data<T>();
if (dOut != nullptr) y_grad = dOut->data<T>();
T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
const XPUT *x_data = nullptr;
const XPUT *y_data = nullptr;
const XPUT *y_grad = nullptr;
if (x != nullptr) x_data = reinterpret_cast<const XPUT *>(x->data<T>());
if (y != nullptr) y_data = reinterpret_cast<const XPUT *>(y->data<T>());
if (dOut != nullptr) y_grad = reinterpret_cast<const XPUT *>(dOut->data<T>());
XPUT *x_grad = reinterpret_cast<XPUT *>(dX->mutable_data<T>(ctx.GetPlace()));
auto xpu_context = ctx.device_context<DeviceContext>().x_context();

int r = func(xpu_context, x_data, y_data, y_grad, x_grad, dX->numel());
Expand All @@ -98,65 +99,64 @@ void xpu_activation_backward(const framework::ExecutionContext &ctx,

template <typename T>
struct XPUReluFunctor : public BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
xpu::relu<T>);
xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
ctx, xpu::relu<XPUType>);
}
};

template <typename T>
struct XPUSigmoidFunctor : public BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(
ctx, xpu::sigmoid<T>);
xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
ctx, xpu::sigmoid<XPUType>);
}
};

template <typename T>
struct XPUTanhFunctor : public BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
xpu::tanh<T>);
}
};

template <typename T>
struct XPUGeluFunctor : public BaseActivationFunctor<T> {
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
xpu::gelu<T>);
xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
ctx, xpu::tanh<XPUType>);
}
};

template <typename T>
struct XPULogFunctor : public BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
xpu::log<T>);
xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
ctx, xpu::log<XPUType>);
}
};

template <typename T>
struct XPUSquareFunctor : public BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(
ctx, xpu::square<T>);
xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
ctx, xpu::square<XPUType>);
}
};

template <typename T>
struct XPUSqrtFunctor : public BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
xpu::sqrt<T>);
xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
ctx, xpu::sqrt<XPUType>);
}
};

template <typename T>
struct XPUAbsFunctor : public BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
xpu::abs<T>);
xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
ctx, xpu::abs<XPUType>);
}
};

Expand Down Expand Up @@ -196,6 +196,7 @@ struct XPUPowFunctor : public BaseActivationFunctor<T> {

template <typename T>
struct XPUHardSwishFunctor : public BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
void operator()(const framework::ExecutionContext &ctx) const {
float threshold = ctx.Attr<float>("threshold");
float scale = ctx.Attr<float>("scale");
Expand All @@ -208,61 +209,59 @@ struct XPUHardSwishFunctor : public BaseActivationFunctor<T> {
PADDLE_ENFORCE_EQ(
offset, 3.0f,
platform::errors::External("Not support offset [%f] in XPU", offset));
xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(
ctx, xpu::hard_swish<T>);
xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
ctx, xpu::hard_swish<XPUType>);
}
};

template <typename T>
struct XPUReluGradFunctor : public BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(
ctx, xpu::relu_grad<T>);
xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
ctx, xpu::relu_grad<XPUType>);
}
};

template <typename T>
struct XPUTanhGradFunctor : public BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(
ctx, xpu::tanh_grad<T>);
xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
ctx, xpu::tanh_grad<XPUType>);
}
};

template <typename T>
struct XPUSigmoidGradFunctor : public BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(
ctx, xpu::sigmoid_grad<T>);
}
};

template <typename T>
struct XPUGeluGradFunctor : public BaseActivationFunctor<T> {
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(
ctx, xpu::gelu_grad<T>);
xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
ctx, xpu::sigmoid_grad<XPUType>);
}
};

template <typename T>
struct XPUSqrtGradFunctor : public BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(
ctx, xpu::sqrt_grad<T>);
xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
ctx, xpu::sqrt_grad<XPUType>);
}
};

template <typename T>
struct XPUSquareGradFunctor : public BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
void operator()(const framework::ExecutionContext &ctx) const {
xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(
ctx, xpu::square_grad<T>);
xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
ctx, xpu::square_grad<XPUType>);
}
};

template <typename T>
struct XPUHardSwishGradFunctor : public BaseActivationFunctor<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
void operator()(const framework::ExecutionContext &ctx) const {
float threshold = ctx.Attr<float>("threshold");
float scale = ctx.Attr<float>("scale");
Expand All @@ -275,8 +274,8 @@ struct XPUHardSwishGradFunctor : public BaseActivationFunctor<T> {
PADDLE_ENFORCE_EQ(
offset, 3.0f,
platform::errors::External("Not support offset [%f] in XPU", offset));
xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(
ctx, xpu::hard_swish_grad<T>);
xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
ctx, xpu::hard_swish_grad<XPUType>);
}
};

Expand Down Expand Up @@ -342,16 +341,23 @@ namespace ops = paddle::operators;
ops::XPUActivationGradKernel<ops::grad_functor<float>>);

REGISTER_ACTIVATION_XPU_KERNEL(relu, XPUReluFunctor, XPUReluGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(tanh, XPUTanhFunctor, XPUTanhGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor,
XPUSigmoidGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(gelu, XPUGeluFunctor, XPUGeluGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSquareGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, XPUHardSwishFunctor,
XPUHardSwishGradFunctor)
REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, XPULeakyReluFunctor,
XPULeakyReluGradFunctor)

REGISTER_OP_XPU_KERNEL(
tanh, ops::XPUActivationKernel<ops::XPUTanhFunctor<float>>,
ops::XPUActivationKernel<ops::XPUTanhFunctor<paddle::platform::float16>>);
REGISTER_OP_XPU_KERNEL(
tanh_grad, ops::XPUActivationGradKernel<ops::XPUTanhGradFunctor<float>>,
ops::XPUActivationGradKernel<
ops::XPUTanhGradFunctor<paddle::platform::float16>>);

REGISTER_OP_XPU_KERNEL(log,
ops::XPUActivationKernel<ops::XPULogFunctor<float>>);
REGISTER_OP_XPU_KERNEL(pow,
Expand Down
33 changes: 10 additions & 23 deletions paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,27 +74,15 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
platform::errors::External("XPU API(logical_not) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::isnan(dev_ctx.x_context(),
reinterpret_cast<const XPUTyp*>(x->data<T>()),
is_nan.data<bool>(), x->numel());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(isnan) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::logical_or(dev_ctx.x_context(), is_finite.data<bool>(),
is_nan.data<bool>(), is_finite.data<bool>(),
x->numel());
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU API(logical_or) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::any(dev_ctx.x_context(), is_finite.data<bool>(),
found_inf_data, x->numel());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(any) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
if (dev_ctx.x_context()->xpu_stream) {
dev_ctx.Wait();
}
memory::Copy(platform::CPUPlace(), &cpu_found_inf_data,
BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
found_inf_data, sizeof(bool));
Expand All @@ -103,12 +91,12 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
if (cpu_found_inf_data) {
inverse_scale = 0.0;
}
auto dev_env = XPUEnv::getenv("XPUSIM_DEVICE_MODEL");

paddle::platform::XPUVersion version = dev_ctx.xpu_version();
framework::Tensor float_x;
framework::Tensor float_out;
if (std::is_same<T, paddle::platform::float16>::value &&
(dev_env == nullptr || std::strcmp(dev_env, "KUNLUN1"))) {
framework::Tensor float_x;
framework::Tensor float_out;
(version == paddle::platform::XPUVersion::XPU1)) {
float_x.mutable_data<MPDType>(dev_ctx.GetPlace(),
x->numel() * sizeof(MPDType));
float_out.mutable_data<MPDType>(dev_ctx.GetPlace(),
Expand Down Expand Up @@ -137,10 +125,6 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
"XPU API(cast_v2) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
if (dev_ctx.x_context()->xpu_stream) {
dev_ctx.Wait();
}

} else {
int r = xpu::scale(dev_ctx.x_context(),
reinterpret_cast<const XPUTyp*>(x->data<T>()),
Expand All @@ -152,6 +136,9 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
r, XPUAPIErrorMsg[r]));
}
}
if (dev_ctx.x_context()->xpu_stream) {
dev_ctx.Wait();
}
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
found_inf_data, platform::CPUPlace(), &cpu_found_inf_data,
sizeof(bool));
Expand Down
6 changes: 2 additions & 4 deletions paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,9 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
} else {
cpu_pre_loss_scaling_data = (*pre_loss_scaling_data);
}

int cpu_good_out_data = 0;
int cpu_bad_out_data = 0;
MPDType cpu_updated_loss_scaling_data;
MPDType cpu_updated_loss_scaling_data = cpu_pre_loss_scaling_data;

if (cpu_found_inf_data) {
cpu_good_out_data = 0;
Expand All @@ -140,8 +139,7 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
cpu_good_out_data = 0;
}
}

// copy to host
// copy to device
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
bad_out_data, platform::CPUPlace(), &cpu_bad_out_data,
sizeof(int));
Expand Down
7 changes: 5 additions & 2 deletions paddle/fluid/operators/fill_constant_op_xpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,11 @@ namespace ops = paddle::operators;
#ifdef PADDLE_WITH_XPU
REGISTER_OP_XPU_KERNEL(
fill_constant, ops::FillConstantKernel<float>,
ops::FillConstantKernel<int64_t>, ops::FillConstantKernel<double>,
ops::FillConstantKernel<bool>, ops::FillConstantKernel<int>,
ops::FillConstantKernel<double>, ops::FillConstantKernel<uint8_t>,
ops::FillConstantKernel<int16_t>, ops::FillConstantKernel<int>,
ops::FillConstantKernel<int64_t>, ops::FillConstantKernel<bool>,
ops::FillConstantKernel<paddle::platform::float16>,
ops::FillConstantKernel<paddle::platform::bfloat16>,
ops::FillConstantKernel<paddle::platform::complex<float>>,
ops::FillConstantKernel<paddle::platform::complex<double>>);
#endif
Loading

0 comments on commit 442688a

Please sign in to comment.