Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【Hackathon No.50】为 Paddle lerp 算子实现 float16 数据类型支持 #50925

Merged
merged 8 commits into from
Apr 3, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 18 additions & 9 deletions paddle/phi/kernels/gpu/lerp_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"

#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/common_shape.h"
Expand All @@ -35,16 +36,18 @@ __global__ void LerpGradKernelImpl(const T* weight,
const int out_size,
const int x_size,
const int y_size) {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
CUDA_KERNEL_LOOP_TYPE(idx, out_size, int64_t) {
T temp_dx = weight[idx] * dout[idx];
MPType temp_dx =
static_cast<MPType>(weight[idx]) * static_cast<MPType>(dout[idx]);
if (dx) {
if (idx < x_size) {
dx[idx] = dout[idx] - temp_dx;
dx[idx] = static_cast<T>(static_cast<MPType>(dout[idx]) - temp_dx);
}
}
if (dy) {
if (idx < y_size) {
dy[idx] = temp_dx;
dy[idx] = static_cast<T>(temp_dx);
}
}
}
Expand All @@ -58,17 +61,18 @@ __global__ void LerpGradScalarKernelImpl(const T* weight,
const int out_size,
const int x_size,
const int y_size) {
T weight_scalar = weight[0];
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
MPType weight_scalar = static_cast<MPType>(weight[0]);
CUDA_KERNEL_LOOP_TYPE(idx, out_size, int64_t) {
T temp_dx = weight_scalar * dout[idx];
MPType temp_dx = weight_scalar * static_cast<MPType>(dout[idx]);
if (dx) {
if (idx < x_size) {
dx[idx] = dout[idx] - temp_dx;
dx[idx] = static_cast<T>(static_cast<MPType>(dout[idx]) - temp_dx);
}
}
if (dy) {
if (idx < y_size) {
dy[idx] = temp_dx;
dy[idx] = static_cast<T>(temp_dx);
}
}
}
Expand Down Expand Up @@ -270,5 +274,10 @@ void LerpGradKernel(const Context& ctx,

} // namespace phi

PD_REGISTER_KERNEL(
lerp_grad, GPU, ALL_LAYOUT, phi::LerpGradKernel, float, double) {}
PD_REGISTER_KERNEL(lerp_grad,
GPU,
ALL_LAYOUT,
phi::LerpGradKernel,
phi::dtype::float16,
float,
double) {}
8 changes: 7 additions & 1 deletion paddle/phi/kernels/gpu/lerp_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,10 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/lerp_kernel_impl.h"

PD_REGISTER_KERNEL(lerp, GPU, ALL_LAYOUT, phi::LerpKernel, float, double) {}
PD_REGISTER_KERNEL(lerp,
GPU,
ALL_LAYOUT,
phi::LerpKernel,
phi::dtype::float16,
float,
double) {}
17 changes: 13 additions & 4 deletions paddle/phi/kernels/impl/lerp_kernel_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#pragma once

#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/kernels/funcs/common_shape.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"

Expand Down Expand Up @@ -43,11 +44,14 @@ static void LerpFunction(const Context& ctx,
auto eigen_w = phi::EigenTensor<T, D>::From(weight, w_dims);
auto eigen_out = phi::EigenTensor<T, D>::From(*out);

using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
auto& place = *ctx.eigen_device();
eigen_out.device(place) =
eigen_x.broadcast(x_bcast_dims) +
eigen_w.broadcast(w_bcast_dims) *
(eigen_y.broadcast(y_bcast_dims) - eigen_x.broadcast(x_bcast_dims));
(eigen_x.broadcast(x_bcast_dims).template cast<MPType>() +
eigen_w.broadcast(w_bcast_dims).template cast<MPType>() *
(eigen_y.broadcast(y_bcast_dims).template cast<MPType>() -
eigen_x.broadcast(x_bcast_dims).template cast<MPType>()))
.template cast<T>();
}

template <typename Context, typename T>
Expand All @@ -64,8 +68,13 @@ static void LerpFunctionZero(const Context& ctx,
auto eigen_w = phi::EigenTensor<T, 1>::From(weight, dim);
auto eigen_out = phi::EigenTensor<T, 1>::From(*out, dim);

using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
auto& place = *ctx.eigen_device();
eigen_out.device(place) = eigen_x + eigen_w * (eigen_y - eigen_x);
eigen_out.device(place) =
(eigen_x.template cast<MPType>() +
eigen_w.template cast<MPType>() *
(eigen_y.template cast<MPType>() - eigen_x.template cast<MPType>()))
.template cast<T>();
}

template <typename T, typename Context>
Expand Down
62 changes: 62 additions & 0 deletions python/paddle/fluid/tests/unittests/test_lerp_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,68 @@ def init_shape(self):
self.shape = [2, 1, 2, 5, 1, 5]


class TestLerpWithDim2Fp16(TestLerp):
def init_shape(self):
self.shape = [2, 50]

def init_dtype(self):
self.dtype = np.float16


class TestLerpWithDim3Fp16(TestLerp):
def init_shape(self):
self.shape = [2, 2, 25]

def init_dtype(self):
self.dtype = np.float16


class TestLerpWithDim4Fp16(TestLerp):
def init_shape(self):
self.shape = [2, 2, 5, 5]

def init_dtype(self):
self.dtype = np.float16


class TestLerpWithDim5Fp16(TestLerp):
def init_shape(self):
self.shape = [2, 1, 2, 5, 5]

def init_dtype(self):
self.dtype = np.float16


class TestLerpWithDim6Fp16(TestLerp):
def init_shape(self):
self.shape = [2, 1, 2, 5, 1, 5]

def init_dtype(self):
self.dtype = np.float16
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

上面这5个case除了维度不同在调用的kernel上应该并没有区别。x、y的shape都一样,甚至元素数量都是100,从测试覆盖情况来看相同的测试内容保留一个规模最大的即可。原始的fp32单测写的太累赘了,不需要完全保证相同数量的case。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image
老师,他五个case在正向kernel中是走的五个分支。FP32这样写应该是要测试每个维度的正确性,这种情况需要修改吗?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里面其实都是走的同一个LerpFuntion,这里之所以写了这么多分支是因为eigen实现,模版参数必须设置dim的数值。但其实走的是同一个函数,计算的方式上没有差异的。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done



class TestLerpWihFp16BroadXY(TestLerp):
def setUp(self):
self.op_type = "lerp"
self.python_api = paddle.lerp
x = np.arange(1.0, 201.0).astype(np.float16).reshape([2, 1, 2, 50])
y = np.full(200, 10.0).astype(np.float16).reshape([2, 2, 1, 50])
w = np.asarray([0.5]).astype(np.float16)
self.inputs = {'X': x, 'Y': y, 'Weight': w}
self.outputs = {'Out': x + w * (y - x)}


class TestLerpWithFp16BroadWToXY(TestLerp):
def setUp(self):
self.op_type = "lerp"
self.python_api = paddle.lerp
x = np.full(600, 2.5).astype(np.float16).reshape([50, 2, 2, 3])
y = np.full(600, 1.0).astype(np.float16).reshape([50, 2, 2, 3])
w = np.random.random([3]).astype(np.float16)
self.inputs = {'X': x, 'Y': y, 'Weight': w}
self.outputs = {'Out': x + w * (y - x)}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • 这3个单测可以保留1个就可以,因为不同的shape在Kernel实现上并没有区分处理。
  • 代码可以简化:因为继承了TestLerp,这里只需要为fp16的case实现 init_dtype和init_shape就可以

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

您好

  • 这三个单测我保留了两个,这两个单侧在反向的kernel中实现是不同的。
  • 测试代码已经按照要求简化。

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

上面的5个case和TestLerpWithFp16BroadWToXY 除了shape不同好像没有其他差异?x和y都是相同的shape,w.shape=[1],测试内容有些重复。上述5个case和TestLerpWithFp16BroadWToXY保留一个就可以。

另外,TestLerpWihFp16BroadXY和TestLerpWithFp16BroadWToXY也可以直接重写init_shape和init_dtype简化吧?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

老师,上面五个case的和TestLerpWithFp16BroadWToXY是不同的。因为TestLerpWithFp16BroadWToXY的w.shape=[3],会走到broadcast w的反向分支。而上面五个case的w.shape=[1],会走到w为scalar的分支。
另外,TestLerpWihFp16BroadXY和TestLerpWithFp16BroadWToXY重写init_shape和init_dtype也比较麻烦,因为TestLerpWihFp16BroadXY中X与Y的shape并不相同,然后TestLerpWithFp16BroadWToXY中w是random的shape=[3]的向量。不是基类中w.shape=[1]的标量。所以选择直接重写setup函数,不知道可不可以。

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

明白了,那TestLerpWihFp16BroadXY和TestLerpWithFp16BroadWToXY可以保留。
其实还是可以继承已有的TestLerpOp,但是可以给init_shape增加一个设置w的shape的功能,对TestLerpOp稍作修改,这样会更通用,代码也能得到简化。

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

单测或许可以只保留3个case,TestLerpWihFp16BroadXY和TestLerpWithFp16BroadWToXY,以及x和y相同shape,w.shape=[1]的?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

好的老师,已经按照相关说明更改。在基类中增加了init_wshape、init_xyshape。其中init_xyshape用来对x、y shape不相等时初始化。并且简化了原来FP32的TestLerpBroadXY、TestLerpBroadWToXY。



class TestLerpBroadXY(TestLerp):
def setUp(self):
self.op_type = "lerp"
Expand Down
16 changes: 10 additions & 6 deletions python/paddle/tensor/math.py
Original file line number Diff line number Diff line change
Expand Up @@ -4212,9 +4212,9 @@ def lerp(x, y, weight, name=None):
lerp(x, y, weight) = x + weight * (y - x).

Args:
x (Tensor): An N-D Tensor with starting points, the data type is float32, float64.
y (Tensor): An N-D Tensor with ending points, the data type is float32, float64.
weight (float|Tensor): The weight for the interpolation formula. When weight is Tensor, the data type is float32, float64.
x (Tensor): An N-D Tensor with starting points, the data type is float16, float32, float64.
y (Tensor): An N-D Tensor with ending points, the data type is float16, float32, float64.
weight (float|Tensor): The weight for the interpolation formula. When weight is Tensor, the data type is float16, float32, float64.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.

Returns:
Expand All @@ -4238,10 +4238,14 @@ def lerp(x, y, weight, name=None):
if in_dygraph_mode():
return _C_ops.lerp(x, y, weight)
else:
check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'lerp')
check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'lerp')
check_variable_and_dtype(
weight, 'weight', ['float32', 'float64'], 'lerp'
x, 'x', ['float16', 'float32', 'float64'], 'lerp'
)
check_variable_and_dtype(
y, 'y', ['float16', 'float32', 'float64'], 'lerp'
)
check_variable_and_dtype(
weight, 'weight', ['float16', 'float32', 'float64'], 'lerp'
)

helper = LayerHelper('lerp', **locals())
Expand Down