-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
improve the performence of divide_double_grad #62533
improve the performence of divide_double_grad #62533
Conversation
你的PR提交成功,感谢你对开源项目的贡献! |
dy->Resize(y.dims()); | ||
dev_ctx.template Alloc<T>(dy); | ||
if (ddx_tensor == nullptr && ddy_tensor == nullptr) { | ||
dy = nullptr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- 如果推导右侧公式存在None导致左侧变量无法计算,根据现在paddle的方案,应该用全0填充:
FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), dy);
,否则大量代码需要修改。 - 把指针本身设置为空的行为应该是没有意义的
其他地方问题类似
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
@@ -312,17 +312,6 @@ void DivideDoubleGradKernel(const Context& dev_ctx, | |||
if (ddy_tensor == nullptr) { | |||
dout = nullptr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- 让传值的指针等于 nullptr 这个操作,应该是没意义的,进而可以把这里的if-else判断优化下
- dy应该赋值为形状跟y一样的全0矩阵
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
可以对比一下调用次数压缩前后的性能情况
DenseTensor dz_div_y; | ||
dz_div_y.Resize(out.dims()); | ||
if (!dx_tensor || dx_tensor->dims() != out.dims()) { | ||
dev_ctx.template Alloc<T>(&dz_div_y); | ||
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::DivideFunctor<T>, | ||
funcs::InverseDivideFunctor<T>>( | ||
dev_ctx, grad_out, y, &dz_div_y, axis); | ||
dx_tensor = &dz_div_y; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
由于dx
(dz_div_y
),只有在算dy
和dout
的时候才会使用到,所以我觉得可以:
- 182行的if条件改为:
if ((dy || dout) && (!dx_tensor || dx_tensor->dims() != out.dims()))
- 保持
dz_div_y
的定义位置不变,将dz_div_y.Resize(out.dims());
放到if条件里面,因为只有需要计算dx_tensor
时,才需要dz_div_y
这个中间变量
这样尽量减少不必要的计算。
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
dy->Resize(y.dims()); | ||
dev_ctx.template Alloc<T>(dy); | ||
if (!ddx_tensor && !ddy_tensor) { | ||
FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), dy); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
构造函数的第三个参数Scalar(0.0)
是否可以改为Scalar(static_cast<T>(0.0))
,否则当T为complex的时候可能会有问题?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
if (!ddx_tensor && !ddy_tensor) { | ||
FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), dy); | ||
} else { | ||
DenseTensor tmp_dy = tmp; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
209行感觉可以删掉吧,反正tmp_dy也是语义不明,还不如直接用tmp,删掉之后下面的tmp_dy全部改成tmp
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::DivideFunctor<T>, | ||
funcs::InverseDivideFunctor<T>>( | ||
dev_ctx, *dx_tensor, y, &tmp_dy, axis); | ||
if (ddx_tensor && !ddy_tensor) { | ||
// dy = -dX * ddX / Y | ||
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::MultiplyFunctor<T>, | ||
funcs::InverseMultiplyFunctor<T>>( | ||
dev_ctx, *ddx_tensor, tmp_dy, dy, axis); | ||
auto& place = *dev_ctx.eigen_device(); | ||
auto dy_result = phi::EigenVector<T>::Flatten(*dy); | ||
dy_result.device(place) = static_cast<T>(-1) * dy_result; | ||
} else if (!ddx_tensor && ddy_tensor) { | ||
// dY = Out * dX * ddY / Y | ||
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::MultiplyFunctor<T>, | ||
funcs::InverseMultiplyFunctor<T>>( | ||
dev_ctx, *ddy_tensor, tmp_dy, &tmp_dy, axis); | ||
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::MultiplyFunctor<T>, | ||
funcs::InverseMultiplyFunctor<T>>( | ||
dev_ctx, out, tmp_dy, dy, axis); | ||
} else { | ||
// dY = Out * dX * ddY / Y - dX * ddX / Y | ||
|
||
// dY = Out * dX * ddY / Y - dX * ddX / Y | ||
phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivDoubleDY<T>>( | ||
dev_ctx, | ||
ddX_safe, | ||
ddY_safe, | ||
out, | ||
dX_div_Y, | ||
axis, | ||
nullptr, | ||
dy, | ||
DivGradDX<T>(), | ||
DivDoubleDY<T>()); | ||
// NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the | ||
// first output tensor is nullptr, the branch to calculate first | ||
// output tensor will not be activated, DivGradDx function will not | ||
// be called and can be ignored, the first branch has little effect | ||
// on running speed. | ||
phi::funcs:: | ||
ElemwiseGradCompute<Context, T, DivGradDX<T>, DivDoubleDY<T>>( | ||
dev_ctx, | ||
*ddx_tensor, | ||
*ddy_tensor, | ||
out, | ||
tmp_dy, | ||
axis, | ||
nullptr, | ||
dy, | ||
DivGradDX<T>(), | ||
DivDoubleDY<T>()); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这里的逻辑看起来是首先统一在 开头求出公共项:dx/y
,然后根据不同的条件走不同的分支,有的分支内部调用不止一次DefaultElementwiseOperator
,所以这里看一下是否可以全部统一使用ElemwiseGradCompute
,if-else分支内也通过一次调用计算完毕,只不过需要根据不同的条件,编写不同的dy_op
(如DivDoubleDY_Only_DDX
, DivDoubleDY_Only_DDY
)并传给ElemwiseGradCompute
,这些不同的dy_op
运算时真正使用到的参数也是不同的,不使用的参数可以随便传一个形状相同的并且能正常访问的DenseTensor
占位即可。
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
} else if (ddx_tensor != nullptr && ddy_tensor == nullptr) { | ||
// ddOut = ddX / Y | ||
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::DivideFunctor<T>, | ||
funcs::InverseDivideFunctor<T>>( | ||
dev_ctx, *ddx_tensor, y, ddout, axis); | ||
} else if (!ddx_tensor && ddy_tensor) { | ||
// ddOut = - Out * ddY / Y | ||
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::MultiplyFunctor<T>, | ||
funcs::InverseMultiplyFunctor<T>>( | ||
dev_ctx, out, *ddy_tensor, &tmp, axis); | ||
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::DivideFunctor<T>, | ||
funcs::InverseDivideFunctor<T>>( | ||
dev_ctx, tmp, y, ddout, axis); | ||
auto& place = *dev_ctx.eigen_device(); | ||
auto ddout_result = phi::EigenVector<T>::Flatten(*ddout); | ||
ddout_result.device(place) = static_cast<T>(-1) * ddout_result; | ||
} else { | ||
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::MultiplyFunctor<T>, | ||
funcs::InverseMultiplyFunctor<T>>( | ||
dev_ctx, out, *ddy_tensor, &tmp, axis); | ||
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::SubtractFunctor<T>, | ||
funcs::InverseSubtractFunctor<T>>( | ||
dev_ctx, *ddx_tensor, tmp, &tmp, axis); | ||
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::DivideFunctor<T>, | ||
funcs::InverseDivideFunctor<T>>( | ||
dev_ctx, tmp, y, ddout, axis); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
同理,多次DefaultElementwiseOperator
调用是否可以优化成一次调用
FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), dy); | ||
} else { | ||
DenseTensor tmp_dy = tmp; | ||
// dX / Y |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// dX / Y
==> // pre-compute 'dX / Y' into 'tmp' for 'ddout' and/or 'dy'
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
auto& place = *dev_ctx.eigen_device(); | ||
auto dout_result = phi::EigenVector<T>::Flatten(*dout); | ||
dout_result.device(place) = static_cast<T>(-1) * dout_result; | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
326和327之间加个空行
@@ -166,33 +166,28 @@ template <typename T, typename Context> | |||
void DivideDoubleGradKernel(const Context& dev_ctx, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DivDoubleDY函数内可以把dout提出来,减少乘法次数
template <typename T> | ||
struct DivDoubleDY_Only_DDX { | ||
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -x * dout; } | ||
}; | ||
|
||
template <typename T> | ||
struct DivDoubleDY_Only_DDY { | ||
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { | ||
return y * out * dout; | ||
} | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
参数类型可以改为const T&
代码覆盖率挂了,可能需要加一些单测来覆盖被标记为红色的代码:https://xly.bce.baidu.com/paddlepaddle/paddle/newipipe/detail/10325058/job/25667520 |
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::MultiplyFunctor<T>, | ||
funcs::InverseMultiplyFunctor<T>>( | ||
dev_ctx, out, *ddy_tensor, &tmp, axis); | ||
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::SubtractFunctor<T>, | ||
funcs::InverseSubtractFunctor<T>>( | ||
dev_ctx, *ddx_tensor, tmp, &tmp, axis); | ||
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::DivideFunctor<T>, | ||
funcs::InverseDivideFunctor<T>>( | ||
dev_ctx, tmp, y, ddout, axis); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
可以把这三次调用合成一次
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
// ddOut = - Out * ddY / Y | ||
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::MultiplyFunctor<T>, | ||
funcs::InverseMultiplyFunctor<T>>( | ||
dev_ctx, out, *ddy_tensor, &tmp, axis); | ||
// VLOG(4) << "5"; | ||
funcs::DefaultElementwiseOperator<Context, | ||
T, | ||
funcs::DivideFunctor<T>, | ||
funcs::InverseDivideFunctor<T>>( | ||
dev_ctx, tmp, y, ddout, axis); | ||
auto& place = *dev_ctx.eigen_device(); | ||
auto ddout_result = phi::EigenVector<T>::Flatten(*ddout); | ||
ddout_result.device(place) = static_cast<T>(-1) * ddout_result; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
两次调用合成一次
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
// template <typename T> | ||
// struct DivDoubleDDOut { | ||
// HOSTDEVICE T operator()(T x, T y, T out, T dout) const { | ||
// return (x - out * y) / dout; | ||
// } | ||
// }; | ||
|
||
// template <typename T> | ||
// struct DivDoubleDDOut_Only_DDX { | ||
// HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return x / dout; } | ||
// }; | ||
|
||
// template <typename T> | ||
// struct DivDoubleDDOut_Only_DDY { | ||
// HOSTDEVICE T operator()(T x, T y, T out, T dout) const { | ||
// return -out * y / dout; | ||
// } | ||
// }; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
参数类型改为const T&
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
funcs::MultiplyFunctor<T>, | ||
funcs::InverseMultiplyFunctor<T>>( | ||
dev_ctx, out, *ddy_tensor, &tmp, axis); | ||
// VLOG(4) << "5"; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
调试用的VLOG可以删除
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM for op benchmark ci
PR Category
Performance Optimization
PR Types
Performance
Description
优化divide_double_grad的大算子实现,优化前:
优化后