diff --git a/rfcs/APIs/20230914_api_design_for_copysign.md b/rfcs/APIs/20230914_api_design_for_copysign.md new file mode 100644 index 000000000..f2a816d4d --- /dev/null +++ b/rfcs/APIs/20230914_api_design_for_copysign.md @@ -0,0 +1,293 @@ +# paddle.copysign 设计文档 + +| API 名称 | paddle.copysign | +| ------------ | -------------------------------- | +| 提交作者 | coco | +| 提交时间 | 2023-09-14 | +| 版本号 | V1.0 | +| 依赖飞桨版本 | develop | +| 文件名 | 20230914_api_defign_for_copysign | + +# 一、概述 + +## 1、相关背景 + +为了提升飞桨API丰富度,Paddle需要扩充API,调用路径为: + +- paddle.copysign 作为独立的函数调用,非 inplace +- paddle.copysign_,作为独立的函数,inplace 地修改输入; +- Tensor.copysign做为 Tensor 的方法使用,非 inplace; +- Tensor.copysign_做为 Tensor 的方法使用, inplace 修改输入; + +## 2、功能目标 + +根据两个输入逐元素地计算结果张量,其结果由第一个输入的绝对值大小及第二个输入的符号组成。 + +## 3、意义 + +飞桨支持直接通过张量进行批量正负符号复制 + +# 二、飞桨现状 + +目前paddle缺少相关功能实现。 + +# 三、业内方案调研 + +## PyTorch + +PyTorch中有API `torch.copysign(input, other, *, out=None) → [Tensor]` 以及对应的`torch.Tensor.copysign` + +在PyTorch中介绍为: + +``` +Create a new floating-point tensor with the magnitude of input and the sign of other, elementwise. + +Supports broadcasting to a common shape, and integer and float inputs. +``` + +## 实现方法 + +从实现方法上,PyTorch是通过c++实现的,[CPU kernel代码位置](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp#L1148-L1158) + +```cpp +void copysign_kernel(TensorIteratorBase& iter) { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_cpu", [&]() { + cpu_kernel_vec(iter, + [](scalar_t a, scalar_t b) -> scalar_t { + return c10::copysign(a, b); + }, + [](Vectorized a, Vectorized b) -> Vectorized { + return a.copysign(b); + }); + }); +} +``` + +在c10 namespace中,[代码位置](https://github.com/pytorch/pytorch/blob/main/c10/util/copysign.h#L12-L15): + +```cpp +namespace c10 { + +// Note: Explicit implementation of copysign for Half and BFloat16 +// is needed to workaround g++-7/8 crash on aarch64, but also makes +// copysign faster for the half-precision types +template +inline auto copysign(const T& a, const U& b) { + return std::copysign(a, b); +} +... +} // namespace c10 +``` + +[cuda kernel代码位置](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cuda/CopysignKernel.cu#L23-L29) + +```cpp +namespace at::native { + +void copysign_kernel_cuda(TensorIteratorBase& iter) { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_cuda", [&]() { + gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { + return c10::cuda::compat::copysign(a, b); + }); + }); +} + +REGISTER_DISPATCH(copysign_stub, ©sign_kernel_cuda); + +} // namespace at::native +``` + +namespace中的`copysign`调用,[代码位置](https://github.com/pytorch/pytorch/blob/main/c10/cuda/CUDAMathCompat.h#L46-L65) + +```cpp +__MATH_FUNCTIONS_DECL__ float copysign(float x, float y) { +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) + return ::copysignf(x, y); +#else + // std::copysign gets ICE/Segfaults with gcc 7.5/8 on arm64 + // (e.g. Jetson), see PyTorch PR #51834 + // This host function needs to be here for the compiler but is never used + TORCH_INTERNAL_ASSERT( + false, "CUDAMathCompat copysign should not run on the CPU"); +#endif +} +__MATH_FUNCTIONS_DECL__ double copysign(double x, double y) { +#if defined(__CUDA_ARCH__) || defined(__HIPCC__) + return ::copysign(x, y); +#else + // see above + TORCH_INTERNAL_ASSERT( + false, "CUDAMathCompat copysign should not run on the CPU"); +#endif +} +``` + +方法都是底层cpp调用copysign函数 + + + +**反向backward:** + +算子配置[代码位置](https://github.com/pytorch/pytorch/blob/main/tools/autograd/derivatives.yaml#L474-L481C28) + +```yaml +- name: copysign.Tensor(Tensor self, Tensor other) -> Tensor + self: copysign_tensor_self_backward(grad, self, result) + other: zeros_like(other) + result: copysign_tensor_self_backward(self_t, self_p, result) + +- name: copysign.Scalar(Tensor self, Scalar other) -> Tensor + self: copysign_tensor_self_backward(grad, self, result) + result: auto_element_wise +``` + +backward 反向[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/csrc/autograd/FunctionsManual.cpp#L94-L101) + +```cpp +Tensor copysign_tensor_self_backward( + const Tensor& grad, + const Tensor& self, + const Tensor& result) { + auto ratio = result / self; + ratio.masked_fill_(self == 0, 0); + return grad * ratio; +} +``` + +## TensorFlow + +无`copysign`实现 + +## Numpy + +numpy.**copysign**(*x1*, *x2*, */*, *out=None*, ***, *where=True*, *casting='same_kind'*, *order='K'*, *dtype=None*, *subok=True*[, *signature*, *extobj*]) *= * + +Change the sign of x1 to that of x2, element-wise.If *x2* is a scalar, its sign will be copied to all elements of *x1*. + +### 实现方法 + +先模板生成函数,底层cpp调用实现[代码位置](https://github.com/numpy/numpy/blob/main/numpy/core/src/umath/loops.c.src#L1213-L1221) + +``` +NPY_NO_EXPORT void +@TYPE@_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + *((@type@ *)op1)= npy_copysign@c@(in1, in2); + } +} +``` + +实际调用cpp的math库[代码位置](https://github.com/numpy/numpy/blob/main/numpy/core/include/numpy/npy_math.h#L199) + +```cpp +#include + +... +#define npy_copysign copysign +... +``` + + + +# 四、对比分析 + +PyTorch和Numpy实现方式基本一致,都是底层调用cpp的math库实现`copysign`,PyTorch可进行backward。 + +# 五、设计思路与实现方案 + +## 命名与参数设计 + +API的设计为: + +- paddle.copysign(x, y, name=None) 作为独立的函数调用,非 inplace; +- paddle.copysign_(x, y, name=None),作为独立的函数,inplace 地修改输入; +- Tensor.copysign(y, name=None)做为 Tensor 的方法使用,非 inplace; +- Tensor.copysign_(y, name=None)做为 Tensor 的方法使用, inplace 修改输入; + +其中 + ++ x(Tensor) - 需要取用绝对值作为输出数值部分的 Tensor , 支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16` ++ y(Tensor | Number) - 为 Tensor 时,shape 需要与 x 相同,或者可广播成 x.shape,支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16`;为 Number 时,支持 `bool`、`int`、`float` + +## 底层OP设计 + +参考PyTorch与Numpy中的设计,调用底层cpp实现OP,反向 kernel impl 大致如下: + +```cpp +template +struct CopySignGradFunctor { + CopySignGradFunctor(const T* x_data, const T* y_data, const T* dout, T* dx, int64_t numel) + : x_data_(x_data), y_data_(y_data), dout_(dout), dx_(dx), numel_(numel) {} + + // backward 逻辑如下 + HOSTDEVICE void operator()(int64_t idx) const { + if (x_data_[idx] == T(0)) dx_[idx] = T(0); + else dx_[idx] = T(dout_[idx]) * (T(std::copysign(x_data_[idx], y_data_[idx]) / x_data_[idx])); + } + + const T* x_data_; + const T* y_data_; + const T* dout_; + T* dx_; + int64_t numel_; +}; + +template +void CopySignGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + DenseTensor* x_grad) { + dev_ctx.template Alloc(x_grad); + auto x_data = x.data(), y_data = y.data(), out_grad_data = out_grad.data(); + auto x_grad_data = x_grad->data(); + phi::funcs::ForRange for_range(dev_ctx, x.numel()); + phi::CopySignGradFunctor functor(x_data, y_data, out_grad_data, x_grad_data, x.numel()); + for_range(functor); +} +``` + + + +## API实现方案 + +1. 配置算子的yaml,注意配置inplace +2. 实现`CopySignInferMeta`,在调用kernel之前计算好`out`的`shape`和`dtype` +3. 实现`CopySignKernel`的CPU和GPU代码以及forward、backward +4. 封装Python的API,支持动态图和静态图,编写文档 +5. 编写单测 + +# 六、测试和验收的考量 + +测试考虑的case如下: + ++ **编程范式场景**:常规覆盖动态图和静态图的测试场景 + ++ **硬件场景**:常规需覆盖 CPU、GPU 两种测试场景 ++ **参数组合场景**:常规覆盖 API 的全部入参,需要对全部入参进行参数有效性和边界值测试,同时可选参数也需有相应的测试覆盖 ++ **计算精度**:需要保证前向计算、反向计算的精度正确性 + + 前向计算:通过 numpy 实现的函数的对比结果 + + 反向计算:通过 numpy 推导,计算反向结果的正确性 ++ **维度测试**:Paddle API 支持的最低维度为 0 维,单测中应编写相应的 0 维尺寸测试 case ++ **边界测试**:y为0、+0、-0时,测试与numpy结果的一致性 + +# 七、可行性分析及规划排期 + +有业内方案实现作为参考,工期上可以满足在当前版本周期内开发完成。 + +# 八、影响面 + +为独立新增API,对其他模块没有影响 + +# 名词解释 + +无 + +# 附件及参考资料 + +[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.copysign.html?highlight=copysign#torch.copysign) + +[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.copysign.html#numpy-copysign) \ No newline at end of file