From 7d2e856ec0d1af7fb5b0cbe561574cab7ebb6db9 Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Thu, 14 Sep 2023 02:52:59 +0800
Subject: [PATCH 01/13] add copysign rfc

---
 rfcs/APIs/20230914_api_design_for_copysign.md | 257 ++++++++++++++++++
 1 file changed, 257 insertions(+)
 create mode 100644 rfcs/APIs/20230914_api_design_for_copysign.md
diff --git a/rfcs/APIs/20230914_api_design_for_copysign.md b/rfcs/APIs/20230914_api_design_for_copysign.md
new file mode 100644
index 000000000..d320f5efc
--- /dev/null
+++ b/rfcs/APIs/20230914_api_design_for_copysign.md
@@ -0,0 +1,257 @@
+# paddle.copysign 设计文档
+
+| API 名称     | paddle.copysign                  |
+| ------------ | -------------------------------- |
+| 提交作者     | coco                             |
+| 提交时间     | 2023-09-14                       |
+| 版本号       | V1.0                             |
+| 依赖飞桨版本 | develop                          |
+| 文件名       | 20230914_api_defign_for_copysign |
+
+# 一、概述
+
+## 1、相关背景
+
+为了提升飞桨API丰富度，Paddle需要扩充API，调用路径为：
+
+- paddle.copysign 作为独立的函数调用，非 inplace
+- paddle.copysign_，作为独立的函数，inplace 地修改输入；
+- Tensor.copysign做为 Tensor 的方法使用，非 inplace;
+- Tensor.copysign_做为 Tensor 的方法使用， inplace 修改输入；
+
+## 2、功能目标
+
+根据两个输入逐元素地计算结果张量，其结果由第一个输入的绝对值大小及第二个输入的符号组成。
+
+## 3、意义
+
+飞桨支持直接通过张量进行批量正负符号复制
+
+# 二、飞桨现状
+
+目前paddle缺少相关功能实现。
+
+# 三、业内方案调研
+
+## PyTorch
+
+PyTorch中有API `torch.copysign(input, other, *, out=None) → [Tensor]` 以及对应的`torch.Tensor.copysign`
+
+在PyTorch中介绍为：
+
+```
+Create a new floating-point tensor with the magnitude of input and the sign of other, elementwise.
+ 
+Supports broadcasting to a common shape, and integer and float inputs.
+```
+
+## 实现方法
+
+从实现方法上，PyTorch是通过c++实现的，[CPU kernel代码位置](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp#L1148-L1158)
+
+```cpp
+void copysign_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_cpu", [&]() {
+    cpu_kernel_vec(iter,
+      [](scalar_t a, scalar_t b) -> scalar_t {
+        return c10::copysign(a, b);
+      },
+      [](Vectorized<scalar_t> a, Vectorized<scalar_t> b) -> Vectorized<scalar_t> {
+        return a.copysign(b);
+      });
+  });
+}
+```
+
+在c10 namespace中，[代码位置](https://github.com/pytorch/pytorch/blob/main/c10/util/copysign.h#L12-L15)：
+
+```cpp
+namespace c10 {
+
+// Note: Explicit implementation of copysign for Half and BFloat16
+// is needed to workaround g++-7/8 crash on aarch64, but also makes
+// copysign faster for the half-precision types
+template <typename T, typename U>
+inline auto copysign(const T& a, const U& b) {
+  return std::copysign(a, b);
+}
+...
+} // namespace c10
+```
+
+[cuda kernel代码位置](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cuda/CopysignKernel.cu#L23-L29)
+
+```cpp
+namespace at::native {
+
+void copysign_kernel_cuda(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_cuda", [&]() {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+      return c10::cuda::compat::copysign(a, b);
+    });
+  });
+}
+
+REGISTER_DISPATCH(copysign_stub, &copysign_kernel_cuda);
+
+} // namespace at::native
+```
+
+namespace中的`copysign`调用，[代码位置](https://github.com/pytorch/pytorch/blob/main/c10/cuda/CUDAMathCompat.h#L46-L65)
+
+```cpp
+__MATH_FUNCTIONS_DECL__ float copysign(float x, float y) {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+  return ::copysignf(x, y);
+#else
+  // std::copysign gets ICE/Segfaults with gcc 7.5/8 on arm64
+  // (e.g. Jetson), see PyTorch PR #51834
+  // This host function needs to be here for the compiler but is never used
+  TORCH_INTERNAL_ASSERT(
+      false, "CUDAMathCompat copysign should not run on the CPU");
+#endif
+}
+__MATH_FUNCTIONS_DECL__ double copysign(double x, double y) {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+  return ::copysign(x, y);
+#else
+  // see above
+  TORCH_INTERNAL_ASSERT(
+      false, "CUDAMathCompat copysign should not run on the CPU");
+#endif
+}
+```
+
+方法都是底层cpp调用copysign函数
+
+
+
+**反向backward:**
+
+算子配置[代码位置](https://github.com/pytorch/pytorch/blob/main/tools/autograd/derivatives.yaml#L474-L481C28)
+
+```yaml
+- name: copysign.Tensor(Tensor self, Tensor other) -> Tensor
+  self: copysign_tensor_self_backward(grad, self, result)
+  other: zeros_like(other)
+  result: copysign_tensor_self_backward(self_t, self_p, result)
+
+- name: copysign.Scalar(Tensor self, Scalar other) -> Tensor
+  self: copysign_tensor_self_backward(grad, self, result)
+  result: auto_element_wise
+```
+
+backward 反向[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/csrc/autograd/FunctionsManual.cpp#L94-L101)
+
+```cpp
+Tensor copysign_tensor_self_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& result) {
+  auto ratio = result / self;
+  ratio.masked_fill_(self == 0, 0);
+  return grad * ratio;
+}
+```
+
+## TensorFlow
+
+无`copysign`实现
+
+## Numpy
+
+numpy.**copysign**(*x1*, *x2*, */*, *out=None*, ***, *where=True*, *casting='same_kind'*, *order='K'*, *dtype=None*, *subok=True*[, *signature*, *extobj*]) *= <ufunc 'copysign'>*
+
+Change the sign of x1 to that of x2, element-wise.If *x2* is a scalar, its sign will be copied to all elements of *x1*.
+
+### 实现方法
+
+先模板生成函数，底层cpp调用实现[代码位置](https://github.com/numpy/numpy/blob/main/numpy/core/src/umath/loops.c.src#L1213-L1221)
+
+```
+NPY_NO_EXPORT void
+@TYPE@_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        *((@type@ *)op1)= npy_copysign@c@(in1, in2);
+    }
+}
+```
+
+实际调用cpp的math库[代码位置](https://github.com/numpy/numpy/blob/main/numpy/core/include/numpy/npy_math.h#L199)
+
+```cpp
+#include <math.h>
+
+...
+#define npy_copysign copysign
+...
+```
+
+
+
+# 四、对比分析
+
+PyTorch和Numpy实现方式基本一致，都是底层调用cpp的math库实现`copysign`，PyTorch可进行backward。
+
+# 五、设计思路与实现方案
+
+## 命名与参数设计
+
+API的设计为:
+
+- paddle.copysign(x, y) 作为独立的函数调用，非 inplace
+- paddle.copysign_(x, y)，作为独立的函数，inplace 地修改输入；
+- Tensor.copysign(y)做为 Tensor 的方法使用，非 inplace;
+- Tensor.copysign_(y)做为 Tensor 的方法使用， inplace 修改输入；
+
+其中
+
++ x(Tensor) - 需要取用绝对值作为输出数值部分的Tensor
++ y(Tensor, int, float 等 number)
+
+## 底层OP设计
+
+参考PyTorch与Numpy中的设计，调用底层cpp实现OP
+
+## API实现方案
+
+1. 配置算子的yaml，注意配置inplace
+2. 实现`CopySignInferMeta`，在调用kernel之前计算好`out`的`shape`和`dtype`
+3. 实现`CopySignKernel`的CPU和GPU代码以及forward、backward
+4. 封装Python的API，支持动态图和静态图，编写文档
+5. 编写单测
+
+# 六、测试和验收的考量
+
+测试考虑的case如下：
+
++ **编程范式场景**：常规覆盖动态图和静态图的测试场景
+
++ **硬件场景**：常规需覆盖 CPU、GPU 两种测试场景
++ **参数组合场景**：常规覆盖 API 的全部入参，需要对全部入参进行参数有效性和边界值测试，同时可选参数也需有相应的测试覆盖
++ **计算精度**：需要保证前向计算、反向计算的精度正确性
+  + 前向计算：通过 numpy 实现的函数的对比结果
+  + 反向计算：通过 numpy 推导，计算反向结果的正确性
++ **维度测试**：Paddle API 支持的最低维度为 0 维，单测中应编写相应的 0 维尺寸测试 case
++ **边界测试**：y为0、+0、-0时，测试与numpy结果的一致性
+
+# 七、可行性分析及规划排期
+
+有业内方案实现作为参考，工期上可以满足在当前版本周期内开发完成。
+
+# 八、影响面
+
+为独立新增API，对其他模块没有影响
+
+# 名词解释
+
+无
+
+# 附件及参考资料
+
+[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.copysign.html?highlight=copysign#torch.copysign)
+
+[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.copysign.html#numpy-copysign)
\ No newline at end of file

From 4c0022db6fef9cd3c0029f630a265bc9ac64caed Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Fri, 15 Sep 2023 12:02:14 +0800
Subject: [PATCH 02/13] fix input args, add backward kernel, fix python api

---
 rfcs/APIs/20230914_api_design_for_copysign.md | 50 ++++++++++++++++---
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/rfcs/APIs/20230914_api_design_for_copysign.md b/rfcs/APIs/20230914_api_design_for_copysign.md
index d320f5efc..68a47cb3e 100644
--- a/rfcs/APIs/20230914_api_design_for_copysign.md
+++ b/rfcs/APIs/20230914_api_design_for_copysign.md
@@ -202,19 +202,55 @@ PyTorch和Numpy实现方式基本一致，都是底层调用cpp的math库实现`
 
 API的设计为:
 
-- paddle.copysign(x, y) 作为独立的函数调用，非 inplace
-- paddle.copysign_(x, y)，作为独立的函数，inplace 地修改输入；
-- Tensor.copysign(y)做为 Tensor 的方法使用，非 inplace;
-- Tensor.copysign_(y)做为 Tensor 的方法使用， inplace 修改输入；
+- paddle.copysign(x, y, name=None) 作为独立的函数调用，非 inplace;
+- paddle.copysign_(x, y, name=None)，作为独立的函数，inplace 地修改输入;
+- Tensor.copysign(y, name=None)做为 Tensor 的方法使用，非 inplace;
+- Tensor.copysign_(y, name=None)做为 Tensor 的方法使用， inplace 修改输入;
 
 其中
 
-+ x(Tensor) - 需要取用绝对值作为输出数值部分的Tensor
-+ y(Tensor, int, float 等 number)
++ x(Tensor) - 需要取用绝对值作为输出数值部分的 Tensor , 支持 `int32`、`int64`、`float32`、`float64`
++ y(Tensor | Number) - 为 Tensor 时，shape 需要与 x 相同，或者可广播成 x.shape；为 Number 时，支持 `int32`、`int64`、`float32`、`float64`
 
 ## 底层OP设计
 
-参考PyTorch与Numpy中的设计，调用底层cpp实现OP
+参考PyTorch与Numpy中的设计，调用底层cpp实现OP，反向 kernel impl 大致如下：
+
+```cpp
+template<typename T>
+struct CopySignGradFunctor {
+    CopySignGradFunctor(const T* x_data, const T* y_data, const T* dout, T* dx, int64_t numel)
+    : x_data_(x_data), y_data_(y_data), dout_(dout), dx_(dx), numel_(numel) {}
+
+    // backward 逻辑如下
+    HOSTDEVICE void operator()(int64_t idx) const {
+        if (x_data_[idx] == T(0)) dx_[idx] = T(0);
+        else dx_[idx] = T(dout_[idx]) * (T(std::copysign(x_data_[idx], y_data_[idx]) / x_data_[idx]));
+    }
+
+    const T* x_data_;
+    const T* y_data_;
+    const T* dout_;
+    T* dx_;
+    int64_t numel_;
+};
+
+template <typename T, typename Context>
+void CopySignGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& out_grad,
+                   DenseTensor* x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+    auto x_data = x.data<T>(), y_data = y.data<T>(), out_grad_data = out_grad.data<T>();
+    auto x_grad_data = x_grad->data<T>();
+    phi::funcs::ForRange<Context> for_range(dev_ctx, x.numel());
+    phi::CopySignGradFunctor<T> functor(x_data, y_data, out_grad_data, x_grad_data, x.numel());
+    for_range(functor);
+}
+```
+
+
 
 ## API实现方案
 

From 5fff675576d71b8f17f07f1cad7bbd28a9c0412e Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Fri, 15 Sep 2023 14:37:32 +0800
Subject: [PATCH 03/13] fix types

---
 rfcs/APIs/20230914_api_design_for_copysign.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rfcs/APIs/20230914_api_design_for_copysign.md b/rfcs/APIs/20230914_api_design_for_copysign.md
index 68a47cb3e..742299e43 100644
--- a/rfcs/APIs/20230914_api_design_for_copysign.md
+++ b/rfcs/APIs/20230914_api_design_for_copysign.md
@@ -209,8 +209,8 @@ API的设计为:
 
 其中
 
-+ x(Tensor) - 需要取用绝对值作为输出数值部分的 Tensor , 支持 `int32`、`int64`、`float32`、`float64`
-+ y(Tensor | Number) - 为 Tensor 时，shape 需要与 x 相同，或者可广播成 x.shape；为 Number 时，支持 `int32`、`int64`、`float32`、`float64`
++ x(Tensor) - 需要取用绝对值作为输出数值部分的 Tensor , 支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16`
++ y(Tensor | Number) - 为 Tensor 时，shape 需要与 x 相同，或者可广播成 x.shape；为 Number 时，支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16`
 
 ## 底层OP设计
 

From bfa46fec6518b3784294cfd2ad6faaeb3c814f1e Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Fri, 15 Sep 2023 14:56:00 +0800
Subject: [PATCH 04/13] fix Number types

---
 rfcs/APIs/20230914_api_design_for_copysign.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rfcs/APIs/20230914_api_design_for_copysign.md b/rfcs/APIs/20230914_api_design_for_copysign.md
index 742299e43..f2a816d4d 100644
--- a/rfcs/APIs/20230914_api_design_for_copysign.md
+++ b/rfcs/APIs/20230914_api_design_for_copysign.md
@@ -210,7 +210,7 @@ API的设计为:
 其中
 
 + x(Tensor) - 需要取用绝对值作为输出数值部分的 Tensor , 支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16`
-+ y(Tensor | Number) - 为 Tensor 时，shape 需要与 x 相同，或者可广播成 x.shape；为 Number 时，支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16`
++ y(Tensor | Number) - 为 Tensor 时，shape 需要与 x 相同，或者可广播成 x.shape，支持 `bool`、`float16`、`float32`、`float64`、`uint8`、`int8`、`int16`、`int32`、`int64`、`bfloat16`；为 Number 时，支持 `bool`、`int`、`float`
 
 ## 底层OP设计
 

From 72fbcfc9ede2814b78780372175eaf35402187b9 Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Wed, 27 Sep 2023 06:40:36 +0800
Subject: [PATCH 05/13] add pdist api design

---
 rfcs/20230926_api_design_for_pdist.md | 409 ++++++++++++++++++++++++++
 1 file changed, 409 insertions(+)
 create mode 100644 rfcs/20230926_api_design_for_pdist.md

diff --git a/rfcs/20230926_api_design_for_pdist.md b/rfcs/20230926_api_design_for_pdist.md
new file mode 100644
index 000000000..4ffe30875
--- /dev/null
+++ b/rfcs/20230926_api_design_for_pdist.md
@@ -0,0 +1,409 @@
+# paddle.pdist设计文档
+
+| API 名称     | paddle.pdist                  |
+| ------------ | ----------------------------- |
+| 提交作者     | coco                          |
+| 提交时间     | 2023-09-26                    |
+| 版本号       | V1.0                          |
+| 依赖飞桨版本 | develop                       |
+| 文件名       | 20230926_api_defign_for_pdist |
+
+# 一、概述
+
+## 1、相关背景
+
+为paddle新增该API，为计算N个向量两两之间的p-norm距离。
+
+## 2、功能目标
+
+一个矩阵`A`的大小为`MxN`，那么`B=pdist(A)`得到的矩阵B的大小为1行`M*(M-1)/2`列，表示的意义是M行数据，每两行计算一下p-norm距离，默认欧式距离。例如a = [[0.0, 1.0],[2.0,3.0],[4.0,5.0],[6.0,7.0]]，输出为[2.8284, 5.6569, 8.4853, 2.8284, 5.6569, 2.8284]。输出顺序为distance(第一行,第二行), distance(第一行,第三行), ... distance(第二行,第三行)...
+
+## 3、意义
+
+飞桨支持直接两两计算向量间的距离。
+
+# 二、飞桨现状
+
+目前paddle缺少相关功能实现。
+
+# 三、业内方案调研
+
+## Scipy
+
+Scipy中有API`scipy.spatial.distance.pdist`
+
+在Scipy中介绍为：
+
+```
+Pairwise distances between observations in n-dimensional space.
+```
+
+## 实现方法
+
+从实现方法上，Scipy是通过py实现的，[代码位置](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L2195-L2233)
+
+```python
+    X = _asarray_validated(X, sparse_ok=False, objects_ok=True, mask_ok=True,
+                           check_finite=False)
+
+    s = X.shape
+    if len(s) != 2:
+        raise ValueError('A 2-dimensional array must be passed.')
+
+    m, n = s
+
+    if callable(metric):
+        mstr = getattr(metric, '__name__', 'UnknownCustomMetric')
+        metric_info = _METRIC_ALIAS.get(mstr, None)
+
+        if metric_info is not None:
+            X, typ, kwargs = _validate_pdist_input(
+                X, m, n, metric_info, **kwargs)
+
+        return _pdist_callable(X, metric=metric, out=out, **kwargs)
+    elif isinstance(metric, str):
+        mstr = metric.lower()
+        metric_info = _METRIC_ALIAS.get(mstr, None)
+
+        if metric_info is not None:
+            pdist_fn = metric_info.pdist_func
+            _extra_windows_error_checks(X, out, (m * (m - 1) / 2,), **kwargs)
+            return pdist_fn(X, out=out, **kwargs)
+        elif mstr.startswith("test_"):
+            metric_info = _TEST_METRICS.get(mstr, None)
+            if metric_info is None:
+                raise ValueError(f'Unknown "Test" Distance Metric: {mstr[5:]}')
+            X, typ, kwargs = _validate_pdist_input(
+                X, m, n, metric_info, **kwargs)
+            return _pdist_callable(
+                X, metric=metric_info.dist_func, out=out, **kwargs)
+        else:
+            raise ValueError('Unknown Distance Metric: %s' % mstr)
+    else:
+        raise TypeError('2nd argument metric must be a string identifier '
+                        'or a function.')
+```
+
+先找到`mertric`对应的函数，然后call调用，例如`metric`为`euclidean`时，调用`euclidean`的函数。[代码位置](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L1781C1-L1787C7)
+
+
+
+```python
+    MetricInfo(
+        canonical_name='euclidean',
+        aka={'euclidean', 'euclid', 'eu', 'e'},
+        dist_func=euclidean,
+        cdist_func=_distance_pybind.cdist_euclidean,
+        pdist_func=_distance_pybind.pdist_euclidean,
+    ),
+```
+
+[euclidean调用minkowski](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L500-L536)和[minkowski实现](https://github.com/scipy/scipy/blob/v1.11.2/scipy/spatial/distance.py#L429-L497)
+
+```python
+def euclidean(u, v, w=None):
+    return minkowski(u, v, p=2, w=w)
+
+
+def minkowski(u, v, p=2, w=None):
+    u = _validate_vector(u)
+    v = _validate_vector(v)
+    if p <= 0:
+        raise ValueError("p must be greater than 0")
+    u_v = u - v
+    if w is not None:
+        w = _validate_weights(w)
+        if p == 1:
+            root_w = w
+        elif p == 2:
+            # better precision and speed
+            root_w = np.sqrt(w)
+        elif p == np.inf:
+            root_w = (w != 0)
+        else:
+            root_w = np.power(w, 1/p)
+        u_v = root_w * u_v
+    dist = norm(u_v, ord=p)
+    return dist
+```
+
+主要是调用`norm`实现计算
+
+```python
+def norm(x, ord=None, axis=None):
+    if not issparse(x):
+        raise TypeError("input is not sparse. use numpy.linalg.norm")
+
+    # Check the default case first and handle it immediately.
+    if axis is None and ord in (None, 'fro', 'f'):
+        return _sparse_frobenius_norm(x)
+
+    # Some norms require functions that are not implemented for all types.
+    x = x.tocsr()
+
+    if axis is None:
+        axis = (0, 1)
+    elif not isinstance(axis, tuple):
+        msg = "'axis' must be None, an integer or a tuple of integers"
+        try:
+            int_axis = int(axis)
+        except TypeError as e:
+            raise TypeError(msg) from e
+        if axis != int_axis:
+            raise TypeError(msg)
+        axis = (int_axis,)
+
+    nd = 2
+    if len(axis) == 2:
+        row_axis, col_axis = axis
+        if not (-nd <= row_axis < nd and -nd <= col_axis < nd):
+            raise ValueError('Invalid axis %r for an array with shape %r' %
+                             (axis, x.shape))
+        if row_axis % nd == col_axis % nd:
+            raise ValueError('Duplicate axes given.')
+        if ord == 2:
+            # Only solver="lobpcg" supports all numpy dtypes
+            _, s, _ = svds(x, k=1, solver="lobpcg")
+            return s[0]
+        elif ord == -2:
+            raise NotImplementedError
+            #return _multi_svd_norm(x, row_axis, col_axis, amin)
+        elif ord == 1:
+            return abs(x).sum(axis=row_axis).max(axis=col_axis)[0,0]
+        elif ord == np.inf:
+            return abs(x).sum(axis=col_axis).max(axis=row_axis)[0,0]
+        elif ord == -1:
+            return abs(x).sum(axis=row_axis).min(axis=col_axis)[0,0]
+        elif ord == -np.inf:
+            return abs(x).sum(axis=col_axis).min(axis=row_axis)[0,0]
+        elif ord in (None, 'f', 'fro'):
+            # The axis order does not matter for this norm.
+            return _sparse_frobenius_norm(x)
+        else:
+            raise ValueError("Invalid norm order for matrices.")
+    elif len(axis) == 1:
+        a, = axis
+        if not (-nd <= a < nd):
+            raise ValueError('Invalid axis %r for an array with shape %r' %
+                             (axis, x.shape))
+        if ord == np.inf:
+            M = abs(x).max(axis=a)
+        elif ord == -np.inf:
+            M = abs(x).min(axis=a)
+        elif ord == 0:
+            # Zero norm
+            M = (x != 0).sum(axis=a)
+        elif ord == 1:
+            # special case for speedup
+            M = abs(x).sum(axis=a)
+        elif ord in (2, None):
+            M = sqrt(abs(x).power(2).sum(axis=a))
+        else:
+            try:
+                ord + 1
+            except TypeError as e:
+                raise ValueError('Invalid norm order for vectors.') from e
+            M = np.power(abs(x).power(ord).sum(axis=a), 1 / ord)
+        if hasattr(M, 'toarray'):
+            return M.toarray().ravel()
+        elif hasattr(M, 'A'):
+            return M.A.ravel()
+        else:
+            return M.ravel()
+    else:
+        raise ValueError("Improper number of dimensions to norm.")
+```
+
+
+
+
+
+
+
+
+
+## PyTorch
+
+Parameters:
+
+- **input** – input tensor of shape N×M.
+- **p** – p value for the p-norm distance to calculate between each vector pair ∈[0,∞]∈[0,∞].
+
+并且有相关描述：
+
+This function is equivalent to `scipy.spatial.distance.pdist(input, 'minkowski', p=p)` if p∈(0,∞). When p=0 it is equivalent to `scipy.spatial.distance.pdist(input, 'hamming') * M`. When p=∞, the closest scipy function is `scipy.spatial.distance.pdist(xn, lambda x, y: np.abs(x - y).max())`.
+
+
+
+相关[实现位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/Distance.cpp#L58-L64)
+
+```cpp
+Tensor pdist(const Tensor& self, const double p) {
+  TORCH_CHECK(self.dim() == 2,
+      "pdist only supports 2D tensors, got: ", self.dim(), "D");
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()), "pdist only supports floating-point dtypes");
+  TORCH_CHECK(p >= 0, "pdist only supports non-negative p values");
+  return at::_pdist_forward(self.contiguous(), p);
+}
+```
+
+调用`_pdist_forward`，[实现位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/Distance.cpp#L244-L262)
+
+```cpp
+Tensor _pdist_forward(const Tensor& self, const double p) {
+  TORCH_CHECK(self.is_contiguous(), "_pdist_forward requires contiguous input");
+  auto device = self.device().type();
+  TORCH_CHECK(device == kCPU || device == kCUDA, "_pdist_forward only supports CPU and CUDA devices, got: ", device);
+  Tensor result = at::empty({0}, self.options(), LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  if (self.size(0) <= 1) {
+    result.resize_({0});
+  } else {
+    int64_t n = self.size(0);
+    int64_t c = n * (n - 1) / 2;
+    result.resize_({c});
+    if (self.size(1) == 0) {
+      result.fill_(0);
+    } else {
+      pdist_forward_stub(device, result, self, p);
+    }
+  }
+  return result;
+}
+```
+
+主要调用`pdist_forward_stub`，绑定了具体的`pdist_forward_kernel_impl`
+
+```cpp
+REGISTER_DISPATCH(pdist_forward_stub, &pdist_forward_kernel_impl);
+```
+
+([CPU](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp#L446)和[CUDA](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cuda/DistanceKernel.cu#L360)实现绑定了同一个`pdist_forward_kernel_impl`)
+
+而后`pdist_forward_kernel_impl`的[实现位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp#L419C1-L423C2)
+
+```cpp
+void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, const double p) {
+  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist", [&] {
+    Dist<scalar_t>::apply_pdist(result, self, p);
+  });
+}
+```
+
+调用`apply_pdist`，[代码位置](https://github.com/pytorch/pytorch/blob/d0f82cd082fad7243226e0ab68fd995873ea7d76/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp#L190-L202)
+
+```cpp
+ // Assumes self is nonempty, contiguous, and 2D
+  static void apply_pdist(Tensor& result, const Tensor& self, const scalar_t p) {
+    if (p == 0.0) {
+      run_parallel_pdist<zdist_calc<Vec>>(result, self, p);
+    } else if (p == 1.0) {
+      run_parallel_pdist<odist_calc<Vec>>(result, self, p);
+    } else if (p == 2.0) {
+      run_parallel_pdist<tdist_calc<Vec>>(result, self, p);
+    } else if (std::isinf(p)) {
+      run_parallel_pdist<idist_calc<Vec>>(result, self, p);
+    } else {
+      run_parallel_pdist<pdist_calc<Vec>>(result, self, p);
+    }
+  }
+```
+
+`run_parallel_pdist`具体实现
+
+```cpp
+  template <typename F>
+  static void run_parallel_pdist(Tensor& result, const Tensor& self, const scalar_t p) {
+    const scalar_t * const self_start = self.data_ptr<scalar_t>();
+    const scalar_t * const self_end = self_start + self.numel();
+    int64_t n = self.size(0);
+    int64_t m = self.size(1);
+
+    scalar_t * const res_start = result.data_ptr<scalar_t>();
+    int64_t combs = result.numel(); // n * (n - 1) / 2
+
+    // We conceptually iterate over tuples of (i, j, k) where i is the first
+    // vector from the input, j is the second, and k is the result index. This
+    // parallelizes over the range of k and infers what i and j are from the
+    // value of k.
+    parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) {
+      const Vec pvec(p);
+      double n2 = n - .5;
+      // The -1 accounts for floating point truncation issues
+      // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+      int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
+      int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
+
+      const scalar_t * self_i = self_start + i * m;
+      const scalar_t * self_j = self_start + j * m;
+      scalar_t * res = res_start + k;
+      const scalar_t * const res_end = res_start + end;
+
+      while (res != res_end) {
+        *res = F::finish(vec::map2_reduce_all<scalar_t>(
+          [&pvec](Vec a, Vec b) { return F::map((a - b).abs(), pvec); },
+          F::red, self_i, self_j, m), p);
+
+        res += 1;
+        self_j += m;
+        if (self_j == self_end) {
+          self_i += m;
+          self_j = self_i + m;
+        }
+      }
+    });
+  }
+```
+
+
+
+# 四、对比分析
+
+Scipy利用现有API组合实现，PyTorch则在底层重写cpp算子。
+
+# 五、设计思路与实现方案
+
+## 命名与参数设计
+
+API的设计为paddle.cdist(x, y, p=2.0)，其中 `x` 严格为 shape=[M, N] 的 Tensor，`p` 为p-范数对应的p值，输出为一行 `Mx(M-1)/2` 列的 Tensor
+
+## API实现方案
+
+参考`Paddle.cdist`和与`Scipy`中的设计，组合已有API实现功能
+
+# 六、测试和验收的考量
+
+测试考虑的case如下：
+
+1. 当`x`、`y` 2D 的 Tensor，并如PyTorch给出合理提示
+
+   ```python
+   >>> a = []
+   >>> a = torch.tensor(a)
+   >>> b = torch.nn.functional.pdist(a)
+   Traceback (most recent call last):
+     File "<stdin>", line 1, in <module>
+   RuntimeError: pdist only supports 2D tensors, got: 1D
+   >>> b
+   ```
+
+   
+
+2. 结果一致性，和 SciPy 以及 PyTorch 结果的数值的一致性
+
+# 七、可行性分析及规划排期
+
+有业内方案实现作为参考，工期上可以满足在当前版本周期内开发完成。
+
+# 八、影响面
+
+为独立新增API，对其他模块没有影响
+
+# 名词解释
+
+无
+
+# 附件及参考资料
+
+[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.nn.functional.pdist.html?highlight=pdist#torch.nn.functional.pdist)
+
+[Scipy文档](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html)
\ No newline at end of file

From 9d2918be55daeb72b367b116ae266d6827e2c83c Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Wed, 27 Sep 2023 16:42:51 +0800
Subject: [PATCH 06/13] fix typo

---
 rfcs/{ => APIs}/20230926_api_design_for_pdist.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename rfcs/{ => APIs}/20230926_api_design_for_pdist.md (98%)

diff --git a/rfcs/20230926_api_design_for_pdist.md b/rfcs/APIs/20230926_api_design_for_pdist.md
similarity index 98%
rename from rfcs/20230926_api_design_for_pdist.md
rename to rfcs/APIs/20230926_api_design_for_pdist.md
index 4ffe30875..ee64c3148 100644
--- a/rfcs/20230926_api_design_for_pdist.md
+++ b/rfcs/APIs/20230926_api_design_for_pdist.md
@@ -364,11 +364,11 @@ Scipy利用现有API组合实现，PyTorch则在底层重写cpp算子。
 
 ## 命名与参数设计
 
-API的设计为paddle.cdist(x, y, p=2.0)，其中 `x` 严格为 shape=[M, N] 的 Tensor，`p` 为p-范数对应的p值，输出为一行 `Mx(M-1)/2` 列的 Tensor
+API的设计为paddle.pdist(x, p=2.0)，其中 `x` 严格为 shape=[M, N] 的 Tensor，`p` 为p-范数对应的p值，输出为一行 `Mx(M-1)/2` 列的 Tensor
 
 ## API实现方案
 
-参考`Paddle.cdist`和与`Scipy`中的设计，组合已有API实现功能
+参考`Paddle.pdist`和与`Scipy`中的设计，组合已有API实现功能
 
 # 六、测试和验收的考量
 

From 3b3e350c49fe09b99f96cee32d4e2e46c5f4e2c7 Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Thu, 28 Sep 2023 06:39:54 +0800
Subject: [PATCH 07/13] fix

---
 rfcs/APIs/20230926_api_design_for_pdist.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rfcs/APIs/20230926_api_design_for_pdist.md b/rfcs/APIs/20230926_api_design_for_pdist.md
index ee64c3148..71b46d1d3 100644
--- a/rfcs/APIs/20230926_api_design_for_pdist.md
+++ b/rfcs/APIs/20230926_api_design_for_pdist.md
@@ -368,7 +368,7 @@ API的设计为paddle.pdist(x, p=2.0)，其中 `x` 严格为 shape=[M, N] 的 Te
 
 ## API实现方案
 
-参考`Paddle.pdist`和与`Scipy`中的设计，组合已有API实现功能
+参考`PyTorch`与`Scipy`中的设计，组合已有API实现功能
 
 # 六、测试和验收的考量
 

From ab440c29329bba7e83f161da48b8a6183110a3de Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Thu, 28 Sep 2023 06:58:23 +0800
Subject: [PATCH 08/13] add bitwise_shift rfc

---
 .../20230927_api_design_for_bitwise_shift.md  | 186 ++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 rfcs/APIs/20230927_api_design_for_bitwise_shift.md

diff --git a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md
new file mode 100644
index 000000000..2f1f6911b
--- /dev/null
+++ b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md
@@ -0,0 +1,186 @@
+# paddle.pdist设计文档
+
+| API 名称     | paddle.bitwise_right_shift<br />paddle.bitwise_left_shift |
+| ------------ | --------------------------------------------------------- |
+| 提交作者     | coco                                                      |
+| 提交时间     | 2023-09-27                                                |
+| 版本号       | V1.0                                                      |
+| 依赖飞桨版本 | develop                                                   |
+| 文件名       | 20230927_api_defign_for_bitwise_shift                     |
+
+# 一、概述
+
+## 1、相关背景
+
+为paddle新增该API，给 Tensor 做 element wise 的算数(或逻辑)左移/右移。
+
+## 2、功能目标
+
+通过一个Tensor给定的bits计算另一个Tensor的的算术（或逻辑）右移/左移。
+
+## 3、意义
+
+飞桨支持直接对Tensor进行元素粒度的左移右移。
+
+# 二、飞桨现状
+
+目前paddle缺少相关功能实现。
+
+# 三、业内方案调研
+
+## PyTorch
+
+PyTorch中有API`torch.bitwise_right_shift(input, other, *, out=None) → Tensor`
+
+介绍为：
+
+```
+Computes the right arithmetic shift of input by other bits. The input tensor must be of integral type. This operator supports broadcasting to a common shape and type promotion.
+```
+
+## 实现方法
+
+从实现方法上，PyTorch是将位运算注册到element_wise系列中实现的，[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_prims/__init__.py#L1144-L1149)
+
+```python
+shift_right_arithmetic = _make_elementwise_binary_prim(
+    "shift_right_arithmetic",
+    impl_aten=torch.bitwise_right_shift,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+```
+
+具体元素尺度的实现，[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/codegen/common.py#L401-L405)：
+
+```python
+# TODO(fdrocha): this is currently not being used anywhere,
+# pending on moving triton pin past 972b761
+@staticmethod
+def bitwise_right_shift(x, y):
+    return f"{ExprPrinter.paren(x)} >> {ExprPrinter.paren(y)}"
+```
+
+
+
+## Numpy
+
+- Parameters:
+
+  - **x1**：array_like, int
+
+    Input values.
+
+  - **x2**：array_like, int
+
+    Number of bits to remove at the right of *x1*. If `x1.shape != x2.shape`, they must be broadcastable to a common shape (which becomes the shape of the output).
+
+  - **out**：ndarray, None, or tuple of ndarray and None, optional
+
+    A location into which the result is stored. If provided, it must have a shape that the inputs broadcast to. If not provided or None, a freshly-allocated array is returned. A tuple (possible only as a keyword argument) must have length equal to the number of outputs.
+
+  - **where**：array_like, optional
+
+    This condition is broadcast over the input. At locations where the condition is True, the *out* array will be set to the ufunc result. Elsewhere, the *out* array will retain its original value. Note that if an uninitialized *out* array is created via the default `out=None`, locations within it where the condition is False will remain uninitialized.
+
+  - **kwargs：
+
+    For other keyword-only arguments, see the [ufunc docs](https://numpy.org/doc/stable/reference/ufuncs.html#ufuncs-kwargs).
+
+Returns:
+
+- **out**：ndarray, int
+
+  Return *x1* with bits shifted *x2* times to the right. This is a scalar if both *x1* and *x2* are scalars.
+
+
+
+相关[实现位置](https://github.com/numpy/numpy/blob/9d4c1484b96ed2b7dff49c479e9d0822a4b91f80/numpy/core/src/umath/loops_autovec.dispatch.c.src#L81-L105)
+
+```cpp
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
+#ifdef @TYPE@_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
+#else
+    BINARY_LOOP {
+        @type@ in1 = *(@type@ *)ip1;
+        @type@ in2 = *(@type@ *)ip2;
+        *(@type@ *)op1 = npy_rshift@c@(in1, in2);
+    }
+#endif
+}
+```
+
+`npy_rshift`相关调用
+
+```cpp
+NPY_INPLACE npy_@u@@type@
+npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if @is_signed@
+    else if (a < 0) {
+        return (npy_@u@@type@)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+```
+
+# 四、对比分析
+
+PyTorch是将算子注册到element wise系列中，Numpy也类似地`BINARY_LOOP`来做element wise的shift操作。
+
+# 五、设计思路与实现方案
+
+## 命名与参数设计
+
+API的设计为`paddle.bitwise_right_shift(x, y)`，其余几个shift操作同理，其中 `x` 与 `y` 需要有相同的shape或者能够进行广播，且类型都必须为int。
+
+## API实现方案
+
+参考`PyTorch`和与`Numpy`中的设计，组合已有API实现功能
+
+# 六、测试和验收的考量
+
+测试考虑的case如下：
+
+1. 对 `x`、`y`的 shape 和 dtype 有限制，并给出合理提示
+
+2. 结果一致性，和 PyTorch、Numpy 结果的数值的一致性
+
+# 七、可行性分析及规划排期
+
+有业内方案实现作为参考，工期上可以满足在当前版本周期内开发完成。
+
+# 八、影响面
+
+为独立新增API，对其他模块没有影响
+
+# 名词解释
+
+无
+
+# 附件及参考资料
+
+[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.bitwise_right_shift.html?highlight=bitwise_right_shift#torch.bitwise_right_shift)
+
+[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.right_shift.html#numpy.right_shift)
\ No newline at end of file

From e3fe25c69c4dd868f98a3474a72afe70c132130a Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Thu, 28 Sep 2023 07:24:06 +0800
Subject: [PATCH 09/13] update

---
 .../20230927_api_design_for_bitwise_shift.md  | 186 ------------------
 1 file changed, 186 deletions(-)
 delete mode 100644 rfcs/APIs/20230927_api_design_for_bitwise_shift.md

diff --git a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md b/rfcs/APIs/20230927_api_design_for_bitwise_shift.md
deleted file mode 100644
index 2f1f6911b..000000000
--- a/rfcs/APIs/20230927_api_design_for_bitwise_shift.md
+++ /dev/null
@@ -1,186 +0,0 @@
-# paddle.pdist设计文档
-
-| API 名称     | paddle.bitwise_right_shift<br />paddle.bitwise_left_shift |
-| ------------ | --------------------------------------------------------- |
-| 提交作者     | coco                                                      |
-| 提交时间     | 2023-09-27                                                |
-| 版本号       | V1.0                                                      |
-| 依赖飞桨版本 | develop                                                   |
-| 文件名       | 20230927_api_defign_for_bitwise_shift                     |
-
-# 一、概述
-
-## 1、相关背景
-
-为paddle新增该API，给 Tensor 做 element wise 的算数(或逻辑)左移/右移。
-
-## 2、功能目标
-
-通过一个Tensor给定的bits计算另一个Tensor的的算术（或逻辑）右移/左移。
-
-## 3、意义
-
-飞桨支持直接对Tensor进行元素粒度的左移右移。
-
-# 二、飞桨现状
-
-目前paddle缺少相关功能实现。
-
-# 三、业内方案调研
-
-## PyTorch
-
-PyTorch中有API`torch.bitwise_right_shift(input, other, *, out=None) → Tensor`
-
-介绍为：
-
-```
-Computes the right arithmetic shift of input by other bits. The input tensor must be of integral type. This operator supports broadcasting to a common shape and type promotion.
-```
-
-## 实现方法
-
-从实现方法上，PyTorch是将位运算注册到element_wise系列中实现的，[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_prims/__init__.py#L1144-L1149)
-
-```python
-shift_right_arithmetic = _make_elementwise_binary_prim(
-    "shift_right_arithmetic",
-    impl_aten=torch.bitwise_right_shift,
-    doc="",
-    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
-)
-```
-
-具体元素尺度的实现，[代码位置](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/codegen/common.py#L401-L405)：
-
-```python
-# TODO(fdrocha): this is currently not being used anywhere,
-# pending on moving triton pin past 972b761
-@staticmethod
-def bitwise_right_shift(x, y):
-    return f"{ExprPrinter.paren(x)} >> {ExprPrinter.paren(y)}"
-```
-
-
-
-## Numpy
-
-- Parameters:
-
-  - **x1**：array_like, int
-
-    Input values.
-
-  - **x2**：array_like, int
-
-    Number of bits to remove at the right of *x1*. If `x1.shape != x2.shape`, they must be broadcastable to a common shape (which becomes the shape of the output).
-
-  - **out**：ndarray, None, or tuple of ndarray and None, optional
-
-    A location into which the result is stored. If provided, it must have a shape that the inputs broadcast to. If not provided or None, a freshly-allocated array is returned. A tuple (possible only as a keyword argument) must have length equal to the number of outputs.
-
-  - **where**：array_like, optional
-
-    This condition is broadcast over the input. At locations where the condition is True, the *out* array will be set to the ufunc result. Elsewhere, the *out* array will retain its original value. Note that if an uninitialized *out* array is created via the default `out=None`, locations within it where the condition is False will remain uninitialized.
-
-  - **kwargs：
-
-    For other keyword-only arguments, see the [ufunc docs](https://numpy.org/doc/stable/reference/ufuncs.html#ufuncs-kwargs).
-
-Returns:
-
-- **out**：ndarray, int
-
-  Return *x1* with bits shifted *x2* times to the right. This is a scalar if both *x1* and *x2* are scalars.
-
-
-
-相关[实现位置](https://github.com/numpy/numpy/blob/9d4c1484b96ed2b7dff49c479e9d0822a4b91f80/numpy/core/src/umath/loops_autovec.dispatch.c.src#L81-L105)
-
-```cpp
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift)
-(char **args, npy_intp const *dimensions, npy_intp const *steps,
-                  void *NPY_UNUSED(func))
-{
-    BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
-#ifdef @TYPE@_left_shift_needs_clear_floatstatus
-    // For some reason, our macOS CI sets an "invalid" flag here, but only
-    // for some types.
-    npy_clear_floatstatus_barrier((char*)dimensions);
-#endif
-}
-
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
-    BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
-#else
-    BINARY_LOOP {
-        @type@ in1 = *(@type@ *)ip1;
-        @type@ in2 = *(@type@ *)ip2;
-        *(@type@ *)op1 = npy_rshift@c@(in1, in2);
-    }
-#endif
-}
-```
-
-`npy_rshift`相关调用
-
-```cpp
-NPY_INPLACE npy_@u@@type@
-npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b)
-{
-    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
-        return a >> b;
-    }
-#if @is_signed@
-    else if (a < 0) {
-        return (npy_@u@@type@)-1;  /* preserve the sign bit */
-    }
-#endif
-    else {
-        return 0;
-    }
-}
-```
-
-# 四、对比分析
-
-PyTorch是将算子注册到element wise系列中，Numpy也类似地`BINARY_LOOP`来做element wise的shift操作。
-
-# 五、设计思路与实现方案
-
-## 命名与参数设计
-
-API的设计为`paddle.bitwise_right_shift(x, y)`，其余几个shift操作同理，其中 `x` 与 `y` 需要有相同的shape或者能够进行广播，且类型都必须为int。
-
-## API实现方案
-
-参考`PyTorch`和与`Numpy`中的设计，组合已有API实现功能
-
-# 六、测试和验收的考量
-
-测试考虑的case如下：
-
-1. 对 `x`、`y`的 shape 和 dtype 有限制，并给出合理提示
-
-2. 结果一致性，和 PyTorch、Numpy 结果的数值的一致性
-
-# 七、可行性分析及规划排期
-
-有业内方案实现作为参考，工期上可以满足在当前版本周期内开发完成。
-
-# 八、影响面
-
-为独立新增API，对其他模块没有影响
-
-# 名词解释
-
-无
-
-# 附件及参考资料
-
-[PyTorch文档](https://pytorch.org/docs/stable/generated/torch.bitwise_right_shift.html?highlight=bitwise_right_shift#torch.bitwise_right_shift)
-
-[Numpy文档](https://numpy.org/doc/stable/reference/generated/numpy.right_shift.html#numpy.right_shift)
\ No newline at end of file

From 933ee4e49aec49437a487d8fd9ec8c1f773c9aeb Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Sat, 7 Oct 2023 15:07:41 +0800
Subject: [PATCH 10/13] fix

---
 rfcs/APIs/20230926_api_design_for_pdist.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/rfcs/APIs/20230926_api_design_for_pdist.md b/rfcs/APIs/20230926_api_design_for_pdist.md
index 71b46d1d3..92a9b2662 100644
--- a/rfcs/APIs/20230926_api_design_for_pdist.md
+++ b/rfcs/APIs/20230926_api_design_for_pdist.md
@@ -20,7 +20,7 @@
 
 ## 3、意义
 
-飞桨支持直接两两计算向量间的距离。
+飞桨支持计算大小为(NxM)的矩阵中，N个向量两两之间的p-norm距离。
 
 # 二、飞桨现状
 
@@ -368,7 +368,11 @@ API的设计为paddle.pdist(x, p=2.0)，其中 `x` 严格为 shape=[M, N] 的 Te
 
 ## API实现方案
 
-参考`PyTorch`与`Scipy`中的设计，组合已有API实现功能
+参考`PyTorch`与`Scipy`中的设计，组合已有API实现功能：
+
+在 Paddle repo 的 ﻿python/paddle/nn/functional/distance.py文件；并在 ﻿python/paddle/nn/functional/init.py中，添加 pdist API，以支持 paddle.Tensor.pdist 的调用方式；
+
+使用的API：`paddle.cdist`,`paddle.tril`,`paddle.masked_select`
 
 # 六、测试和验收的考量
 

From 1d88c0907af5709ed7e890def73b3e0abcdca19d Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Sat, 7 Oct 2023 18:22:33 +0800
Subject: [PATCH 11/13] add test path

---
 rfcs/APIs/20230926_api_design_for_pdist.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rfcs/APIs/20230926_api_design_for_pdist.md b/rfcs/APIs/20230926_api_design_for_pdist.md
index 92a9b2662..7480e9e6f 100644
--- a/rfcs/APIs/20230926_api_design_for_pdist.md
+++ b/rfcs/APIs/20230926_api_design_for_pdist.md
@@ -376,6 +376,8 @@ API的设计为paddle.pdist(x, p=2.0)，其中 `x` 严格为 shape=[M, N] 的 Te
 
 # 六、测试和验收的考量
 
+单测代码位置，Paddle repo 的 paddle/test/legacy_test/test_pdist.py 目录
+
 测试考虑的case如下：
 
 1. 当`x`、`y` 2D 的 Tensor，并如PyTorch给出合理提示

From 4ff2616c5d127a55e9f715111f08b98217f9bf67 Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Sat, 7 Oct 2023 19:02:19 +0800
Subject: [PATCH 12/13] add args details

---
 rfcs/APIs/20230926_api_design_for_pdist.md | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/rfcs/APIs/20230926_api_design_for_pdist.md b/rfcs/APIs/20230926_api_design_for_pdist.md
index 7480e9e6f..0b7172a72 100644
--- a/rfcs/APIs/20230926_api_design_for_pdist.md
+++ b/rfcs/APIs/20230926_api_design_for_pdist.md
@@ -364,7 +364,25 @@ Scipy利用现有API组合实现，PyTorch则在底层重写cpp算子。
 
 ## 命名与参数设计
 
-API的设计为paddle.pdist(x, p=2.0)，其中 `x` 严格为 shape=[M, N] 的 Tensor，`p` 为p-范数对应的p值，输出为一行 `Mx(M-1)/2` 列的 Tensor
+API的设计为:
+
+`paddle.pdist(x, p=2.0, compute_mode="use_mm_for_euclid_dist_if_necessary", name=None)`
+
+Args：
+
++ x(Tensor): 严格为 shape=[M, N] 的 Tensor
++ p(float, optional): 为p-范数对应的p值，默认为2.0
++ compute_mode(str, optional): 默认为`use_mm_for_euclid_dist_if_necessary`（组合已有API过程中用到了`paddle.cdist`，当`p=2.0`时，可以设置`compute_mode`利用矩阵运算进行优化）
+  + `compute_mode=use_mm_for_euclid_dist_if_necessary`时，当p=2.0且M>25时使用矩阵乘法计算距离
+  + `compute_mode=use_mm_for_euclid_dist`时，当p=2.0时使用矩阵乘法计算距离
+  + `compute_mode=donot_use_mm_for_euclid_dist`时，不使用矩阵乘法计算距离
++ name(str, 可选): 操作的名称(默认为None)
+
+Return：
+
++ 一行 `Mx(M-1)/2` 列的 Tensor
+
+
 
 ## API实现方案
 

From e574498f968ae58a3a55bcfcc86b313f73b9fa30 Mon Sep 17 00:00:00 2001
From: coco <1228759711@qq.com>
Date: Sat, 7 Oct 2023 19:04:53 +0800
Subject: [PATCH 13/13] typo

---
 rfcs/APIs/20230926_api_design_for_pdist.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rfcs/APIs/20230926_api_design_for_pdist.md b/rfcs/APIs/20230926_api_design_for_pdist.md
index 0b7172a72..89d9b92b9 100644
--- a/rfcs/APIs/20230926_api_design_for_pdist.md
+++ b/rfcs/APIs/20230926_api_design_for_pdist.md
@@ -371,8 +371,8 @@ API的设计为:
 Args：
 
 + x(Tensor): 严格为 shape=[M, N] 的 Tensor
-+ p(float, optional): 为p-范数对应的p值，默认为2.0
-+ compute_mode(str, optional): 默认为`use_mm_for_euclid_dist_if_necessary`（组合已有API过程中用到了`paddle.cdist`，当`p=2.0`时，可以设置`compute_mode`利用矩阵运算进行优化）
++ p(float, 可选): 为p-范数对应的p值，默认为2.0
++ compute_mode(str, 可选): 默认为`use_mm_for_euclid_dist_if_necessary`（组合已有API过程中用到了`paddle.cdist`，当`p=2.0`时，可以设置`compute_mode`利用矩阵运算进行优化）
   + `compute_mode=use_mm_for_euclid_dist_if_necessary`时，当p=2.0且M>25时使用矩阵乘法计算距离
   + `compute_mode=use_mm_for_euclid_dist`时，当p=2.0时使用矩阵乘法计算距离
   + `compute_mode=donot_use_mm_for_euclid_dist`时，不使用矩阵乘法计算距离