From 3d710589d40968b74241eec45043295b02e63fd9 Mon Sep 17 00:00:00 2001
From: Siming Dai <908660116@qq.com>
Date: Wed, 20 Apr 2022 20:34:05 +0800
Subject: [PATCH] Add graph api docs (#4661) (#4664)

* Add graph apis docs

* fix khop_sampler doc

* fix khop_sample doc

* fix khop sampler doc

* revise doc
---
 .../paddle/incubate/graph_khop_sampler_cn.rst | 46 +++++++++++++
 docs/api/paddle/incubate/graph_reindex_cn.rst | 69 +++++++++++++++++++
 .../incubate/graph_sample_neighbors_cn.rst    | 47 +++++++++++++
 .../paddle/incubate/graph_send_recv_cn.rst    | 22 +++++-
 4 files changed, 182 insertions(+), 2 deletions(-)
 create mode 100644 docs/api/paddle/incubate/graph_khop_sampler_cn.rst
 create mode 100644 docs/api/paddle/incubate/graph_reindex_cn.rst
 create mode 100644 docs/api/paddle/incubate/graph_sample_neighbors_cn.rst

diff --git a/docs/api/paddle/incubate/graph_khop_sampler_cn.rst b/docs/api/paddle/incubate/graph_khop_sampler_cn.rst
new file mode 100644
index 00000000000..68151d5f7ba
--- /dev/null
+++ b/docs/api/paddle/incubate/graph_khop_sampler_cn.rst
@@ -0,0 +1,46 @@
+.. _cn_api_incubate_graph_khop_sampler:
+
+graph_khop_sampler
+-------------------------------
+
+.. py:function:: paddle.incubate.graph_khop_sampler(row, colptr, input_nodes, sample_sizes, sorted_eids=None, return_eids=False, name=None)
+
+主要应用于图学习领域，将节点邻居采样和节点重编号两步骤统一在一起，同时提供多层邻居采样的功能。关于邻居采样和节点重编号的相关API可以分别参考 :ref:`cn_api_incubate_graph_sample_neighbors` 和 :ref:`cn_api_incubate_graph_reindex` 。
+
+参数
+:::::::::
+    - row (Tensor) - 输入原始图的CSC格式的行信息，数据类型为：int32、int64，形状为[num_edges, 1] 或 [num_edges]。
+    - colptr (Tensor) - 输入原始图的CSC格式的压缩列信息，数据类型应当与 ``row`` 一致，形状为[num_nodes + 1, 1]或 [num_nodes + 1]。
+    - input_nodes (Tensor) - 需进行邻居采样的中心节点信息，数据类型应当与 ``row`` 一致。
+    - sample_sizes (list | tuple) - 表示每一层需要采样的邻居个数，数据类型为int。
+    - sorted_eids (Tensor，可选) - 输入原始图在CSC格式下的边编号信息。如果 ``return_eids`` 为True，则不能为空。数据类型应当与 ``row`` 一致。默认值为None，表示不需要返回边编号信息。
+    - return_eids (bool) - 是否返回采样后对应的原始边编号信息，默认为False。
+    - name (str，可选) - 操作的名称(可选，默认值为None）。更多信息请参见 :ref:`api_guide_Name` 。
+
+返回
+:::::::::
+    - edge_src (Tensor) - 返回采样后重索引的边对应的源节点信息。
+    - edge_dst (Tensor) - 返回采样后重索引的边对应的目标节点信息。
+    - sample_index (Tensor) - 返回去重后的输入中心节点信息和邻居信息，且为原始编号。
+    - reindex_nodes (Tensor) - 返回输入中心节点在 ``sample_index`` 中的下标索引位置。
+    - edge_eids (Tensor) - 如果 ``return_eids`` 为True，则会返回采样边对应的编号信息，否则不返回。
+
+
+代码示例
+::::::::::
+
+.. code-block:: python
+
+    import paddle
+
+    row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
+    colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
+    nodes = [0, 8, 1, 2]
+    sample_sizes = [2, 2]
+    row = paddle.to_tensor(row, dtype="int64")
+    colptr = paddle.to_tensor(colptr, dtype="int64")
+    nodes = paddle.to_tensor(nodes, dtype="int64")
+
+    edge_src, edge_dst, sample_index, reindex_nodes = \
+        paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes)
+
diff --git a/docs/api/paddle/incubate/graph_reindex_cn.rst b/docs/api/paddle/incubate/graph_reindex_cn.rst
new file mode 100644
index 00000000000..91b2f2d3f88
--- /dev/null
+++ b/docs/api/paddle/incubate/graph_reindex_cn.rst
@@ -0,0 +1,69 @@
+.. _cn_api_incubate_graph_reindex:
+
+graph_reindex
+-------------------------------
+
+.. py:function:: paddle.incubate.graph_reindex(x, neighbors, count, value_buffer=None, index_buffer=None, flag_buffer_hashtable=False, name=None)
+
+主要应用于图学习领域，需要与图采样相关的API配合使用。其主要目的是对输入的中心节点信息和邻居信息进行从0开始的重新编号，以方便后续的图模型子图训练。
+
+.. code-block:: text
+
+        Given:
+
+        X = [0, 1, 2]
+
+        neighbors = [8, 9, 0, 4, 7, 6, 7]
+
+        count = [2, 3, 2]
+
+        value_buffer = None
+
+        index_buffer = None
+
+        flag_buffer_hashtable = False
+
+        Then:
+
+        reindex_src = [3, 4, 0, 5, 6, 7, 6]
+
+        reindex_dst = [0, 0, 1, 1, 1, 2, 2]
+
+        out_nodes = [0, 1, 2, 8, 9, 4, 7, 6]  # 可以将对应位置的节点编号替换到重编号的边中，得到重编号前的边信息。
+
+参数
+:::::::::
+    - x (Tensor) - 输入的中心节点原始编号，数据类型为：int32、int64。
+    - neighbors (Tensor) - 中心节点的邻居节点编号，数据类型为：int32、int64。
+    - count (Tensor) - 中心节点各自的邻居数目，数据类型为：int32。
+    - value_buffer (Tensor，可选) - 用于快速哈希索引的缓存Tensor，可加速重编号过程。数据类型为int32，并且应当事先填充为-1。如果 ``flag_buffer_hashtable`` 为True，则不可为空。默认值为None。
+    - index_buffer (Tensor，可选) - 用于快速哈希索引的缓存Tensor，可加速重编号过程。数据类型为int32，并且应当事先填充为-1。如果 ``flag_buffer_hashtable`` 为True，则不可为空。默认值为None。
+    - flag_buffer_hashtable (bool) - 是否采取快速哈希索引，默认为False。只适用于GPU版本的API。
+    - name (str，可选) - 操作的名称(可选，默认值为None）。更多信息请参见 :ref:`api_guide_Name` 。
+
+返回
+:::::::::
+    - reindex_src (Tensor) - 重编号后的边对应的源节点信息。
+    - reindex_dst (Tensor) - 重编号后的边对应的目标节点信息。
+    - out_nodes (Tensor) - 返回去重后的输入中心节点信息和邻居信息，且为原始编号。注意，我们将输入的中心节点编号信息放置于前面，而邻居信息放置于后面。
+
+
+代码示例
+::::::::::
+
+.. code-block:: python
+
+    import paddle
+
+    x = [0, 1, 2]
+    neighbors = [8, 9, 0, 4, 7, 6, 7]
+    count = [2, 3, 2]
+    x = paddle.to_tensor(x, dtype="int64")
+    neighbors = paddle.to_tensor(neighbors, dtype="int64")
+    count = paddle.to_tensor(count, dtype="int32")
+    
+    reindex_src, reindex_dst, out_nodes = \
+        paddle.incubate.graph_reindex(x, neighbors, count)
+    # reindex_src: [3, 4, 0, 5, 6, 7, 6]
+    # reindex_dst: [0, 0, 1, 1, 1, 2, 2]
+    # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6]
diff --git a/docs/api/paddle/incubate/graph_sample_neighbors_cn.rst b/docs/api/paddle/incubate/graph_sample_neighbors_cn.rst
new file mode 100644
index 00000000000..fa65474c5d9
--- /dev/null
+++ b/docs/api/paddle/incubate/graph_sample_neighbors_cn.rst
@@ -0,0 +1,47 @@
+.. _cn_api_incubate_graph_sample_neighbors:
+
+graph_sample_neighbors
+-------------------------------
+
+.. py:function:: paddle.incubate.graph_sample_neighbors(row, colptr, input_nodes, eids=None, perm_buffer=None, sample_size=-1, return_eids=False, flag_perm_buffer=False, name=None)
+
+主要应用于图学习领域，主要目的是提供高性能图邻居采样方法。通过输入图的CSC（Compressed Sparse Column，压缩列信息），分别对应 ``row`` 和 ``colptr`` ，从而将图转换为适用于邻居采样的格式，再输入需要进行采样的中心节点 ``input_nodes`` ，以及采样的邻居个数 ``sample_size`` ，从而可以获得对应中心节点采样后的邻居。另外，在GPU版本提供了Fisher-yates高性能图采样方法。
+
+参数
+:::::::::
+    - row (Tensor) - 输入原始图的CSC格式的行信息，数据类型为：int32、int64，形状为[num_edges, 1] 或 [num_edges]。
+    - colptr (Tensor) - 输入原始图的CSC格式的压缩列信息，数据类型应当与 ``row`` 一致，形状为[num_nodes + 1, 1]或 [num_nodes + 1]。
+    - input_nodes (Tensor) - 需进行邻居采样的中心节点信息，数据类型应当与 ``row`` 一致。
+    - eids (Tensor，可选) - 输入原始图在CSC格式下的边编号信息。如果 ``return_eids`` 为True，则不能为空。数据类型应当与 ``row`` 一致。默认为None，表示不需要返回边编号信息。
+    - perm_buffer (Tensor，可选) - Fisher-yates采样方法需要用到的缓存Tensor。如果 ``flag_perm_buffer`` 为True，则不能为空。数据类型应当与 ``row`` 一致，形状为[num_edges]，填充内容为0 至 num_edges的顺序递增序列。
+    - sample_size (int) - 采样邻居个数。默认值为-1，表示采样输入中心节点的所有邻居。
+    - return_eids (bool) - 是否返回采样后对应的原始边编号信息，默认为False。
+    - flag_perm_buffer (bool) - 是否采用Fisher-yates采样方法，默认为False。 
+    - name (str，可选) - 操作的名称(可选，默认值为None）。更多信息请参见 :ref:`api_guide_Name` 。
+
+返回
+:::::::::
+    - out_neighbors (Tensor) - 返回采样后的邻居节点。
+    - out_count (Tensor) - 返回中心节点各自对应的采样邻居数目，形状应该与 ``input_nodes`` 一致。
+    - out_eids (Tensor) - 如果 ``return_eids`` 为True，则会返回采样边对应的编号信息，否则不返回。
+
+
+代码示例
+::::::::::
+
+.. code-block:: python
+
+    import paddle
+ 
+    # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
+    #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
+    row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
+    colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
+    nodes = [0, 8, 1, 2]
+    sample_size = 2
+    row = paddle.to_tensor(row, dtype="int64")
+    colptr = paddle.to_tensor(colptr, dtype="int64")
+    nodes = paddle.to_tensor(nodes, dtype="int64")
+    out_neighbors, out_count = \
+        paddle.incubate.graph_sample_neighbors(row, colptr, nodes, 
+                                               sample_size=sample_size)
diff --git a/docs/api/paddle/incubate/graph_send_recv_cn.rst b/docs/api/paddle/incubate/graph_send_recv_cn.rst
index 22940d94ddb..457ffed4940 100644
--- a/docs/api/paddle/incubate/graph_send_recv_cn.rst
+++ b/docs/api/paddle/incubate/graph_send_recv_cn.rst
@@ -3,9 +3,9 @@
 graph_send_recv
 -------------------------------
 
-.. py:function:: paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None)
+.. py:function:: paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum", out_size=None, name=None)
 
-此API主要应用于图学习领域，目的是为了减少在消息传递过程中带来的中间变量显存或内存的损耗。其中， ``x`` 作为输入Tensor，首先利用 ``src_index`` 作为索引来gather出在 ``x`` 中相应位置的数据，随后再将gather出的结果利用 ``dst_index`` 来更新到对应的输出结果中，其中 ``pool_type`` 表示不同的更新方式，包括sum、mean、max、min共计4种处理模式。
+主要应用于图学习领域，目的是为了减少在消息传递过程中带来的中间变量显存或内存的损耗。其中， ``x`` 作为输入Tensor，首先利用 ``src_index`` 作为索引来gather出在 ``x`` 中相应位置的数据，随后再将gather出的结果利用 ``dst_index`` 来更新到对应的输出结果中，其中 ``pool_type`` 表示不同的更新方式，包括sum、mean、max、min共计4种处理模式。另外，提供了 ``out_size`` 参数，用于设置实际输出的形状，有利于减少实际显存占用。
 
 .. code-block:: text
 
@@ -19,6 +19,8 @@ graph_send_recv
 
         pool_type = "sum"
 
+        out_size = None
+
         Then:
 
         Out = [[0, 2, 3],
@@ -31,6 +33,7 @@ graph_send_recv
     - src_index (Tensor) - 1-D Tensor，数据类型为：int32、int64。
     - dst_index (Tensor) - 1-D Tensor，数据类型为：int32、int64。注意： ``dst_index`` 的形状应当与 ``src_index`` 一致。
     - pool_type (str) - scatter结果的不同处理方式，包括sum、mean、max、min。 默认值为 sum。
+    - out_size (int64，可选) - 可以通过根据实际需求设置 ``out_size`` 来改变实际输出形状。默认值为None，表示这个参数将不会被使用。注意， ``out_size`` 的值必须等于或大于 ``max(dst_index) + 1`` 。
     - name (str，可选) - 操作的名称(可选，默认值为None）。更多信息请参见 :ref:`api_guide_Name` 。
 
 返回
@@ -51,3 +54,18 @@ graph_send_recv
     dst_index = indexes[:, 1]
     out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum")
     # Outputs: [[0., 2., 3.], [2., 8., 10.], [1., 4., 5.]]
+
+    x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
+    indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32")
+    src_index = indexes[:, 0]
+    dst_index = indexes[:, 1]
+    out_size = paddle.max(dst_index) + 1
+    out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum", out_size=out_size)
+    # Outputs: [[0., 2., 3.], [[2., 8., 10.]]]
+
+    x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
+    indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32")
+    src_index = indexes[:, 0]
+    dst_index = indexes[:, 1]
+    out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum")
+    # Outputs: [[0., 2., 3.], [2., 8., 10.], [0., 0., 0.]]