PaddlePaddle · zkh2016 · Jul 26, 2022 · May 9, 2022 · May 29, 2022 · May 29, 2022
diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml
@@ -8,12 +8,12 @@
   backward : add_grad
 
 - api : conv3d
-  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
-  output : Tensor(out), Tensor(rulebook)
+  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key)
+  output : Tensor(out), Tensor(rulebook), Tensor(counter) 
   kernel :
-    func : sparse_conv3d{sparse_coo, dense -> sparse_coo, dense}
+    func : sparse_conv3d{sparse_coo, dense -> sparse_coo, dense, dense}
     layout : x
-  intermediate : rulebook
+  intermediate: rulebook, counter
   backward : conv3d_grad
 
 - api : coo_to_dense
@@ -132,6 +132,13 @@
     layout : x
   backward : values_grad
 
+- api: coalesced
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func: coalesced{sparse_coo -> sparse_coo}
+    layout : x
+
 - api: full_like
   args : (Tensor x, Scalar value, DataType dtype=DataType::UNDEFINED)
   output : Tensor(out)
@@ -162,11 +169,11 @@
 
 - api: maxpool
   args : (Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides)
-  output : Tensor(out), Tensor(rulebook)
+  output : Tensor(out), Tensor(rulebook), Tensor(counter)
   kernel :
-    func : sparse_maxpool{sparse_coo -> sparse_coo, dense}
+    func : sparse_maxpool{sparse_coo -> sparse_coo, dense, dense}
     layout : x
-  intermediate : rulebook
+  intermediate : rulebook, counter
   backward : sparse_maxpool_grad
 
 - api: mv

diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml
@@ -7,11 +7,11 @@
            add_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
 
 - backward_api : conv3d_grad
-  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
-  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
+  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor), Tensor(counter@DenseTensor)
+  args : (Tensor x, Tensor kernel, Tensor out, Tensor rulebook, Tensor counter, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key)
   output : Tensor(x_grad), Tensor(kernel_grad)
   kernel :
-    func : sparse_conv3d_grad{sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense}
+    func : sparse_conv3d_grad{sparse_coo, dense, sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense}
 
 - backward_api : coo_to_dense_grad
   forward : coo_to_dense(Tensor x) -> Tensor(out)
@@ -93,11 +93,11 @@
     func : softmax_csr_grad{sparse_csr, sparse_csr -> sparse_csr}
 
 - backward_api : sparse_maxpool_grad
-  forward : sparse_maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out), Tensor(rulebook)
-  args : (Tensor x, Tensor rulebook, Tensor out, Tensor out_grad, int[] kernel_sizes)
+  forward : sparse_maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out), Tensor(rulebook), Tensor(counter)
+  args : (Tensor x, Tensor rulebook, Tensor counter, Tensor out, Tensor out_grad, int[] kernel_sizes)
   output : Tensor(x_grad)
   kernel :
-    func : sparse_maxpool_grad {sparse_coo, dense, sparse_coo, sparse_coo -> sparse_coo}
+    func : sparse_maxpool_grad {sparse_coo, dense, dense, sparse_coo, sparse_coo -> sparse_coo}
 
 - backward_api : sqrt_grad
   forward : sqrt(Tensor x) -> Tensor(out)

diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h
@@ -156,6 +156,38 @@ class SparseCooTensor : public TensorBase,
   /// \brief get the dnese dim
   int32_t dense_dim() const;
 
+  const std::pair<DenseTensor, std::vector<int>>* table(
+      const std::string& key) const {
+    const auto& iter = table_ptr_->find(key);
+    if (iter == table_ptr_->end()) {
+      return nullptr;
+    }
+    return &iter->second;
+  }
+  // DenseTensor* mutable_rulebook() { return &rulebook_; }
+  void SetTable(const std::string& key,
+                const std::pair<DenseTensor, std::vector<int>>& table) {
+    auto ret = table_ptr_->insert({key, table});
+    if (ret.second == false) {
+      ret.first->second = table;
+    }
+  }
+
+  const std::shared_ptr<
+      std::map<std::string, std::pair<DenseTensor, std::vector<int>>>>&
+  GetTablePtr() const {
+    return table_ptr_;
+  }
+  void SetTablePtr(
+      const std::shared_ptr<
+          std::map<std::string, std::pair<DenseTensor, std::vector<int>>>>&
+          table_ptr) {
+    table_ptr_ = table_ptr;
+  }
+
+  // const bool subm() const { return subm_; }
+  // void SetSubm(const bool subm) { subm_ = subm; }
+
  private:
   // save the indices of non zero elements in original dense tensor
   DenseTensor non_zero_indices_;
@@ -165,6 +197,12 @@ class SparseCooTensor : public TensorBase,
   bool coalesced_ = false;
   // save the number of non zero elements in each batch
   DDim dims_;
+
+  // for sparse conv
+  std::shared_ptr<
+      std::map<std::string, std::pair<DenseTensor, std::vector<int>>>>
+      table_ptr_ = std::make_shared<
+          std::map<std::string, std::pair<DenseTensor, std::vector<int>>>>();
   /* --------------------------- */
   /*   example: non zero element is scalar */
   /* --------------------------- */

diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+
+#define VecBytes 16
 
 namespace phi {
 namespace funcs {
@@ -28,33 +33,125 @@ namespace sparse {
  * channels: the output channel size
  * out: the outputs
  **/
-template <typename T>
+template <typename T, int VecSize>
 __global__ void ScatterKernel(const T* input,
                               const int* unique_value,
                               const int* out_index,
                               const int non_zero_num,
                               const int rulebook_len,
                               const int channels,
-                              T* out,
-                              const bool subm = false) {
+                              T* out) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
-    int indices_i = i / channels;
-    int channels_i = i - indices_i * channels;
+  const int vec_channels = channels / VecSize;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  for (int i = tid; i < non_zero_num * vec_channels;
+       i += gridDim.x * blockDim.x) {
+    int indices_i = i / vec_channels;
+    int channels_i = i - indices_i * vec_channels;
 
     int start = unique_value[indices_i];
     int end = indices_i == non_zero_num - 1 ? rulebook_len
                                             : unique_value[indices_i + 1];
     // max(end-start) = kernel_size
-    T sum = static_cast<T>(0);
-    if (subm) {
-      sum = out[indices_i * channels + channels_i];
-    }
+    StoreT sums = {static_cast<T>(0)};
     for (int j = start; j < end; j++) {
       const int out_feature_i = out_index[j];
-      sum += input[out_feature_i * channels + channels_i];
+      LoadT vec_in;
+      phi::Load<T, VecSize>(
+          input + out_feature_i * channels + channels_i * VecSize, &vec_in);
+#pragma unroll
+      for (int k = 0; k < VecSize; k++) {
+        sums[k] += vec_in[k];
+      }
+    }
+    phi::Store<T, VecSize>(sums,
+                           out + indices_i * channels + channels_i * VecSize);
+  }
+}
+// scatter's index has been grouped in advance
+// index_counts record the count of every group
+// index_groups save the index of every group
+template <typename T, int VecSize>
+__global__ void ScatterKernelV2(const T* input,
+                                const int* index_counts,
+                                const int* index_groups,
+                                const int non_zero_num,
+                                const int kernel_size,
+                                const int channels,
+                                const int buffer_counts,
+                                T* out) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int vec_channels = channels / VecSize;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  for (int i = tid; i < non_zero_num * vec_channels;
+       i += gridDim.x * blockDim.x) {
+    int indices_i = i / vec_channels;
+    int channels_i = i - indices_i * vec_channels;
+
+    StoreT sums = {static_cast<T>(0)};
+    phi::Load<T, VecSize>(out + indices_i * channels + channels_i * VecSize,
+                          &sums);
+    for (int it = 0; it < buffer_counts; it++) {
+      int len = index_counts[indices_i + it * non_zero_num];
+      const int group_offset = it * kernel_size * non_zero_num;
+      for (int j = 0; j < len; j++) {
+        const int out_feature_i =
+            index_groups[indices_i * kernel_size + j + group_offset];
+        LoadT vec_in;
+        phi::Load<T, VecSize>(
+            input + out_feature_i * channels + channels_i * VecSize, &vec_in);
+#pragma unroll
+        for (int k = 0; k < VecSize; k++) {
+          sums[k] += vec_in[k];
+        }
+      }
     }
-    out[indices_i * channels + channels_i] = sum;
+    phi::Store<T, VecSize>(sums,
+                           out + indices_i * channels + channels_i * VecSize);
+  }
+}
+
+template <typename T>
+void ScatterV2(const GPUContext& dev_ctx,
+               const T* input,
+               const int* index_counts,
+               const int* index_groups,
+               const int non_zero_num,
+               const int kernel_size,
+               const int channels,
+               const int buffer_counts,
+               T* output) {
+  const int VecSize = VecBytes / sizeof(T);
+  if (channels % VecSize == 0) {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, non_zero_num * channels / VecSize, 1);
+    ScatterKernelV2<T, VecSize><<<config.block_per_grid.x,
+                                  config.thread_per_block.x,
+                                  0,
+                                  dev_ctx.stream()>>>(input,
+                                                      index_counts,
+                                                      index_groups,
+                                                      non_zero_num,
+                                                      kernel_size,
+                                                      channels,
+                                                      buffer_counts,
+                                                      output);
+  } else {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, non_zero_num * channels, 1);
+    ScatterKernelV2<T, 1><<<config.block_per_grid.x,
+                            config.thread_per_block.x,
+                            0,
+                            dev_ctx.stream()>>>(input,
+                                                index_counts,
+                                                index_groups,
+                                                non_zero_num,
+                                                kernel_size,
+                                                channels,
+                                                buffer_counts,
+                                                output);
   }
 }
 

diff --git a/paddle/phi/kernels/sparse/coalesced_kernel.h b/paddle/phi/kernels/sparse/coalesced_kernel.h
@@ -26,5 +26,12 @@ void CoalescedKernel(const Context& dev_ctx,
                      const SparseCooTensor& x,
                      SparseCooTensor* out);
 
+template <typename T, typename Context>
+SparseCooTensor Coalesced(const Context& dev_ctx, const SparseCooTensor& x) {
+  SparseCooTensor coo;
+  CoalescedKernel<T, Context>(dev_ctx, x, &coo);
+  return coo;
+}
+
 }  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
@@ -26,13 +26,16 @@ template <typename T, typename Context>
 void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& kernel,
+                      const SparseCooTensor& out,
                       const DenseTensor& rulebook,
+                      const DenseTensor& counter,
                       const SparseCooTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
                       const int groups,
                       const bool subm,
+                      const std::string& key,
                       SparseCooTensor* x_grad,
                       DenseTensor* kernel_grad);
 
@@ -41,27 +44,33 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dGrad(
     const Context& dev_ctx,
     const SparseCooTensor& x,
     const DenseTensor& kernel,
+    const SparseCooTensor& out,
     const DenseTensor& rulebook,
+    const DenseTensor& counter,
     const SparseCooTensor& out_grad,
     const std::vector<int>& paddings,
     const std::vector<int>& dilations,
     const std::vector<int>& strides,
     const int groups,
-    const bool subm) {
+    const bool subm,
+    const std::string& key) {
   SparseCooTensor x_grad;
   DenseTensor kernel_grad;
 
   // TODO(zhangkaihuo): call InferMeta func here
   Conv3dGradKernel<T, Context>(dev_ctx,
                                x,
                                kernel,
+                               out,
                                rulebook,
+                               counter,
                                out_grad,
                                paddings,
                                dilations,
                                strides,
                                groups,
                                subm,
+                               key,
                                &x_grad,
                                &kernel_grad);
   return std::make_tuple(x_grad, kernel_grad);

diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -31,8 +31,10 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& strides,
                   const int groups,
                   const bool subm,
+                  const std::string& key,
                   SparseCooTensor* out,
-                  DenseTensor* rulebook);
+                  DenseTensor* rulebook,
+                  DenseTensor* counter);
 
 template <typename T, typename Context>
 SparseCooTensor Conv3d(const Context& dev_ctx,
@@ -43,7 +45,9 @@ SparseCooTensor Conv3d(const Context& dev_ctx,
                        const std::vector<int>& strides,
                        const int groups,
                        const bool subm,
-                       DenseTensor* rulebook) {
+                       const std::string& key,
+                       DenseTensor* rulebook,
+                       DenseTensor* counter) {
   SparseCooTensor coo;
   Conv3dKernel<T, Context>(dev_ctx,
                            x,
@@ -53,8 +57,10 @@ SparseCooTensor Conv3d(const Context& dev_ctx,
                            strides,
                            groups,
                            subm,
+                           key,
                            &coo,
-                           rulebook);
+                           rulebook,
+                           counter);
   return coo;
 }