From bb4db9b1d883d374a330699a85fef61b23539ff5 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 9 May 2022 07:52:18 +0000
Subject: [PATCH 01/70] test sparse model

---
 .../unittests/test_sparse_middle_extractor.py | 324 ++++++++++++++++++
 .../tests/unittests/test_sparse_mnist.py      | 126 +++++++
 2 files changed, 450 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_sparse_mnist.py

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py b/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py
new file mode 100644
index 0000000000000..ae52b4a413336
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py
@@ -0,0 +1,324 @@
+import paddle
+import paddle.nn as nn
+import paddle.sparse as sparse
+from paddle.fluid.framework import _test_eager_guard
+import time
+import numpy as np
+import torch
+import spconv.pytorch as spconv
+import inspect
+
+class MiddleExtractor(paddle.nn.Layer):
+    def __init__(self,
+                #output_shape,
+                use_norm=True,
+                num_input_features=128,
+                num_filters_down1=[64],
+                num_filters_down2=[64, 64],
+                name='MiddleExtractor'):
+        super(MiddleExtractor, self).__init__()
+        self.name = name
+        if not use_norm:
+            self.middle_conv = paddle.nn.Sequential(
+                #nn.Pad3D(1),
+                nn.Conv3D(num_input_features, 64, 3, stride=(2, 1, 1), data_format='NDHWC'),
+                #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
+                nn.ReLU(),
+                #nn.Pad3D([1, 1, 1, 1, 0, 0]),
+                nn.Conv3D(64, 64, 3, stride=(1, 1, 1), data_format='NDHWC'),
+                #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
+                nn.ReLU(),
+                #nn.Pad3D(1),
+                nn.Conv3D(64, 64, 3, stride=(2, 1, 1), data_format='NDHWC'),
+                #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
+                nn.ReLU(),
+            )
+        else:
+            self.middle_conv = paddle.nn.Sequential(
+                #nn.Pad3D(1),
+                nn.Conv3D(num_input_features, 64, 3, stride=(2, 1, 1), data_format='NDHWC'),
+                nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
+                nn.ReLU(),
+                #nn.Pad3D([1, 1, 1, 1, 0, 0]),
+                nn.Conv3D(64, 64, 3, stride=(1, 1, 1), data_format='NDHWC'),
+                nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
+                nn.ReLU(),
+                #nn.Pad3D(1),
+                nn.Conv3D(64, 64, 3, stride=(2, 1, 1), data_format='NDHWC'),
+                nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
+                nn.ReLU(),
+            )
+    def forward(self, x):
+        return self.middle_conv(x)
+
+
+def get_pos_to_kw_map(func):
+    pos_to_kw = {}
+    fsig = inspect.signature(func)
+    pos = 0
+    for name, info in fsig.parameters.items():
+        if info.kind is info.POSITIONAL_OR_KEYWORD:
+            pos_to_kw[pos] = name
+        pos += 1
+    return pos_to_kw
+
+def change_default_args(**kwargs):
+    def layer_wrapper(layer_class):
+        class DefaultArgLayer(layer_class):
+            def __init__(self, *args, **kw):
+                pos_to_kw = get_pos_to_kw_map(layer_class.__init__)
+                kw_to_pos = {kw: pos for pos, kw in pos_to_kw.items()}
+                for key, val in kwargs.items():
+                    if key not in kw and kw_to_pos[key] > len(args):
+                        kw[key] = val
+                super().__init__(*args, **kw)
+
+        return DefaultArgLayer
+
+    return layer_wrapper
+
+class Empty(torch.nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(Empty, self).__init__()
+
+    def forward(self, *args, **kwargs):
+        if len(args) == 1:
+            return args[0]
+        elif len(args) == 0:
+            return None
+        return args
+
+class SpconvMiddleExtractor(torch.nn.Module):
+    def __init__(self,
+                #output_shape,
+                use_norm=True,
+                num_input_features=128,
+                num_filters_down1=[64],
+                num_filters_down2=[64, 64],
+                name='SpconvMiddleExtractor'):
+        super(SpconvMiddleExtractor, self).__init__()
+        if use_norm:
+            BatchNorm1d = change_default_args(
+                eps=1e-3, momentum=0.01)(torch.nn.BatchNorm1d)
+            Linear = change_default_args(bias=False)(nn.Linear)
+        else:
+            BatchNorm1d = Empty
+            Linear = change_default_args(bias=True)(nn.Linear)
+
+        middle_layers = []
+
+        num_filters = [num_input_features] + num_filters_down1
+        filters_pairs_d1 = [[num_filters[i], num_filters[i + 1]]
+                            for i in range(len(num_filters) - 1)]
+
+        for i, o in filters_pairs_d1:
+            middle_layers.append(spconv.SubMConv3d(i, o, 3, bias=False))
+            if use_norm:
+                #middle_layers.append(BatchNorm1d(o))
+                middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01))
+            middle_layers.append(torch.nn.ReLU())
+
+        middle_layers.append(
+            spconv.SparseConv3d(
+                num_filters[-1],
+                num_filters[-1], (3, 1, 1), (2, 1, 1),
+                bias=False))
+
+        if use_norm:
+            #middle_layers.append(
+            #    BatchNorm1d(num_filters[-1]))
+            middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01))
+        middle_layers.append(torch.nn.ReLU())
+
+
+        # assert len(num_filters_down2) > 0
+        if len(num_filters_down1) == 0:
+            num_filters = [num_filters[-1]] + num_filters_down2
+        else:
+            num_filters = [num_filters_down1[-1]] + num_filters_down2
+        filters_pairs_d2 = [[num_filters[i], num_filters[i + 1]]
+                            for i in range(len(num_filters) - 1)]
+        for i, o in filters_pairs_d2:
+            middle_layers.append(spconv.SubMConv3d(i, o, 3, bias=False))
+            if use_norm:
+                #middle_layers.append(BatchNorm1d(o))
+                middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01))
+            middle_layers.append(torch.nn.ReLU())
+        middle_layers.append(
+            spconv.SparseConv3d(
+                num_filters[-1],
+                num_filters[-1], (3, 1, 1), (2, 1, 1),
+                bias=False))
+        if use_norm:
+            #middle_layers.append(
+                #BatchNorm1d(num_filters[-1]))
+            middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01))
+        middle_layers.append(torch.nn.ReLU())
+        #middle_layers.append(scn.SparseToDense(3, num_filters[-1]))
+        middle_layers.append(spconv.ToDense())
+        self.middle_conv = spconv.SparseSequential(*middle_layers)
+
+    def forward(self, x):
+        out = self.middle_conv(x)
+        return out
+
+class SparseMiddleExtractor(paddle.nn.Layer):
+    def __init__(self,
+                #output_shape,
+                use_norm=True,
+                num_input_features=128,
+                num_filters_down1=[64],
+                num_filters_down2=[64, 64],
+                name='SparseMiddleExtractor'):
+        super(SparseMiddleExtractor, self).__init__()
+        self.name = name
+
+        middle_layers = []
+        num_filters = [num_input_features] + num_filters_down1
+        filters_pairs_d1 = [[num_filters[i], num_filters[i + 1]] for i in range(len(num_filters) - 1)]
+        for i, o in filters_pairs_d1:
+            middle_layers.append(sparse.SubmConv3D(i, o, 3, bias_attr=False))
+            if use_norm:
+                middle_layers.append(sparse.BatchNorm(o, epsilon=1e-3, momentum=0.01))
+            middle_layers.append(sparse.ReLU())
+
+        middle_layers.append(sparse.Conv3D(num_filters[-1], num_filters[-1], (3, 1, 1), (2, 1, 1), bias_attr=False))
+
+        if use_norm:
+            middle_layers.append(sparse.BatchNorm(num_filters[-1], epsilon=1e-3, momentum=0.01))
+        middle_layers.append(sparse.ReLU())
+
+
+        if len(num_filters_down1) == 0:
+            num_filters = [num_filters[-1]] + num_filters_down2
+        else:
+            num_filters = [num_filters_down1[-1]] + num_filters_down2
+
+        filters_pairs_d2 = [[num_filters[i], num_filters[i + 1]] for i in range(len(num_filters) - 1)]
+
+        for i, o in filters_pairs_d2:
+            middle_layers.append(sparse.SubmConv3D(i, o, 3, bias_attr=False))
+            if use_norm:
+                middle_layers.append(sparse.BatchNorm(o, epsilon=1e-3, momentum=0.01))
+            middle_layers.append(sparse.ReLU())
+
+        middle_layers.append(sparse.Conv3D(num_filters[-1], num_filters[-1], (3, 1, 1), (2, 1, 1), bias_attr=False))
+        if use_norm:
+            middle_layers.append(sparse.BatchNorm(num_filters[-1], epsilon=1e-3, momentum=0.01))
+        middle_layers.append(sparse.ReLU())
+
+        self.middle_conv = nn.Sequential(*middle_layers)
+        
+    def forward(self, x):
+        sparse_out = self.middle_conv(x)
+        #return sparse_out
+        return sparse_out.to_dense()
+
+
+def test():
+    paddle.seed(0)
+    with _test_eager_guard():
+        in_channels = 128 
+        # Note: 1. paddle的BatchNorm1D的输入shape不能太大，否则报CUDNN_STATUS_NOT_SUPPORTED.
+        shape = [20, 40, 100]
+        batch_size = 1
+        sparsity = 0.95
+
+        full_shape = [batch_size] + shape + [in_channels]
+        print(full_shape)
+
+        total_elements = np.prod(shape)
+        nnz = int(total_elements * (1-sparsity))
+        print("nnz=", nnz)
+
+        #product indices
+        indices = []
+        for i in range(4):
+           indices.append(paddle.randint(0, full_shape[i], [1, nnz])) 
+        
+        indices = paddle.concat(indices)
+        #product values
+        values = paddle.randn((nnz, in_channels))
+
+        sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, shape=full_shape)
+
+        dense_x = sparse_x.to_dense()
+
+        #spconv
+        device = torch.device("cuda")
+        torch_x = torch.tensor(dense_x.numpy(), device=device)
+
+        spconv_x = spconv.SparseConvTensor.from_dense(torch_x)
+
+        #whether to use batch_norm
+        use_norm = True
+
+        dense_model = MiddleExtractor(use_norm=use_norm, num_input_features=in_channels)
+        spconv_model = SpconvMiddleExtractor(use_norm=use_norm, num_input_features=in_channels).to(device)
+        sparse_model = SparseMiddleExtractor(use_norm=use_norm, num_input_features=in_channels)
+        layer_nums = len(sparse_model.middle_conv)
+        block_size = 3 if use_norm else 2
+        layer_nums = int(layer_nums / block_size)
+
+        for i in range(0, layer_nums):
+            weight = paddle.to_tensor(spconv_model.middle_conv[i * block_size].weight.detach().cpu().numpy())
+            sparse_model.middle_conv[i * block_size].weight.set_value(paddle.transpose(paddle.to_tensor(weight), [1,2,3,4,0]))
+            if use_norm:
+               bn_weight = paddle.to_tensor(spconv_model.middle_conv[i*block_size + 1].weight.detach().cpu().numpy()) 
+               sparse_model.middle_conv[i * block_size + 1].weight.set_value(bn_weight)
+
+        print(dense_model)
+        print(sparse_model)
+        print(spconv_model)
+        paddle.device.cuda.synchronize()
+
+        #warm up
+        dense_x.stop_gradient=True
+        out1 = dense_model(dense_x)
+        paddle.device.cuda.synchronize()
+        sparse_x.stop_gradient=True
+        out2 = sparse_model(sparse_x)
+        paddle.device.cuda.synchronize()
+        spconv_x.features.required_grad=False
+        out3 = spconv_model(spconv_x)
+        torch.cuda.synchronize(device)
+        #warm up
+
+        t0 = time.time()
+        #padde dense
+        dense_x.stop_gradient=False
+        out1 = dense_model(dense_x)
+        out1.backward(out1)
+        paddle.device.cuda.synchronize()
+        t1 = time.time()
+
+        #padde sparse
+        sparse_x.stop_gradient=False
+        out2 = sparse_model(sparse_x)
+        out2.backward(out2)
+        paddle.device.cuda.synchronize()
+        t2 = time.time()
+
+        #spconv
+        spconv_x.features.required_grad=True
+        spconv_x.features.requires_grad_()
+        out3 = spconv_model(spconv_x)
+        out3.backward(out3)
+        torch.cuda.synchronize(device)
+        t3 = time.time()
+
+        # Note 2. sparse的BatchNorm底层是使用paddle.nn.BatchNorm1D对values进行bn计算,测试发现BatchNorm1D的性能比BatchNorm3D差，因此use_norm=True的情况，需要更高的稀疏度才能比dense的快
+        # Note 3. 只跑前向，sparse的耗时和spconv接近，稀疏度越高sparse的性能越好，当前方式测试前向+反向，spconv的耗时很高, 原因未知
+        print("dense time: ", t1 - t0)
+        print("sparse time: ", t2 - t1)
+        print("spconv time: ", t3 - t2)
+
+        # Note 4. paddle和torch的BN存在误差，测试shape=(4000, 64)的随机输入，单层BN前向误差在1e-6, 反向误差在1e-4
+        #verify the forward calculation result
+        assert np.allclose(paddle.transpose(out2, [0, 4, 1, 2, 3]).numpy(), out3.detach().cpu().numpy(), atol=1e-4, rtol=1e-4)
+
+        #verify the backward calculation result
+        assert np.allclose(spconv_x.features.grad.cpu().numpy(),
+        sparse_x.grad.values().numpy(), atol=1e-3, rtol=1e-3)
+
+test()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_mnist.py b/python/paddle/fluid/tests/unittests/test_sparse_mnist.py
new file mode 100644
index 0000000000000..3589dc83090f3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_mnist.py
@@ -0,0 +1,126 @@
+import paddle
+from paddle.vision.transforms import Compose, Normalize, ToTensor
+from paddle.fluid.framework import _test_eager_guard
+import time
+
+paddle.disable_static()
+#transform = Compose([Normalize(mean=[127.5],
+#                               std=[127.5],
+#                               data_format='CHW')])
+transform = Compose([ToTensor()])
+# 使用transform对数据集做归一化
+print('download training data and load training data')
+train_dataset = paddle.vision.datasets.MNIST(mode='train', transform=transform)
+test_dataset = paddle.vision.datasets.MNIST(mode='test', transform=transform)
+print('load finished')
+
+import numpy as np
+#import matplotlib.pyplot as plt
+train_data0, train_label_0 = train_dataset[0][0],train_dataset[0][1]
+train_data0 = train_data0.reshape([28,28])
+#plt.figure(figsize=(2,2))
+#plt.imshow(train_data0, cmap=plt.cm.binary)
+print('train_data0 label is: ' + str(train_label_0))
+
+
+import paddle
+import paddle.nn.functional as F
+class SparseLeNet(paddle.nn.Layer):
+    def __init__(self):
+        super(SparseLeNet, self).__init__()
+        #self.bn = paddle.sparse.BatchNorm(1)
+        self.conv1 = paddle.sparse.Conv3D(in_channels=1, out_channels=6, kernel_size=[1, 5, 5], stride=[1, 1, 1], padding=[0, 2, 2])
+        self.relu1 = paddle.sparse.ReLU()
+        self.pool1 = paddle.sparse.MaxPool3D(kernel_size=[1, 2, 2], stride=[1, 2, 2])
+        self.conv2 = paddle.sparse.Conv3D(in_channels=6, out_channels=16, kernel_size=[1, 5, 5], stride=[1, 1, 1])
+        self.relu2 = paddle.sparse.ReLU()
+        self.pool2 = paddle.sparse.MaxPool3D(kernel_size=[1, 2, 2], stride=[1, 2, 2])
+
+        self.fc1 = paddle.nn.Linear(16*5*5, 120)
+        self.fc2 = paddle.nn.Linear(120, 84)
+        self.fc3 = paddle.nn.Linear(84, 10)
+
+    def forward(self, x):
+        #x = self.bn(x)
+        x = self.conv1(x)
+        x = self.relu1(x)
+        x = self.pool1(x)
+        x = self.conv2(x)
+        x = self.relu2(x)
+        x = self.pool2(x)
+        x = x.to_dense()
+
+        x = paddle.flatten(x, start_axis=1, stop_axis=-1)
+        x = self.fc1(x)
+        x = paddle.nn.functional.relu(x)
+        x = self.fc2(x)
+        x = paddle.nn.functional.relu(x)
+        x = self.fc3(x)
+        return x
+
+import paddle.nn.functional as F
+train_loader = paddle.io.DataLoader(train_dataset, batch_size=64, shuffle=True)
+# 加载训练集 batch_size 设为 64
+# sparse 训练
+
+def prepare_data(x_data):
+  x_data = paddle.transpose(x_data, perm=[0, 2, 3, 1])
+  x_data = paddle.reshape(x_data, [x_data.shape[0], 1, x_data.shape[1], x_data.shape[2], x_data.shape[3]])
+  return x_data
+
+def sparse_train(model):
+    model.train()
+    epochs = 2
+    optim = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters())
+    # 用Adam作为优化函数
+    for epoch in range(epochs):
+        for batch_id, data in enumerate(train_loader()):
+            x_data = data[0]
+            y_data = data[1]
+            x_data = prepare_data(x_data)
+            x_data = x_data.to_sparse_coo(4)
+            x_data.stop_gradient=False
+            predicts = model(x_data)
+            loss = F.cross_entropy(predicts, y_data)
+            # 计算损失
+            acc = paddle.metric.accuracy(predicts, y_data)
+            loss.backward()
+            if batch_id % 300 == 0:
+                print("epoch: {}, batch_id: {}, loss is: {}, acc is: {}".format(epoch, batch_id, loss.numpy(), acc.numpy()))
+            optim.step()
+            optim.clear_grad()
+
+test_loader = paddle.io.DataLoader(test_dataset, places=paddle.CPUPlace(), batch_size=64)
+# 加载测试数据集
+def test(model):
+    model.eval()
+    batch_size = 64
+    for batch_id, data in enumerate(test_loader()):
+        x_data = data[0]
+        y_data = data[1]
+        x_data = prepare_data(x_data)
+        x_data = x_data.to_sparse_coo(4)
+        predicts = model(x_data)
+        # 获取预测结果
+        loss = F.cross_entropy(predicts, y_data)
+        acc = paddle.metric.accuracy(predicts, y_data)
+        if batch_id % 20 == 0:
+            print("batch_id: {}, loss is: {}, acc is: {}".format(batch_id, loss.numpy(), acc.numpy()))
+
+with _test_eager_guard():
+  sparse_model = SparseLeNet()
+  print(sparse_model)
+
+  t0 = time.time()
+  sparse_train(sparse_model)
+  t1 = time.time()
+  print("spare time:", t1-t0)
+  test(sparse_model)
+  #x = paddle.randn((1, 1,28,28,1))
+  #x.stop_gradient=False
+  #sparse_x = x.to_sparse_coo(4)
+  #print("sparse_x values shape:", sparse_x.values().shape)
+  #out = sparse_model(sparse_x)
+  #out.backward(out)
+  #print("end")
+

From 441da36cb4080172c37acb547a73cc580344cbd3 Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Sun, 29 May 2022 18:00:48 +0800
Subject: [PATCH 02/70] refactor code structure

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 182 ++++++++++----------
 1 file changed, 89 insertions(+), 93 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 361e62e566035..e572ec70dbebe 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -451,10 +451,80 @@ void BatchNormKernel(const Context &ctx,
       paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
     } else {
       double this_factor = 1. - momentum;
-
-      bool called = false;
+#ifdef PADDLE_WITH_HIP
+      const int num = transformed_x.numel();
+      const int block = 256;
+      const int max_threads = ctx.GetMaxPhysicalThreadCount();
+      const int max_blocks = std::max(max_threads / block, 1);
+      const int grid = std::min(C, max_blocks);
+      if (compute_format == DataLayout::kNCHW) {
+        BNForwardTraining<
+            T,
+            block,
+            DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
+            transformed_x.template data<T>(),
+            scale.template data<BatchNormParamType<T>>(),
+            bias.template data<BatchNormParamType<T>>(),
+            C,
+            N,
+            H * W * D,
+            epsilon,
+            this_factor,
+            transformed_y.template data<T>(),
+            mean_out->template data<BatchNormParamType<T>>(),
+            variance_out->template data<BatchNormParamType<T>>(),
+            saved_mean->template data<BatchNormParamType<T>>(),
+            saved_variance->template data<BatchNormParamType<T>>());
+      } else {
+        BNForwardTraining<
+            T,
+            block,
+            DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
+            transformed_x.template data<T>(),
+            scale.template data<BatchNormParamType<T>>(),
+            bias.template data<BatchNormParamType<T>>(),
+            C,
+            N,
+            H * W * D,
+            epsilon,
+            this_factor,
+            transformed_y.template data<T>(),
+            mean_out->template data<BatchNormParamType<T>>(),
+            variance_out->template data<BatchNormParamType<T>>(),
+            saved_mean->template data<BatchNormParamType<T>>(),
+            saved_variance->template data<BatchNormParamType<T>>());
+      }
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardTraining(
+//         handle, mode_, const_cast<void *>(static_cast<const void *>(
+//                            CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         this_factor,
+//         static_cast<void *>(
+//             mean_out->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(variance_out->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace())),
+//         epsilon,
+//         static_cast<void *>(
+//             saved_mean->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(saved_variance->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace()))));
+#else
 #if CUDNN_VERSION_MIN(7, 4, 1)
-      called = true;
       size_t workspace_size = 0;
       size_t reserve_space_size = 0;
       void *reserve_space_ptr = nullptr;
@@ -530,102 +600,28 @@ void BatchNormKernel(const Context &ctx,
               workspace_size,
               reserve_space_ptr,
               reserve_space_size));
-#endif  // CUDNN_VERSION_MIN(7, 4, 1)
-      if (!called) {
-#ifdef PADDLE_WITH_HIP
-        const int num = transformed_x.numel();
-        const int block = 256;
-        const int max_threads = ctx.GetMaxPhysicalThreadCount();
-        const int max_blocks = std::max(max_threads / block, 1);
-        const int grid = std::min(C, max_blocks);
-        if (compute_format == DataLayout::kNCHW) {
-          BNForwardTraining<
-              T,
-              block,
-              DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
+              handle,
+              mode_,
+              CudnnDataType<T>::kOne(),
+              CudnnDataType<T>::kZero(),
+              data_desc_,
               transformed_x.template data<T>(),
+              data_desc_,
+              ctx.template Alloc<T>(&transformed_y),
+              bn_param_desc_,
               scale.template data<BatchNormParamType<T>>(),
               bias.template data<BatchNormParamType<T>>(),
-              C,
-              N,
-              H * W * D,
-              epsilon,
               this_factor,
-              transformed_y.template data<T>(),
-              mean_out->template data<BatchNormParamType<T>>(),
-              variance_out->template data<BatchNormParamType<T>>(),
-              saved_mean->template data<BatchNormParamType<T>>(),
-              saved_variance->template data<BatchNormParamType<T>>());
-        } else {
-          BNForwardTraining<
-              T,
-              block,
-              DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
-              transformed_x.template data<T>(),
-              scale.template data<BatchNormParamType<T>>(),
-              bias.template data<BatchNormParamType<T>>(),
-              C,
-              N,
-              H * W * D,
+              ctx.template Alloc<BatchNormParamType<T>>(mean_out),
+              ctx.template Alloc<BatchNormParamType<T>>(variance_out),
               epsilon,
-              this_factor,
-              transformed_y.template data<T>(),
-              mean_out->template data<BatchNormParamType<T>>(),
-              variance_out->template data<BatchNormParamType<T>>(),
-              saved_mean->template data<BatchNormParamType<T>>(),
-              saved_variance->template data<BatchNormParamType<T>>());
-        }
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationForwardTraining(
-//         handle, mode_, const_cast<void *>(static_cast<const void *>(
-//                            CudnnDataType<T>::kOne())),
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kZero())),
-//         data_desc_,
-//         static_cast<const void *>(transformed_x.template data<T>()),
-//         data_desc_,
-//         static_cast<void *>(
-//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
-//         bn_param_desc_,
-//         const_cast<void *>(static_cast<const void *>(
-//             scale->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             bias->template data<BatchNormParamType<T>>())),
-//         this_factor,
-//         static_cast<void *>(
-//             mean_out->template mutable_data<BatchNormParamType<T>>(
-//                 ctx.GetPlace())),
-//         static_cast<void *>(variance_out->template mutable_data<
-//                             BatchNormParamType<T>>(ctx.GetPlace())),
-//         epsilon,
-//         static_cast<void *>(
-//             saved_mean->template mutable_data<BatchNormParamType<T>>(
-//                 ctx.GetPlace())),
-//         static_cast<void *>(saved_variance->template mutable_data<
-//                             BatchNormParamType<T>>(ctx.GetPlace()))));
-#else
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
-                handle,
-                mode_,
-                CudnnDataType<T>::kOne(),
-                CudnnDataType<T>::kZero(),
-                data_desc_,
-                transformed_x.template data<T>(),
-                data_desc_,
-                ctx.template Alloc<T>(&transformed_y),
-                bn_param_desc_,
-                scale.template data<BatchNormParamType<T>>(),
-                bias.template data<BatchNormParamType<T>>(),
-                this_factor,
-                ctx.template Alloc<BatchNormParamType<T>>(mean_out),
-                ctx.template Alloc<BatchNormParamType<T>>(variance_out),
-                epsilon,
-                ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
-                ctx.template Alloc<BatchNormParamType<T>>(saved_variance)));
+              ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
+              ctx.template Alloc<BatchNormParamType<T>>(saved_variance)));
+#endif // CUDNN_VERSION_MIN(7, 4, 1)
 #endif
-      }
     }
   }
 

From c48e076349ba257bcd87f874f792aaf449852fd8 Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Sun, 29 May 2022 22:55:48 +0800
Subject: [PATCH 03/70] add native kernel usage

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 222 ++++++++++++--------
 1 file changed, 134 insertions(+), 88 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index e572ec70dbebe..08eea1f8717cd 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -524,103 +524,149 @@ void BatchNormKernel(const Context &ctx,
 //         static_cast<void *>(saved_variance->template mutable_data<
 //                             BatchNormParamType<T>>(ctx.GetPlace()))));
 #else
-#if CUDNN_VERSION_MIN(7, 4, 1)
-      size_t workspace_size = 0;
-      size_t reserve_space_size = 0;
-      void *reserve_space_ptr = nullptr;
-      void *workspace_ptr = nullptr;
-      DenseTensor workspace_tensor;
-      DenseTensor reserve_space_tensor;
-      // Create reserve space and workspace for batch norm.
-      // Create tensor for each batchnorm op, it will be used in the
-      // backward. Thus this tensor shouldn't be temp.
-      // auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
-      if (reserve_space == nullptr) {
-        reserve_space = &reserve_space_tensor;
-      }
-      PADDLE_ENFORCE_NOT_NULL(
-          reserve_space,
-          phi::errors::NotFound(
-              "The argument ReserveSpace of batch_norm op is not found."));
-      // --------------- cudnn batchnorm workspace ---------------
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::
-              cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
-                  /*handle=*/handle,
-                  /*mode=*/mode_,
-                  /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
-                  /*xDesc=*/data_desc_,
-                  /*zDesc=*/nullptr,
-                  /*yDesc=*/data_desc_,
-                  /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-                  /*activationDesc=*/nullptr,
-                  /*sizeInBytes=*/&workspace_size));
-
-      // -------------- cudnn batchnorm reserve space --------------
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::
-              cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
-                  /*handle=*/handle,
-                  /*mode=*/mode_,
-                  /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
-                  /*activationDesc=*/nullptr,
-                  /*xDesc=*/data_desc_,
-                  /*sizeInBytes=*/&reserve_space_size));
-
-      reserve_space->Resize({static_cast<int64_t>(reserve_space_size)});
-      reserve_space_ptr =
-          static_cast<void *>(ctx.template Alloc<uint8_t>(reserve_space));
-      workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
-      workspace_ptr =
-          static_cast<void *>(ctx.template Alloc<uint8_t>(&workspace_tensor));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
-              handle,
-              mode_,
-              CUDNN_BATCHNORM_OPS_BN,
-              CudnnDataType<T>::kOne(),
-              CudnnDataType<T>::kZero(),
-              data_desc_,
+      const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070);
+      if(use_native_kernel) {
+        const int num = transformed_x.numel();
+        const int block = 256;
+        const int max_threads = ctx.GetMaxPhysicalThreadCount();
+        const int max_blocks = std::max(max_threads / block, 1);
+        const int grid = std::min(C, max_blocks);
+        if (compute_format == DataLayout::kNCHW) {
+          BNForwardTraining<
+              T,
+              block,
+              DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
               transformed_x.template data<T>(),
-              nullptr,
-              nullptr,
-              data_desc_,
-              transformed_y.template data<T>(),
-              bn_param_desc_,
               scale.template data<BatchNormParamType<T>>(),
               bias.template data<BatchNormParamType<T>>(),
-              this_factor,
-              ctx.template Alloc<BatchNormParamType<T>>(mean_out),
-              ctx.template Alloc<BatchNormParamType<T>>(variance_out),
+              C,
+              N,
+              H * W * D,
               epsilon,
-              ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
-              ctx.template Alloc<BatchNormParamType<T>>(saved_variance),
-              nullptr,
-              workspace_ptr,
-              workspace_size,
-              reserve_space_ptr,
-              reserve_space_size));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
-              handle,
-              mode_,
-              CudnnDataType<T>::kOne(),
-              CudnnDataType<T>::kZero(),
-              data_desc_,
+              this_factor,
+              transformed_y.template data<T>(),
+              mean_out->template data<BatchNormParamType<T>>(),
+              variance_out->template data<BatchNormParamType<T>>(),
+              saved_mean->template data<BatchNormParamType<T>>(),
+              saved_variance->template data<BatchNormParamType<T>>());
+        } else {
+          BNForwardTraining<
+              T,
+              block,
+              DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
               transformed_x.template data<T>(),
-              data_desc_,
-              ctx.template Alloc<T>(&transformed_y),
-              bn_param_desc_,
               scale.template data<BatchNormParamType<T>>(),
               bias.template data<BatchNormParamType<T>>(),
-              this_factor,
-              ctx.template Alloc<BatchNormParamType<T>>(mean_out),
-              ctx.template Alloc<BatchNormParamType<T>>(variance_out),
+              C,
+              N,
+              H * W * D,
               epsilon,
-              ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
-              ctx.template Alloc<BatchNormParamType<T>>(saved_variance)));
+              this_factor,
+              transformed_y.template data<T>(),
+              mean_out->template data<BatchNormParamType<T>>(),
+              variance_out->template data<BatchNormParamType<T>>(),
+              saved_mean->template data<BatchNormParamType<T>>(),
+              saved_variance->template data<BatchNormParamType<T>>());
+        }
+      } else {
+#if CUDNN_VERSION_MIN(7, 4, 1)
+        size_t workspace_size = 0;
+        size_t reserve_space_size = 0;
+        void *reserve_space_ptr = nullptr;
+        void *workspace_ptr = nullptr;
+        DenseTensor workspace_tensor;
+        DenseTensor reserve_space_tensor;
+        // Create reserve space and workspace for batch norm.
+        // Create tensor for each batchnorm op, it will be used in the
+        // backward. Thus this tensor shouldn't be temp.
+        // auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+        if (reserve_space == nullptr) {
+          reserve_space = &reserve_space_tensor;
+        }
+        PADDLE_ENFORCE_NOT_NULL(
+            reserve_space,
+            phi::errors::NotFound(
+                "The argument ReserveSpace of batch_norm op is not found."));
+        // --------------- cudnn batchnorm workspace ---------------
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::
+                cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+                    /*handle=*/handle,
+                    /*mode=*/mode_,
+                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
+                    /*xDesc=*/data_desc_,
+                    /*zDesc=*/nullptr,
+                    /*yDesc=*/data_desc_,
+                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                    /*activationDesc=*/nullptr,
+                    /*sizeInBytes=*/&workspace_size));
+
+        // -------------- cudnn batchnorm reserve space --------------
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::
+                cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+                    /*handle=*/handle,
+                    /*mode=*/mode_,
+                    /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
+                    /*activationDesc=*/nullptr,
+                    /*xDesc=*/data_desc_,
+                    /*sizeInBytes=*/&reserve_space_size));
+
+        reserve_space->Resize({static_cast<int64_t>(reserve_space_size)});
+        reserve_space_ptr =
+            static_cast<void *>(ctx.template Alloc<uint8_t>(reserve_space));
+        workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
+        workspace_ptr =
+            static_cast<void *>(ctx.template Alloc<uint8_t>(&workspace_tensor));
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
+                handle,
+                mode_,
+                CUDNN_BATCHNORM_OPS_BN,
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                data_desc_,
+                transformed_x.template data<T>(),
+                nullptr,
+                nullptr,
+                data_desc_,
+                transformed_y.template data<T>(),
+                bn_param_desc_,
+                scale.template data<BatchNormParamType<T>>(),
+                bias.template data<BatchNormParamType<T>>(),
+                this_factor,
+                ctx.template Alloc<BatchNormParamType<T>>(mean_out),
+                ctx.template Alloc<BatchNormParamType<T>>(variance_out),
+                epsilon,
+                ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
+                ctx.template Alloc<BatchNormParamType<T>>(saved_variance),
+                nullptr,
+                workspace_ptr,
+                workspace_size,
+                reserve_space_ptr,
+                reserve_space_size));
+#else
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
+                handle,
+                mode_,
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                data_desc_,
+                transformed_x.template data<T>(),
+                data_desc_,
+                ctx.template Alloc<T>(&transformed_y),
+                bn_param_desc_,
+                scale.template data<BatchNormParamType<T>>(),
+                bias.template data<BatchNormParamType<T>>(),
+                this_factor,
+                ctx.template Alloc<BatchNormParamType<T>>(mean_out),
+                ctx.template Alloc<BatchNormParamType<T>>(variance_out),
+                epsilon,
+                ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
+                ctx.template Alloc<BatchNormParamType<T>>(saved_variance)));
 #endif // CUDNN_VERSION_MIN(7, 4, 1)
+      }
 #endif
     }
   }

From 0a68ba3641939219cc22c45a5ca772e419107ecf Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Sat, 4 Jun 2022 00:21:54 +0800
Subject: [PATCH 04/70] add wellford impl

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 268 +++++++++++++++++++-
 1 file changed, 264 insertions(+), 4 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 08eea1f8717cd..15a84c4ae918e 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -140,6 +140,265 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
   }
 }
 
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellford(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    double exponentialAverageFactor,
+    T *y,
+    BatchNormParamType<T> *mean,
+    BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> variance_val;
+  __shared__ BatchNormParamType<T> inv_var_val;
+
+  constexpr int THREADS_PER_WARP = 32;
+  constexpr int THREADS_BITS_PER_WARP = 5;
+
+  constexpr int WARP_PER_BLOCK = BlockDim / THREADS_PER_WARP;
+  const int WARP_BITS_PER_BLOCK = (31 - __clz(WARP_PER_BLOCK));
+
+  __shared__ int warp_shared_count[WARP_PER_BLOCK];
+  __shared__ BatchNormParamType<T> warp_shared_mean[WARP_PER_BLOCK];
+  __shared__ BatchNormParamType<T> warp_shared_var_n[WARP_PER_BLOCK];
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> local_mean = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> local_var_n = static_cast<BatchNormParamType<T>>(0);
+    int local_count = 0;
+
+    // thread-local iterative computation
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                        ? (j / HxW * C + i) * HxW + j % HxW
+                        : j * outer_size + i;
+      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
+      BatchNormParamType<T> delta = (x_i - local_mean);
+      local_count++;
+      local_mean += delta / local_count;
+      local_var_n += delta * (x_i - local_mean);
+    }
+
+    // warp sum
+    for(int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) {
+      BatchNormParamType<T> o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
+      int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
+      BatchNormParamType<T> factor = 1.0 / static_cast<float>(max(1, local_count+o_count));
+      local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor);
+      local_mean = (local_count * local_mean + o_count * o_mean) * factor;
+      local_count += o_count;
+    }
+
+    if (threadIdx.x % THREADS_PER_WARP == 0) {
+      warp_shared_count[threadIdx.x / THREADS_PER_WARP] = local_count;
+      warp_shared_mean[threadIdx.x / THREADS_PER_WARP] = local_mean;
+      warp_shared_var_n[threadIdx.x / THREADS_PER_WARP] = local_var_n;
+    }
+    __syncthreads();
+
+    // block sum
+    if (threadIdx.x < WARP_PER_BLOCK) {
+      local_count = warp_shared_count[threadIdx.x];
+      local_mean = warp_shared_count[threadIdx.x];
+      local_var_n = warp_shared_count[threadIdx.x];
+    }
+
+    for(int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) {
+      BatchNormParamType<T> o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
+      int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
+      BatchNormParamType<T> factor = 1.0 / static_cast<float>(max(1, local_count+o_count));
+      local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor);
+      local_mean = (local_count * local_mean + o_count * o_mean) * factor;
+      local_count += o_count;
+    }
+
+    if (threadIdx.x == 0) {
+      mean_val = local_mean;
+      variance_val = local_var_n / local_count;
+      inv_var_val = 1 / sqrt(variance_val + epsilon);
+
+      if (save_mean && save_inv_variance) {
+        save_mean[i] = mean_val;
+        save_inv_variance[i] = inv_var_val;
+      }
+      mean[i] = (1 - exponentialAverageFactor) * mean_val +
+                exponentialAverageFactor * mean[i];
+      variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                    exponentialAverageFactor * variance[i];
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
+      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
+    }
+  }
+}
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    double exponentialAverageFactor,
+    T *y,
+    BatchNormParamType<T> *mean,
+    BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> variance_val;
+  __shared__ BatchNormParamType<T> inv_var_val;
+
+  constexpr int PARALLEL_LOADS = 4;
+
+  constexpr int THREADS_PER_WARP = 32;
+  constexpr int THREADS_BITS_PER_WARP = 5;
+
+  constexpr int WARP_PER_BLOCK = BlockDim / THREADS_PER_WARP;
+  const int WARP_BITS_PER_BLOCK = (31 - __clz(WARP_PER_BLOCK));
+
+  __shared__ int warp_shared_count[WARP_PER_BLOCK];
+  __shared__ BatchNormParamType<T> warp_shared_mean[WARP_PER_BLOCK];
+  __shared__ BatchNormParamType<T> warp_shared_var_n[WARP_PER_BLOCK];
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> tmp_local_mean[PARALLEL_LOADS];
+    BatchNormParamType<T> tmp_local_var_n[PARALLEL_LOADS];
+    int tmp_local_count[PARALLEL_LOADS];
+
+    #pragma unroll
+    for(int k = 0; k < PARALLEL_LOADS; k++) {
+      tmp_local_mean[k] = static_cast<BatchNormParamType<T>>(0);
+      tmp_local_var_n[k] = static_cast<BatchNormParamType<T>>(0);
+      tmp_local_count[k] = 0;
+    }
+
+    // thread-local iterative computation
+    for (int j = threadIdx.x; j < inner_size; j += PARALLEL_LOADS * blockDim.x) {
+      BatchNormParamType<T> tmp_local_x[PARALLEL_LOADS];
+      BatchNormParamType<T> tmp_local_count_inv[PARALLEL_LOADS];
+      BatchNormParamType<T> valid[PARALLEL_LOADS];
+      auto offset = j;
+      #pragma unroll
+      for(int k = 0; k < PARALLEL_LOADS; k++) {
+        if(offset < inner_size) {
+          const int index = layout == phi::DataLayout::kNCHW
+                            ? (offset / HxW * C + i) * HxW + offset % HxW
+                            : offset * outer_size + i;
+          tmp_local_x[k] = static_cast<BatchNormParamType<T>>(x[index]);
+          tmp_local_count[k]++;
+          tmp_local_count_inv[k] = static_cast<BatchNormParamType<T>>(1) / tmp_local_count[k];
+          valid[k] = static_cast<BatchNormParamType<T>>(1);
+        } else {
+          tmp_local_x[k] = static_cast<BatchNormParamType<T>>(0);
+          tmp_local_count_inv[k] = static_cast<BatchNormParamType<T>>(0);
+          valid[k] = static_cast<BatchNormParamType<T>>(0);
+        }
+        offset += blockDim.x;
+      }
+
+      #pragma unroll
+      for(int k = 0; k < PARALLEL_LOADS; k++) {
+        BatchNormParamType<T> delta = (tmp_local_x[k] - tmp_local_mean[k]);
+        tmp_local_mean[k] += delta * tmp_local_count_inv[k];
+        tmp_local_var_n[k] += delta * (tmp_local_x[k] - tmp_local_mean[k]) * valid[k];
+      }
+    }
+
+    #pragma unroll
+    for(int k = 1; k < PARALLEL_LOADS; k++) {
+      BatchNormParamType<T> factor = 1.0 / static_cast<float>(max(1, tmp_local_count[0]+tmp_local_count[k]));
+      BatchNormParamType<T> delta = (tmp_local_mean[0] - tmp_local_mean[k]);
+      tmp_local_mean[0] = (tmp_local_count[0] * tmp_local_mean[0] + tmp_local_count[k] * tmp_local_mean[k]) * factor;
+      tmp_local_var_n[0] += (tmp_local_var_n[k] + delta * delta * tmp_local_count[0] * tmp_local_count[k] * factor);
+      tmp_local_count[0] += tmp_local_count[k];
+    }
+
+    BatchNormParamType<T> local_mean = tmp_local_mean[0];
+    BatchNormParamType<T> local_var_n = tmp_local_var_n[0];
+    int local_count = tmp_local_count[0];
+
+    // warp sum
+    for(int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) {
+      BatchNormParamType<T> o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
+      int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
+      BatchNormParamType<T> factor = 1.0 / static_cast<float>(max(1, local_count+o_count));
+      local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor);
+      local_mean = (local_count * local_mean + o_count * o_mean) * factor;
+      local_count += o_count;
+    }
+
+    if (threadIdx.x % THREADS_PER_WARP == 0) {
+      warp_shared_count[threadIdx.x / THREADS_PER_WARP] = local_count;
+      warp_shared_mean[threadIdx.x / THREADS_PER_WARP] = local_mean;
+      warp_shared_var_n[threadIdx.x / THREADS_PER_WARP] = local_var_n;
+    }
+    __syncthreads();
+
+    // block sum
+    if (threadIdx.x < WARP_PER_BLOCK) {
+      local_count = warp_shared_count[threadIdx.x];
+      local_mean = warp_shared_count[threadIdx.x];
+      local_var_n = warp_shared_count[threadIdx.x];
+    }
+
+    for(int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) {
+      BatchNormParamType<T> o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
+      int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
+      BatchNormParamType<T> factor = 1.0 / static_cast<float>(max(1, local_count+o_count));
+      local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor);
+      local_mean = (local_count * local_mean + o_count * o_mean) * factor;
+      local_count += o_count;
+    }
+
+    if (threadIdx.x == 0) {
+      mean_val = local_mean;
+      variance_val = local_var_n / local_count;
+      inv_var_val = 1 / sqrt(variance_val + epsilon);
+
+      if (save_mean && save_inv_variance) {
+        save_mean[i] = mean_val;
+        save_inv_variance[i] = inv_var_val;
+      }
+      mean[i] = (1 - exponentialAverageFactor) * mean_val +
+                exponentialAverageFactor * mean[i];
+      variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                    exponentialAverageFactor * variance[i];
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
+      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
+    }
+  }
+}
+
 template <typename T, typename Context>
 void BatchNormKernel(const Context &ctx,
                      const DenseTensor &x,
@@ -524,15 +783,16 @@ void BatchNormKernel(const Context &ctx,
 //         static_cast<void *>(saved_variance->template mutable_data<
 //                             BatchNormParamType<T>>(ctx.GetPlace()))));
 #else
-      const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070);
+      //const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070);
+      const bool use_native_kernel = true;
       if(use_native_kernel) {
         const int num = transformed_x.numel();
-        const int block = 256;
+        const int block = 1024;
         const int max_threads = ctx.GetMaxPhysicalThreadCount();
         const int max_blocks = std::max(max_threads / block, 1);
         const int grid = std::min(C, max_blocks);
         if (compute_format == DataLayout::kNCHW) {
-          BNForwardTraining<
+          BNForwardTrainingWellfordParallel<
               T,
               block,
               DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
@@ -550,7 +810,7 @@ void BatchNormKernel(const Context &ctx,
               saved_mean->template data<BatchNormParamType<T>>(),
               saved_variance->template data<BatchNormParamType<T>>());
         } else {
-          BNForwardTraining<
+          BNForwardTrainingWellfordParallel<
               T,
               block,
               DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(

From b3248c9ef649ac2073853ae0b39f8b4fa3175d1b Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Sat, 4 Jun 2022 15:32:25 +0800
Subject: [PATCH 05/70] add shmem impl

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 83 +++++++++++++++++++--
 1 file changed, 77 insertions(+), 6 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 15a84c4ae918e..394b6399977dd 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -399,6 +399,77 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel
   }
 }
 
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingSMem(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    double exponentialAverageFactor,
+    T *y,
+    BatchNormParamType<T> *mean,
+    BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance) {
+  extern __shared__ __align__(sizeof(double)) char smem_buf[];
+  BatchNormParamType<T>* x_buf = reinterpret_cast<BatchNormParamType<T>*>(smem_buf);
+
+  int outer_size = C;
+  int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> variance_val;
+  __shared__ BatchNormParamType<T> inv_var_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
+      x_buf[j] = x_i;
+      x_sum += x_i;
+      x_square_sum += x_i * x_i;
+    }
+    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+    x_square_sum =
+        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      mean_val = x_sum / inner_size;
+      variance_val = x_square_sum / inner_size - mean_val * mean_val;
+      inv_var_val = 1 / sqrt(variance_val + epsilon);
+
+      if (save_mean && save_inv_variance) {
+        save_mean[i] = mean_val;
+        save_inv_variance[i] = inv_var_val;
+      }
+      mean[i] = (1 - exponentialAverageFactor) * mean_val +
+                exponentialAverageFactor * mean[i];
+      variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                    exponentialAverageFactor * variance[i];
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x_buf[j]) - mean_val;
+      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
+    }
+  }
+}
+
 template <typename T, typename Context>
 void BatchNormKernel(const Context &ctx,
                      const DenseTensor &x,
@@ -786,16 +857,16 @@ void BatchNormKernel(const Context &ctx,
       //const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070);
       const bool use_native_kernel = true;
       if(use_native_kernel) {
-        const int num = transformed_x.numel();
-        const int block = 1024;
+        const int block = 512;
         const int max_threads = ctx.GetMaxPhysicalThreadCount();
         const int max_blocks = std::max(max_threads / block, 1);
         const int grid = std::min(C, max_blocks);
+        const size_t smem_size = N * H * W * D * sizeof(BatchNormParamType<T>);
         if (compute_format == DataLayout::kNCHW) {
-          BNForwardTrainingWellfordParallel<
+          BNForwardTrainingSMem<
               T,
               block,
-              DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
+              DataLayout::kNCHW><<<grid, block, smem_size, ctx.stream()>>>(
               transformed_x.template data<T>(),
               scale.template data<BatchNormParamType<T>>(),
               bias.template data<BatchNormParamType<T>>(),
@@ -810,10 +881,10 @@ void BatchNormKernel(const Context &ctx,
               saved_mean->template data<BatchNormParamType<T>>(),
               saved_variance->template data<BatchNormParamType<T>>());
         } else {
-          BNForwardTrainingWellfordParallel<
+          BNForwardTrainingSMem<
               T,
               block,
-              DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
+              DataLayout::kNHWC><<<grid, block, smem_size, ctx.stream()>>>(
               transformed_x.template data<T>(),
               scale.template data<BatchNormParamType<T>>(),
               bias.template data<BatchNormParamType<T>>(),

From 78349a2d57bc34fc0897bacd9f42cc2fa9a918ee Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Sat, 4 Jun 2022 17:07:29 +0800
Subject: [PATCH 06/70] add dispatch logic

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 92 ++++++++++++++++++---
 1 file changed, 79 insertions(+), 13 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 394b6399977dd..ab797aa186f65 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -470,6 +470,81 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingSMem(
   }
 }
 
+template <typename T, typename Context, phi::DataLayout layout>
+inline bool TryDispatchBNForwardTrainingSMem(
+    const Context &ctx,
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    double exponentialAverageFactor,
+    T *y,
+    BatchNormParamType<T> *mean,
+    BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance) {
+  constexpr int block_size = 512;
+  const size_t smem = N * HxW * sizeof(BatchNormParamType<T>);
+  int max_active_blocks_conf;
+  {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf,
+        BNForwardTrainingSMem<T, block_size, layout>,
+        block_size, smem);
+  }
+  if (max_active_blocks_conf <= 0) {
+    return false;
+  }
+  const int max_threads = ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block_size, 1);
+  const int grid = std::min(C, max_blocks);
+  BNForwardTrainingSMem<T, block_size, layout><<<grid, block_size, smem, ctx.stream()>>>(
+    x, scale, bias, C, N, HxW, epsilon, exponentialAverageFactor,
+    y, mean, variance, save_mean, save_inv_variance);
+  return true;
+}
+
+template <typename T, typename Context, phi::DataLayout layout>
+inline void DispatchBNForwardTraining(
+        const Context &ctx,
+        const T *x,
+        const BatchNormParamType<T> *scale,
+        const BatchNormParamType<T> *bias,
+        const int C,
+        const int N,
+        const int HxW,
+        const double epsilon,
+        double exponentialAverageFactor,
+        T *y,
+        BatchNormParamType<T> *mean,
+        BatchNormParamType<T> *variance,
+        BatchNormParamType<T> *save_mean,
+        BatchNormParamType<T> *save_inv_variance) {
+  if ((N * HxW) <= 1024) {
+    // TODO: impl register-cache version
+    return;
+  } else {
+    bool dispatch_smem_impl_success = false;
+    {
+      dispatch_smem_impl_success = TryDispatchBNForwardTrainingSMem<T, Context, layout>(
+          ctx, x, scale, bias, C, N, HxW, epsilon, exponentialAverageFactor,
+          y, mean, variance, save_mean, save_inv_variance);
+    }
+    if (!dispatch_smem_impl_success) {
+      const int block = 512;
+      const int max_threads = ctx.GetMaxPhysicalThreadCount();
+      const int max_blocks = std::max(max_threads / block, 1);
+      const int grid = std::min(C, max_blocks);
+      return BNForwardTraining<T, block, layout><<<grid, block, 0, ctx.stream()>>>(
+        x, scale, bias, C, N, HxW, epsilon, exponentialAverageFactor,
+        y, mean, variance, save_mean, save_inv_variance);
+    }
+  }
+}
+
 template <typename T, typename Context>
 void BatchNormKernel(const Context &ctx,
                      const DenseTensor &x,
@@ -857,16 +932,9 @@ void BatchNormKernel(const Context &ctx,
       //const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070);
       const bool use_native_kernel = true;
       if(use_native_kernel) {
-        const int block = 512;
-        const int max_threads = ctx.GetMaxPhysicalThreadCount();
-        const int max_blocks = std::max(max_threads / block, 1);
-        const int grid = std::min(C, max_blocks);
-        const size_t smem_size = N * H * W * D * sizeof(BatchNormParamType<T>);
         if (compute_format == DataLayout::kNCHW) {
-          BNForwardTrainingSMem<
-              T,
-              block,
-              DataLayout::kNCHW><<<grid, block, smem_size, ctx.stream()>>>(
+          DispatchBNForwardTraining<T, Context, DataLayout::kNCHW>(
+              ctx,
               transformed_x.template data<T>(),
               scale.template data<BatchNormParamType<T>>(),
               bias.template data<BatchNormParamType<T>>(),
@@ -881,10 +949,8 @@ void BatchNormKernel(const Context &ctx,
               saved_mean->template data<BatchNormParamType<T>>(),
               saved_variance->template data<BatchNormParamType<T>>());
         } else {
-          BNForwardTrainingSMem<
-              T,
-              block,
-              DataLayout::kNHWC><<<grid, block, smem_size, ctx.stream()>>>(
+          DispatchBNForwardTraining<T, Context, DataLayout::kNHWC>(
+              ctx,
               transformed_x.template data<T>(),
               scale.template data<BatchNormParamType<T>>(),
               bias.template data<BatchNormParamType<T>>(),

From 98c66f0df7b2ab0c33c89a0bd2a18fce2b13440a Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Sun, 5 Jun 2022 00:21:12 +0800
Subject: [PATCH 07/70] add channel_last impl

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 280 +++++++++++++++++---
 1 file changed, 250 insertions(+), 30 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index ab797aa186f65..14778a89a4657 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -25,6 +25,7 @@ namespace cub = hipcub;
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 
 #include "paddle/fluid/operators/norm_utils.cu.h"
 #include "paddle/fluid/operators/norm_utils.h"
@@ -399,6 +400,158 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel
   }
 }
 
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ void BNForwardTraining2D(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    double exponentialAverageFactor,
+    T *y,
+    BatchNormParamType<T> *mean,
+    BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance,
+    BatchNormParamType<T> *block_data_ptr,
+    int *flag_ptr) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+
+  extern __shared__ __align__(sizeof(double)) char smem_buf[];
+
+  BatchNormParamType<T>* mean_val = reinterpret_cast<BatchNormParamType<T>*>(smem_buf);
+  BatchNormParamType<T>* variance_val = reinterpret_cast<BatchNormParamType<T>*>(&smem_buf[blockDim.x]);
+  BatchNormParamType<T>* inv_var_val = reinterpret_cast<BatchNormParamType<T>*>(&smem_buf[2*blockDim.x]);
+
+  __shared__ BatchNormParamType<T> smem_sum[BlockDim];
+  __shared__ BatchNormParamType<T> smem_square_sum[BlockDim];
+
+  int outer_loop_stride = gridDim.x * blockDim.x;
+  int inner_loop_stride = gridDim.y * blockDim.y;
+
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size; i += outer_loop_stride) {
+    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
+
+    for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size; j += inner_loop_stride) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
+      x_sum += x_i;
+      x_square_sum += x_i * x_i;
+    }
+
+    // vertical block sum
+    int tid = threadIdx.x + threadIdx.y * blockDim.x;
+    #pragma unroll
+    for (int offset = blockDim.y/2; offset > 0; offset >>= 1) {
+      if (threadIdx.y < offset*2) {
+        smem_sum[tid] = x_sum;
+        smem_square_sum[tid] = x_square_sum;
+      }
+      __syncthreads();
+      if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+        int pair_tid = tid + offset * blockDim.x;
+        x_sum += smem_sum[pair_tid];
+        x_square_sum += smem_square_sum[pair_tid];
+      }
+    }
+
+    if (gridDim.y > 1) {
+      volatile BatchNormParamType<T>* staging_sum = block_data_ptr;
+      volatile BatchNormParamType<T>* staging_square_sum = &block_data_ptr[C*gridDim.y];
+      // write block data to global memory
+      if (threadIdx.y == 0) {
+        staging_sum[i + blockIdx.y * C] = x_sum;
+        staging_square_sum[i + blockIdx.y * C] = x_square_sum;
+      }
+
+      // make sure write is visible to all blocks
+      __threadfence();
+      __syncthreads();
+
+      __shared__ bool is_last_block_done;
+      // mark block done
+      if (threadIdx.x == 0 && threadIdx.y == 0) {
+        int old = atomicAdd(&flag_ptr[blockIdx.x], 1);
+        is_last_block_done = (old == (gridDim.y-1));
+      }
+
+      __syncthreads();
+
+      if (is_last_block_done) {
+        x_sum = static_cast<BatchNormParamType<T>>(0);
+        x_square_sum = static_cast<BatchNormParamType<T>>(0);
+        // thread sum
+        for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
+          x_sum += staging_sum[i+y*C];
+          x_square_sum += staging_square_sum[i+y*C];
+        }
+
+        // vertical block sum
+        int tid = threadIdx.x + threadIdx.y * blockDim.x;
+        #pragma unroll
+        for (int offset = blockDim.y/2; offset > 0; offset >>= 1) {
+          if (threadIdx.y < offset*2) {
+            smem_sum[tid] = x_sum;
+            smem_square_sum[tid] = x_square_sum;
+          }
+          __syncthreads();
+          if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+            int pair_tid = tid + offset * blockDim.x;
+            x_sum += smem_sum[pair_tid];
+            x_square_sum += smem_square_sum[pair_tid];
+          }
+        }
+
+        // final compute
+        if(threadIdx.y == 0) {
+          mean_val[threadIdx.x] = x_sum / inner_size;
+          variance_val[threadIdx.x] = x_square_sum / inner_size - mean_val[threadIdx.x] * mean_val[threadIdx.x];
+          inv_var_val[threadIdx.x] = 1 / sqrt(variance_val[threadIdx.x] + epsilon);
+  
+          if (save_mean && save_inv_variance) {
+            save_mean[i] = mean_val[threadIdx.x];
+            save_inv_variance[i] = inv_var_val[threadIdx.x];
+          }
+          mean[i] = (1 - exponentialAverageFactor) * mean_val[threadIdx.x] +
+                    exponentialAverageFactor * mean[i];
+          variance[i] = (1 - exponentialAverageFactor) * variance_val[threadIdx.x] +
+                        exponentialAverageFactor * variance[i];
+        }
+      }
+    } else {
+      if(blockIdx.y == 0 && threadIdx.y == 0) {
+        mean_val[threadIdx.x] = x_sum / inner_size;
+        variance_val[threadIdx.x] = x_square_sum / inner_size - mean_val[threadIdx.x] * mean_val[threadIdx.x];
+        inv_var_val[threadIdx.x] = 1 / sqrt(variance_val[threadIdx.x] + epsilon);
+
+        if (save_mean && save_inv_variance) {
+          save_mean[i] = mean_val[threadIdx.x];
+          save_inv_variance[i] = inv_var_val[threadIdx.x];
+        }
+        mean[i] = (1 - exponentialAverageFactor) * mean_val[threadIdx.x] +
+                  exponentialAverageFactor * mean[i];
+        variance[i] = (1 - exponentialAverageFactor) * variance_val[threadIdx.x] +
+                      exponentialAverageFactor * variance[i];
+      }
+    }
+    __syncthreads();
+
+    for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val[threadIdx.x];
+      y[index] = scale[i] * x_sub_mean * inv_var_val[threadIdx.x] + bias[i];
+    }
+  }
+}
 
 template <typename T, int BlockDim, phi::DataLayout layout>
 static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingSMem(
@@ -932,38 +1085,105 @@ void BatchNormKernel(const Context &ctx,
       //const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070);
       const bool use_native_kernel = true;
       if(use_native_kernel) {
+        dim3 block;
+        dim3 grid;
+
+        const int block_size = 512;
+        // init block&grid config
+        int block_x = std::min(phi::funcs::details::GetLastPow2(C), 32);
+        int block_y = std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16), block_size / block_x);
+        if (block_x * block_y != block_size) {
+          block_x = std::min(phi::funcs::details::GetLastPow2(C), block_size / block_y);
+        }
+        int grid_x = (C + block_x - 1) / block_x;
+        int grid_y = std::min((N * H * W * D + block_y * 16 - 1) / (block_y * 16), 128);
+
+        block.x = block_x;
+        block.y = block_y;
+        grid.x = grid_x;
+        grid.y = grid_y;
+
+        // init intermediate storage
+        DenseTensor block_data_tensor;
+        DenseTensor flag_tensor;
+        BatchNormParamType<T>* block_data_ptr = nullptr;
+        int* flag_ptr = nullptr;
+        if(grid.y > 1) {
+          block_data_tensor.Resize({static_cast<int64_t>(2 * C * grid.y * sizeof(BatchNormParamType<T>))});
+          flag_tensor.Resize({static_cast<int64_t>(grid.x * sizeof(int))});
+
+          block_data_ptr = static_cast<BatchNormParamType<T>*>(ctx.template Alloc<BatchNormParamType<T>>(&block_data_tensor));
+          flag_ptr = static_cast<int*>(ctx.template Alloc<int>(&flag_tensor));
+        }
+
+        size_t smem_size = 3 * sizeof(BatchNormParamType<T>) * block.x;
         if (compute_format == DataLayout::kNCHW) {
-          DispatchBNForwardTraining<T, Context, DataLayout::kNCHW>(
-              ctx,
-              transformed_x.template data<T>(),
-              scale.template data<BatchNormParamType<T>>(),
-              bias.template data<BatchNormParamType<T>>(),
-              C,
-              N,
-              H * W * D,
-              epsilon,
-              this_factor,
-              transformed_y.template data<T>(),
-              mean_out->template data<BatchNormParamType<T>>(),
-              variance_out->template data<BatchNormParamType<T>>(),
-              saved_mean->template data<BatchNormParamType<T>>(),
-              saved_variance->template data<BatchNormParamType<T>>());
+          BNForwardTraining2D<T, block_size, DataLayout::kNCHW>
+            <<<grid, block, smem_size, ctx.stream()>>>(
+            transformed_x.template data<T>(),
+            scale.template data<BatchNormParamType<T>>(),
+            bias.template data<BatchNormParamType<T>>(),
+            C,
+            N,
+            H * W * D,
+            epsilon,
+            this_factor,
+            transformed_y.template data<T>(),
+            mean_out->template data<BatchNormParamType<T>>(),
+            variance_out->template data<BatchNormParamType<T>>(),
+            saved_mean->template data<BatchNormParamType<T>>(),
+            saved_variance->template data<BatchNormParamType<T>>(),
+            block_data_ptr,
+            flag_ptr);
+          // DispatchBNForwardTraining<T, Context, DataLayout::kNCHW>(
+          //     ctx,
+          //     transformed_x.template data<T>(),
+          //     scale.template data<BatchNormParamType<T>>(),
+          //     bias.template data<BatchNormParamType<T>>(),
+          //     C,
+          //     N,
+          //     H * W * D,
+          //     epsilon,
+          //     this_factor,
+          //     transformed_y.template data<T>(),
+          //     mean_out->template data<BatchNormParamType<T>>(),
+          //     variance_out->template data<BatchNormParamType<T>>(),
+          //     saved_mean->template data<BatchNormParamType<T>>(),
+          //     saved_variance->template data<BatchNormParamType<T>>());
         } else {
-          DispatchBNForwardTraining<T, Context, DataLayout::kNHWC>(
-              ctx,
-              transformed_x.template data<T>(),
-              scale.template data<BatchNormParamType<T>>(),
-              bias.template data<BatchNormParamType<T>>(),
-              C,
-              N,
-              H * W * D,
-              epsilon,
-              this_factor,
-              transformed_y.template data<T>(),
-              mean_out->template data<BatchNormParamType<T>>(),
-              variance_out->template data<BatchNormParamType<T>>(),
-              saved_mean->template data<BatchNormParamType<T>>(),
-              saved_variance->template data<BatchNormParamType<T>>());
+          BNForwardTraining2D<T, block_size, DataLayout::kNHWC>
+            <<<grid, block, smem_size, ctx.stream()>>>(
+            transformed_x.template data<T>(),
+            scale.template data<BatchNormParamType<T>>(),
+            bias.template data<BatchNormParamType<T>>(),
+            C,
+            N,
+            H * W * D,
+            epsilon,
+            this_factor,
+            transformed_y.template data<T>(),
+            mean_out->template data<BatchNormParamType<T>>(),
+            variance_out->template data<BatchNormParamType<T>>(),
+            saved_mean->template data<BatchNormParamType<T>>(),
+            saved_variance->template data<BatchNormParamType<T>>(),
+            block_data_ptr,
+            flag_ptr);
+          
+          // DispatchBNForwardTraining<T, Context, DataLayout::kNHWC>(
+          //     ctx,
+          //     transformed_x.template data<T>(),
+          //     scale.template data<BatchNormParamType<T>>(),
+          //     bias.template data<BatchNormParamType<T>>(),
+          //     C,
+          //     N,
+          //     H * W * D,
+          //     epsilon,
+          //     this_factor,
+          //     transformed_y.template data<T>(),
+          //     mean_out->template data<BatchNormParamType<T>>(),
+          //     variance_out->template data<BatchNormParamType<T>>(),
+          //     saved_mean->template data<BatchNormParamType<T>>(),
+          //     saved_variance->template data<BatchNormParamType<T>>());
         }
       } else {
 #if CUDNN_VERSION_MIN(7, 4, 1)

From 570dc551bb3958269b6ea7d65309d57a6e2c20ab Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Mon, 6 Jun 2022 10:13:28 +0800
Subject: [PATCH 08/70] refine the global space init

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 14778a89a4657..0cfffbb81f1ba 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -1109,11 +1109,13 @@ void BatchNormKernel(const Context &ctx,
         BatchNormParamType<T>* block_data_ptr = nullptr;
         int* flag_ptr = nullptr;
         if(grid.y > 1) {
-          block_data_tensor.Resize({static_cast<int64_t>(2 * C * grid.y * sizeof(BatchNormParamType<T>))});
-          flag_tensor.Resize({static_cast<int64_t>(grid.x * sizeof(int))});
+          block_data_tensor = phi::Empty<BatchNormParamType<T>, Context>(ctx, {2 * C * grid.y});
+          flag_tensor = phi::Empty<int, Context>(ctx, {grid.x});
 
-          block_data_ptr = static_cast<BatchNormParamType<T>*>(ctx.template Alloc<BatchNormParamType<T>>(&block_data_tensor));
-          flag_ptr = static_cast<int*>(ctx.template Alloc<int>(&flag_tensor));
+          block_data_ptr = block_data_tensor.data<BatchNormParamType<T>>();
+          flag_ptr = flag_tensor.data<int>();
+          funcs::SetConstant<Context, int> set_zero;
+          set_zero(ctx, &flag_tensor, static_cast<int>(0));
         }
 
         size_t smem_size = 3 * sizeof(BatchNormParamType<T>) * block.x;

From aaca04a6bc9f8a671488b8f2fd99a95c6303fb34 Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Tue, 7 Jun 2022 14:49:24 +0800
Subject: [PATCH 09/70] impl 2d kernel

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 549 ++++++++++++--------
 1 file changed, 339 insertions(+), 210 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 0cfffbb81f1ba..c726f31de232d 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -141,7 +141,6 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
   }
 }
 
-
 template <typename T, int BlockDim, phi::DataLayout layout>
 static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellford(
     const T *x,
@@ -181,8 +180,8 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellford(
     // thread-local iterative computation
     for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
       const int index = layout == phi::DataLayout::kNCHW
-                        ? (j / HxW * C + i) * HxW + j % HxW
-                        : j * outer_size + i;
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
       BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
       BatchNormParamType<T> delta = (x_i - local_mean);
       local_count++;
@@ -191,11 +190,17 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellford(
     }
 
     // warp sum
-    for(int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) {
-      BatchNormParamType<T> o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
-      int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
-      BatchNormParamType<T> factor = 1.0 / static_cast<float>(max(1, local_count+o_count));
-      local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor);
+    for (int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) {
+      BatchNormParamType<T> o_mean =
+          __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
+      int o_count =
+          __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
+      BatchNormParamType<T> factor =
+          1.0 / static_cast<float>(max(1, local_count + o_count));
+      local_var_n += (__shfl_xor_sync(
+                          0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) +
+                      (local_mean - o_mean) * (local_mean - o_mean) *
+                          local_count * o_count * factor);
       local_mean = (local_count * local_mean + o_count * o_mean) * factor;
       local_count += o_count;
     }
@@ -214,11 +219,17 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellford(
       local_var_n = warp_shared_count[threadIdx.x];
     }
 
-    for(int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) {
-      BatchNormParamType<T> o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
-      int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
-      BatchNormParamType<T> factor = 1.0 / static_cast<float>(max(1, local_count+o_count));
-      local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor);
+    for (int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) {
+      BatchNormParamType<T> o_mean =
+          __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
+      int o_count =
+          __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
+      BatchNormParamType<T> factor =
+          1.0 / static_cast<float>(max(1, local_count + o_count));
+      local_var_n += (__shfl_xor_sync(
+                          0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) +
+                      (local_mean - o_mean) * (local_mean - o_mean) *
+                          local_count * o_count * factor);
       local_mean = (local_count * local_mean + o_count * o_mean) * factor;
       local_count += o_count;
     }
@@ -251,20 +262,21 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellford(
 }
 
 template <typename T, int BlockDim, phi::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel(
-    const T *x,
-    const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias,
-    const int C,
-    const int N,
-    const int HxW,
-    const double epsilon,
-    double exponentialAverageFactor,
-    T *y,
-    BatchNormParamType<T> *mean,
-    BatchNormParamType<T> *variance,
-    BatchNormParamType<T> *save_mean,
-    BatchNormParamType<T> *save_inv_variance) {
+static __global__
+    LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel(
+        const T *x,
+        const BatchNormParamType<T> *scale,
+        const BatchNormParamType<T> *bias,
+        const int C,
+        const int N,
+        const int HxW,
+        const double epsilon,
+        double exponentialAverageFactor,
+        T *y,
+        BatchNormParamType<T> *mean,
+        BatchNormParamType<T> *variance,
+        BatchNormParamType<T> *save_mean,
+        BatchNormParamType<T> *save_inv_variance) {
   int outer_size = C;
   int inner_size = N * HxW;
   __shared__ BatchNormParamType<T> mean_val;
@@ -288,28 +300,30 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel
     BatchNormParamType<T> tmp_local_var_n[PARALLEL_LOADS];
     int tmp_local_count[PARALLEL_LOADS];
 
-    #pragma unroll
-    for(int k = 0; k < PARALLEL_LOADS; k++) {
+#pragma unroll
+    for (int k = 0; k < PARALLEL_LOADS; k++) {
       tmp_local_mean[k] = static_cast<BatchNormParamType<T>>(0);
       tmp_local_var_n[k] = static_cast<BatchNormParamType<T>>(0);
       tmp_local_count[k] = 0;
     }
 
     // thread-local iterative computation
-    for (int j = threadIdx.x; j < inner_size; j += PARALLEL_LOADS * blockDim.x) {
+    for (int j = threadIdx.x; j < inner_size;
+         j += PARALLEL_LOADS * blockDim.x) {
       BatchNormParamType<T> tmp_local_x[PARALLEL_LOADS];
       BatchNormParamType<T> tmp_local_count_inv[PARALLEL_LOADS];
       BatchNormParamType<T> valid[PARALLEL_LOADS];
       auto offset = j;
-      #pragma unroll
-      for(int k = 0; k < PARALLEL_LOADS; k++) {
-        if(offset < inner_size) {
+#pragma unroll
+      for (int k = 0; k < PARALLEL_LOADS; k++) {
+        if (offset < inner_size) {
           const int index = layout == phi::DataLayout::kNCHW
-                            ? (offset / HxW * C + i) * HxW + offset % HxW
-                            : offset * outer_size + i;
+                                ? (offset / HxW * C + i) * HxW + offset % HxW
+                                : offset * outer_size + i;
           tmp_local_x[k] = static_cast<BatchNormParamType<T>>(x[index]);
           tmp_local_count[k]++;
-          tmp_local_count_inv[k] = static_cast<BatchNormParamType<T>>(1) / tmp_local_count[k];
+          tmp_local_count_inv[k] =
+              static_cast<BatchNormParamType<T>>(1) / tmp_local_count[k];
           valid[k] = static_cast<BatchNormParamType<T>>(1);
         } else {
           tmp_local_x[k] = static_cast<BatchNormParamType<T>>(0);
@@ -319,20 +333,27 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel
         offset += blockDim.x;
       }
 
-      #pragma unroll
-      for(int k = 0; k < PARALLEL_LOADS; k++) {
+#pragma unroll
+      for (int k = 0; k < PARALLEL_LOADS; k++) {
         BatchNormParamType<T> delta = (tmp_local_x[k] - tmp_local_mean[k]);
         tmp_local_mean[k] += delta * tmp_local_count_inv[k];
-        tmp_local_var_n[k] += delta * (tmp_local_x[k] - tmp_local_mean[k]) * valid[k];
+        tmp_local_var_n[k] +=
+            delta * (tmp_local_x[k] - tmp_local_mean[k]) * valid[k];
       }
     }
 
-    #pragma unroll
-    for(int k = 1; k < PARALLEL_LOADS; k++) {
-      BatchNormParamType<T> factor = 1.0 / static_cast<float>(max(1, tmp_local_count[0]+tmp_local_count[k]));
+#pragma unroll
+    for (int k = 1; k < PARALLEL_LOADS; k++) {
+      BatchNormParamType<T> factor =
+          1.0 /
+          static_cast<float>(max(1, tmp_local_count[0] + tmp_local_count[k]));
       BatchNormParamType<T> delta = (tmp_local_mean[0] - tmp_local_mean[k]);
-      tmp_local_mean[0] = (tmp_local_count[0] * tmp_local_mean[0] + tmp_local_count[k] * tmp_local_mean[k]) * factor;
-      tmp_local_var_n[0] += (tmp_local_var_n[k] + delta * delta * tmp_local_count[0] * tmp_local_count[k] * factor);
+      tmp_local_mean[0] = (tmp_local_count[0] * tmp_local_mean[0] +
+                           tmp_local_count[k] * tmp_local_mean[k]) *
+                          factor;
+      tmp_local_var_n[0] +=
+          (tmp_local_var_n[k] +
+           delta * delta * tmp_local_count[0] * tmp_local_count[k] * factor);
       tmp_local_count[0] += tmp_local_count[k];
     }
 
@@ -341,11 +362,17 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel
     int local_count = tmp_local_count[0];
 
     // warp sum
-    for(int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) {
-      BatchNormParamType<T> o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
-      int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
-      BatchNormParamType<T> factor = 1.0 / static_cast<float>(max(1, local_count+o_count));
-      local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor);
+    for (int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) {
+      BatchNormParamType<T> o_mean =
+          __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
+      int o_count =
+          __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
+      BatchNormParamType<T> factor =
+          1.0 / static_cast<float>(max(1, local_count + o_count));
+      local_var_n += (__shfl_xor_sync(
+                          0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) +
+                      (local_mean - o_mean) * (local_mean - o_mean) *
+                          local_count * o_count * factor);
       local_mean = (local_count * local_mean + o_count * o_mean) * factor;
       local_count += o_count;
     }
@@ -364,11 +391,17 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel
       local_var_n = warp_shared_count[threadIdx.x];
     }
 
-    for(int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) {
-      BatchNormParamType<T> o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
-      int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
-      BatchNormParamType<T> factor = 1.0 / static_cast<float>(max(1, local_count+o_count));
-      local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor);
+    for (int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) {
+      BatchNormParamType<T> o_mean =
+          __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
+      int o_count =
+          __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
+      BatchNormParamType<T> factor =
+          1.0 / static_cast<float>(max(1, local_count + o_count));
+      local_var_n += (__shfl_xor_sync(
+                          0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) +
+                      (local_mean - o_mean) * (local_mean - o_mean) *
+                          local_count * o_count * factor);
       local_mean = (local_count * local_mean + o_count * o_mean) * factor;
       local_count += o_count;
     }
@@ -401,7 +434,7 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel
 }
 
 template <typename T, int BlockDim, phi::DataLayout layout>
-static __global__ void BNForwardTraining2D(
+static __global__ void BNForwardTraining2DComputeStatistic(
     const T *x,
     const BatchNormParamType<T> *scale,
     const BatchNormParamType<T> *bias,
@@ -411,32 +444,30 @@ static __global__ void BNForwardTraining2D(
     const double epsilon,
     double exponentialAverageFactor,
     T *y,
-    BatchNormParamType<T> *mean,
-    BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *global_mean,
+    BatchNormParamType<T> *global_variance,
     BatchNormParamType<T> *save_mean,
     BatchNormParamType<T> *save_inv_variance,
+    BatchNormParamType<T> *compute_mean,
+    BatchNormParamType<T> *compute_inv_var,
     BatchNormParamType<T> *block_data_ptr,
     int *flag_ptr) {
   int outer_size = C;
   int inner_size = N * HxW;
 
-  extern __shared__ __align__(sizeof(double)) char smem_buf[];
-
-  BatchNormParamType<T>* mean_val = reinterpret_cast<BatchNormParamType<T>*>(smem_buf);
-  BatchNormParamType<T>* variance_val = reinterpret_cast<BatchNormParamType<T>*>(&smem_buf[blockDim.x]);
-  BatchNormParamType<T>* inv_var_val = reinterpret_cast<BatchNormParamType<T>*>(&smem_buf[2*blockDim.x]);
-
   __shared__ BatchNormParamType<T> smem_sum[BlockDim];
   __shared__ BatchNormParamType<T> smem_square_sum[BlockDim];
 
   int outer_loop_stride = gridDim.x * blockDim.x;
   int inner_loop_stride = gridDim.y * blockDim.y;
 
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size; i += outer_loop_stride) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size;
+       i += outer_loop_stride) {
     BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
     BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
 
-    for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size; j += inner_loop_stride) {
+    for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size;
+         j += inner_loop_stride) {
       const int index = layout == phi::DataLayout::kNCHW
                             ? (j / HxW * C + i) * HxW + j % HxW
                             : j * outer_size + i;
@@ -447,9 +478,9 @@ static __global__ void BNForwardTraining2D(
 
     // vertical block sum
     int tid = threadIdx.x + threadIdx.y * blockDim.x;
-    #pragma unroll
-    for (int offset = blockDim.y/2; offset > 0; offset >>= 1) {
-      if (threadIdx.y < offset*2) {
+#pragma unroll
+    for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+      if (threadIdx.y < offset * 2) {
         smem_sum[tid] = x_sum;
         smem_square_sum[tid] = x_square_sum;
       }
@@ -462,8 +493,9 @@ static __global__ void BNForwardTraining2D(
     }
 
     if (gridDim.y > 1) {
-      volatile BatchNormParamType<T>* staging_sum = block_data_ptr;
-      volatile BatchNormParamType<T>* staging_square_sum = &block_data_ptr[C*gridDim.y];
+      volatile BatchNormParamType<T> *staging_sum = block_data_ptr;
+      volatile BatchNormParamType<T> *staging_square_sum =
+          &block_data_ptr[C * gridDim.y];
       // write block data to global memory
       if (threadIdx.y == 0) {
         staging_sum[i + blockIdx.y * C] = x_sum;
@@ -478,7 +510,7 @@ static __global__ void BNForwardTraining2D(
       // mark block done
       if (threadIdx.x == 0 && threadIdx.y == 0) {
         int old = atomicAdd(&flag_ptr[blockIdx.x], 1);
-        is_last_block_done = (old == (gridDim.y-1));
+        is_last_block_done = (old == (gridDim.y - 1));
       }
 
       __syncthreads();
@@ -488,15 +520,15 @@ static __global__ void BNForwardTraining2D(
         x_square_sum = static_cast<BatchNormParamType<T>>(0);
         // thread sum
         for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
-          x_sum += staging_sum[i+y*C];
-          x_square_sum += staging_square_sum[i+y*C];
+          x_sum += staging_sum[i + y * C];
+          x_square_sum += staging_square_sum[i + y * C];
         }
 
         // vertical block sum
         int tid = threadIdx.x + threadIdx.y * blockDim.x;
-        #pragma unroll
-        for (int offset = blockDim.y/2; offset > 0; offset >>= 1) {
-          if (threadIdx.y < offset*2) {
+#pragma unroll
+        for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+          if (threadIdx.y < offset * 2) {
             smem_sum[tid] = x_sum;
             smem_square_sum[tid] = x_square_sum;
           }
@@ -509,46 +541,90 @@ static __global__ void BNForwardTraining2D(
         }
 
         // final compute
-        if(threadIdx.y == 0) {
-          mean_val[threadIdx.x] = x_sum / inner_size;
-          variance_val[threadIdx.x] = x_square_sum / inner_size - mean_val[threadIdx.x] * mean_val[threadIdx.x];
-          inv_var_val[threadIdx.x] = 1 / sqrt(variance_val[threadIdx.x] + epsilon);
-  
+        if (threadIdx.y == 0) {
+          BatchNormParamType<T> compute_mean_val = x_sum / inner_size;
+          BatchNormParamType<T> variance_val =
+              x_square_sum / inner_size - compute_mean_val * compute_mean_val;
+          BatchNormParamType<T> compute_inv_var_val =
+              1 / sqrt(variance_val + epsilon);
+
           if (save_mean && save_inv_variance) {
-            save_mean[i] = mean_val[threadIdx.x];
-            save_inv_variance[i] = inv_var_val[threadIdx.x];
+            save_mean[i] = compute_mean_val;
+            save_inv_variance[i] = compute_inv_var_val;
           }
-          mean[i] = (1 - exponentialAverageFactor) * mean_val[threadIdx.x] +
-                    exponentialAverageFactor * mean[i];
-          variance[i] = (1 - exponentialAverageFactor) * variance_val[threadIdx.x] +
-                        exponentialAverageFactor * variance[i];
+          global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val +
+                           exponentialAverageFactor * global_mean[i];
+          global_variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                               exponentialAverageFactor * global_variance[i];
+
+          compute_mean[i] = compute_mean_val;
+          compute_inv_var[i] = compute_inv_var_val;
         }
       }
     } else {
-      if(blockIdx.y == 0 && threadIdx.y == 0) {
-        mean_val[threadIdx.x] = x_sum / inner_size;
-        variance_val[threadIdx.x] = x_square_sum / inner_size - mean_val[threadIdx.x] * mean_val[threadIdx.x];
-        inv_var_val[threadIdx.x] = 1 / sqrt(variance_val[threadIdx.x] + epsilon);
+      if (blockIdx.y == 0 && threadIdx.y == 0) {
+        BatchNormParamType<T> compute_mean_val = x_sum / inner_size;
+        BatchNormParamType<T> variance_val =
+            x_square_sum / inner_size - compute_mean_val * compute_mean_val;
+        BatchNormParamType<T> compute_inv_var_val =
+            1 / sqrt(variance_val + epsilon);
 
         if (save_mean && save_inv_variance) {
-          save_mean[i] = mean_val[threadIdx.x];
-          save_inv_variance[i] = inv_var_val[threadIdx.x];
+          save_mean[i] = compute_mean_val;
+          save_inv_variance[i] = compute_inv_var_val;
         }
-        mean[i] = (1 - exponentialAverageFactor) * mean_val[threadIdx.x] +
-                  exponentialAverageFactor * mean[i];
-        variance[i] = (1 - exponentialAverageFactor) * variance_val[threadIdx.x] +
-                      exponentialAverageFactor * variance[i];
+        global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val +
+                         exponentialAverageFactor * global_mean[i];
+        global_variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                             exponentialAverageFactor * global_variance[i];
+
+        compute_mean[i] = compute_mean_val;
+        compute_inv_var[i] = compute_inv_var_val;
       }
     }
+  }
+}
+
+template <typename T, phi::DataLayout layout>
+static __global__ void BNForwardTraining2DUpdateOutput(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    T *y,
+    BatchNormParamType<T> *compute_mean,
+    BatchNormParamType<T> *compute_inv_var) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+
+  extern __shared__ __align__(sizeof(double)) char smem_buf[];
+
+  BatchNormParamType<T> *smem_mean =
+      reinterpret_cast<BatchNormParamType<T> *>(smem_buf);
+  BatchNormParamType<T> *smem_inv_var =
+      reinterpret_cast<BatchNormParamType<T> *>(&smem_buf[blockDim.x]);
+
+  int outer_loop_stride = gridDim.x * blockDim.x;
+  int inner_loop_stride = gridDim.y * blockDim.y;
+
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size;
+       i += outer_loop_stride) {
+    if (threadIdx.y == 0) {
+      smem_mean[threadIdx.x] = compute_mean[i];
+      smem_inv_var[threadIdx.x] = compute_inv_var[i];
+    }
     __syncthreads();
 
-    for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size; j += blockDim.x) {
+    for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size;
+         j += inner_loop_stride) {
       const int index = layout == phi::DataLayout::kNCHW
                             ? (j / HxW * C + i) * HxW + j % HxW
                             : j * outer_size + i;
       BatchNormParamType<T> x_sub_mean =
-          static_cast<BatchNormParamType<T>>(x[index]) - mean_val[threadIdx.x];
-      y[index] = scale[i] * x_sub_mean * inv_var_val[threadIdx.x] + bias[i];
+          static_cast<BatchNormParamType<T>>(x[index]) - smem_mean[threadIdx.x];
+      y[index] = scale[i] * x_sub_mean * smem_inv_var[threadIdx.x] + bias[i];
     }
   }
 }
@@ -569,7 +645,8 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingSMem(
     BatchNormParamType<T> *save_mean,
     BatchNormParamType<T> *save_inv_variance) {
   extern __shared__ __align__(sizeof(double)) char smem_buf[];
-  BatchNormParamType<T>* x_buf = reinterpret_cast<BatchNormParamType<T>*>(smem_buf);
+  BatchNormParamType<T> *x_buf =
+      reinterpret_cast<BatchNormParamType<T> *>(smem_buf);
 
   int outer_size = C;
   int inner_size = N * HxW;
@@ -646,7 +723,8 @@ inline bool TryDispatchBNForwardTrainingSMem(
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_conf,
         BNForwardTrainingSMem<T, block_size, layout>,
-        block_size, smem);
+        block_size,
+        smem);
   }
   if (max_active_blocks_conf <= 0) {
     return false;
@@ -654,46 +732,85 @@ inline bool TryDispatchBNForwardTrainingSMem(
   const int max_threads = ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(max_threads / block_size, 1);
   const int grid = std::min(C, max_blocks);
-  BNForwardTrainingSMem<T, block_size, layout><<<grid, block_size, smem, ctx.stream()>>>(
-    x, scale, bias, C, N, HxW, epsilon, exponentialAverageFactor,
-    y, mean, variance, save_mean, save_inv_variance);
+  BNForwardTrainingSMem<T,
+                        block_size,
+                        layout><<<grid, block_size, smem, ctx.stream()>>>(
+      x,
+      scale,
+      bias,
+      C,
+      N,
+      HxW,
+      epsilon,
+      exponentialAverageFactor,
+      y,
+      mean,
+      variance,
+      save_mean,
+      save_inv_variance);
   return true;
 }
 
 template <typename T, typename Context, phi::DataLayout layout>
 inline void DispatchBNForwardTraining(
-        const Context &ctx,
-        const T *x,
-        const BatchNormParamType<T> *scale,
-        const BatchNormParamType<T> *bias,
-        const int C,
-        const int N,
-        const int HxW,
-        const double epsilon,
-        double exponentialAverageFactor,
-        T *y,
-        BatchNormParamType<T> *mean,
-        BatchNormParamType<T> *variance,
-        BatchNormParamType<T> *save_mean,
-        BatchNormParamType<T> *save_inv_variance) {
+    const Context &ctx,
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    double exponentialAverageFactor,
+    T *y,
+    BatchNormParamType<T> *mean,
+    BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance) {
   if ((N * HxW) <= 1024) {
-    // TODO: impl register-cache version
+    // TODO(yaozihang): impl register-cache version
     return;
   } else {
     bool dispatch_smem_impl_success = false;
     {
-      dispatch_smem_impl_success = TryDispatchBNForwardTrainingSMem<T, Context, layout>(
-          ctx, x, scale, bias, C, N, HxW, epsilon, exponentialAverageFactor,
-          y, mean, variance, save_mean, save_inv_variance);
+      dispatch_smem_impl_success =
+          TryDispatchBNForwardTrainingSMem<T, Context, layout>(
+              ctx,
+              x,
+              scale,
+              bias,
+              C,
+              N,
+              HxW,
+              epsilon,
+              exponentialAverageFactor,
+              y,
+              mean,
+              variance,
+              save_mean,
+              save_inv_variance);
     }
     if (!dispatch_smem_impl_success) {
       const int block = 512;
       const int max_threads = ctx.GetMaxPhysicalThreadCount();
       const int max_blocks = std::max(max_threads / block, 1);
       const int grid = std::min(C, max_blocks);
-      return BNForwardTraining<T, block, layout><<<grid, block, 0, ctx.stream()>>>(
-        x, scale, bias, C, N, HxW, epsilon, exponentialAverageFactor,
-        y, mean, variance, save_mean, save_inv_variance);
+      return BNForwardTraining<T,
+                               block,
+                               layout><<<grid, block, 0, ctx.stream()>>>(
+          x,
+          scale,
+          bias,
+          C,
+          N,
+          HxW,
+          epsilon,
+          exponentialAverageFactor,
+          y,
+          mean,
+          variance,
+          save_mean,
+          save_inv_variance);
     }
   }
 }
@@ -1016,10 +1133,9 @@ void BatchNormKernel(const Context &ctx,
       const int max_blocks = std::max(max_threads / block, 1);
       const int grid = std::min(C, max_blocks);
       if (compute_format == DataLayout::kNCHW) {
-        BNForwardTraining<
-            T,
-            block,
-            DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
+        BNForwardTraining<T,
+                          block,
+                          DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             transformed_x.template data<T>(),
             scale.template data<BatchNormParamType<T>>(),
             bias.template data<BatchNormParamType<T>>(),
@@ -1034,10 +1150,9 @@ void BatchNormKernel(const Context &ctx,
             saved_mean->template data<BatchNormParamType<T>>(),
             saved_variance->template data<BatchNormParamType<T>>());
       } else {
-        BNForwardTraining<
-            T,
-            block,
-            DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
+        BNForwardTraining<T,
+                          block,
+                          DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             transformed_x.template data<T>(),
             scale.template data<BatchNormParamType<T>>(),
             bias.template data<BatchNormParamType<T>>(),
@@ -1082,21 +1197,25 @@ void BatchNormKernel(const Context &ctx,
 //         static_cast<void *>(saved_variance->template mutable_data<
 //                             BatchNormParamType<T>>(ctx.GetPlace()))));
 #else
-      //const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070);
+      // const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070);
       const bool use_native_kernel = true;
-      if(use_native_kernel) {
+      if (use_native_kernel) {
         dim3 block;
         dim3 grid;
 
         const int block_size = 512;
         // init block&grid config
         int block_x = std::min(phi::funcs::details::GetLastPow2(C), 32);
-        int block_y = std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16), block_size / block_x);
+        int block_y =
+            std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16),
+                     block_size / block_x);
         if (block_x * block_y != block_size) {
-          block_x = std::min(phi::funcs::details::GetLastPow2(C), block_size / block_y);
+          block_x = std::min(phi::funcs::details::GetLastPow2(C),
+                             block_size / block_y);
         }
         int grid_x = (C + block_x - 1) / block_x;
-        int grid_y = std::min((N * H * W * D + block_y * 16 - 1) / (block_y * 16), 128);
+        int grid_y =
+            std::min((N * H * W * D + block_y * 16 - 1) / (block_y * 16), 128);
 
         block.x = block_x;
         block.y = block_y;
@@ -1106,10 +1225,16 @@ void BatchNormKernel(const Context &ctx,
         // init intermediate storage
         DenseTensor block_data_tensor;
         DenseTensor flag_tensor;
-        BatchNormParamType<T>* block_data_ptr = nullptr;
-        int* flag_ptr = nullptr;
-        if(grid.y > 1) {
-          block_data_tensor = phi::Empty<BatchNormParamType<T>, Context>(ctx, {2 * C * grid.y});
+        DenseTensor compute_mean_tensor =
+            phi::Empty<BatchNormParamType<T>, Context>(ctx, {C});
+        DenseTensor compute_inv_var_tensor =
+            phi::Empty<BatchNormParamType<T>, Context>(ctx, {C});
+
+        BatchNormParamType<T> *block_data_ptr = nullptr;
+        int *flag_ptr = nullptr;
+        if (grid.y > 1) {
+          block_data_tensor =
+              phi::Empty<BatchNormParamType<T>, Context>(ctx, {2 * C * grid.y});
           flag_tensor = phi::Empty<int, Context>(ctx, {grid.x});
 
           block_data_ptr = block_data_tensor.data<BatchNormParamType<T>>();
@@ -1118,74 +1243,78 @@ void BatchNormKernel(const Context &ctx,
           set_zero(ctx, &flag_tensor, static_cast<int>(0));
         }
 
-        size_t smem_size = 3 * sizeof(BatchNormParamType<T>) * block.x;
         if (compute_format == DataLayout::kNCHW) {
-          BNForwardTraining2D<T, block_size, DataLayout::kNCHW>
-            <<<grid, block, smem_size, ctx.stream()>>>(
-            transformed_x.template data<T>(),
-            scale.template data<BatchNormParamType<T>>(),
-            bias.template data<BatchNormParamType<T>>(),
-            C,
-            N,
-            H * W * D,
-            epsilon,
-            this_factor,
-            transformed_y.template data<T>(),
-            mean_out->template data<BatchNormParamType<T>>(),
-            variance_out->template data<BatchNormParamType<T>>(),
-            saved_mean->template data<BatchNormParamType<T>>(),
-            saved_variance->template data<BatchNormParamType<T>>(),
-            block_data_ptr,
-            flag_ptr);
-          // DispatchBNForwardTraining<T, Context, DataLayout::kNCHW>(
-          //     ctx,
-          //     transformed_x.template data<T>(),
-          //     scale.template data<BatchNormParamType<T>>(),
-          //     bias.template data<BatchNormParamType<T>>(),
-          //     C,
-          //     N,
-          //     H * W * D,
-          //     epsilon,
-          //     this_factor,
-          //     transformed_y.template data<T>(),
-          //     mean_out->template data<BatchNormParamType<T>>(),
-          //     variance_out->template data<BatchNormParamType<T>>(),
-          //     saved_mean->template data<BatchNormParamType<T>>(),
-          //     saved_variance->template data<BatchNormParamType<T>>());
+          BNForwardTraining2DComputeStatistic<
+              T,
+              block_size,
+              DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              this_factor,
+              transformed_y.template data<T>(),
+              mean_out->template data<BatchNormParamType<T>>(),
+              variance_out->template data<BatchNormParamType<T>>(),
+              saved_mean->template data<BatchNormParamType<T>>(),
+              saved_variance->template data<BatchNormParamType<T>>(),
+              compute_mean_tensor.data<BatchNormParamType<T>>(),
+              compute_inv_var_tensor.data<BatchNormParamType<T>>(),
+              block_data_ptr,
+              flag_ptr);
+
+          size_t smem_size = block.x * 2 * sizeof(BatchNormParamType<T>);
+          BNForwardTraining2DUpdateOutput<
+              T,
+              DataLayout::kNCHW><<<grid, block, smem_size, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              transformed_y.template data<T>(),
+              compute_mean_tensor.data<BatchNormParamType<T>>(),
+              compute_inv_var_tensor.data<BatchNormParamType<T>>());
         } else {
-          BNForwardTraining2D<T, block_size, DataLayout::kNHWC>
-            <<<grid, block, smem_size, ctx.stream()>>>(
-            transformed_x.template data<T>(),
-            scale.template data<BatchNormParamType<T>>(),
-            bias.template data<BatchNormParamType<T>>(),
-            C,
-            N,
-            H * W * D,
-            epsilon,
-            this_factor,
-            transformed_y.template data<T>(),
-            mean_out->template data<BatchNormParamType<T>>(),
-            variance_out->template data<BatchNormParamType<T>>(),
-            saved_mean->template data<BatchNormParamType<T>>(),
-            saved_variance->template data<BatchNormParamType<T>>(),
-            block_data_ptr,
-            flag_ptr);
-          
-          // DispatchBNForwardTraining<T, Context, DataLayout::kNHWC>(
-          //     ctx,
-          //     transformed_x.template data<T>(),
-          //     scale.template data<BatchNormParamType<T>>(),
-          //     bias.template data<BatchNormParamType<T>>(),
-          //     C,
-          //     N,
-          //     H * W * D,
-          //     epsilon,
-          //     this_factor,
-          //     transformed_y.template data<T>(),
-          //     mean_out->template data<BatchNormParamType<T>>(),
-          //     variance_out->template data<BatchNormParamType<T>>(),
-          //     saved_mean->template data<BatchNormParamType<T>>(),
-          //     saved_variance->template data<BatchNormParamType<T>>());
+          BNForwardTraining2DComputeStatistic<
+              T,
+              block_size,
+              DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              this_factor,
+              transformed_y.template data<T>(),
+              mean_out->template data<BatchNormParamType<T>>(),
+              variance_out->template data<BatchNormParamType<T>>(),
+              saved_mean->template data<BatchNormParamType<T>>(),
+              saved_variance->template data<BatchNormParamType<T>>(),
+              compute_mean_tensor.data<BatchNormParamType<T>>(),
+              compute_inv_var_tensor.data<BatchNormParamType<T>>(),
+              block_data_ptr,
+              flag_ptr);
+
+          size_t smem_size = block.x * 2 * sizeof(BatchNormParamType<T>);
+          BNForwardTraining2DUpdateOutput<
+              T,
+              DataLayout::kNHWC><<<grid, block, smem_size, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              transformed_y.template data<T>(),
+              compute_mean_tensor.data<BatchNormParamType<T>>(),
+              compute_inv_var_tensor.data<BatchNormParamType<T>>());
         }
       } else {
 #if CUDNN_VERSION_MIN(7, 4, 1)
@@ -1284,7 +1413,7 @@ void BatchNormKernel(const Context &ctx,
                 epsilon,
                 ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
                 ctx.template Alloc<BatchNormParamType<T>>(saved_variance)));
-#endif // CUDNN_VERSION_MIN(7, 4, 1)
+#endif  // CUDNN_VERSION_MIN(7, 4, 1)
       }
 #endif
     }

From 74b792b38bf9e6110b619473f49613f958fa0eb1 Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Sat, 11 Jun 2022 15:55:40 +0800
Subject: [PATCH 10/70] rm wellford

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 292 --------------------
 1 file changed, 292 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 4b263fd983171..5cf81367e4cf7 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -138,298 +138,6 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
   }
 }
 
-template <typename T, int BlockDim, phi::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellford(
-    const T *x,
-    const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias,
-    const int C,
-    const int N,
-    const int HxW,
-    const double epsilon,
-    double exponentialAverageFactor,
-    T *y,
-    BatchNormParamType<T> *mean,
-    BatchNormParamType<T> *variance,
-    BatchNormParamType<T> *save_mean,
-    BatchNormParamType<T> *save_inv_variance) {
-  int outer_size = C;
-  int inner_size = N * HxW;
-  __shared__ BatchNormParamType<T> mean_val;
-  __shared__ BatchNormParamType<T> variance_val;
-  __shared__ BatchNormParamType<T> inv_var_val;
-
-  constexpr int THREADS_PER_WARP = 32;
-  constexpr int THREADS_BITS_PER_WARP = 5;
-
-  constexpr int WARP_PER_BLOCK = BlockDim / THREADS_PER_WARP;
-  const int WARP_BITS_PER_BLOCK = (31 - __clz(WARP_PER_BLOCK));
-
-  __shared__ int warp_shared_count[WARP_PER_BLOCK];
-  __shared__ BatchNormParamType<T> warp_shared_mean[WARP_PER_BLOCK];
-  __shared__ BatchNormParamType<T> warp_shared_var_n[WARP_PER_BLOCK];
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> local_mean = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> local_var_n = static_cast<BatchNormParamType<T>>(0);
-    int local_count = 0;
-
-    // thread-local iterative computation
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == phi::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
-      BatchNormParamType<T> delta = (x_i - local_mean);
-      local_count++;
-      local_mean += delta / local_count;
-      local_var_n += delta * (x_i - local_mean);
-    }
-
-    // warp sum
-    for (int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) {
-      BatchNormParamType<T> o_mean =
-          __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
-      int o_count =
-          __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
-      BatchNormParamType<T> factor =
-          1.0 / static_cast<float>(max(1, local_count + o_count));
-      local_var_n += (__shfl_xor_sync(
-                          0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) +
-                      (local_mean - o_mean) * (local_mean - o_mean) *
-                          local_count * o_count * factor);
-      local_mean = (local_count * local_mean + o_count * o_mean) * factor;
-      local_count += o_count;
-    }
-
-    if (threadIdx.x % THREADS_PER_WARP == 0) {
-      warp_shared_count[threadIdx.x / THREADS_PER_WARP] = local_count;
-      warp_shared_mean[threadIdx.x / THREADS_PER_WARP] = local_mean;
-      warp_shared_var_n[threadIdx.x / THREADS_PER_WARP] = local_var_n;
-    }
-    __syncthreads();
-
-    // block sum
-    if (threadIdx.x < WARP_PER_BLOCK) {
-      local_count = warp_shared_count[threadIdx.x];
-      local_mean = warp_shared_count[threadIdx.x];
-      local_var_n = warp_shared_count[threadIdx.x];
-    }
-
-    for (int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) {
-      BatchNormParamType<T> o_mean =
-          __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
-      int o_count =
-          __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
-      BatchNormParamType<T> factor =
-          1.0 / static_cast<float>(max(1, local_count + o_count));
-      local_var_n += (__shfl_xor_sync(
-                          0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) +
-                      (local_mean - o_mean) * (local_mean - o_mean) *
-                          local_count * o_count * factor);
-      local_mean = (local_count * local_mean + o_count * o_mean) * factor;
-      local_count += o_count;
-    }
-
-    if (threadIdx.x == 0) {
-      mean_val = local_mean;
-      variance_val = local_var_n / local_count;
-      inv_var_val = 1 / sqrt(variance_val + epsilon);
-
-      if (save_mean && save_inv_variance) {
-        save_mean[i] = mean_val;
-        save_inv_variance[i] = inv_var_val;
-      }
-      mean[i] = (1 - exponentialAverageFactor) * mean_val +
-                exponentialAverageFactor * mean[i];
-      variance[i] = (1 - exponentialAverageFactor) * variance_val +
-                    exponentialAverageFactor * variance[i];
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == phi::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> x_sub_mean =
-          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
-      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
-    }
-  }
-}
-
-template <typename T, int BlockDim, phi::DataLayout layout>
-static __global__
-LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel(
-    const T *x,
-    const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias,
-    const int C,
-    const int N,
-    const int HxW,
-    const double epsilon,
-    double exponentialAverageFactor,
-    T *y,
-    BatchNormParamType<T> *mean,
-    BatchNormParamType<T> *variance,
-    BatchNormParamType<T> *save_mean,
-    BatchNormParamType<T> *save_inv_variance) {
-  int outer_size = C;
-  int inner_size = N * HxW;
-  __shared__ BatchNormParamType<T> mean_val;
-  __shared__ BatchNormParamType<T> variance_val;
-  __shared__ BatchNormParamType<T> inv_var_val;
-
-  constexpr int PARALLEL_LOADS = 4;
-
-  constexpr int THREADS_PER_WARP = 32;
-  constexpr int THREADS_BITS_PER_WARP = 5;
-
-  constexpr int WARP_PER_BLOCK = BlockDim / THREADS_PER_WARP;
-  const int WARP_BITS_PER_BLOCK = (31 - __clz(WARP_PER_BLOCK));
-
-  __shared__ int warp_shared_count[WARP_PER_BLOCK];
-  __shared__ BatchNormParamType<T> warp_shared_mean[WARP_PER_BLOCK];
-  __shared__ BatchNormParamType<T> warp_shared_var_n[WARP_PER_BLOCK];
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> tmp_local_mean[PARALLEL_LOADS];
-    BatchNormParamType<T> tmp_local_var_n[PARALLEL_LOADS];
-    int tmp_local_count[PARALLEL_LOADS];
-
-#pragma unroll
-    for (int k = 0; k < PARALLEL_LOADS; k++) {
-      tmp_local_mean[k] = static_cast<BatchNormParamType<T>>(0);
-      tmp_local_var_n[k] = static_cast<BatchNormParamType<T>>(0);
-      tmp_local_count[k] = 0;
-    }
-
-    // thread-local iterative computation
-    for (int j = threadIdx.x; j < inner_size;
-         j += PARALLEL_LOADS * blockDim.x) {
-      BatchNormParamType<T> tmp_local_x[PARALLEL_LOADS];
-      BatchNormParamType<T> tmp_local_count_inv[PARALLEL_LOADS];
-      BatchNormParamType<T> valid[PARALLEL_LOADS];
-      auto offset = j;
-#pragma unroll
-      for (int k = 0; k < PARALLEL_LOADS; k++) {
-        if (offset < inner_size) {
-          const int index = layout == phi::DataLayout::kNCHW
-                                ? (offset / HxW * C + i) * HxW + offset % HxW
-                                : offset * outer_size + i;
-          tmp_local_x[k] = static_cast<BatchNormParamType<T>>(x[index]);
-          tmp_local_count[k]++;
-          tmp_local_count_inv[k] =
-              static_cast<BatchNormParamType<T>>(1) / tmp_local_count[k];
-          valid[k] = static_cast<BatchNormParamType<T>>(1);
-        } else {
-          tmp_local_x[k] = static_cast<BatchNormParamType<T>>(0);
-          tmp_local_count_inv[k] = static_cast<BatchNormParamType<T>>(0);
-          valid[k] = static_cast<BatchNormParamType<T>>(0);
-        }
-        offset += blockDim.x;
-      }
-
-#pragma unroll
-      for (int k = 0; k < PARALLEL_LOADS; k++) {
-        BatchNormParamType<T> delta = (tmp_local_x[k] - tmp_local_mean[k]);
-        tmp_local_mean[k] += delta * tmp_local_count_inv[k];
-        tmp_local_var_n[k] +=
-            delta * (tmp_local_x[k] - tmp_local_mean[k]) * valid[k];
-      }
-    }
-
-#pragma unroll
-    for (int k = 1; k < PARALLEL_LOADS; k++) {
-      BatchNormParamType<T> factor =
-          1.0 /
-          static_cast<float>(max(1, tmp_local_count[0] + tmp_local_count[k]));
-      BatchNormParamType<T> delta = (tmp_local_mean[0] - tmp_local_mean[k]);
-      tmp_local_mean[0] = (tmp_local_count[0] * tmp_local_mean[0] +
-                           tmp_local_count[k] * tmp_local_mean[k]) *
-                          factor;
-      tmp_local_var_n[0] +=
-          (tmp_local_var_n[k] +
-           delta * delta * tmp_local_count[0] * tmp_local_count[k] * factor);
-      tmp_local_count[0] += tmp_local_count[k];
-    }
-
-    BatchNormParamType<T> local_mean = tmp_local_mean[0];
-    BatchNormParamType<T> local_var_n = tmp_local_var_n[0];
-    int local_count = tmp_local_count[0];
-
-    // warp sum
-    for (int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) {
-      BatchNormParamType<T> o_mean =
-          __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
-      int o_count =
-          __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
-      BatchNormParamType<T> factor =
-          1.0 / static_cast<float>(max(1, local_count + o_count));
-      local_var_n += (__shfl_xor_sync(
-                          0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) +
-                      (local_mean - o_mean) * (local_mean - o_mean) *
-                          local_count * o_count * factor);
-      local_mean = (local_count * local_mean + o_count * o_mean) * factor;
-      local_count += o_count;
-    }
-
-    if (threadIdx.x % THREADS_PER_WARP == 0) {
-      warp_shared_count[threadIdx.x / THREADS_PER_WARP] = local_count;
-      warp_shared_mean[threadIdx.x / THREADS_PER_WARP] = local_mean;
-      warp_shared_var_n[threadIdx.x / THREADS_PER_WARP] = local_var_n;
-    }
-    __syncthreads();
-
-    // block sum
-    if (threadIdx.x < WARP_PER_BLOCK) {
-      local_count = warp_shared_count[threadIdx.x];
-      local_mean = warp_shared_count[threadIdx.x];
-      local_var_n = warp_shared_count[threadIdx.x];
-    }
-
-    for (int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) {
-      BatchNormParamType<T> o_mean =
-          __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP);
-      int o_count =
-          __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP);
-      BatchNormParamType<T> factor =
-          1.0 / static_cast<float>(max(1, local_count + o_count));
-      local_var_n += (__shfl_xor_sync(
-                          0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) +
-                      (local_mean - o_mean) * (local_mean - o_mean) *
-                          local_count * o_count * factor);
-      local_mean = (local_count * local_mean + o_count * o_mean) * factor;
-      local_count += o_count;
-    }
-
-    if (threadIdx.x == 0) {
-      mean_val = local_mean;
-      variance_val = local_var_n / local_count;
-      inv_var_val = 1 / sqrt(variance_val + epsilon);
-
-      if (save_mean && save_inv_variance) {
-        save_mean[i] = mean_val;
-        save_inv_variance[i] = inv_var_val;
-      }
-      mean[i] = (1 - exponentialAverageFactor) * mean_val +
-                exponentialAverageFactor * mean[i];
-      variance[i] = (1 - exponentialAverageFactor) * variance_val +
-                    exponentialAverageFactor * variance[i];
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == phi::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> x_sub_mean =
-          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
-      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
-    }
-  }
-}
-
 template <typename T, int BlockDim, phi::DataLayout layout>
 static __global__ void BNForwardTraining2DComputeStatistic(
     const T *x,

From a0bd5b67697ea2f0d6981db58bb5f4d3cbca967a Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Sat, 11 Jun 2022 13:49:46 +0800
Subject: [PATCH 11/70] fix backward

---
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 194 +++++++++++-------
 1 file changed, 115 insertions(+), 79 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index 6de239182c15b..b23b119342d68 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -542,70 +542,60 @@ void BatchNormGradRawKernel(const Context &ctx,
 
     // This branch calls CUDNN APIs
     if (d_x && d_scale && d_bias) {
-      bool called = false;
-#if CUDNN_VERSION_MIN(7, 4, 1)
-      called = true;
-      size_t workspace_size = 0;
-      void *workspace_ptr = nullptr;
-      DenseTensor workspace_tensor;
-      auto reserve_space_size = reserve_space->memory_size();
-      // --------------- cudnn batchnorm workspace ---------------
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::
-              cudnnGetBatchNormalizationBackwardExWorkspaceSize(
-                  /*handle=*/ctx.cudnn_handle(),
-                  /*mode=*/mode_,
-                  /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
-                  /*xDesc=*/data_desc_,
-                  /*yDesc=*/data_desc_,
-                  /*dyDesc=*/data_desc_,
-                  /*dzDesc=*/nullptr,
-                  /*dxDesc=*/data_desc_,
-                  /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-                  /*activationDesc=*/nullptr,
-                  /*sizeInBytes=*/&workspace_size));
-
-      workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
-      workspace_ptr =
-          static_cast<void *>(ctx.template Alloc<uint8_t>(&workspace_tensor));
-
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnBatchNormalizationBackwardEx(
-              /*handle=*/ctx.cudnn_handle(),
-              /*mode=*/mode_,
-              /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
-              /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
-              /*betaDataDiff=*/CudnnDataType<T>::kZero(),
-              /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
-              /*betaParamDiff=*/CudnnDataType<T>::kZero(),
-              /*xDesc=*/data_desc_,
-              /*xData=*/transformed_x.template data<T>(),
-              /*yDesc=*/nullptr,
-              /*yData=*/nullptr,
-              /*dyDesc=*/data_desc_,
-              /*dyData=*/transformed_d_y.template data<T>(),
-              /*dzDesc=*/nullptr,
-              /*dzData=*/nullptr,
-              /*dxDesc=*/data_desc_,
-              /*dxData=*/ctx.template Alloc<T>(&transformed_d_x),
-              /*dBnScaleBiasDesc=*/bn_param_desc_,
-              /*bnScaleData=*/scale.template data<BatchNormParamType<T>>(),
-              /*bnBiasData=*/nullptr,
-              /*dBnScaleData=*/
-              ctx.template Alloc<BatchNormParamType<T>>(d_scale),
-              /*dBnBiasData=*/ctx.template Alloc<BatchNormParamType<T>>(d_bias),
-              /*epsilon=*/epsilon,
-              /*savedMean=*/saved_mean_data,
-              /*savedInvVariance=*/saved_var_data,
-              /*activationDesc=*/nullptr,
-              /*workspace=*/workspace_ptr,
-              /*workSpaceSizeInBytes=*/workspace_size,
-              /*reserveSpace=*/
-              const_cast<uint8_t *>(reserve_space->template data<uint8_t>()),
-              /*reserveSpaceSizeInBytes=*/reserve_space_size));
-#endif  // CUDNN_VERSION_MIN(7, 4, 1)
-      if (!called) {
 #ifdef PADDLE_WITH_HIP
+      if (compute_format == DataLayout::kNCHW) {
+        BNBackward<T, block, DataLayout::kNCHW>
+            <<<grid2, block, 0, ctx.stream()>>>(
+                transformed_d_y.template data<T>(),
+                transformed_x.template data<T>(),
+                scale.template data<BatchNormParamType<T>>(),
+                saved_mean_data,
+                saved_var_data,
+                C,
+                N,
+                H * W * D,
+                epsilon,
+                transformed_d_x.template data<T>(),
+                ctx.template Alloc<BatchNormParamType<T>>(d_scale),
+                ctx.template Alloc<BatchNormParamType<T>>(d_bias));
+      } else {
+        BNBackward<T, block, DataLayout::kNHWC>
+            <<<grid2, block, 0, ctx.stream()>>>(
+                transformed_d_y.template data<T>(),
+                transformed_x.template data<T>(),
+                scale.template data<BatchNormParamType<T>>(),
+                saved_mean_data,
+                saved_var_data,
+                C,
+                N,
+                H * W * D,
+                epsilon,
+                transformed_d_x.template data<T>(),
+                ctx.template Alloc<BatchNormParamType<T>>(d_scale),
+                ctx.template Alloc<BatchNormParamType<T>>(d_bias));
+      }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationBackward(
+//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), data_desc_,
+//         transformed_x.template data<T>(), data_desc_,
+//         transformed_d_y.template data<T>(), data_desc_,
+//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
+//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
+//         d_scale->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         d_bias->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         epsilon, saved_mean_data, saved_var_data));
+#else
+      // CUDNN PER_ACTIVATION mode only support small batch size
+      const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
+      const bool use_native_kernel =
+          (x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD);
+      if (use_native_kernel) {
         if (compute_format == DataLayout::kNCHW) {
           BNBackward<T, block, DataLayout::kNCHW>
               <<<grid2, block, 0, ctx.stream()>>>(
@@ -637,22 +627,67 @@ void BatchNormGradRawKernel(const Context &ctx,
                   ctx.template Alloc<BatchNormParamType<T>>(d_scale),
                   ctx.template Alloc<BatchNormParamType<T>>(d_bias));
         }
+      } else {
+#if CUDNN_VERSION_MIN(7, 4, 1)
+        size_t workspace_size = 0;
+        void *workspace_ptr = nullptr;
+        DenseTensor workspace_tensor;
+        auto reserve_space_size = reserve_space->memory_size();
+        // --------------- cudnn batchnorm workspace ---------------
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::
+                cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+                    /*handle=*/ctx.cudnn_handle(),
+                    /*mode=*/mode_,
+                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
+                    /*xDesc=*/data_desc_,
+                    /*yDesc=*/data_desc_,
+                    /*dyDesc=*/data_desc_,
+                    /*dzDesc=*/nullptr,
+                    /*dxDesc=*/data_desc_,
+                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                    /*activationDesc=*/nullptr,
+                    /*sizeInBytes=*/&workspace_size));
 
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationBackward(
-//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-//         CudnnDataType<T>::kZero(), data_desc_,
-//         transformed_x.template data<T>(), data_desc_,
-//         transformed_d_y.template data<T>(), data_desc_,
-//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
-//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-//         d_scale->template mutable_data<BatchNormParamType<T>>(
-//             ctx.GetPlace()),
-//         d_bias->template mutable_data<BatchNormParamType<T>>(
-//             ctx.GetPlace()),
-//         epsilon, saved_mean_data, saved_var_data));
+        workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
+        workspace_ptr =
+            static_cast<void *>(ctx.template Alloc<uint8_t>(&workspace_tensor));
+
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::cudnnBatchNormalizationBackwardEx(
+                /*handle=*/ctx.cudnn_handle(),
+                /*mode=*/mode_,
+                /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
+                /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
+                /*betaDataDiff=*/CudnnDataType<T>::kZero(),
+                /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
+                /*betaParamDiff=*/CudnnDataType<T>::kZero(),
+                /*xDesc=*/data_desc_,
+                /*xData=*/transformed_x.template data<T>(),
+                /*yDesc=*/nullptr,
+                /*yData=*/nullptr,
+                /*dyDesc=*/data_desc_,
+                /*dyData=*/transformed_d_y.template data<T>(),
+                /*dzDesc=*/nullptr,
+                /*dzData=*/nullptr,
+                /*dxDesc=*/data_desc_,
+                /*dxData=*/ctx.template Alloc<T>(&transformed_d_x),
+                /*dBnScaleBiasDesc=*/bn_param_desc_,
+                /*bnScaleData=*/scale.template data<BatchNormParamType<T>>(),
+                /*bnBiasData=*/nullptr,
+                /*dBnScaleData=*/
+                ctx.template Alloc<BatchNormParamType<T>>(d_scale),
+                /*dBnBiasData=*/
+                ctx.template Alloc<BatchNormParamType<T>>(d_bias),
+                /*epsilon=*/epsilon,
+                /*savedMean=*/saved_mean_data,
+                /*savedInvVariance=*/saved_var_data,
+                /*activationDesc=*/nullptr,
+                /*workspace=*/workspace_ptr,
+                /*workSpaceSizeInBytes=*/workspace_size,
+                /*reserveSpace=*/
+                const_cast<uint8_t *>(reserve_space->template data<uint8_t>()),
+                /*reserveSpaceSizeInBytes=*/reserve_space_size));
 #else
         PADDLE_ENFORCE_GPU_SUCCESS(
             paddle::platform::dynload::cudnnBatchNormalizationBackward(
@@ -675,8 +710,9 @@ void BatchNormGradRawKernel(const Context &ctx,
                 epsilon,
                 saved_mean_data,
                 saved_var_data));
-#endif
+#endif  // CUDNN_VERSION_MIN(7, 4, 1)
       }
+#endif
 
       if (data_layout == DataLayout::kNHWC &&
           compute_format == DataLayout::kNCHW) {

From 2433ebf80f0e9c019b2d1ce587d6b965476b0638 Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Sat, 11 Jun 2022 14:39:53 +0800
Subject: [PATCH 12/70] add unit test for batchnorm1d

---
 .../tests/unittests/test_batch_norm_op_v2.py  | 42 ++++++++++++++++---
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index 9db95f094a7e3..cfd5d5f7c9bd0 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -110,11 +110,43 @@ def compute_v2(x):
                         y.backward()
                         return y.numpy(), x1.gradient()
 
-        x = np.random.randn(*shape).astype("float32")
-        y1, g1 = compute_v1(x)
-        y2, g2 = compute_v2(x)
-        self.assertTrue(np.allclose(g1, g2))
-        self.assertTrue(np.allclose(y1, y2))
+            x = np.random.randn(*shape).astype("float32")
+            y1, g1 = compute_v1(x)
+            y2, g2 = compute_v2(x)
+            self.assertTrue(np.allclose(g1, g2))
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_eager_api_1d(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [200000, 4]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(shape[1])
+                    x1 = paddle.to_tensor(x)
+                    x1.stop_gradient = False
+                    y = bn(x1)
+                    y.backward()
+                    return y.numpy(), x1.gradient()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    with _test_eager_guard():
+                        bn = paddle.nn.BatchNorm1D(shape[1])
+                        x1 = paddle.to_tensor(x)
+                        x1.stop_gradient = False
+                        y = bn(x1)
+                        y.backward()
+                        return y.numpy(), x1.gradient()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1, g1 = compute_v1(x)
+            y2, g2 = compute_v2(x)
+            self.assertTrue(np.allclose(g1, g2))
+            self.assertTrue(np.allclose(y1, y2))
 
     def test_dygraph(self):
         places = [fluid.CPUPlace()]

From 90c27a653c5a3a9d0295eed48446d2a58bb667f8 Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Sat, 11 Jun 2022 21:40:24 +0800
Subject: [PATCH 13/70] fix bug

---
 paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu | 5 +++--
 paddle/phi/kernels/gpu/batch_norm_kernel.cu      | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index b23b119342d68..1e93803866e3f 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -593,8 +593,9 @@ void BatchNormGradRawKernel(const Context &ctx,
 #else
       // CUDNN PER_ACTIVATION mode only support small batch size
       const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
-      const bool use_native_kernel =
-          (x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD);
+      // const bool use_native_kernel =
+      //     (x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD);
+      const bool use_native_kernel = true;
       if (use_native_kernel) {
         if (compute_format == DataLayout::kNCHW) {
           BNBackward<T, block, DataLayout::kNCHW>
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 5cf81367e4cf7..90e9b45d3ab4e 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -304,12 +304,13 @@ static __global__ void BNForwardTraining2DUpdateOutput(
   int outer_size = C;
   int inner_size = N * HxW;
 
-  extern __shared__ __align__(sizeof(double)) char smem_buf[];
+  extern __shared__ __align__(sizeof(BatchNormParamType<T>)) char smem_buf[];
 
   BatchNormParamType<T> *smem_mean =
       reinterpret_cast<BatchNormParamType<T> *>(smem_buf);
   BatchNormParamType<T> *smem_inv_var =
-      reinterpret_cast<BatchNormParamType<T> *>(&smem_buf[blockDim.x]);
+      reinterpret_cast<BatchNormParamType<T> *>(
+          smem_buf + blockDim.x * sizeof(BatchNormParamType<T>));
 
   int outer_loop_stride = gridDim.x * blockDim.x;
   int inner_loop_stride = gridDim.y * blockDim.y;

From 91d83e559cf7fc7c0b8607b2d3a8a7b628ca1c89 Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Sun, 12 Jun 2022 00:12:56 +0800
Subject: [PATCH 14/70] impl channel last 2d

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 296 +++++++++++++++++---
 1 file changed, 260 insertions(+), 36 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 90e9b45d3ab4e..07539c221d06d 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -139,7 +139,7 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
 }
 
 template <typename T, int BlockDim, phi::DataLayout layout>
-static __global__ void BNForwardTraining2DComputeStatistic(
+static __global__ void BNForwardTraining2DChannelLastCompStat(
     const T *x,
     const BatchNormParamType<T> *scale,
     const BatchNormParamType<T> *bias,
@@ -291,7 +291,7 @@ static __global__ void BNForwardTraining2DComputeStatistic(
 }
 
 template <typename T, phi::DataLayout layout>
-static __global__ void BNForwardTraining2DUpdateOutput(
+static __global__ void BNForwardTraining2DChannelLastWriteRes(
     const T *x,
     const BatchNormParamType<T> *scale,
     const BatchNormParamType<T> *bias,
@@ -335,6 +335,203 @@ static __global__ void BNForwardTraining2DUpdateOutput(
   }
 }
 
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ void BNForwardTraining2DCompStat(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    double exponentialAverageFactor,
+    T *y,
+    BatchNormParamType<T> *global_mean,
+    BatchNormParamType<T> *global_variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance,
+    BatchNormParamType<T> *compute_mean,
+    BatchNormParamType<T> *compute_inv_var,
+    BatchNormParamType<T> *block_data_ptr,
+    int *flag_ptr) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+
+  __shared__ BatchNormParamType<T> smem_sum[BlockDim];
+  __shared__ BatchNormParamType<T> smem_square_sum[BlockDim];
+
+  int outer_loop_stride = gridDim.y * blockDim.y;
+  int inner_loop_stride = gridDim.x * blockDim.x;
+
+  for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < outer_size;
+       i += outer_loop_stride) {
+    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
+
+    for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size;
+         j += inner_loop_stride) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
+      x_sum += x_i;
+      x_square_sum += x_i * x_i;
+    }
+
+    // horizonal block sum
+    int tid = threadIdx.x + threadIdx.y * blockDim.x;
+#pragma unroll
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
+      if (threadIdx.x < offset * 2) {
+        smem_sum[tid] = x_sum;
+        smem_square_sum[tid] = x_square_sum;
+      }
+      __syncthreads();
+      if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) {
+        int pair_tid = tid + offset;
+        x_sum += smem_sum[pair_tid];
+        x_square_sum += smem_square_sum[pair_tid];
+      }
+    }
+
+    if (gridDim.x > 1) {
+      volatile BatchNormParamType<T> *staging_sum = block_data_ptr;
+      volatile BatchNormParamType<T> *staging_square_sum =
+          &block_data_ptr[C * gridDim.x];
+      // write block data to global memory
+      if (threadIdx.x == 0) {
+        staging_sum[i + blockIdx.x * C] = x_sum;
+        staging_square_sum[i + blockIdx.x * C] = x_square_sum;
+      }
+
+      // make sure write is visible to all blocks
+      __threadfence();
+      __syncthreads();
+
+      __shared__ bool is_last_block_done;
+      // mark block done
+      if (threadIdx.x == 0 && threadIdx.y == 0) {
+        int old = atomicAdd(&flag_ptr[blockIdx.y], 1);
+        is_last_block_done = (old == (gridDim.x - 1));
+      }
+
+      __syncthreads();
+
+      if (is_last_block_done) {
+        x_sum = static_cast<BatchNormParamType<T>>(0);
+        x_square_sum = static_cast<BatchNormParamType<T>>(0);
+        // thread sum
+        for (int x = threadIdx.x; x < gridDim.x; x += blockDim.x) {
+          x_sum += staging_sum[i + x * C];
+          x_square_sum += staging_square_sum[i + x * C];
+        }
+
+        // vertical block sum
+        int tid = threadIdx.x + threadIdx.y * blockDim.x;
+#pragma unroll
+        for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
+          if (threadIdx.x < offset * 2) {
+            smem_sum[tid] = x_sum;
+            smem_square_sum[tid] = x_square_sum;
+          }
+          __syncthreads();
+          if (threadIdx.x < offset && threadIdx.x + offset < blockDim.y) {
+            int pair_tid = tid + offset;
+            x_sum += smem_sum[pair_tid];
+            x_square_sum += smem_square_sum[pair_tid];
+          }
+        }
+
+        // final compute
+        if (threadIdx.x == 0) {
+          BatchNormParamType<T> compute_mean_val = x_sum / inner_size;
+          BatchNormParamType<T> variance_val =
+              x_square_sum / inner_size - compute_mean_val * compute_mean_val;
+          BatchNormParamType<T> compute_inv_var_val =
+              1 / sqrt(variance_val + epsilon);
+
+          if (save_mean && save_inv_variance) {
+            save_mean[i] = compute_mean_val;
+            save_inv_variance[i] = compute_inv_var_val;
+          }
+          global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val +
+                           exponentialAverageFactor * global_mean[i];
+          global_variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                               exponentialAverageFactor * global_variance[i];
+
+          compute_mean[i] = compute_mean_val;
+          compute_inv_var[i] = compute_inv_var_val;
+        }
+      }
+    } else {
+      if (blockIdx.x == 0 && threadIdx.x == 0) {
+        BatchNormParamType<T> compute_mean_val = x_sum / inner_size;
+        BatchNormParamType<T> variance_val =
+            x_square_sum / inner_size - compute_mean_val * compute_mean_val;
+        BatchNormParamType<T> compute_inv_var_val =
+            1 / sqrt(variance_val + epsilon);
+
+        if (save_mean && save_inv_variance) {
+          save_mean[i] = compute_mean_val;
+          save_inv_variance[i] = compute_inv_var_val;
+        }
+        global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val +
+                         exponentialAverageFactor * global_mean[i];
+        global_variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                             exponentialAverageFactor * global_variance[i];
+
+        compute_mean[i] = compute_mean_val;
+        compute_inv_var[i] = compute_inv_var_val;
+      }
+    }
+  }
+}
+
+template <typename T, phi::DataLayout layout>
+static __global__ void BNForwardTraining2DWriteRes(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    T *y,
+    BatchNormParamType<T> *compute_mean,
+    BatchNormParamType<T> *compute_inv_var) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+
+  extern __shared__ __align__(sizeof(BatchNormParamType<T>)) char smem_buf[];
+
+  BatchNormParamType<T> *smem_mean =
+      reinterpret_cast<BatchNormParamType<T> *>(smem_buf);
+  BatchNormParamType<T> *smem_inv_var =
+      reinterpret_cast<BatchNormParamType<T> *>(
+          smem_buf + blockDim.y * sizeof(BatchNormParamType<T>));
+
+  int outer_loop_stride = gridDim.y * blockDim.y;
+  int inner_loop_stride = gridDim.x * blockDim.x;
+
+  for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < outer_size;
+       i += outer_loop_stride) {
+    if (threadIdx.x == 0) {
+      smem_mean[threadIdx.y] = compute_mean[i];
+      smem_inv_var[threadIdx.y] = compute_inv_var[i];
+    }
+    __syncthreads();
+
+    for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size;
+         j += inner_loop_stride) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x[index]) - smem_mean[threadIdx.y];
+      y[index] = scale[i] * x_sub_mean * smem_inv_var[threadIdx.y] + bias[i];
+    }
+  }
+}
+
 template <typename T, int BlockDim, phi::DataLayout layout>
 static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingSMem(
     const T *x,
@@ -900,25 +1097,7 @@ void BatchNormKernel(const Context &ctx,
       if (use_native_kernel) {
         dim3 block;
         dim3 grid;
-
         const int block_size = 512;
-        // init block&grid config
-        int block_x = std::min(phi::funcs::details::GetLastPow2(C), 32);
-        int block_y =
-            std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16),
-                     block_size / block_x);
-        if (block_x * block_y != block_size) {
-          block_x = std::min(phi::funcs::details::GetLastPow2(C),
-                             block_size / block_y);
-        }
-        int grid_x = (C + block_x - 1) / block_x;
-        int grid_y =
-            std::min((N * H * W * D + block_y * 16 - 1) / (block_y * 16), 128);
-
-        block.x = block_x;
-        block.y = block_y;
-        grid.x = grid_x;
-        grid.y = grid_y;
 
         // init intermediate storage
         DenseTensor block_data_tensor;
@@ -930,19 +1109,34 @@ void BatchNormKernel(const Context &ctx,
 
         BatchNormParamType<T> *block_data_ptr = nullptr;
         int *flag_ptr = nullptr;
-        if (grid.y > 1) {
-          block_data_tensor =
-              phi::Empty<BatchNormParamType<T>, Context>(ctx, {2 * C * grid.y});
-          flag_tensor = phi::Empty<int, Context>(ctx, {grid.x});
-
-          block_data_ptr = block_data_tensor.data<BatchNormParamType<T>>();
-          flag_ptr = flag_tensor.data<int>();
-          funcs::SetConstant<Context, int> set_zero;
-          set_zero(ctx, &flag_tensor, static_cast<int>(0));
-        }
 
-        if (compute_format == DataLayout::kNCHW) {
-          BNForwardTraining2DComputeStatistic<T, block_size, DataLayout::kNCHW>
+        if (x_dims.size() != 2 && compute_format == DataLayout::kNCHW) {
+          // init block&grid config
+          int block_x = std::min(
+              phi::funcs::details::GetLastPow2(N * H * W * D / 16), block_size);
+          int block_y = std::min(phi::funcs::details::GetLastPow2(C),
+                                 block_size / block_x);
+
+          int grid_x = std::min(
+              (N * H * W * D + block_x * 16 - 1) / (block_x * 16), 128);
+          int grid_y = (C + block_y - 1) / block_y;
+
+          block.x = block_x;
+          block.y = block_y;
+          grid.x = grid_x;
+          grid.y = grid_y;
+
+          if (grid.x > 1) {
+            block_data_tensor = phi::Empty<BatchNormParamType<T>, Context>(
+                ctx, {2 * C * grid.x});
+            flag_tensor = phi::Empty<int, Context>(ctx, {grid.y});
+
+            block_data_ptr = block_data_tensor.data<BatchNormParamType<T>>();
+            flag_ptr = flag_tensor.data<int>();
+            funcs::SetConstant<Context, int> set_zero;
+            set_zero(ctx, &flag_tensor, static_cast<int>(0));
+          }
+          BNForwardTraining2DCompStat<T, block_size, DataLayout::kNCHW>
               <<<grid, block, 0, ctx.stream()>>>(
                   transformed_x.template data<T>(),
                   scale.template data<BatchNormParamType<T>>(),
@@ -962,8 +1156,8 @@ void BatchNormKernel(const Context &ctx,
                   block_data_ptr,
                   flag_ptr);
 
-          size_t smem_size = block.x * 2 * sizeof(BatchNormParamType<T>);
-          BNForwardTraining2DUpdateOutput<T, DataLayout::kNCHW>
+          size_t smem_size = block.y * 2 * sizeof(BatchNormParamType<T>);
+          BNForwardTraining2DWriteRes<T, DataLayout::kNCHW>
               <<<grid, block, smem_size, ctx.stream()>>>(
                   transformed_x.template data<T>(),
                   scale.template data<BatchNormParamType<T>>(),
@@ -975,7 +1169,37 @@ void BatchNormKernel(const Context &ctx,
                   compute_mean_tensor.data<BatchNormParamType<T>>(),
                   compute_inv_var_tensor.data<BatchNormParamType<T>>());
         } else {
-          BNForwardTraining2DComputeStatistic<T, block_size, DataLayout::kNHWC>
+          // init block&grid config
+          int block_x = std::min(phi::funcs::details::GetLastPow2(C), 32);
+          int block_y =
+              std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16),
+                       block_size / block_x);
+          if (block_x * block_y != block_size) {
+            block_x = std::min(phi::funcs::details::GetLastPow2(C),
+                               block_size / block_y);
+          }
+          int grid_x = (C + block_x - 1) / block_x;
+          int grid_y = std::min(
+              (N * H * W * D + block_y * 16 - 1) / (block_y * 16), 128);
+
+          block.x = block_x;
+          block.y = block_y;
+          grid.x = grid_x;
+          grid.y = grid_y;
+
+          if (grid.y > 1) {
+            block_data_tensor = phi::Empty<BatchNormParamType<T>, Context>(
+                ctx, {2 * C * grid.y});
+            flag_tensor = phi::Empty<int, Context>(ctx, {grid.x});
+
+            block_data_ptr = block_data_tensor.data<BatchNormParamType<T>>();
+            flag_ptr = flag_tensor.data<int>();
+            funcs::SetConstant<Context, int> set_zero;
+            set_zero(ctx, &flag_tensor, static_cast<int>(0));
+          }
+          BNForwardTraining2DChannelLastCompStat<T,
+                                                 block_size,
+                                                 DataLayout::kNHWC>
               <<<grid, block, 0, ctx.stream()>>>(
                   transformed_x.template data<T>(),
                   scale.template data<BatchNormParamType<T>>(),
@@ -996,7 +1220,7 @@ void BatchNormKernel(const Context &ctx,
                   flag_ptr);
 
           size_t smem_size = block.x * 2 * sizeof(BatchNormParamType<T>);
-          BNForwardTraining2DUpdateOutput<T, DataLayout::kNHWC>
+          BNForwardTraining2DChannelLastWriteRes<T, DataLayout::kNHWC>
               <<<grid, block, smem_size, ctx.stream()>>>(
                   transformed_x.template data<T>(),
                   scale.template data<BatchNormParamType<T>>(),

From 6871dbf6a0f606a798d22a62df075111667fe023 Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Wed, 15 Jun 2022 11:11:22 +0800
Subject: [PATCH 15/70] refine

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 226 ++------------------
 1 file changed, 20 insertions(+), 206 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 07539c221d06d..abaca230d0ec1 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -138,7 +138,7 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
   }
 }
 
-template <typename T, int BlockDim, phi::DataLayout layout>
+template <typename T, int BlockDim>
 static __global__ void BNForwardTraining2DChannelLastCompStat(
     const T *x,
     const BatchNormParamType<T> *scale,
@@ -173,9 +173,7 @@ static __global__ void BNForwardTraining2DChannelLastCompStat(
 
     for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size;
          j += inner_loop_stride) {
-      const int index = layout == phi::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
+      const int index = j * outer_size + i;
       BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
       x_sum += x_i;
       x_square_sum += x_i * x_i;
@@ -290,7 +288,7 @@ static __global__ void BNForwardTraining2DChannelLastCompStat(
   }
 }
 
-template <typename T, phi::DataLayout layout>
+template <typename T>
 static __global__ void BNForwardTraining2DChannelLastWriteRes(
     const T *x,
     const BatchNormParamType<T> *scale,
@@ -325,9 +323,7 @@ static __global__ void BNForwardTraining2DChannelLastWriteRes(
 
     for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size;
          j += inner_loop_stride) {
-      const int index = layout == phi::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
+      const int index = j * outer_size + i;
       BatchNormParamType<T> x_sub_mean =
           static_cast<BatchNormParamType<T>>(x[index]) - smem_mean[threadIdx.x];
       y[index] = scale[i] * x_sub_mean * smem_inv_var[threadIdx.x] + bias[i];
@@ -335,7 +331,7 @@ static __global__ void BNForwardTraining2DChannelLastWriteRes(
   }
 }
 
-template <typename T, int BlockDim, phi::DataLayout layout>
+template <typename T, int BlockDim>
 static __global__ void BNForwardTraining2DCompStat(
     const T *x,
     const BatchNormParamType<T> *scale,
@@ -370,9 +366,7 @@ static __global__ void BNForwardTraining2DCompStat(
 
     for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size;
          j += inner_loop_stride) {
-      const int index = layout == phi::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
+      const int index = (j / HxW * C + i) * HxW + j % HxW;
       BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
       x_sum += x_i;
       x_square_sum += x_i * x_i;
@@ -487,7 +481,7 @@ static __global__ void BNForwardTraining2DCompStat(
   }
 }
 
-template <typename T, phi::DataLayout layout>
+template <typename T>
 static __global__ void BNForwardTraining2DWriteRes(
     const T *x,
     const BatchNormParamType<T> *scale,
@@ -522,9 +516,7 @@ static __global__ void BNForwardTraining2DWriteRes(
 
     for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size;
          j += inner_loop_stride) {
-      const int index = layout == phi::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
+      const int index = (j / HxW * C + i) * HxW + j % HxW;
       BatchNormParamType<T> x_sub_mean =
           static_cast<BatchNormParamType<T>>(x[index]) - smem_mean[threadIdx.y];
       y[index] = scale[i] * x_sub_mean * smem_inv_var[threadIdx.y] + bias[i];
@@ -532,188 +524,6 @@ static __global__ void BNForwardTraining2DWriteRes(
   }
 }
 
-template <typename T, int BlockDim, phi::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingSMem(
-    const T *x,
-    const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias,
-    const int C,
-    const int N,
-    const int HxW,
-    const double epsilon,
-    double exponentialAverageFactor,
-    T *y,
-    BatchNormParamType<T> *mean,
-    BatchNormParamType<T> *variance,
-    BatchNormParamType<T> *save_mean,
-    BatchNormParamType<T> *save_inv_variance) {
-  extern __shared__ __align__(sizeof(double)) char smem_buf[];
-  BatchNormParamType<T> *x_buf =
-      reinterpret_cast<BatchNormParamType<T> *>(smem_buf);
-
-  int outer_size = C;
-  int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage mean_storage;
-  __shared__ typename BlockReduce::TempStorage variance_storeage;
-  __shared__ BatchNormParamType<T> mean_val;
-  __shared__ BatchNormParamType<T> variance_val;
-  __shared__ BatchNormParamType<T> inv_var_val;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == phi::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
-      x_buf[j] = x_i;
-      x_sum += x_i;
-      x_square_sum += x_i * x_i;
-    }
-    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
-    x_square_sum =
-        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      mean_val = x_sum / inner_size;
-      variance_val = x_square_sum / inner_size - mean_val * mean_val;
-      inv_var_val = 1 / sqrt(variance_val + epsilon);
-
-      if (save_mean && save_inv_variance) {
-        save_mean[i] = mean_val;
-        save_inv_variance[i] = inv_var_val;
-      }
-      mean[i] = (1 - exponentialAverageFactor) * mean_val +
-                exponentialAverageFactor * mean[i];
-      variance[i] = (1 - exponentialAverageFactor) * variance_val +
-                    exponentialAverageFactor * variance[i];
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == phi::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> x_sub_mean =
-          static_cast<BatchNormParamType<T>>(x_buf[j]) - mean_val;
-      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
-    }
-  }
-}
-
-template <typename T, typename Context, phi::DataLayout layout>
-inline bool TryDispatchBNForwardTrainingSMem(
-    const Context &ctx,
-    const T *x,
-    const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias,
-    const int C,
-    const int N,
-    const int HxW,
-    const double epsilon,
-    double exponentialAverageFactor,
-    T *y,
-    BatchNormParamType<T> *mean,
-    BatchNormParamType<T> *variance,
-    BatchNormParamType<T> *save_mean,
-    BatchNormParamType<T> *save_inv_variance) {
-  constexpr int block_size = 512;
-  const size_t smem = N * HxW * sizeof(BatchNormParamType<T>);
-  int max_active_blocks_conf;
-  {
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_conf,
-        BNForwardTrainingSMem<T, block_size, layout>,
-        block_size,
-        smem);
-  }
-  if (max_active_blocks_conf <= 0) {
-    return false;
-  }
-  const int max_threads = ctx.GetMaxPhysicalThreadCount();
-  const int max_blocks = std::max(max_threads / block_size, 1);
-  const int grid = std::min(C, max_blocks);
-  BNForwardTrainingSMem<T, block_size, layout>
-      <<<grid, block_size, smem, ctx.stream()>>>(x,
-                                                 scale,
-                                                 bias,
-                                                 C,
-                                                 N,
-                                                 HxW,
-                                                 epsilon,
-                                                 exponentialAverageFactor,
-                                                 y,
-                                                 mean,
-                                                 variance,
-                                                 save_mean,
-                                                 save_inv_variance);
-  return true;
-}
-
-template <typename T, typename Context, phi::DataLayout layout>
-inline void DispatchBNForwardTraining(
-    const Context &ctx,
-    const T *x,
-    const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias,
-    const int C,
-    const int N,
-    const int HxW,
-    const double epsilon,
-    double exponentialAverageFactor,
-    T *y,
-    BatchNormParamType<T> *mean,
-    BatchNormParamType<T> *variance,
-    BatchNormParamType<T> *save_mean,
-    BatchNormParamType<T> *save_inv_variance) {
-  if ((N * HxW) <= 1024) {
-    // TODO(yaozihang): impl register-cache version
-    return;
-  } else {
-    bool dispatch_smem_impl_success = false;
-    {
-      dispatch_smem_impl_success =
-          TryDispatchBNForwardTrainingSMem<T, Context, layout>(
-              ctx,
-              x,
-              scale,
-              bias,
-              C,
-              N,
-              HxW,
-              epsilon,
-              exponentialAverageFactor,
-              y,
-              mean,
-              variance,
-              save_mean,
-              save_inv_variance);
-    }
-    if (!dispatch_smem_impl_success) {
-      const int block = 512;
-      const int max_threads = ctx.GetMaxPhysicalThreadCount();
-      const int max_blocks = std::max(max_threads / block, 1);
-      const int grid = std::min(C, max_blocks);
-      return BNForwardTraining<T, block, layout>
-          <<<grid, block, 0, ctx.stream()>>>(x,
-                                             scale,
-                                             bias,
-                                             C,
-                                             N,
-                                             HxW,
-                                             epsilon,
-                                             exponentialAverageFactor,
-                                             y,
-                                             mean,
-                                             variance,
-                                             save_mean,
-                                             save_inv_variance);
-    }
-  }
-}
-
 template <typename T, typename Context>
 void BatchNormKernel(const Context &ctx,
                      const DenseTensor &x,
@@ -1112,11 +922,17 @@ void BatchNormKernel(const Context &ctx,
 
         if (x_dims.size() != 2 && compute_format == DataLayout::kNCHW) {
           // init block&grid config
-          int block_x = std::min(
-              phi::funcs::details::GetLastPow2(N * H * W * D / 16), block_size);
+          int block_x =
+              std::min(phi::funcs::details::GetLastPow2(H * W * D), block_size);
           int block_y = std::min(phi::funcs::details::GetLastPow2(C),
                                  block_size / block_x);
 
+          if (block_x * block_y != block_size) {
+            block_x =
+                std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16),
+                         block_size / block_y);
+          }
+
           int grid_x = std::min(
               (N * H * W * D + block_x * 16 - 1) / (block_x * 16), 128);
           int grid_y = (C + block_y - 1) / block_y;
@@ -1136,7 +952,7 @@ void BatchNormKernel(const Context &ctx,
             funcs::SetConstant<Context, int> set_zero;
             set_zero(ctx, &flag_tensor, static_cast<int>(0));
           }
-          BNForwardTraining2DCompStat<T, block_size, DataLayout::kNCHW>
+          BNForwardTraining2DCompStat<T, block_size>
               <<<grid, block, 0, ctx.stream()>>>(
                   transformed_x.template data<T>(),
                   scale.template data<BatchNormParamType<T>>(),
@@ -1157,7 +973,7 @@ void BatchNormKernel(const Context &ctx,
                   flag_ptr);
 
           size_t smem_size = block.y * 2 * sizeof(BatchNormParamType<T>);
-          BNForwardTraining2DWriteRes<T, DataLayout::kNCHW>
+          BNForwardTraining2DWriteRes<T>
               <<<grid, block, smem_size, ctx.stream()>>>(
                   transformed_x.template data<T>(),
                   scale.template data<BatchNormParamType<T>>(),
@@ -1197,9 +1013,7 @@ void BatchNormKernel(const Context &ctx,
             funcs::SetConstant<Context, int> set_zero;
             set_zero(ctx, &flag_tensor, static_cast<int>(0));
           }
-          BNForwardTraining2DChannelLastCompStat<T,
-                                                 block_size,
-                                                 DataLayout::kNHWC>
+          BNForwardTraining2DChannelLastCompStat<T, block_size>
               <<<grid, block, 0, ctx.stream()>>>(
                   transformed_x.template data<T>(),
                   scale.template data<BatchNormParamType<T>>(),
@@ -1220,7 +1034,7 @@ void BatchNormKernel(const Context &ctx,
                   flag_ptr);
 
           size_t smem_size = block.x * 2 * sizeof(BatchNormParamType<T>);
-          BNForwardTraining2DChannelLastWriteRes<T, DataLayout::kNHWC>
+          BNForwardTraining2DChannelLastWriteRes<T>
               <<<grid, block, smem_size, ctx.stream()>>>(
                   transformed_x.template data<T>(),
                   scale.template data<BatchNormParamType<T>>(),

From 0571ecc7ef662a4b718cd9dfe1cd0b0a4cf0b43a Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Wed, 15 Jun 2022 16:01:14 +0800
Subject: [PATCH 16/70] fix memory thpt

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 82 +++++++++------------
 1 file changed, 33 insertions(+), 49 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index abaca230d0ec1..6d54e4193b007 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -302,31 +302,22 @@ static __global__ void BNForwardTraining2DChannelLastWriteRes(
   int outer_size = C;
   int inner_size = N * HxW;
 
-  extern __shared__ __align__(sizeof(BatchNormParamType<T>)) char smem_buf[];
-
-  BatchNormParamType<T> *smem_mean =
-      reinterpret_cast<BatchNormParamType<T> *>(smem_buf);
-  BatchNormParamType<T> *smem_inv_var =
-      reinterpret_cast<BatchNormParamType<T> *>(
-          smem_buf + blockDim.x * sizeof(BatchNormParamType<T>));
-
   int outer_loop_stride = gridDim.x * blockDim.x;
   int inner_loop_stride = gridDim.y * blockDim.y;
 
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size;
        i += outer_loop_stride) {
-    if (threadIdx.y == 0) {
-      smem_mean[threadIdx.x] = compute_mean[i];
-      smem_inv_var[threadIdx.x] = compute_inv_var[i];
-    }
-    __syncthreads();
+    BatchNormParamType<T> mean_val = compute_mean[i];
+    BatchNormParamType<T> inv_var_val = compute_inv_var[i];
+    BatchNormParamType<T> scale_val = scale[i];
+    BatchNormParamType<T> bias_val = bias[i];
 
     for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size;
          j += inner_loop_stride) {
       const int index = j * outer_size + i;
       BatchNormParamType<T> x_sub_mean =
-          static_cast<BatchNormParamType<T>>(x[index]) - smem_mean[threadIdx.x];
-      y[index] = scale[i] * x_sub_mean * smem_inv_var[threadIdx.x] + bias[i];
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
+      y[index] = scale_val * x_sub_mean * inv_var_val + bias_val;
     }
   }
 }
@@ -495,31 +486,22 @@ static __global__ void BNForwardTraining2DWriteRes(
   int outer_size = C;
   int inner_size = N * HxW;
 
-  extern __shared__ __align__(sizeof(BatchNormParamType<T>)) char smem_buf[];
-
-  BatchNormParamType<T> *smem_mean =
-      reinterpret_cast<BatchNormParamType<T> *>(smem_buf);
-  BatchNormParamType<T> *smem_inv_var =
-      reinterpret_cast<BatchNormParamType<T> *>(
-          smem_buf + blockDim.y * sizeof(BatchNormParamType<T>));
-
   int outer_loop_stride = gridDim.y * blockDim.y;
   int inner_loop_stride = gridDim.x * blockDim.x;
 
   for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < outer_size;
        i += outer_loop_stride) {
-    if (threadIdx.x == 0) {
-      smem_mean[threadIdx.y] = compute_mean[i];
-      smem_inv_var[threadIdx.y] = compute_inv_var[i];
-    }
-    __syncthreads();
+    BatchNormParamType<T> mean_val = compute_mean[i];
+    BatchNormParamType<T> inv_var_val = compute_inv_var[i];
+    BatchNormParamType<T> scale_val = scale[i];
+    BatchNormParamType<T> bias_val = bias[i];
 
     for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size;
          j += inner_loop_stride) {
       const int index = (j / HxW * C + i) * HxW + j % HxW;
       BatchNormParamType<T> x_sub_mean =
-          static_cast<BatchNormParamType<T>>(x[index]) - smem_mean[threadIdx.y];
-      y[index] = scale[i] * x_sub_mean * smem_inv_var[threadIdx.y] + bias[i];
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
+      y[index] = scale_val * x_sub_mean * inv_var_val + bias_val;
     }
   }
 }
@@ -908,6 +890,8 @@ void BatchNormKernel(const Context &ctx,
         dim3 block;
         dim3 grid;
         const int block_size = 512;
+        const int MAX_GRID_SIZE = 128;
+        const int WARP_SIZE = 32;
 
         // init intermediate storage
         DenseTensor block_data_tensor;
@@ -933,8 +917,9 @@ void BatchNormKernel(const Context &ctx,
                          block_size / block_y);
           }
 
-          int grid_x = std::min(
-              (N * H * W * D + block_x * 16 - 1) / (block_x * 16), 128);
+          int grid_x =
+              std::min((N * H * W * D + block_x * 16 - 1) / (block_x * 16),
+                       MAX_GRID_SIZE);
           int grid_y = (C + block_y - 1) / block_y;
 
           block.x = block_x;
@@ -972,21 +957,20 @@ void BatchNormKernel(const Context &ctx,
                   block_data_ptr,
                   flag_ptr);
 
-          size_t smem_size = block.y * 2 * sizeof(BatchNormParamType<T>);
-          BNForwardTraining2DWriteRes<T>
-              <<<grid, block, smem_size, ctx.stream()>>>(
-                  transformed_x.template data<T>(),
-                  scale.template data<BatchNormParamType<T>>(),
-                  bias.template data<BatchNormParamType<T>>(),
-                  C,
-                  N,
-                  H * W * D,
-                  transformed_y.template data<T>(),
-                  compute_mean_tensor.data<BatchNormParamType<T>>(),
-                  compute_inv_var_tensor.data<BatchNormParamType<T>>());
+          BNForwardTraining2DWriteRes<T><<<grid, block, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              transformed_y.template data<T>(),
+              compute_mean_tensor.data<BatchNormParamType<T>>(),
+              compute_inv_var_tensor.data<BatchNormParamType<T>>());
         } else {
           // init block&grid config
-          int block_x = std::min(phi::funcs::details::GetLastPow2(C), 32);
+          int block_x =
+              std::min(phi::funcs::details::GetLastPow2(C), WARP_SIZE);
           int block_y =
               std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16),
                        block_size / block_x);
@@ -995,8 +979,9 @@ void BatchNormKernel(const Context &ctx,
                                block_size / block_y);
           }
           int grid_x = (C + block_x - 1) / block_x;
-          int grid_y = std::min(
-              (N * H * W * D + block_y * 16 - 1) / (block_y * 16), 128);
+          int grid_y =
+              std::min((N * H * W * D + block_y * 16 - 1) / (block_y * 16),
+                       MAX_GRID_SIZE);
 
           block.x = block_x;
           block.y = block_y;
@@ -1033,9 +1018,8 @@ void BatchNormKernel(const Context &ctx,
                   block_data_ptr,
                   flag_ptr);
 
-          size_t smem_size = block.x * 2 * sizeof(BatchNormParamType<T>);
           BNForwardTraining2DChannelLastWriteRes<T>
-              <<<grid, block, smem_size, ctx.stream()>>>(
+              <<<grid, block, 0, ctx.stream()>>>(
                   transformed_x.template data<T>(),
                   scale.template data<BatchNormParamType<T>>(),
                   bias.template data<BatchNormParamType<T>>(),

From 3fc54ada9a14d577234023064f772c82dcc01c76 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Thu, 16 Jun 2022 05:47:24 +0000
Subject: [PATCH 17/70] opt gather

---
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 19 +++++++++++++------
 .../sparse/gpu/convolution_grad_kernel.cu     |  8 ++++----
 .../kernels/sparse/gpu/convolution_kernel.cu  |  4 ++--
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 24a7387d4fe19..74d3108806a2a 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/sparse/utils.cu.h"
 #include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 namespace phi {
 namespace sparse {
@@ -46,18 +47,24 @@ using Dims4D = phi::funcs::sparse::Dims4D;
  * index_size: the size of indices
  * slice_size: slice size corresponding to each index, here is the channel size
  **/
-template <typename T, typename IndexT = int>
+template <typename T, typename IndexT = int, int VecSize>
 __global__ void GatherKernel(const T* params,
                              const IndexT* indices,
                              T* output,
                              size_t index_size,
                              size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
+  CUDA_KERNEL_LOOP_TYPE(i, index_size*slice_size/VecSize, int64_t) {
+    const int vec_slice_size = slice_size / VecSize;
+    int indices_i = i / vec_slice_size;
+    int slice_i = i - indices_i * vec_slice_size;  // offset inside the slice
     IndexT gather_i = indices[indices_i];
-    int64_t params_i = gather_i * slice_size + slice_i;
-    *(output + i) = *(params + params_i);
+    int64_t params_i = gather_i * slice_size + slice_i*VecSize;
+    //*(output + i) = *(params + params_i);
+    using  LoadT = phi::AlignedVector<T, VecSize>;
+    using  StoreT = phi::AlignedVector<T, VecSize>;
+    LoadT params_vec;
+    phi::Load<T, VecSize>(params + params_i, &params_vec); 
+    phi::Store<T, VecSize>(params_vec, output + i * VecSize);
   }
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index d83d064418eec..0b89d4a2dc269 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -138,8 +138,8 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   }
 
   auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * in_channels, 1);
-  GatherKernel<T, IntT><<<config.block_per_grid.x,
+      dev_ctx, rulebook_len * in_channels/sizeof(T), 1);
+  GatherKernel<T, IntT, sizeof(T)><<<config.block_per_grid.x,
                           config.thread_per_block.x,
                           0,
                           dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
@@ -149,8 +149,8 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
                                               in_channels);
 
   config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * out_channels, 1);
-  GatherKernel<T, IntT>
+      dev_ctx, rulebook_len * out_channels/sizeof(T), 1);
+  GatherKernel<T, IntT, sizeof(T)>
       <<<config.block_per_grid.x,
          config.thread_per_block.x,
          0,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index c3b6c8c6abcc8..4a6dd00dc1b0d 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -111,8 +111,8 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
 
   auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
-  GatherKernel<T, IntT><<<config.block_per_grid.x,
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels/sizeof(T), 1);
+  GatherKernel<T, IntT, sizeof(T)><<<config.block_per_grid.x,
                           config.thread_per_block.x,
                           0,
                           dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),

From 804ba033849e230dbce5ee827a68e4e578c46485 Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Thu, 16 Jun 2022 15:24:16 +0800
Subject: [PATCH 18/70] fix threshold

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 6d54e4193b007..9c21b0e716165 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -884,8 +884,8 @@ void BatchNormKernel(const Context &ctx,
 //         static_cast<void *>(saved_variance->template mutable_data<
 //                             BatchNormParamType<T>>(ctx.GetPlace()))));
 #else
-      // const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070);
-      const bool use_native_kernel = true;
+      const bool use_native_kernel = ((x_dims.size() == 2 && N >= 131070) ||
+                                      (x_dims.size() == 3 && N >= 880801));
       if (use_native_kernel) {
         dim3 block;
         dim3 grid;

From 48c634469b8194117a5ed388cdf45c17297e1aa7 Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Thu, 16 Jun 2022 22:56:40 +0800
Subject: [PATCH 19/70] fix backward threshold

---
 paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu | 9 +++++----
 paddle/phi/kernels/gpu/batch_norm_kernel.cu      | 7 +++++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index 1e93803866e3f..0f028f42a956c 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -591,11 +591,12 @@ void BatchNormGradRawKernel(const Context &ctx,
 //             ctx.GetPlace()),
 //         epsilon, saved_mean_data, saved_var_data));
 #else
-      // CUDNN PER_ACTIVATION mode only support small batch size
+      // CUDNN only support small batch size
       const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
-      // const bool use_native_kernel =
-      //     (x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD);
-      const bool use_native_kernel = true;
+      const size_t CUDNN_SPATIAL_THRESHOLD = 880801;
+      const bool use_native_kernel =
+          ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
+           (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD));
       if (use_native_kernel) {
         if (compute_format == DataLayout::kNCHW) {
           BNBackward<T, block, DataLayout::kNCHW>
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 9c21b0e716165..c70bfc3d2c27a 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -884,8 +884,11 @@ void BatchNormKernel(const Context &ctx,
 //         static_cast<void *>(saved_variance->template mutable_data<
 //                             BatchNormParamType<T>>(ctx.GetPlace()))));
 #else
-      const bool use_native_kernel = ((x_dims.size() == 2 && N >= 131070) ||
-                                      (x_dims.size() == 3 && N >= 880801));
+      const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
+      const size_t CUDNN_SPATIAL_THRESHOLD = 880801;
+      const bool use_native_kernel =
+          ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
+           (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD));
       if (use_native_kernel) {
         dim3 block;
         dim3 grid;

From 6785f6f20076d9bf1b48768fec5efaf5f960658a Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Thu, 16 Jun 2022 23:39:10 +0800
Subject: [PATCH 20/70] refine unit test

---
 .../tests/unittests/test_batch_norm_op_v2.py  | 51 ++++++++++---------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index cfd5d5f7c9bd0..193f43c7eb9f0 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -82,50 +82,50 @@ def error3d():
                 self.assertRaises(ValueError, error2d_dataformat)
                 self.assertRaises(ValueError, error3d_dataformat)
 
-    def test_eager_api(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            shape = [4, 10, 4, 4]
+    def test_large_batch(self):
 
-            def compute_v1(x):
-                with fluid.dygraph.guard(p):
-                    bn = fluid.dygraph.BatchNorm(shape[1])
-                    #bn = paddle.nn.BatchNorm2D(shape[1])
+        def compute_baseline(x):
+            with fluid.dygraph.guard(p):
+                bn = fluid.dygraph.BatchNorm(shape[1])
+                x1 = paddle.to_tensor(x)
+                x1.stop_gradient = False
+                y = bn(x1)
+                y.backward()
+                return y.numpy(), x1.gradient()
+
+        def compute_1d(x):
+            with fluid.dygraph.guard(p):
+                with _test_eager_guard():
+                    bn = paddle.nn.BatchNorm1D(shape[1])
                     x1 = paddle.to_tensor(x)
                     x1.stop_gradient = False
                     y = bn(x1)
                     y.backward()
                     return y.numpy(), x1.gradient()
 
-            def compute_v2(x):
-                with fluid.dygraph.guard(p):
-                    with _test_eager_guard():
-                        print("v2")
-                        bn = paddle.nn.BatchNorm2D(shape[1])
-                        x1 = paddle.to_tensor(x)
-                        x1.stop_gradient = False
-                        y = bn(x1)
-                        y.backward()
-                        return y.numpy(), x1.gradient()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [200000, 4]
 
             x = np.random.randn(*shape).astype("float32")
-            y1, g1 = compute_v1(x)
-            y2, g2 = compute_v2(x)
+            y1, g1 = compute_baseline(x)
+            y2, g2 = compute_1d(x)
             self.assertTrue(np.allclose(g1, g2))
             self.assertTrue(np.allclose(y1, y2))
 
-    def test_eager_api_1d(self):
+    def test_eager_api(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
-            shape = [200000, 4]
+            shape = [4, 10, 4, 4]
 
             def compute_v1(x):
                 with fluid.dygraph.guard(p):
                     bn = fluid.dygraph.BatchNorm(shape[1])
+                    #bn = paddle.nn.BatchNorm2D(shape[1])
                     x1 = paddle.to_tensor(x)
                     x1.stop_gradient = False
                     y = bn(x1)
@@ -135,7 +135,8 @@ def compute_v1(x):
             def compute_v2(x):
                 with fluid.dygraph.guard(p):
                     with _test_eager_guard():
-                        bn = paddle.nn.BatchNorm1D(shape[1])
+                        print("v2")
+                        bn = paddle.nn.BatchNorm2D(shape[1])
                         x1 = paddle.to_tensor(x)
                         x1.stop_gradient = False
                         y = bn(x1)

From e46ef54a4bd6a9e53660e165950415a22ecc8bb8 Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Thu, 16 Jun 2022 23:42:53 +0800
Subject: [PATCH 21/70] refine test

---
 .../paddle/fluid/tests/unittests/test_batch_norm_op_v2.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index 193f43c7eb9f0..7aa3b8cddf80c 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -107,8 +107,16 @@ def compute_1d(x):
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
+            # [N, C]
             shape = [200000, 4]
+            x = np.random.randn(*shape).astype("float32")
+            y1, g1 = compute_baseline(x)
+            y2, g2 = compute_1d(x)
+            self.assertTrue(np.allclose(g1, g2))
+            self.assertTrue(np.allclose(y1, y2))
 
+            # [N, C, L]
+            shape = [1000000, 4, 4]
             x = np.random.randn(*shape).astype("float32")
             y1, g1 = compute_baseline(x)
             y2, g2 = compute_1d(x)

From 938cde3131225bbdab9a8eb28d6cd83331c58830 Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Fri, 17 Jun 2022 14:32:07 +0800
Subject: [PATCH 22/70] delete pragma unroll

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index c70bfc3d2c27a..a26bba041912b 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -229,7 +229,6 @@ static __global__ void BNForwardTraining2DChannelLastCompStat(
 
         // vertical block sum
         int tid = threadIdx.x + threadIdx.y * blockDim.x;
-#pragma unroll
         for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
           if (threadIdx.y < offset * 2) {
             smem_sum[tid] = x_sum;
@@ -365,7 +364,6 @@ static __global__ void BNForwardTraining2DCompStat(
 
     // horizonal block sum
     int tid = threadIdx.x + threadIdx.y * blockDim.x;
-#pragma unroll
     for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
       if (threadIdx.x < offset * 2) {
         smem_sum[tid] = x_sum;
@@ -413,7 +411,6 @@ static __global__ void BNForwardTraining2DCompStat(
 
         // vertical block sum
         int tid = threadIdx.x + threadIdx.y * blockDim.x;
-#pragma unroll
         for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
           if (threadIdx.x < offset * 2) {
             smem_sum[tid] = x_sum;

From a24f2aa8d0681e8a940aa686bb5f80f70fb71671 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 20 Jun 2022 09:20:32 +0000
Subject: [PATCH 23/70] opt gather and scatter

---
 paddle/phi/kernels/funcs/sparse/scatter.cu.h  | 31 +++++---
 .../kernels/sparse/gpu/coalesced_kernel.cu    | 36 ++++++---
 .../phi/kernels/sparse/gpu/convolution.cu.h   |  1 -
 .../sparse/gpu/convolution_grad_kernel.cu     | 73 +++++++++++------
 .../kernels/sparse/gpu/convolution_kernel.cu  | 78 +++++++++++++------
 5 files changed, 150 insertions(+), 69 deletions(-)

diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
index b9568f1df716d..48b12f8a1b6de 100644
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+
+#define VecBytes 16
 
 namespace phi {
 namespace funcs {
@@ -28,33 +31,37 @@ namespace sparse {
  * channels: the output channel size
  * out: the outputs
  **/
-template <typename T>
+template <typename T, int VecSize>
 __global__ void ScatterKernel(const T* input,
                               const int* unique_value,
                               const int* out_index,
                               const int non_zero_num,
                               const int rulebook_len,
                               const int channels,
-                              T* out,
-                              const bool subm = false) {
+                              T* out) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
-    int indices_i = i / channels;
-    int channels_i = i - indices_i * channels;
+  const int vec_channels = channels / VecSize;
+  using  LoadT = phi::AlignedVector<T, VecSize>;
+  using  StoreT = phi::AlignedVector<T, VecSize>;
+  for (int i = tid; i < non_zero_num * vec_channels; i += gridDim.x * blockDim.x) {
+    int indices_i = i / vec_channels;
+    int channels_i = i - indices_i * vec_channels;
 
     int start = unique_value[indices_i];
     int end = indices_i == non_zero_num - 1 ? rulebook_len
                                             : unique_value[indices_i + 1];
     // max(end-start) = kernel_size
-    T sum = static_cast<T>(0);
-    if (subm) {
-      sum = out[indices_i * channels + channels_i];
-    }
+    StoreT sums={static_cast<T>(0)};
     for (int j = start; j < end; j++) {
       const int out_feature_i = out_index[j];
-      sum += input[out_feature_i * channels + channels_i];
+      LoadT vec_in;
+      phi::Load<T, VecSize>(input + out_feature_i * channels + channels_i * VecSize, &vec_in);
+#pragma unroll
+      for(int k = 0; k < VecSize; k++){
+        sums[k] += vec_in[k];
+      }
     }
-    out[indices_i * channels + channels_i] = sum;
+    phi::Store<T, VecSize>(sums, out + indices_i * channels + channels_i * VecSize); 
   }
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
index 7d9e566916add..44ecb4365a9c1 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
@@ -132,16 +132,32 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx,
   }
 
   // 5. scatter the values
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1);
-  phi::funcs::sparse::ScatterKernel<T>
-      <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
-          x_values_ptr,
-          public_indexs.data<int>(),
-          values_indexs_ptr,
-          out_nnz,
-          nnz,
-          stride,
-          out_values.data<T>());
+  const int VecSize = VecBytes / sizeof(T);
+  if(stride % VecSize == 0){
+      config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx,
+              nnz * stride / VecSize, 1);
+      phi::funcs::sparse::ScatterKernel<T, VecSize>
+          <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
+                  x_values_ptr,
+                  public_indexs.data<int>(),
+                  values_indexs_ptr,
+                  out_nnz,
+                  nnz,
+                  stride,
+                  out_values.data<T>());
+  }else{
+      config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx,
+              nnz * stride, 1);
+      phi::funcs::sparse::ScatterKernel<T, 1>
+          <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
+                  x_values_ptr,
+                  public_indexs.data<int>(),
+                  values_indexs_ptr,
+                  out_nnz,
+                  nnz,
+                  stride,
+                  out_values.data<T>());
+  }
 
   // 6. convert index to coordinate
   Dim<DDim::kMaxRank> const_dims;
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 74d3108806a2a..7f0d4814b75d0 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -59,7 +59,6 @@ __global__ void GatherKernel(const T* params,
     int slice_i = i - indices_i * vec_slice_size;  // offset inside the slice
     IndexT gather_i = indices[indices_i];
     int64_t params_i = gather_i * slice_size + slice_i*VecSize;
-    //*(output + i) = *(params + params_i);
     using  LoadT = phi::AlignedVector<T, VecSize>;
     using  StoreT = phi::AlignedVector<T, VecSize>;
     LoadT params_vec;
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 0b89d4a2dc269..ebf2bf6cae896 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
@@ -137,28 +138,56 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
     }
   }
 
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * in_channels/sizeof(T), 1);
-  GatherKernel<T, IntT, sizeof(T)><<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          0,
-                          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                                              rulebook_ptr + rulebook_len,
-                                              in_features_ptr,
-                                              rulebook_len,
-                                              in_channels);
+  const int VecSize = VecBytes / sizeof(T);
+  if(in_channels % VecSize == 0){
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+              dev_ctx, rulebook_len * in_channels / VecSize, 1);
+      GatherKernel<T, IntT, VecSize><<<config.block_per_grid.x,
+          config.thread_per_block.x,
+          0,
+          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                  rulebook_ptr + rulebook_len,
+                  in_features_ptr,
+                  rulebook_len,
+                  in_channels);
+  }else{
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+              dev_ctx, rulebook_len * in_channels, 1);
+      GatherKernel<T, IntT, 1><<<config.block_per_grid.x,
+          config.thread_per_block.x,
+          0,
+          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                  rulebook_ptr + rulebook_len,
+                  in_features_ptr,
+                  rulebook_len,
+                  in_channels);
+  }
 
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * out_channels/sizeof(T), 1);
-  GatherKernel<T, IntT, sizeof(T)>
-      <<<config.block_per_grid.x,
-         config.thread_per_block.x,
-         0,
-         dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
-                             rulebook_ptr + rulebook_len * 2,
-                             out_grad_features_ptr,
-                             rulebook_len,
-                             out_channels);
+  if(out_channels % VecSize == 0){
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+              dev_ctx, rulebook_len * out_channels / VecSize, 1);
+      GatherKernel<T, IntT, VecSize>
+          <<<config.block_per_grid.x,
+          config.thread_per_block.x,
+          0,
+          dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
+                  rulebook_ptr + rulebook_len * 2,
+                  out_grad_features_ptr,
+                  rulebook_len,
+                  out_channels);
+  }else{
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+              dev_ctx, rulebook_len * out_channels, 1);
+      GatherKernel<T, IntT, 1>
+          <<<config.block_per_grid.x,
+          config.thread_per_block.x,
+          0,
+          dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
+                  rulebook_ptr + rulebook_len * 2,
+                  out_grad_features_ptr,
+                  rulebook_len,
+                  out_channels);
+  }
 
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
@@ -203,7 +232,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   }
 
   // 4. scatter
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
       dev_ctx, rulebook_len * in_channels, 1);
 
   phi::funcs::ScatterCUDAKernel<<<config.block_per_grid,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 4a6dd00dc1b0d..c2942bdfd2b63 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -110,16 +110,30 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   phi::funcs::SetConstant<GPUContext, T> set_zero;
   set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
 
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels/sizeof(T), 1);
-  GatherKernel<T, IntT, sizeof(T)><<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          0,
-                          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                                              rulebook_ptr + n,
-                                              in_features_ptr,
-                                              n,
-                                              in_channels);
+  const int VecSize = VecBytes / sizeof(T);
+  if(in_channels % VecSize == 0){
+      auto config =
+          phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels / VecSize, 1);
+      GatherKernel<T, IntT, VecSize><<<config.block_per_grid.x,
+          config.thread_per_block.x,
+          0,
+          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                  rulebook_ptr + n,
+                  in_features_ptr,
+                  n,
+                  in_channels);
+  }else{
+      auto config =
+          phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
+      GatherKernel<T, IntT, 1><<<config.block_per_grid.x,
+          config.thread_per_block.x,
+          0,
+          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                  rulebook_ptr + n,
+                  in_features_ptr,
+                  n,
+                  in_channels);
+  }
 
   // 3. call gemm for every werght
   auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
@@ -155,7 +169,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   // 4. scatter
   if (subm) {
     set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
-    config =
+    auto config =
         phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1);
     phi::funcs::ScatterCUDAKernel<T, IntT>
         <<<config.block_per_grid,
@@ -168,19 +182,35 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
                                out_channels,
                                false);
   } else {
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, out->nnz() * out_channels, 1);
-    phi::funcs::sparse::ScatterKernel<T>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(out_features_ptr,
-                               unique_value.data<int>(),
-                               out_index.data<int>(),
-                               out->nnz(),
-                               n,
-                               out_channels,
-                               out_values_ptr);
+      if(out_channels % VecSize == 0){
+          auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+                  dev_ctx, out->nnz() * out_channels / VecSize, 1);
+          phi::funcs::sparse::ScatterKernel<T, VecSize>
+              <<<config.block_per_grid.x,
+              config.thread_per_block.x,
+              0,
+              dev_ctx.stream()>>>(out_features_ptr,
+                      unique_value.data<int>(),
+                      out_index.data<int>(),
+                      out->nnz(),
+                      n,
+                      out_channels,
+                      out_values_ptr);
+      }else{
+          auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+                  dev_ctx, out->nnz() * out_channels, 1);
+          phi::funcs::sparse::ScatterKernel<T, 1>
+              <<<config.block_per_grid.x,
+              config.thread_per_block.x,
+              0,
+              dev_ctx.stream()>>>(out_features_ptr,
+                      unique_value.data<int>(),
+                      out_index.data<int>(),
+                      out->nnz(),
+                      n,
+                      out_channels,
+                      out_values_ptr);
+      }
   }
 }
 /**

From 64be38b56a5c931cfeff710aa69bc6336472fe46 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 20 Jun 2022 10:56:50 +0000
Subject: [PATCH 24/70] opt conv

---
 paddle/phi/kernels/funcs/sparse/scatter.cu.h  | 34 +++++---
 .../kernels/sparse/gpu/coalesced_kernel.cu    | 39 ++++++---
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 18 +++--
 .../sparse/gpu/convolution_grad_kernel.cu     | 75 ++++++++++++-----
 .../kernels/sparse/gpu/convolution_kernel.cu  | 80 +++++++++++++------
 5 files changed, 172 insertions(+), 74 deletions(-)

diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
index b9568f1df716d..cd89c916db577 100644
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+
+#define VecBytes 16
 
 namespace phi {
 namespace funcs {
@@ -28,33 +31,40 @@ namespace sparse {
  * channels: the output channel size
  * out: the outputs
  **/
-template <typename T>
+template <typename T, int VecSize>
 __global__ void ScatterKernel(const T* input,
                               const int* unique_value,
                               const int* out_index,
                               const int non_zero_num,
                               const int rulebook_len,
                               const int channels,
-                              T* out,
-                              const bool subm = false) {
+                              T* out) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
-    int indices_i = i / channels;
-    int channels_i = i - indices_i * channels;
+  const int vec_channels = channels / VecSize;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  for (int i = tid; i < non_zero_num * vec_channels;
+       i += gridDim.x * blockDim.x) {
+    int indices_i = i / vec_channels;
+    int channels_i = i - indices_i * vec_channels;
 
     int start = unique_value[indices_i];
     int end = indices_i == non_zero_num - 1 ? rulebook_len
                                             : unique_value[indices_i + 1];
     // max(end-start) = kernel_size
-    T sum = static_cast<T>(0);
-    if (subm) {
-      sum = out[indices_i * channels + channels_i];
-    }
+    StoreT sums = {static_cast<T>(0)};
     for (int j = start; j < end; j++) {
       const int out_feature_i = out_index[j];
-      sum += input[out_feature_i * channels + channels_i];
+      LoadT vec_in;
+      phi::Load<T, VecSize>(
+          input + out_feature_i * channels + channels_i * VecSize, &vec_in);
+#pragma unroll
+      for (int k = 0; k < VecSize; k++) {
+        sums[k] += vec_in[k];
+      }
     }
-    out[indices_i * channels + channels_i] = sum;
+    phi::Store<T, VecSize>(sums,
+                           out + indices_i * channels + channels_i * VecSize);
   }
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
index 7d9e566916add..60d90a18d4633 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
@@ -132,16 +132,35 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx,
   }
 
   // 5. scatter the values
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1);
-  phi::funcs::sparse::ScatterKernel<T>
-      <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
-          x_values_ptr,
-          public_indexs.data<int>(),
-          values_indexs_ptr,
-          out_nnz,
-          nnz,
-          stride,
-          out_values.data<T>());
+  const int VecSize = VecBytes / sizeof(T);
+  if (stride % VecSize == 0) {
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, nnz * stride / VecSize, 1);
+    phi::funcs::sparse::ScatterKernel<T, VecSize>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(x_values_ptr,
+                               public_indexs.data<int>(),
+                               values_indexs_ptr,
+                               out_nnz,
+                               nnz,
+                               stride,
+                               out_values.data<T>());
+  } else {
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1);
+    phi::funcs::sparse::ScatterKernel<T, 1>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(x_values_ptr,
+                               public_indexs.data<int>(),
+                               values_indexs_ptr,
+                               out_nnz,
+                               nnz,
+                               stride,
+                               out_values.data<T>());
+  }
 
   // 6. convert index to coordinate
   Dim<DDim::kMaxRank> const_dims;
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 24a7387d4fe19..a08c7931bb4f4 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/sparse/utils.cu.h"
@@ -46,18 +47,23 @@ using Dims4D = phi::funcs::sparse::Dims4D;
  * index_size: the size of indices
  * slice_size: slice size corresponding to each index, here is the channel size
  **/
-template <typename T, typename IndexT = int>
+template <typename T, typename IndexT = int, int VecSize>
 __global__ void GatherKernel(const T* params,
                              const IndexT* indices,
                              T* output,
                              size_t index_size,
                              size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size / VecSize, int64_t) {
+    const int vec_slice_size = slice_size / VecSize;
+    int indices_i = i / vec_slice_size;
+    int slice_i = i - indices_i * vec_slice_size;  // offset inside the slice
     IndexT gather_i = indices[indices_i];
-    int64_t params_i = gather_i * slice_size + slice_i;
-    *(output + i) = *(params + params_i);
+    int64_t params_i = gather_i * slice_size + slice_i * VecSize;
+    using LoadT = phi::AlignedVector<T, VecSize>;
+    using StoreT = phi::AlignedVector<T, VecSize>;
+    LoadT params_vec;
+    phi::Load<T, VecSize>(params + params_i, &params_vec);
+    phi::Store<T, VecSize>(params_vec, output + i * VecSize);
   }
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index d83d064418eec..d91c93fde66fb 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
@@ -137,28 +138,58 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
     }
   }
 
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * in_channels, 1);
-  GatherKernel<T, IntT><<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          0,
-                          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                                              rulebook_ptr + rulebook_len,
-                                              in_features_ptr,
-                                              rulebook_len,
-                                              in_channels);
+  const int VecSize = VecBytes / sizeof(T);
+  if (in_channels % VecSize == 0) {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, rulebook_len * in_channels / VecSize, 1);
+    GatherKernel<T, IntT, VecSize>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                               rulebook_ptr + rulebook_len,
+                               in_features_ptr,
+                               rulebook_len,
+                               in_channels);
+  } else {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, rulebook_len * in_channels, 1);
+    GatherKernel<T, IntT, 1>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                               rulebook_ptr + rulebook_len,
+                               in_features_ptr,
+                               rulebook_len,
+                               in_channels);
+  }
 
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * out_channels, 1);
-  GatherKernel<T, IntT>
-      <<<config.block_per_grid.x,
-         config.thread_per_block.x,
-         0,
-         dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
-                             rulebook_ptr + rulebook_len * 2,
-                             out_grad_features_ptr,
-                             rulebook_len,
-                             out_channels);
+  if (out_channels % VecSize == 0) {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, rulebook_len * out_channels / VecSize, 1);
+    GatherKernel<T, IntT, VecSize>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
+                               rulebook_ptr + rulebook_len * 2,
+                               out_grad_features_ptr,
+                               rulebook_len,
+                               out_channels);
+  } else {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, rulebook_len * out_channels, 1);
+    GatherKernel<T, IntT, 1>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
+                               rulebook_ptr + rulebook_len * 2,
+                               out_grad_features_ptr,
+                               rulebook_len,
+                               out_channels);
+  }
 
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
@@ -203,7 +234,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   }
 
   // 4. scatter
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
       dev_ctx, rulebook_len * in_channels, 1);
 
   phi::funcs::ScatterCUDAKernel<<<config.block_per_grid,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index c3b6c8c6abcc8..c1ea1c1894461 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -110,16 +110,32 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   phi::funcs::SetConstant<GPUContext, T> set_zero;
   set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
 
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
-  GatherKernel<T, IntT><<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          0,
-                          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                                              rulebook_ptr + n,
-                                              in_features_ptr,
-                                              n,
-                                              in_channels);
+  const int VecSize = VecBytes / sizeof(T);
+  if (in_channels % VecSize == 0) {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, n * in_channels / VecSize, 1);
+    GatherKernel<T, IntT, VecSize>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                               rulebook_ptr + n,
+                               in_features_ptr,
+                               n,
+                               in_channels);
+  } else {
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
+    GatherKernel<T, IntT, 1>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                               rulebook_ptr + n,
+                               in_features_ptr,
+                               n,
+                               in_channels);
+  }
 
   // 3. call gemm for every werght
   auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
@@ -155,7 +171,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   // 4. scatter
   if (subm) {
     set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
-    config =
+    auto config =
         phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1);
     phi::funcs::ScatterCUDAKernel<T, IntT>
         <<<config.block_per_grid,
@@ -168,19 +184,35 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
                                out_channels,
                                false);
   } else {
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, out->nnz() * out_channels, 1);
-    phi::funcs::sparse::ScatterKernel<T>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(out_features_ptr,
-                               unique_value.data<int>(),
-                               out_index.data<int>(),
-                               out->nnz(),
-                               n,
-                               out_channels,
-                               out_values_ptr);
+    if (out_channels % VecSize == 0) {
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+          dev_ctx, out->nnz() * out_channels / VecSize, 1);
+      phi::funcs::sparse::ScatterKernel<T, VecSize>
+          <<<config.block_per_grid.x,
+             config.thread_per_block.x,
+             0,
+             dev_ctx.stream()>>>(out_features_ptr,
+                                 unique_value.data<int>(),
+                                 out_index.data<int>(),
+                                 out->nnz(),
+                                 n,
+                                 out_channels,
+                                 out_values_ptr);
+    } else {
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+          dev_ctx, out->nnz() * out_channels, 1);
+      phi::funcs::sparse::ScatterKernel<T, 1>
+          <<<config.block_per_grid.x,
+             config.thread_per_block.x,
+             0,
+             dev_ctx.stream()>>>(out_features_ptr,
+                                 unique_value.data<int>(),
+                                 out_index.data<int>(),
+                                 out->nnz(),
+                                 n,
+                                 out_channels,
+                                 out_values_ptr);
+    }
   }
 }
 /**

From 26ca7ab0c54dd03cbc1e85c095a6839d23cfff54 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 21 Jun 2022 06:58:05 +0000
Subject: [PATCH 25/70] fix batch csr

---
 .../kernels/sparse/cpu/sparse_utils_kernel.cc |  2 +-
 .../kernels/sparse/gpu/sparse_utils_kernel.cu |  2 +-
 .../tests/unittests/test_sparse_utils_op.py   | 78 ++++++++-----------
 3 files changed, 36 insertions(+), 46 deletions(-)

diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index 1cd3086d5f74c..8bf0104ef0baf 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -198,7 +198,7 @@ void SparseCooToCsrCPUKernel(const CPUContext& dev_ctx,
   const auto& coo_values = x.non_zero_elements();
   const IntT* batchs_ptr = coo_indices.data<IntT>();
   const IntT* coo_rows_data =
-      batchs == 1 ? batchs_ptr : batchs_ptr + non_zero_num;
+      x_dims.size() == 2 ? batchs_ptr : batchs_ptr + non_zero_num;
   const IntT* coo_cols_data = coo_rows_data + non_zero_num;
   const T* coo_values_data = coo_values.data<T>();
 
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 1ed4ebd23db87..bcc979dcbe51d 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -370,7 +370,7 @@ void SparseCooToCsrGPUKernel(const GPUContext& dev_ctx,
   const auto& coo_values = x.non_zero_elements();
   const IntT* batchs_ptr = coo_indices.data<IntT>();
   const IntT* coo_rows_data =
-      batchs == 1 ? batchs_ptr : batchs_ptr + non_zero_num;
+      x_dims.size() == 2 ? batchs_ptr : batchs_ptr + non_zero_num;
   const IntT* coo_cols_data = coo_rows_data + non_zero_num;
   const T* coo_values_data = coo_values.data<T>();
 
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index 6cc1d9cf96cae..a12425b69299e 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -318,50 +318,40 @@ def test_sparse_coo_tensor_sorted(self):
 
     def test_batch_csr(self):
         with _test_eager_guard():
-            shape = [3, 3, 3]
-
-            def verify(x, crows, cols, values):
-                x = paddle.to_tensor(x)
-                csr = x.to_sparse_csr()
-                assert np.allclose(crows, csr.crows().numpy())
-                assert np.allclose(cols, csr.cols().numpy())
-                assert np.allclose(values, csr.values().numpy())
-
-                dense = csr.to_dense()
-                assert np.allclose(x.numpy(), dense.numpy())
-
-            x = [
-                [[1.0, 0, 0], [0, 2.0, 0], [0, 0, 3.0]],
-                [[0, 0, 0], [0, 0, 0], [0, 0, 0]],
-                [[1.0, 0, 0], [0, 2.0, 0], [0, 0, 3.0]],
-            ]
-            crows = [[0, 1, 2, 3, 0, 0, 0, 0, 0, 1, 2, 3]]
-            cols = [0, 1, 2, 0, 1, 2]
-            values = [1.0, 2.0, 3.0, 1.0, 2.0, 3.0]
-
-            verify(x, crows, cols, values)
-
-            x = [
-                [[0, 0, 0], [0, 0, 0], [0, 0, 0]],
-                [[1.0, 0, 0], [0, 2.0, 0], [0, 0, 3.0]],
-                [[1.0, 0, 0], [0, 2.0, 0], [0, 0, 3.0]],
-            ]
-            crows = [[0, 0, 0, 0, 0, 1, 2, 3, 0, 1, 2, 3]]
-            cols = [0, 1, 2, 0, 1, 2]
-            values = [1.0, 2.0, 3.0, 1.0, 2.0, 3.0]
-
-            verify(x, crows, cols, values)
-
-            x = [
-                [[1.0, 0, 0], [0, 2.0, 0], [0, 0, 3.0]],
-                [[1.0, 0, 0], [0, 2.0, 0], [0, 0, 3.0]],
-                [[0, 0, 0], [0, 0, 0], [0, 0, 0]],
-            ]
-            crows = [[0, 1, 2, 3, 0, 1, 2, 3, 0, 0, 0, 0]]
-            cols = [0, 1, 2, 0, 1, 2]
-            values = [1.0, 2.0, 3.0, 1.0, 2.0, 3.0]
-
-            verify(x, crows, cols, values)
+
+            def verify(dense_x):
+                sparse_x = dense_x.to_sparse_csr()
+                out = sparse_x.to_dense()
+                assert np.allclose(out.numpy(), dense_x.numpy())
+
+            shape = np.random.randint(low=1, high=10, size=3)
+            shape = list(shape)
+            dense_x = paddle.randn(shape)
+            dense_x = paddle.nn.functional.dropout(dense_x, p=0.5)
+            verify(dense_x)
+
+            #test batchs=1
+            shape[0] = 1
+            dense_x = paddle.randn(shape)
+            dense_x = paddle.nn.functional.dropout(dense_x, p=0.5)
+            verify(dense_x)
+
+            shape = np.random.randint(low=2, high=10, size=3)
+            shape = list(shape)
+            dense_x = paddle.randn(shape)
+            #set the 0th batch to zero
+            dense_x[0] = 0
+            verify(dense_x)
+
+            dense_x = paddle.randn(shape)
+            #set the 1th batch to zero
+            dense_x[1] = 0
+            verify(dense_x)
+
+            dense_x = paddle.randn(shape)
+            #set the 2th batch to zero
+            dense_x[2] = 0
+            verify(dense_x)
 
 
 class TestCooError(unittest.TestCase):

From 11011c0b63288e6c8168f997bfae1caa9f2d3ef1 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 21 Jun 2022 07:00:11 +0000
Subject: [PATCH 26/70] remove the unused file

---
 .../unittests/test_sparse_middle_extractor.py | 324 ------------------
 .../tests/unittests/test_sparse_mnist.py      | 126 -------
 2 files changed, 450 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py
 delete mode 100644 python/paddle/fluid/tests/unittests/test_sparse_mnist.py

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py b/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py
deleted file mode 100644
index ae52b4a413336..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py
+++ /dev/null
@@ -1,324 +0,0 @@
-import paddle
-import paddle.nn as nn
-import paddle.sparse as sparse
-from paddle.fluid.framework import _test_eager_guard
-import time
-import numpy as np
-import torch
-import spconv.pytorch as spconv
-import inspect
-
-class MiddleExtractor(paddle.nn.Layer):
-    def __init__(self,
-                #output_shape,
-                use_norm=True,
-                num_input_features=128,
-                num_filters_down1=[64],
-                num_filters_down2=[64, 64],
-                name='MiddleExtractor'):
-        super(MiddleExtractor, self).__init__()
-        self.name = name
-        if not use_norm:
-            self.middle_conv = paddle.nn.Sequential(
-                #nn.Pad3D(1),
-                nn.Conv3D(num_input_features, 64, 3, stride=(2, 1, 1), data_format='NDHWC'),
-                #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
-                nn.ReLU(),
-                #nn.Pad3D([1, 1, 1, 1, 0, 0]),
-                nn.Conv3D(64, 64, 3, stride=(1, 1, 1), data_format='NDHWC'),
-                #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
-                nn.ReLU(),
-                #nn.Pad3D(1),
-                nn.Conv3D(64, 64, 3, stride=(2, 1, 1), data_format='NDHWC'),
-                #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
-                nn.ReLU(),
-            )
-        else:
-            self.middle_conv = paddle.nn.Sequential(
-                #nn.Pad3D(1),
-                nn.Conv3D(num_input_features, 64, 3, stride=(2, 1, 1), data_format='NDHWC'),
-                nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
-                nn.ReLU(),
-                #nn.Pad3D([1, 1, 1, 1, 0, 0]),
-                nn.Conv3D(64, 64, 3, stride=(1, 1, 1), data_format='NDHWC'),
-                nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
-                nn.ReLU(),
-                #nn.Pad3D(1),
-                nn.Conv3D(64, 64, 3, stride=(2, 1, 1), data_format='NDHWC'),
-                nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
-                nn.ReLU(),
-            )
-    def forward(self, x):
-        return self.middle_conv(x)
-
-
-def get_pos_to_kw_map(func):
-    pos_to_kw = {}
-    fsig = inspect.signature(func)
-    pos = 0
-    for name, info in fsig.parameters.items():
-        if info.kind is info.POSITIONAL_OR_KEYWORD:
-            pos_to_kw[pos] = name
-        pos += 1
-    return pos_to_kw
-
-def change_default_args(**kwargs):
-    def layer_wrapper(layer_class):
-        class DefaultArgLayer(layer_class):
-            def __init__(self, *args, **kw):
-                pos_to_kw = get_pos_to_kw_map(layer_class.__init__)
-                kw_to_pos = {kw: pos for pos, kw in pos_to_kw.items()}
-                for key, val in kwargs.items():
-                    if key not in kw and kw_to_pos[key] > len(args):
-                        kw[key] = val
-                super().__init__(*args, **kw)
-
-        return DefaultArgLayer
-
-    return layer_wrapper
-
-class Empty(torch.nn.Module):
-    def __init__(self, *args, **kwargs):
-        super(Empty, self).__init__()
-
-    def forward(self, *args, **kwargs):
-        if len(args) == 1:
-            return args[0]
-        elif len(args) == 0:
-            return None
-        return args
-
-class SpconvMiddleExtractor(torch.nn.Module):
-    def __init__(self,
-                #output_shape,
-                use_norm=True,
-                num_input_features=128,
-                num_filters_down1=[64],
-                num_filters_down2=[64, 64],
-                name='SpconvMiddleExtractor'):
-        super(SpconvMiddleExtractor, self).__init__()
-        if use_norm:
-            BatchNorm1d = change_default_args(
-                eps=1e-3, momentum=0.01)(torch.nn.BatchNorm1d)
-            Linear = change_default_args(bias=False)(nn.Linear)
-        else:
-            BatchNorm1d = Empty
-            Linear = change_default_args(bias=True)(nn.Linear)
-
-        middle_layers = []
-
-        num_filters = [num_input_features] + num_filters_down1
-        filters_pairs_d1 = [[num_filters[i], num_filters[i + 1]]
-                            for i in range(len(num_filters) - 1)]
-
-        for i, o in filters_pairs_d1:
-            middle_layers.append(spconv.SubMConv3d(i, o, 3, bias=False))
-            if use_norm:
-                #middle_layers.append(BatchNorm1d(o))
-                middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01))
-            middle_layers.append(torch.nn.ReLU())
-
-        middle_layers.append(
-            spconv.SparseConv3d(
-                num_filters[-1],
-                num_filters[-1], (3, 1, 1), (2, 1, 1),
-                bias=False))
-
-        if use_norm:
-            #middle_layers.append(
-            #    BatchNorm1d(num_filters[-1]))
-            middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01))
-        middle_layers.append(torch.nn.ReLU())
-
-
-        # assert len(num_filters_down2) > 0
-        if len(num_filters_down1) == 0:
-            num_filters = [num_filters[-1]] + num_filters_down2
-        else:
-            num_filters = [num_filters_down1[-1]] + num_filters_down2
-        filters_pairs_d2 = [[num_filters[i], num_filters[i + 1]]
-                            for i in range(len(num_filters) - 1)]
-        for i, o in filters_pairs_d2:
-            middle_layers.append(spconv.SubMConv3d(i, o, 3, bias=False))
-            if use_norm:
-                #middle_layers.append(BatchNorm1d(o))
-                middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01))
-            middle_layers.append(torch.nn.ReLU())
-        middle_layers.append(
-            spconv.SparseConv3d(
-                num_filters[-1],
-                num_filters[-1], (3, 1, 1), (2, 1, 1),
-                bias=False))
-        if use_norm:
-            #middle_layers.append(
-                #BatchNorm1d(num_filters[-1]))
-            middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01))
-        middle_layers.append(torch.nn.ReLU())
-        #middle_layers.append(scn.SparseToDense(3, num_filters[-1]))
-        middle_layers.append(spconv.ToDense())
-        self.middle_conv = spconv.SparseSequential(*middle_layers)
-
-    def forward(self, x):
-        out = self.middle_conv(x)
-        return out
-
-class SparseMiddleExtractor(paddle.nn.Layer):
-    def __init__(self,
-                #output_shape,
-                use_norm=True,
-                num_input_features=128,
-                num_filters_down1=[64],
-                num_filters_down2=[64, 64],
-                name='SparseMiddleExtractor'):
-        super(SparseMiddleExtractor, self).__init__()
-        self.name = name
-
-        middle_layers = []
-        num_filters = [num_input_features] + num_filters_down1
-        filters_pairs_d1 = [[num_filters[i], num_filters[i + 1]] for i in range(len(num_filters) - 1)]
-        for i, o in filters_pairs_d1:
-            middle_layers.append(sparse.SubmConv3D(i, o, 3, bias_attr=False))
-            if use_norm:
-                middle_layers.append(sparse.BatchNorm(o, epsilon=1e-3, momentum=0.01))
-            middle_layers.append(sparse.ReLU())
-
-        middle_layers.append(sparse.Conv3D(num_filters[-1], num_filters[-1], (3, 1, 1), (2, 1, 1), bias_attr=False))
-
-        if use_norm:
-            middle_layers.append(sparse.BatchNorm(num_filters[-1], epsilon=1e-3, momentum=0.01))
-        middle_layers.append(sparse.ReLU())
-
-
-        if len(num_filters_down1) == 0:
-            num_filters = [num_filters[-1]] + num_filters_down2
-        else:
-            num_filters = [num_filters_down1[-1]] + num_filters_down2
-
-        filters_pairs_d2 = [[num_filters[i], num_filters[i + 1]] for i in range(len(num_filters) - 1)]
-
-        for i, o in filters_pairs_d2:
-            middle_layers.append(sparse.SubmConv3D(i, o, 3, bias_attr=False))
-            if use_norm:
-                middle_layers.append(sparse.BatchNorm(o, epsilon=1e-3, momentum=0.01))
-            middle_layers.append(sparse.ReLU())
-
-        middle_layers.append(sparse.Conv3D(num_filters[-1], num_filters[-1], (3, 1, 1), (2, 1, 1), bias_attr=False))
-        if use_norm:
-            middle_layers.append(sparse.BatchNorm(num_filters[-1], epsilon=1e-3, momentum=0.01))
-        middle_layers.append(sparse.ReLU())
-
-        self.middle_conv = nn.Sequential(*middle_layers)
-        
-    def forward(self, x):
-        sparse_out = self.middle_conv(x)
-        #return sparse_out
-        return sparse_out.to_dense()
-
-
-def test():
-    paddle.seed(0)
-    with _test_eager_guard():
-        in_channels = 128 
-        # Note: 1. paddle的BatchNorm1D的输入shape不能太大，否则报CUDNN_STATUS_NOT_SUPPORTED.
-        shape = [20, 40, 100]
-        batch_size = 1
-        sparsity = 0.95
-
-        full_shape = [batch_size] + shape + [in_channels]
-        print(full_shape)
-
-        total_elements = np.prod(shape)
-        nnz = int(total_elements * (1-sparsity))
-        print("nnz=", nnz)
-
-        #product indices
-        indices = []
-        for i in range(4):
-           indices.append(paddle.randint(0, full_shape[i], [1, nnz])) 
-        
-        indices = paddle.concat(indices)
-        #product values
-        values = paddle.randn((nnz, in_channels))
-
-        sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, shape=full_shape)
-
-        dense_x = sparse_x.to_dense()
-
-        #spconv
-        device = torch.device("cuda")
-        torch_x = torch.tensor(dense_x.numpy(), device=device)
-
-        spconv_x = spconv.SparseConvTensor.from_dense(torch_x)
-
-        #whether to use batch_norm
-        use_norm = True
-
-        dense_model = MiddleExtractor(use_norm=use_norm, num_input_features=in_channels)
-        spconv_model = SpconvMiddleExtractor(use_norm=use_norm, num_input_features=in_channels).to(device)
-        sparse_model = SparseMiddleExtractor(use_norm=use_norm, num_input_features=in_channels)
-        layer_nums = len(sparse_model.middle_conv)
-        block_size = 3 if use_norm else 2
-        layer_nums = int(layer_nums / block_size)
-
-        for i in range(0, layer_nums):
-            weight = paddle.to_tensor(spconv_model.middle_conv[i * block_size].weight.detach().cpu().numpy())
-            sparse_model.middle_conv[i * block_size].weight.set_value(paddle.transpose(paddle.to_tensor(weight), [1,2,3,4,0]))
-            if use_norm:
-               bn_weight = paddle.to_tensor(spconv_model.middle_conv[i*block_size + 1].weight.detach().cpu().numpy()) 
-               sparse_model.middle_conv[i * block_size + 1].weight.set_value(bn_weight)
-
-        print(dense_model)
-        print(sparse_model)
-        print(spconv_model)
-        paddle.device.cuda.synchronize()
-
-        #warm up
-        dense_x.stop_gradient=True
-        out1 = dense_model(dense_x)
-        paddle.device.cuda.synchronize()
-        sparse_x.stop_gradient=True
-        out2 = sparse_model(sparse_x)
-        paddle.device.cuda.synchronize()
-        spconv_x.features.required_grad=False
-        out3 = spconv_model(spconv_x)
-        torch.cuda.synchronize(device)
-        #warm up
-
-        t0 = time.time()
-        #padde dense
-        dense_x.stop_gradient=False
-        out1 = dense_model(dense_x)
-        out1.backward(out1)
-        paddle.device.cuda.synchronize()
-        t1 = time.time()
-
-        #padde sparse
-        sparse_x.stop_gradient=False
-        out2 = sparse_model(sparse_x)
-        out2.backward(out2)
-        paddle.device.cuda.synchronize()
-        t2 = time.time()
-
-        #spconv
-        spconv_x.features.required_grad=True
-        spconv_x.features.requires_grad_()
-        out3 = spconv_model(spconv_x)
-        out3.backward(out3)
-        torch.cuda.synchronize(device)
-        t3 = time.time()
-
-        # Note 2. sparse的BatchNorm底层是使用paddle.nn.BatchNorm1D对values进行bn计算,测试发现BatchNorm1D的性能比BatchNorm3D差，因此use_norm=True的情况，需要更高的稀疏度才能比dense的快
-        # Note 3. 只跑前向，sparse的耗时和spconv接近，稀疏度越高sparse的性能越好，当前方式测试前向+反向，spconv的耗时很高, 原因未知
-        print("dense time: ", t1 - t0)
-        print("sparse time: ", t2 - t1)
-        print("spconv time: ", t3 - t2)
-
-        # Note 4. paddle和torch的BN存在误差，测试shape=(4000, 64)的随机输入，单层BN前向误差在1e-6, 反向误差在1e-4
-        #verify the forward calculation result
-        assert np.allclose(paddle.transpose(out2, [0, 4, 1, 2, 3]).numpy(), out3.detach().cpu().numpy(), atol=1e-4, rtol=1e-4)
-
-        #verify the backward calculation result
-        assert np.allclose(spconv_x.features.grad.cpu().numpy(),
-        sparse_x.grad.values().numpy(), atol=1e-3, rtol=1e-3)
-
-test()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_mnist.py b/python/paddle/fluid/tests/unittests/test_sparse_mnist.py
deleted file mode 100644
index 3589dc83090f3..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_sparse_mnist.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import paddle
-from paddle.vision.transforms import Compose, Normalize, ToTensor
-from paddle.fluid.framework import _test_eager_guard
-import time
-
-paddle.disable_static()
-#transform = Compose([Normalize(mean=[127.5],
-#                               std=[127.5],
-#                               data_format='CHW')])
-transform = Compose([ToTensor()])
-# 使用transform对数据集做归一化
-print('download training data and load training data')
-train_dataset = paddle.vision.datasets.MNIST(mode='train', transform=transform)
-test_dataset = paddle.vision.datasets.MNIST(mode='test', transform=transform)
-print('load finished')
-
-import numpy as np
-#import matplotlib.pyplot as plt
-train_data0, train_label_0 = train_dataset[0][0],train_dataset[0][1]
-train_data0 = train_data0.reshape([28,28])
-#plt.figure(figsize=(2,2))
-#plt.imshow(train_data0, cmap=plt.cm.binary)
-print('train_data0 label is: ' + str(train_label_0))
-
-
-import paddle
-import paddle.nn.functional as F
-class SparseLeNet(paddle.nn.Layer):
-    def __init__(self):
-        super(SparseLeNet, self).__init__()
-        #self.bn = paddle.sparse.BatchNorm(1)
-        self.conv1 = paddle.sparse.Conv3D(in_channels=1, out_channels=6, kernel_size=[1, 5, 5], stride=[1, 1, 1], padding=[0, 2, 2])
-        self.relu1 = paddle.sparse.ReLU()
-        self.pool1 = paddle.sparse.MaxPool3D(kernel_size=[1, 2, 2], stride=[1, 2, 2])
-        self.conv2 = paddle.sparse.Conv3D(in_channels=6, out_channels=16, kernel_size=[1, 5, 5], stride=[1, 1, 1])
-        self.relu2 = paddle.sparse.ReLU()
-        self.pool2 = paddle.sparse.MaxPool3D(kernel_size=[1, 2, 2], stride=[1, 2, 2])
-
-        self.fc1 = paddle.nn.Linear(16*5*5, 120)
-        self.fc2 = paddle.nn.Linear(120, 84)
-        self.fc3 = paddle.nn.Linear(84, 10)
-
-    def forward(self, x):
-        #x = self.bn(x)
-        x = self.conv1(x)
-        x = self.relu1(x)
-        x = self.pool1(x)
-        x = self.conv2(x)
-        x = self.relu2(x)
-        x = self.pool2(x)
-        x = x.to_dense()
-
-        x = paddle.flatten(x, start_axis=1, stop_axis=-1)
-        x = self.fc1(x)
-        x = paddle.nn.functional.relu(x)
-        x = self.fc2(x)
-        x = paddle.nn.functional.relu(x)
-        x = self.fc3(x)
-        return x
-
-import paddle.nn.functional as F
-train_loader = paddle.io.DataLoader(train_dataset, batch_size=64, shuffle=True)
-# 加载训练集 batch_size 设为 64
-# sparse 训练
-
-def prepare_data(x_data):
-  x_data = paddle.transpose(x_data, perm=[0, 2, 3, 1])
-  x_data = paddle.reshape(x_data, [x_data.shape[0], 1, x_data.shape[1], x_data.shape[2], x_data.shape[3]])
-  return x_data
-
-def sparse_train(model):
-    model.train()
-    epochs = 2
-    optim = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters())
-    # 用Adam作为优化函数
-    for epoch in range(epochs):
-        for batch_id, data in enumerate(train_loader()):
-            x_data = data[0]
-            y_data = data[1]
-            x_data = prepare_data(x_data)
-            x_data = x_data.to_sparse_coo(4)
-            x_data.stop_gradient=False
-            predicts = model(x_data)
-            loss = F.cross_entropy(predicts, y_data)
-            # 计算损失
-            acc = paddle.metric.accuracy(predicts, y_data)
-            loss.backward()
-            if batch_id % 300 == 0:
-                print("epoch: {}, batch_id: {}, loss is: {}, acc is: {}".format(epoch, batch_id, loss.numpy(), acc.numpy()))
-            optim.step()
-            optim.clear_grad()
-
-test_loader = paddle.io.DataLoader(test_dataset, places=paddle.CPUPlace(), batch_size=64)
-# 加载测试数据集
-def test(model):
-    model.eval()
-    batch_size = 64
-    for batch_id, data in enumerate(test_loader()):
-        x_data = data[0]
-        y_data = data[1]
-        x_data = prepare_data(x_data)
-        x_data = x_data.to_sparse_coo(4)
-        predicts = model(x_data)
-        # 获取预测结果
-        loss = F.cross_entropy(predicts, y_data)
-        acc = paddle.metric.accuracy(predicts, y_data)
-        if batch_id % 20 == 0:
-            print("batch_id: {}, loss is: {}, acc is: {}".format(batch_id, loss.numpy(), acc.numpy()))
-
-with _test_eager_guard():
-  sparse_model = SparseLeNet()
-  print(sparse_model)
-
-  t0 = time.time()
-  sparse_train(sparse_model)
-  t1 = time.time()
-  print("spare time:", t1-t0)
-  test(sparse_model)
-  #x = paddle.randn((1, 1,28,28,1))
-  #x.stop_gradient=False
-  #sparse_x = x.to_sparse_coo(4)
-  #print("sparse_x values shape:", sparse_x.values().shape)
-  #out = sparse_model(sparse_x)
-  #out.backward(out)
-  #print("end")
-

From de91d400ca163421bb4a71d7c4703fa873f62bbc Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 21 Jun 2022 08:34:15 +0000
Subject: [PATCH 27/70] opt SparseMaskCopyKernel

---
 .../kernels/sparse/gpu/sparse_mask_kernel.cu  |  48 ++-
 .../unittests/test_sparse_middle_extractor.py | 324 ------------------
 .../tests/unittests/test_sparse_mnist.py      | 126 -------
 3 files changed, 35 insertions(+), 463 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py
 delete mode 100644 python/paddle/fluid/tests/unittests/test_sparse_mnist.py

diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index cbbdc122f616f..0b05433de83db 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"
+#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
 #include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
 
 namespace phi {
@@ -120,7 +121,7 @@ void SparseMaskKernel(const Context& dev_ctx,
       }));
 }
 
-template <typename T, typename IntT>
+template <typename T, typename IntT, int VecSize>
 __global__ void SparseMaskCopyKernel(const IntT* x_indexs,
                                      const IntT* mask_indexs,
                                      const IntT* bound_out,
@@ -129,10 +130,15 @@ __global__ void SparseMaskCopyKernel(const IntT* x_indexs,
                                      const int64_t stride,
                                      T* out_values) {
   CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
+    using LoadT = phi::AlignedVector<T, VecSize>;
+    using StoreT = phi::AlignedVector<T, VecSize>;
     const IntT j = bound_out[i];
     if (j >= 0 && j < n && mask_indexs[i] == x_indexs[j]) {
-      for (int k = 0; k < stride; k++) {
-        out_values[i * stride + k] = x_values[j * stride + k];
+      for (int k = 0; k < stride / VecSize; k++) {
+        // out_values[i * stride + k] = x_values[j * stride + k];
+        LoadT vec_x;
+        phi::Load<T, VecSize>(x_values + j * stride + k * VecSize, &vec_x);
+        phi::Store<T, VecSize>(vec_x, out_values + i * stride + k * VecSize);
       }
     }
   }
@@ -230,16 +236,32 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
   const int64_t stride =
       x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];
 
-  SparseMaskCopyKernel<<<config.block_per_grid,
-                         config.thread_per_block,
-                         0,
-                         dev_ctx.stream()>>>(x_indexs_ptr,
-                                             mask_indexs_ptr,
-                                             bound_out_ptr,
-                                             x.non_zero_elements().data<T>(),
-                                             mask_indexs.numel(),
-                                             stride,
-                                             out_ptr);
+  const int VecSize = VecBytes / sizeof(T);
+  if (stride % VecSize == 0) {
+    SparseMaskCopyKernel<T, IntT, VecSize>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(x_indexs_ptr,
+                               mask_indexs_ptr,
+                               bound_out_ptr,
+                               x.non_zero_elements().data<T>(),
+                               mask_indexs.numel(),
+                               stride,
+                               out_ptr);
+  } else {
+    SparseMaskCopyKernel<T, IntT, 1>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(x_indexs_ptr,
+                               mask_indexs_ptr,
+                               bound_out_ptr,
+                               x.non_zero_elements().data<T>(),
+                               mask_indexs.numel(),
+                               stride,
+                               out_ptr);
+  }
 }
 
 template <typename T, typename Context>
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py b/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py
deleted file mode 100644
index ae52b4a413336..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py
+++ /dev/null
@@ -1,324 +0,0 @@
-import paddle
-import paddle.nn as nn
-import paddle.sparse as sparse
-from paddle.fluid.framework import _test_eager_guard
-import time
-import numpy as np
-import torch
-import spconv.pytorch as spconv
-import inspect
-
-class MiddleExtractor(paddle.nn.Layer):
-    def __init__(self,
-                #output_shape,
-                use_norm=True,
-                num_input_features=128,
-                num_filters_down1=[64],
-                num_filters_down2=[64, 64],
-                name='MiddleExtractor'):
-        super(MiddleExtractor, self).__init__()
-        self.name = name
-        if not use_norm:
-            self.middle_conv = paddle.nn.Sequential(
-                #nn.Pad3D(1),
-                nn.Conv3D(num_input_features, 64, 3, stride=(2, 1, 1), data_format='NDHWC'),
-                #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
-                nn.ReLU(),
-                #nn.Pad3D([1, 1, 1, 1, 0, 0]),
-                nn.Conv3D(64, 64, 3, stride=(1, 1, 1), data_format='NDHWC'),
-                #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
-                nn.ReLU(),
-                #nn.Pad3D(1),
-                nn.Conv3D(64, 64, 3, stride=(2, 1, 1), data_format='NDHWC'),
-                #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
-                nn.ReLU(),
-            )
-        else:
-            self.middle_conv = paddle.nn.Sequential(
-                #nn.Pad3D(1),
-                nn.Conv3D(num_input_features, 64, 3, stride=(2, 1, 1), data_format='NDHWC'),
-                nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
-                nn.ReLU(),
-                #nn.Pad3D([1, 1, 1, 1, 0, 0]),
-                nn.Conv3D(64, 64, 3, stride=(1, 1, 1), data_format='NDHWC'),
-                nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
-                nn.ReLU(),
-                #nn.Pad3D(1),
-                nn.Conv3D(64, 64, 3, stride=(2, 1, 1), data_format='NDHWC'),
-                nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'),
-                nn.ReLU(),
-            )
-    def forward(self, x):
-        return self.middle_conv(x)
-
-
-def get_pos_to_kw_map(func):
-    pos_to_kw = {}
-    fsig = inspect.signature(func)
-    pos = 0
-    for name, info in fsig.parameters.items():
-        if info.kind is info.POSITIONAL_OR_KEYWORD:
-            pos_to_kw[pos] = name
-        pos += 1
-    return pos_to_kw
-
-def change_default_args(**kwargs):
-    def layer_wrapper(layer_class):
-        class DefaultArgLayer(layer_class):
-            def __init__(self, *args, **kw):
-                pos_to_kw = get_pos_to_kw_map(layer_class.__init__)
-                kw_to_pos = {kw: pos for pos, kw in pos_to_kw.items()}
-                for key, val in kwargs.items():
-                    if key not in kw and kw_to_pos[key] > len(args):
-                        kw[key] = val
-                super().__init__(*args, **kw)
-
-        return DefaultArgLayer
-
-    return layer_wrapper
-
-class Empty(torch.nn.Module):
-    def __init__(self, *args, **kwargs):
-        super(Empty, self).__init__()
-
-    def forward(self, *args, **kwargs):
-        if len(args) == 1:
-            return args[0]
-        elif len(args) == 0:
-            return None
-        return args
-
-class SpconvMiddleExtractor(torch.nn.Module):
-    def __init__(self,
-                #output_shape,
-                use_norm=True,
-                num_input_features=128,
-                num_filters_down1=[64],
-                num_filters_down2=[64, 64],
-                name='SpconvMiddleExtractor'):
-        super(SpconvMiddleExtractor, self).__init__()
-        if use_norm:
-            BatchNorm1d = change_default_args(
-                eps=1e-3, momentum=0.01)(torch.nn.BatchNorm1d)
-            Linear = change_default_args(bias=False)(nn.Linear)
-        else:
-            BatchNorm1d = Empty
-            Linear = change_default_args(bias=True)(nn.Linear)
-
-        middle_layers = []
-
-        num_filters = [num_input_features] + num_filters_down1
-        filters_pairs_d1 = [[num_filters[i], num_filters[i + 1]]
-                            for i in range(len(num_filters) - 1)]
-
-        for i, o in filters_pairs_d1:
-            middle_layers.append(spconv.SubMConv3d(i, o, 3, bias=False))
-            if use_norm:
-                #middle_layers.append(BatchNorm1d(o))
-                middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01))
-            middle_layers.append(torch.nn.ReLU())
-
-        middle_layers.append(
-            spconv.SparseConv3d(
-                num_filters[-1],
-                num_filters[-1], (3, 1, 1), (2, 1, 1),
-                bias=False))
-
-        if use_norm:
-            #middle_layers.append(
-            #    BatchNorm1d(num_filters[-1]))
-            middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01))
-        middle_layers.append(torch.nn.ReLU())
-
-
-        # assert len(num_filters_down2) > 0
-        if len(num_filters_down1) == 0:
-            num_filters = [num_filters[-1]] + num_filters_down2
-        else:
-            num_filters = [num_filters_down1[-1]] + num_filters_down2
-        filters_pairs_d2 = [[num_filters[i], num_filters[i + 1]]
-                            for i in range(len(num_filters) - 1)]
-        for i, o in filters_pairs_d2:
-            middle_layers.append(spconv.SubMConv3d(i, o, 3, bias=False))
-            if use_norm:
-                #middle_layers.append(BatchNorm1d(o))
-                middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01))
-            middle_layers.append(torch.nn.ReLU())
-        middle_layers.append(
-            spconv.SparseConv3d(
-                num_filters[-1],
-                num_filters[-1], (3, 1, 1), (2, 1, 1),
-                bias=False))
-        if use_norm:
-            #middle_layers.append(
-                #BatchNorm1d(num_filters[-1]))
-            middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01))
-        middle_layers.append(torch.nn.ReLU())
-        #middle_layers.append(scn.SparseToDense(3, num_filters[-1]))
-        middle_layers.append(spconv.ToDense())
-        self.middle_conv = spconv.SparseSequential(*middle_layers)
-
-    def forward(self, x):
-        out = self.middle_conv(x)
-        return out
-
-class SparseMiddleExtractor(paddle.nn.Layer):
-    def __init__(self,
-                #output_shape,
-                use_norm=True,
-                num_input_features=128,
-                num_filters_down1=[64],
-                num_filters_down2=[64, 64],
-                name='SparseMiddleExtractor'):
-        super(SparseMiddleExtractor, self).__init__()
-        self.name = name
-
-        middle_layers = []
-        num_filters = [num_input_features] + num_filters_down1
-        filters_pairs_d1 = [[num_filters[i], num_filters[i + 1]] for i in range(len(num_filters) - 1)]
-        for i, o in filters_pairs_d1:
-            middle_layers.append(sparse.SubmConv3D(i, o, 3, bias_attr=False))
-            if use_norm:
-                middle_layers.append(sparse.BatchNorm(o, epsilon=1e-3, momentum=0.01))
-            middle_layers.append(sparse.ReLU())
-
-        middle_layers.append(sparse.Conv3D(num_filters[-1], num_filters[-1], (3, 1, 1), (2, 1, 1), bias_attr=False))
-
-        if use_norm:
-            middle_layers.append(sparse.BatchNorm(num_filters[-1], epsilon=1e-3, momentum=0.01))
-        middle_layers.append(sparse.ReLU())
-
-
-        if len(num_filters_down1) == 0:
-            num_filters = [num_filters[-1]] + num_filters_down2
-        else:
-            num_filters = [num_filters_down1[-1]] + num_filters_down2
-
-        filters_pairs_d2 = [[num_filters[i], num_filters[i + 1]] for i in range(len(num_filters) - 1)]
-
-        for i, o in filters_pairs_d2:
-            middle_layers.append(sparse.SubmConv3D(i, o, 3, bias_attr=False))
-            if use_norm:
-                middle_layers.append(sparse.BatchNorm(o, epsilon=1e-3, momentum=0.01))
-            middle_layers.append(sparse.ReLU())
-
-        middle_layers.append(sparse.Conv3D(num_filters[-1], num_filters[-1], (3, 1, 1), (2, 1, 1), bias_attr=False))
-        if use_norm:
-            middle_layers.append(sparse.BatchNorm(num_filters[-1], epsilon=1e-3, momentum=0.01))
-        middle_layers.append(sparse.ReLU())
-
-        self.middle_conv = nn.Sequential(*middle_layers)
-        
-    def forward(self, x):
-        sparse_out = self.middle_conv(x)
-        #return sparse_out
-        return sparse_out.to_dense()
-
-
-def test():
-    paddle.seed(0)
-    with _test_eager_guard():
-        in_channels = 128 
-        # Note: 1. paddle的BatchNorm1D的输入shape不能太大，否则报CUDNN_STATUS_NOT_SUPPORTED.
-        shape = [20, 40, 100]
-        batch_size = 1
-        sparsity = 0.95
-
-        full_shape = [batch_size] + shape + [in_channels]
-        print(full_shape)
-
-        total_elements = np.prod(shape)
-        nnz = int(total_elements * (1-sparsity))
-        print("nnz=", nnz)
-
-        #product indices
-        indices = []
-        for i in range(4):
-           indices.append(paddle.randint(0, full_shape[i], [1, nnz])) 
-        
-        indices = paddle.concat(indices)
-        #product values
-        values = paddle.randn((nnz, in_channels))
-
-        sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, shape=full_shape)
-
-        dense_x = sparse_x.to_dense()
-
-        #spconv
-        device = torch.device("cuda")
-        torch_x = torch.tensor(dense_x.numpy(), device=device)
-
-        spconv_x = spconv.SparseConvTensor.from_dense(torch_x)
-
-        #whether to use batch_norm
-        use_norm = True
-
-        dense_model = MiddleExtractor(use_norm=use_norm, num_input_features=in_channels)
-        spconv_model = SpconvMiddleExtractor(use_norm=use_norm, num_input_features=in_channels).to(device)
-        sparse_model = SparseMiddleExtractor(use_norm=use_norm, num_input_features=in_channels)
-        layer_nums = len(sparse_model.middle_conv)
-        block_size = 3 if use_norm else 2
-        layer_nums = int(layer_nums / block_size)
-
-        for i in range(0, layer_nums):
-            weight = paddle.to_tensor(spconv_model.middle_conv[i * block_size].weight.detach().cpu().numpy())
-            sparse_model.middle_conv[i * block_size].weight.set_value(paddle.transpose(paddle.to_tensor(weight), [1,2,3,4,0]))
-            if use_norm:
-               bn_weight = paddle.to_tensor(spconv_model.middle_conv[i*block_size + 1].weight.detach().cpu().numpy()) 
-               sparse_model.middle_conv[i * block_size + 1].weight.set_value(bn_weight)
-
-        print(dense_model)
-        print(sparse_model)
-        print(spconv_model)
-        paddle.device.cuda.synchronize()
-
-        #warm up
-        dense_x.stop_gradient=True
-        out1 = dense_model(dense_x)
-        paddle.device.cuda.synchronize()
-        sparse_x.stop_gradient=True
-        out2 = sparse_model(sparse_x)
-        paddle.device.cuda.synchronize()
-        spconv_x.features.required_grad=False
-        out3 = spconv_model(spconv_x)
-        torch.cuda.synchronize(device)
-        #warm up
-
-        t0 = time.time()
-        #padde dense
-        dense_x.stop_gradient=False
-        out1 = dense_model(dense_x)
-        out1.backward(out1)
-        paddle.device.cuda.synchronize()
-        t1 = time.time()
-
-        #padde sparse
-        sparse_x.stop_gradient=False
-        out2 = sparse_model(sparse_x)
-        out2.backward(out2)
-        paddle.device.cuda.synchronize()
-        t2 = time.time()
-
-        #spconv
-        spconv_x.features.required_grad=True
-        spconv_x.features.requires_grad_()
-        out3 = spconv_model(spconv_x)
-        out3.backward(out3)
-        torch.cuda.synchronize(device)
-        t3 = time.time()
-
-        # Note 2. sparse的BatchNorm底层是使用paddle.nn.BatchNorm1D对values进行bn计算,测试发现BatchNorm1D的性能比BatchNorm3D差，因此use_norm=True的情况，需要更高的稀疏度才能比dense的快
-        # Note 3. 只跑前向，sparse的耗时和spconv接近，稀疏度越高sparse的性能越好，当前方式测试前向+反向，spconv的耗时很高, 原因未知
-        print("dense time: ", t1 - t0)
-        print("sparse time: ", t2 - t1)
-        print("spconv time: ", t3 - t2)
-
-        # Note 4. paddle和torch的BN存在误差，测试shape=(4000, 64)的随机输入，单层BN前向误差在1e-6, 反向误差在1e-4
-        #verify the forward calculation result
-        assert np.allclose(paddle.transpose(out2, [0, 4, 1, 2, 3]).numpy(), out3.detach().cpu().numpy(), atol=1e-4, rtol=1e-4)
-
-        #verify the backward calculation result
-        assert np.allclose(spconv_x.features.grad.cpu().numpy(),
-        sparse_x.grad.values().numpy(), atol=1e-3, rtol=1e-3)
-
-test()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_mnist.py b/python/paddle/fluid/tests/unittests/test_sparse_mnist.py
deleted file mode 100644
index 3589dc83090f3..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_sparse_mnist.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import paddle
-from paddle.vision.transforms import Compose, Normalize, ToTensor
-from paddle.fluid.framework import _test_eager_guard
-import time
-
-paddle.disable_static()
-#transform = Compose([Normalize(mean=[127.5],
-#                               std=[127.5],
-#                               data_format='CHW')])
-transform = Compose([ToTensor()])
-# 使用transform对数据集做归一化
-print('download training data and load training data')
-train_dataset = paddle.vision.datasets.MNIST(mode='train', transform=transform)
-test_dataset = paddle.vision.datasets.MNIST(mode='test', transform=transform)
-print('load finished')
-
-import numpy as np
-#import matplotlib.pyplot as plt
-train_data0, train_label_0 = train_dataset[0][0],train_dataset[0][1]
-train_data0 = train_data0.reshape([28,28])
-#plt.figure(figsize=(2,2))
-#plt.imshow(train_data0, cmap=plt.cm.binary)
-print('train_data0 label is: ' + str(train_label_0))
-
-
-import paddle
-import paddle.nn.functional as F
-class SparseLeNet(paddle.nn.Layer):
-    def __init__(self):
-        super(SparseLeNet, self).__init__()
-        #self.bn = paddle.sparse.BatchNorm(1)
-        self.conv1 = paddle.sparse.Conv3D(in_channels=1, out_channels=6, kernel_size=[1, 5, 5], stride=[1, 1, 1], padding=[0, 2, 2])
-        self.relu1 = paddle.sparse.ReLU()
-        self.pool1 = paddle.sparse.MaxPool3D(kernel_size=[1, 2, 2], stride=[1, 2, 2])
-        self.conv2 = paddle.sparse.Conv3D(in_channels=6, out_channels=16, kernel_size=[1, 5, 5], stride=[1, 1, 1])
-        self.relu2 = paddle.sparse.ReLU()
-        self.pool2 = paddle.sparse.MaxPool3D(kernel_size=[1, 2, 2], stride=[1, 2, 2])
-
-        self.fc1 = paddle.nn.Linear(16*5*5, 120)
-        self.fc2 = paddle.nn.Linear(120, 84)
-        self.fc3 = paddle.nn.Linear(84, 10)
-
-    def forward(self, x):
-        #x = self.bn(x)
-        x = self.conv1(x)
-        x = self.relu1(x)
-        x = self.pool1(x)
-        x = self.conv2(x)
-        x = self.relu2(x)
-        x = self.pool2(x)
-        x = x.to_dense()
-
-        x = paddle.flatten(x, start_axis=1, stop_axis=-1)
-        x = self.fc1(x)
-        x = paddle.nn.functional.relu(x)
-        x = self.fc2(x)
-        x = paddle.nn.functional.relu(x)
-        x = self.fc3(x)
-        return x
-
-import paddle.nn.functional as F
-train_loader = paddle.io.DataLoader(train_dataset, batch_size=64, shuffle=True)
-# 加载训练集 batch_size 设为 64
-# sparse 训练
-
-def prepare_data(x_data):
-  x_data = paddle.transpose(x_data, perm=[0, 2, 3, 1])
-  x_data = paddle.reshape(x_data, [x_data.shape[0], 1, x_data.shape[1], x_data.shape[2], x_data.shape[3]])
-  return x_data
-
-def sparse_train(model):
-    model.train()
-    epochs = 2
-    optim = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters())
-    # 用Adam作为优化函数
-    for epoch in range(epochs):
-        for batch_id, data in enumerate(train_loader()):
-            x_data = data[0]
-            y_data = data[1]
-            x_data = prepare_data(x_data)
-            x_data = x_data.to_sparse_coo(4)
-            x_data.stop_gradient=False
-            predicts = model(x_data)
-            loss = F.cross_entropy(predicts, y_data)
-            # 计算损失
-            acc = paddle.metric.accuracy(predicts, y_data)
-            loss.backward()
-            if batch_id % 300 == 0:
-                print("epoch: {}, batch_id: {}, loss is: {}, acc is: {}".format(epoch, batch_id, loss.numpy(), acc.numpy()))
-            optim.step()
-            optim.clear_grad()
-
-test_loader = paddle.io.DataLoader(test_dataset, places=paddle.CPUPlace(), batch_size=64)
-# 加载测试数据集
-def test(model):
-    model.eval()
-    batch_size = 64
-    for batch_id, data in enumerate(test_loader()):
-        x_data = data[0]
-        y_data = data[1]
-        x_data = prepare_data(x_data)
-        x_data = x_data.to_sparse_coo(4)
-        predicts = model(x_data)
-        # 获取预测结果
-        loss = F.cross_entropy(predicts, y_data)
-        acc = paddle.metric.accuracy(predicts, y_data)
-        if batch_id % 20 == 0:
-            print("batch_id: {}, loss is: {}, acc is: {}".format(batch_id, loss.numpy(), acc.numpy()))
-
-with _test_eager_guard():
-  sparse_model = SparseLeNet()
-  print(sparse_model)
-
-  t0 = time.time()
-  sparse_train(sparse_model)
-  t1 = time.time()
-  print("spare time:", t1-t0)
-  test(sparse_model)
-  #x = paddle.randn((1, 1,28,28,1))
-  #x.stop_gradient=False
-  #sparse_x = x.to_sparse_coo(4)
-  #print("sparse_x values shape:", sparse_x.values().shape)
-  #out = sparse_model(sparse_x)
-  #out.backward(out)
-  #print("end")
-

From a13794700ebdcd88c08e49cdbddd733e80e7bf8d Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 21 Jun 2022 08:39:10 +0000
Subject: [PATCH 28/70] merge origin

---
 paddle/phi/kernels/funcs/sparse/scatter.cu.h  |  17 +--
 .../kernels/sparse/gpu/coalesced_kernel.cu    |  51 +++++----
 .../phi/kernels/sparse/gpu/convolution.cu.h   |  12 +-
 .../sparse/gpu/convolution_grad_kernel.cu     |  94 ++++++++--------
 .../kernels/sparse/gpu/convolution_kernel.cu  | 104 +++++++++---------
 5 files changed, 144 insertions(+), 134 deletions(-)

diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
index 48b12f8a1b6de..cd89c916db577 100644
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -41,9 +41,10 @@ __global__ void ScatterKernel(const T* input,
                               T* out) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   const int vec_channels = channels / VecSize;
-  using  LoadT = phi::AlignedVector<T, VecSize>;
-  using  StoreT = phi::AlignedVector<T, VecSize>;
-  for (int i = tid; i < non_zero_num * vec_channels; i += gridDim.x * blockDim.x) {
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  for (int i = tid; i < non_zero_num * vec_channels;
+       i += gridDim.x * blockDim.x) {
     int indices_i = i / vec_channels;
     int channels_i = i - indices_i * vec_channels;
 
@@ -51,17 +52,19 @@ __global__ void ScatterKernel(const T* input,
     int end = indices_i == non_zero_num - 1 ? rulebook_len
                                             : unique_value[indices_i + 1];
     // max(end-start) = kernel_size
-    StoreT sums={static_cast<T>(0)};
+    StoreT sums = {static_cast<T>(0)};
     for (int j = start; j < end; j++) {
       const int out_feature_i = out_index[j];
       LoadT vec_in;
-      phi::Load<T, VecSize>(input + out_feature_i * channels + channels_i * VecSize, &vec_in);
+      phi::Load<T, VecSize>(
+          input + out_feature_i * channels + channels_i * VecSize, &vec_in);
 #pragma unroll
-      for(int k = 0; k < VecSize; k++){
+      for (int k = 0; k < VecSize; k++) {
         sums[k] += vec_in[k];
       }
     }
-    phi::Store<T, VecSize>(sums, out + indices_i * channels + channels_i * VecSize); 
+    phi::Store<T, VecSize>(sums,
+                           out + indices_i * channels + channels_i * VecSize);
   }
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
index 44ecb4365a9c1..60d90a18d4633 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
@@ -133,30 +133,33 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx,
 
   // 5. scatter the values
   const int VecSize = VecBytes / sizeof(T);
-  if(stride % VecSize == 0){
-      config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx,
-              nnz * stride / VecSize, 1);
-      phi::funcs::sparse::ScatterKernel<T, VecSize>
-          <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
-                  x_values_ptr,
-                  public_indexs.data<int>(),
-                  values_indexs_ptr,
-                  out_nnz,
-                  nnz,
-                  stride,
-                  out_values.data<T>());
-  }else{
-      config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx,
-              nnz * stride, 1);
-      phi::funcs::sparse::ScatterKernel<T, 1>
-          <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
-                  x_values_ptr,
-                  public_indexs.data<int>(),
-                  values_indexs_ptr,
-                  out_nnz,
-                  nnz,
-                  stride,
-                  out_values.data<T>());
+  if (stride % VecSize == 0) {
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, nnz * stride / VecSize, 1);
+    phi::funcs::sparse::ScatterKernel<T, VecSize>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(x_values_ptr,
+                               public_indexs.data<int>(),
+                               values_indexs_ptr,
+                               out_nnz,
+                               nnz,
+                               stride,
+                               out_values.data<T>());
+  } else {
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1);
+    phi::funcs::sparse::ScatterKernel<T, 1>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(x_values_ptr,
+                               public_indexs.data<int>(),
+                               values_indexs_ptr,
+                               out_nnz,
+                               nnz,
+                               stride,
+                               out_values.data<T>());
   }
 
   // 6. convert index to coordinate
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 7f0d4814b75d0..a08c7931bb4f4 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -24,12 +24,12 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/sparse/utils.cu.h"
 #include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
-#include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 namespace phi {
 namespace sparse {
@@ -53,16 +53,16 @@ __global__ void GatherKernel(const T* params,
                              T* output,
                              size_t index_size,
                              size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size*slice_size/VecSize, int64_t) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size / VecSize, int64_t) {
     const int vec_slice_size = slice_size / VecSize;
     int indices_i = i / vec_slice_size;
     int slice_i = i - indices_i * vec_slice_size;  // offset inside the slice
     IndexT gather_i = indices[indices_i];
-    int64_t params_i = gather_i * slice_size + slice_i*VecSize;
-    using  LoadT = phi::AlignedVector<T, VecSize>;
-    using  StoreT = phi::AlignedVector<T, VecSize>;
+    int64_t params_i = gather_i * slice_size + slice_i * VecSize;
+    using LoadT = phi::AlignedVector<T, VecSize>;
+    using StoreT = phi::AlignedVector<T, VecSize>;
     LoadT params_vec;
-    phi::Load<T, VecSize>(params + params_i, &params_vec); 
+    phi::Load<T, VecSize>(params + params_i, &params_vec);
     phi::Store<T, VecSize>(params_vec, output + i * VecSize);
   }
 }
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index ebf2bf6cae896..d91c93fde66fb 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -139,54 +139,56 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   }
 
   const int VecSize = VecBytes / sizeof(T);
-  if(in_channels % VecSize == 0){
-      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-              dev_ctx, rulebook_len * in_channels / VecSize, 1);
-      GatherKernel<T, IntT, VecSize><<<config.block_per_grid.x,
-          config.thread_per_block.x,
-          0,
-          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                  rulebook_ptr + rulebook_len,
-                  in_features_ptr,
-                  rulebook_len,
-                  in_channels);
-  }else{
-      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-              dev_ctx, rulebook_len * in_channels, 1);
-      GatherKernel<T, IntT, 1><<<config.block_per_grid.x,
-          config.thread_per_block.x,
-          0,
-          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                  rulebook_ptr + rulebook_len,
-                  in_features_ptr,
-                  rulebook_len,
-                  in_channels);
+  if (in_channels % VecSize == 0) {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, rulebook_len * in_channels / VecSize, 1);
+    GatherKernel<T, IntT, VecSize>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                               rulebook_ptr + rulebook_len,
+                               in_features_ptr,
+                               rulebook_len,
+                               in_channels);
+  } else {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, rulebook_len * in_channels, 1);
+    GatherKernel<T, IntT, 1>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                               rulebook_ptr + rulebook_len,
+                               in_features_ptr,
+                               rulebook_len,
+                               in_channels);
   }
 
-  if(out_channels % VecSize == 0){
-      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-              dev_ctx, rulebook_len * out_channels / VecSize, 1);
-      GatherKernel<T, IntT, VecSize>
-          <<<config.block_per_grid.x,
-          config.thread_per_block.x,
-          0,
-          dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
-                  rulebook_ptr + rulebook_len * 2,
-                  out_grad_features_ptr,
-                  rulebook_len,
-                  out_channels);
-  }else{
-      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-              dev_ctx, rulebook_len * out_channels, 1);
-      GatherKernel<T, IntT, 1>
-          <<<config.block_per_grid.x,
-          config.thread_per_block.x,
-          0,
-          dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
-                  rulebook_ptr + rulebook_len * 2,
-                  out_grad_features_ptr,
-                  rulebook_len,
-                  out_channels);
+  if (out_channels % VecSize == 0) {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, rulebook_len * out_channels / VecSize, 1);
+    GatherKernel<T, IntT, VecSize>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
+                               rulebook_ptr + rulebook_len * 2,
+                               out_grad_features_ptr,
+                               rulebook_len,
+                               out_channels);
+  } else {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, rulebook_len * out_channels, 1);
+    GatherKernel<T, IntT, 1>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
+                               rulebook_ptr + rulebook_len * 2,
+                               out_grad_features_ptr,
+                               rulebook_len,
+                               out_channels);
   }
 
   const T* kernel_ptr = kernel.data<T>();
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index c2942bdfd2b63..c1ea1c1894461 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -111,28 +111,30 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
 
   const int VecSize = VecBytes / sizeof(T);
-  if(in_channels % VecSize == 0){
-      auto config =
-          phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels / VecSize, 1);
-      GatherKernel<T, IntT, VecSize><<<config.block_per_grid.x,
-          config.thread_per_block.x,
-          0,
-          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                  rulebook_ptr + n,
-                  in_features_ptr,
-                  n,
-                  in_channels);
-  }else{
-      auto config =
-          phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
-      GatherKernel<T, IntT, 1><<<config.block_per_grid.x,
-          config.thread_per_block.x,
-          0,
-          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                  rulebook_ptr + n,
-                  in_features_ptr,
-                  n,
-                  in_channels);
+  if (in_channels % VecSize == 0) {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, n * in_channels / VecSize, 1);
+    GatherKernel<T, IntT, VecSize>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                               rulebook_ptr + n,
+                               in_features_ptr,
+                               n,
+                               in_channels);
+  } else {
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
+    GatherKernel<T, IntT, 1>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                               rulebook_ptr + n,
+                               in_features_ptr,
+                               n,
+                               in_channels);
   }
 
   // 3. call gemm for every werght
@@ -182,35 +184,35 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
                                out_channels,
                                false);
   } else {
-      if(out_channels % VecSize == 0){
-          auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-                  dev_ctx, out->nnz() * out_channels / VecSize, 1);
-          phi::funcs::sparse::ScatterKernel<T, VecSize>
-              <<<config.block_per_grid.x,
-              config.thread_per_block.x,
-              0,
-              dev_ctx.stream()>>>(out_features_ptr,
-                      unique_value.data<int>(),
-                      out_index.data<int>(),
-                      out->nnz(),
-                      n,
-                      out_channels,
-                      out_values_ptr);
-      }else{
-          auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-                  dev_ctx, out->nnz() * out_channels, 1);
-          phi::funcs::sparse::ScatterKernel<T, 1>
-              <<<config.block_per_grid.x,
-              config.thread_per_block.x,
-              0,
-              dev_ctx.stream()>>>(out_features_ptr,
-                      unique_value.data<int>(),
-                      out_index.data<int>(),
-                      out->nnz(),
-                      n,
-                      out_channels,
-                      out_values_ptr);
-      }
+    if (out_channels % VecSize == 0) {
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+          dev_ctx, out->nnz() * out_channels / VecSize, 1);
+      phi::funcs::sparse::ScatterKernel<T, VecSize>
+          <<<config.block_per_grid.x,
+             config.thread_per_block.x,
+             0,
+             dev_ctx.stream()>>>(out_features_ptr,
+                                 unique_value.data<int>(),
+                                 out_index.data<int>(),
+                                 out->nnz(),
+                                 n,
+                                 out_channels,
+                                 out_values_ptr);
+    } else {
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+          dev_ctx, out->nnz() * out_channels, 1);
+      phi::funcs::sparse::ScatterKernel<T, 1>
+          <<<config.block_per_grid.x,
+             config.thread_per_block.x,
+             0,
+             dev_ctx.stream()>>>(out_features_ptr,
+                                 unique_value.data<int>(),
+                                 out_index.data<int>(),
+                                 out->nnz(),
+                                 n,
+                                 out_channels,
+                                 out_values_ptr);
+    }
   }
 }
 /**

From 6a92b32a5c70d25e571793ac54e7f2de5c7b4523 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 22 Jun 2022 07:34:59 +0000
Subject: [PATCH 29/70] opt subm

---
 paddle/phi/core/sparse_coo_tensor.h           |  7 ++
 .../kernels/sparse/convolution_grad_kernel.h  |  6 +-
 .../phi/kernels/sparse/convolution_kernel.h   | 18 +----
 .../sparse/cpu/convolution_grad_kernel.cc     |  7 +-
 .../kernels/sparse/cpu/convolution_kernel.cc  | 55 ++++++++-----
 .../phi/kernels/sparse/gpu/convolution.cu.h   |  1 +
 .../sparse/gpu/convolution_grad_kernel.cu     |  8 +-
 .../kernels/sparse/gpu/convolution_kernel.cu  | 81 +++++++++++++------
 .../kernels/test_sparse_conv3d_dev_api.cc     | 18 ++---
 python/paddle/utils/code_gen/sparse_api.yaml  |  5 +-
 .../paddle/utils/code_gen/sparse_bw_api.yaml  |  6 +-
 11 files changed, 129 insertions(+), 83 deletions(-)

diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h
index c65b5ce57430b..7f2da7afe824b 100644
--- a/paddle/phi/core/sparse_coo_tensor.h
+++ b/paddle/phi/core/sparse_coo_tensor.h
@@ -155,6 +155,11 @@ class SparseCooTensor : public TensorBase,
 
   /// \brief get the dnese dim
   int32_t dense_dim() const;
+  const DenseTensor& rulebook() const { return rulebook_; }
+  DenseTensor* mutable_rulebook() { return &rulebook_; }
+  void SetRulebook(const DenseTensor& rulebook) { rulebook_ = rulebook; }
+  const bool subm() const { return subm_; }
+  void SetSubm(const bool subm) { subm_ = subm; }
 
  private:
   // save the indices of non zero elements in original dense tensor
@@ -165,6 +170,8 @@ class SparseCooTensor : public TensorBase,
   bool coalesced_ = false;
   // save the number of non zero elements in each batch
   DDim dims_;
+  DenseTensor rulebook_;
+  bool subm_ = false;
   /* --------------------------- */
   /*   example: non zero element is scalar */
   /* --------------------------- */
diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
index eebfcddfc7a9e..03a7403b1f41d 100644
--- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
@@ -26,7 +26,7 @@ template <typename T, typename Context>
 void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& kernel,
-                      const DenseTensor& rulebook,
+                      const SparseCooTensor& out,
                       const SparseCooTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
@@ -41,7 +41,7 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dGrad(
     const Context& dev_ctx,
     const SparseCooTensor& x,
     const DenseTensor& kernel,
-    const DenseTensor& rulebook,
+    const SparseCooTensor& out,
     const SparseCooTensor& out_grad,
     const std::vector<int>& paddings,
     const std::vector<int>& dilations,
@@ -55,7 +55,7 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dGrad(
   Conv3dGradKernel<T, Context>(dev_ctx,
                                x,
                                kernel,
-                               rulebook,
+                               out,
                                out_grad,
                                paddings,
                                dilations,
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
index 62a72a9dd4115..90e46800c92ca 100644
--- a/paddle/phi/kernels/sparse/convolution_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -31,8 +31,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& strides,
                   const int groups,
                   const bool subm,
-                  SparseCooTensor* out,
-                  DenseTensor* rulebook);
+                  SparseCooTensor* out);
 
 template <typename T, typename Context>
 SparseCooTensor Conv3d(const Context& dev_ctx,
@@ -42,19 +41,10 @@ SparseCooTensor Conv3d(const Context& dev_ctx,
                        const std::vector<int>& dilations,
                        const std::vector<int>& strides,
                        const int groups,
-                       const bool subm,
-                       DenseTensor* rulebook) {
+                       const bool subm) {
   SparseCooTensor coo;
-  Conv3dKernel<T, Context>(dev_ctx,
-                           x,
-                           kernel,
-                           paddings,
-                           dilations,
-                           strides,
-                           groups,
-                           subm,
-                           &coo,
-                           rulebook);
+  Conv3dKernel<T, Context>(
+      dev_ctx, x, kernel, paddings, dilations, strides, groups, subm, &coo);
   return coo;
 }
 
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
index 5a981fb8df350..b750a688ff867 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -35,7 +35,7 @@ template <typename T, typename IntT = int>
 void Conv3dGradCPUKernel(const CPUContext& dev_ctx,
                          const SparseCooTensor& x,
                          const DenseTensor& kernel,
-                         const DenseTensor& rulebook,
+                         const SparseCooTensor& out,
                          const SparseCooTensor& out_grad,
                          const std::vector<int>& paddings,
                          const std::vector<int>& dilations,
@@ -48,6 +48,7 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx,
   const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
+  const DenseTensor& rulebook = out.rulebook();
   const IntT* rulebook_ptr = rulebook.data<IntT>();
 
   const int rulebook_len = rulebook.dims()[1];
@@ -182,7 +183,7 @@ template <typename T, typename Context>
 void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& kernel,
-                      const DenseTensor& rulebook,
+                      const SparseCooTensor& out,
                       const SparseCooTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
@@ -196,7 +197,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
         Conv3dGradCPUKernel<T, data_t>(dev_ctx,
                                        x,
                                        kernel,
-                                       rulebook,
+                                       out,
                                        out_grad,
                                        paddings,
                                        dilations,
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index 1b95de890deeb..6db26ffcc7094 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/sparse/cpu/convolution.h"
 
@@ -35,8 +36,7 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx,
                      const std::vector<int>& strides,
                      const int groups,
                      const bool subm,
-                     SparseCooTensor* out,
-                     DenseTensor* rulebook) {
+                     SparseCooTensor* out) {
   // update padding and dilation
   // Currently, only support x.layout is NDHWC, groups = 1
   // if x.layout != NDHWC then transpose(x), transpose(weight)
@@ -70,20 +70,37 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx,
       DataType::INT32, {kernel_size}, DataLayout::NCHW);
   DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
 
-  ProductRuleBook<T, CPUContext, IntT>(dev_ctx,
-                                       x,
-                                       kernel_sizes,
-                                       subm_paddings,
-                                       dilations,
-                                       subm_strides,
-                                       out_dims,
-                                       subm,
-                                       rulebook,
-                                       &counter_per_kernel);
-
-  UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
-      dev_ctx, x, kernel_size, out_channels, out_dims, rulebook, out);
-
+  DenseTensor* rulebook = nullptr;
+  // int n = 0;
+  if (subm && x.subm()) {
+    DenseTensor out_rulebook = phi::EmptyLike<IntT>(dev_ctx, x.rulebook());
+    phi::Copy(dev_ctx, x.rulebook(), dev_ctx.GetPlace(), false, &out_rulebook);
+    out->SetRulebook(out_rulebook);
+    rulebook = out->mutable_rulebook();
+    // n = rulebook->dims()[1];
+
+    DenseTensor out_indices =
+        phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
+    DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+    phi::Copy(
+        dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
+    out->SetMember(out_indices, out_values, out_dims, true);
+    out->SetSubm(subm);
+  } else {
+    ProductRuleBook<T, CPUContext, IntT>(dev_ctx,
+                                         x,
+                                         kernel_sizes,
+                                         subm_paddings,
+                                         dilations,
+                                         subm_strides,
+                                         out_dims,
+                                         subm,
+                                         rulebook,
+                                         &counter_per_kernel);
+
+    UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
+        dev_ctx, x, kernel_size, out_channels, out_dims, rulebook, out);
+  }
   int n = rulebook->dims()[1];
   const int* counter_ptr = counter_per_kernel.data<int>();
 
@@ -159,8 +176,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& strides,
                   const int groups,
                   const bool subm,
-                  SparseCooTensor* out,
-                  DenseTensor* rulebook) {
+                  SparseCooTensor* out) {
   PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "Conv3dCPUKernel", ([&] {
         Conv3dCPUKernel<T, data_t>(dev_ctx,
@@ -171,8 +187,7 @@ void Conv3dKernel(const Context& dev_ctx,
                                    strides,
                                    groups,
                                    subm,
-                                   out,
-                                   rulebook);
+                                   out);
       }));
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index a08c7931bb4f4..aafd06d606d33 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -555,6 +555,7 @@ int ProductRuleBook(const Context& dev_ctx,
     phi::Copy(
         dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
     out->SetMember(out_indices, out_values, out_dims, true);
+    out->SetSubm(true);
   }
   return rulebook_len;
 }
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index d91c93fde66fb..a5573f74be441 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -42,7 +42,7 @@ template <typename T, typename IntT>
 void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
                          const SparseCooTensor& x,
                          const DenseTensor& kernel,
-                         const DenseTensor& rulebook,
+                         const SparseCooTensor& out,
                          const SparseCooTensor& out_grad,
                          const std::vector<int>& paddings,
                          const std::vector<int>& dilations,
@@ -55,6 +55,8 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
+
+  const DenseTensor& rulebook = out.rulebook();
   const IntT* rulebook_ptr = rulebook.data<IntT>();
 
   const int rulebook_len = rulebook.dims()[1];
@@ -253,7 +255,7 @@ template <typename T, typename Context>
 void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& kernel,
-                      const DenseTensor& rulebook,
+                      const SparseCooTensor& out,
                       const SparseCooTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
@@ -267,7 +269,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
         Conv3dGradGPUKernel<T, data_t>(dev_ctx,
                                        x,
                                        kernel,
-                                       rulebook,
+                                       out,
                                        out_grad,
                                        paddings,
                                        dilations,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index c1ea1c1894461..fc7f939e9bde9 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -34,8 +34,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
                      const std::vector<int>& strides,
                      const int groups,
                      const bool subm,
-                     SparseCooTensor* out,
-                     DenseTensor* rulebook) {
+                     SparseCooTensor* out) {
   // update padding and dilation
   // Currently, only support x.layout is NDHWC, groups = 1
   // if x.layout != NDHWC then transpose(x), transpose(weight)
@@ -75,25 +74,61 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
   DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
 
-  int n = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
-                                               x,
-                                               kernel_sizes,
-                                               subm_paddings,
-                                               dilations,
-                                               subm_strides,
-                                               out_dims,
-                                               subm,
-                                               rulebook,
-                                               &counter_per_kernel,
-                                               &offsets_per_kernel,
-                                               &out_index,
-                                               &unique_value,
-                                               out,
-                                               &h_counter,
-                                               &offsets);
+  DenseTensor* rulebook = nullptr;
+  int n = 0;
+  if (subm && x.subm()) {
+    DenseTensor out_rulebook = phi::EmptyLike<IntT>(dev_ctx, x.rulebook());
+    phi::Copy(dev_ctx, x.rulebook(), dev_ctx.GetPlace(), false, &out_rulebook);
+    out->SetRulebook(out_rulebook);
+    rulebook = out->mutable_rulebook();
+    n = rulebook->dims()[1];
 
-  const int* counter_ptr = counter_per_kernel.data<int>();
-  const int* offsets_ptr = counter_per_kernel.data<int>();
+    DenseTensor out_indices =
+        phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
+    DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+    phi::Copy(
+        dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
+    out->SetMember(out_indices, out_values, out_dims, true);
+    out->SetSubm(subm);
+    const IntT* rulebook_ptr = rulebook->data<IntT>();
+    phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
+                                       rulebook_ptr,
+                                       n * sizeof(IntT),
+                                       gpuMemcpyDeviceToHost,
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    std::vector<IntT> counter(kernel_size, 0);
+    for (int i = 0; i < n; i++) {
+      counter[h_counter[i]] += 1;
+    }
+    IntT offset = 0;
+    for (int i = 0; i < kernel_size; i++) {
+      offsets[i] = offset;
+      offset += counter[i];
+    }
+    offsets[kernel_size] = offset;
+  } else {
+    rulebook = out->mutable_rulebook();
+    n = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
+                                             x,
+                                             kernel_sizes,
+                                             subm_paddings,
+                                             dilations,
+                                             subm_strides,
+                                             out_dims,
+                                             subm,
+                                             rulebook,
+                                             &counter_per_kernel,
+                                             &offsets_per_kernel,
+                                             &out_index,
+                                             &unique_value,
+                                             out,
+                                             &h_counter,
+                                             &offsets);
+  }
+
+  // const int* counter_ptr = counter_per_kernel.data<int>();
+  // const int* offsets_ptr = counter_per_kernel.data<int>();
   const IntT* rulebook_ptr = rulebook->data<IntT>();
 
   // 2. gather
@@ -229,8 +264,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const std::vector<int>& strides,
                   const int groups,
                   const bool subm,
-                  SparseCooTensor* out,
-                  DenseTensor* rulebook) {
+                  SparseCooTensor* out) {
   PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "Conv3dGPUKernel", ([&] {
         Conv3dGPUKernel<T, data_t>(dev_ctx,
@@ -241,8 +275,7 @@ void Conv3dKernel(const Context& dev_ctx,
                                    strides,
                                    groups,
                                    subm,
-                                   out,
-                                   rulebook);
+                                   out);
       }));
 }
 
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index b7d56cb0d2b06..b8ae97a449f80 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -112,8 +112,8 @@ void TestConv3dBase(const std::vector<IntT>& indices,
   };
 
   if (!std::is_same<T, phi::dtype::float16>::value) {
-    DenseTensor rulebook = phi::Empty(
-        dev_ctx_cpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
+    // DenseTensor rulebook = phi::Empty(
+    //     dev_ctx_cpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
     SparseCooTensor out = sparse::Conv3d<T>(dev_ctx_cpu,
                                             x_tensor,
                                             kernel_tensor,
@@ -121,8 +121,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                                             dilations,
                                             strides,
                                             1,
-                                            subm,
-                                            &rulebook);
+                                            subm);
 
     ASSERT_EQ(correct_out_dims.size(), out.dims().size());
     for (int i = 0; i < correct_out_dims.size(); i++) {
@@ -142,7 +141,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
           sparse::Conv3dGrad<T>(dev_ctx_cpu,
                                 x_tensor,
                                 kernel_tensor,
-                                rulebook,
+                                out,
                                 out,
                                 paddings,
                                 dilations,
@@ -196,8 +195,8 @@ void TestConv3dBase(const std::vector<IntT>& indices,
   phi::Copy(
       dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);
 
-  DenseTensor d_rulebook = phi::Empty(
-      dev_ctx_gpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
+  // DenseTensor d_rulebook = phi::Empty(
+  //     dev_ctx_gpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
   SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu,
                                             d_x_tensor,
                                             d_kernel_tensor,
@@ -205,8 +204,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                                             dilations,
                                             strides,
                                             1,
-                                            subm,
-                                            &d_rulebook);
+                                            subm);
 
   ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
   ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
@@ -243,7 +241,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
         sparse::Conv3dGrad<T>(dev_ctx_gpu,
                               d_x_tensor,
                               d_kernel_tensor,
-                              d_rulebook,
+                              d_out,
                               d_out,
                               paddings,
                               dilations,
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index 84c6d2a16af43..34b4fd317283c 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -1,10 +1,9 @@
 - api : conv3d
   args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
-  output : Tensor(out), Tensor(rulebook)
+  output : Tensor(out)
   kernel :
-    func : sparse_conv3d{sparse_coo, dense -> sparse_coo, dense}
+    func : sparse_conv3d{sparse_coo, dense -> sparse_coo}
     layout : x
-  intermediate : rulebook
   backward : conv3d_grad
 
 - api : coo_to_dense
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
index 5d9874dff29ec..0c00ba4c16491 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -1,9 +1,9 @@
 - backward_api : conv3d_grad
-  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
-  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
+  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor)
+  args : (Tensor x, Tensor kernel, Tensor out, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
   output : Tensor(x_grad), Tensor(kernel_grad)
   kernel :
-    func : sparse_conv3d_grad{sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense}
+    func : sparse_conv3d_grad{sparse_coo, dense, sparse_coo, sparse_coo -> sparse_coo, dense}
 
 - backward_api : coo_to_dense_grad
   forward : coo_to_dense(Tensor x) -> Tensor(out)

From e96f090365266dbedab26870513e2d4c9e14582c Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 22 Jun 2022 11:18:55 +0000
Subject: [PATCH 30/70] opt subm

---
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 386 ++++++++++++------
 1 file changed, 261 insertions(+), 125 deletions(-)

diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 24a7387d4fe19..982c06dfcd8f3 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -268,6 +268,111 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
   }
 }
 
+template <typename IntT>
+__global__ void GetOutIndexTable(const IntT* indices,
+                                 const IntT non_zero_num,
+                                 const Dims4D dims,
+                                 IntT* out_index_table) {
+  CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
+    IntT batch = indices[i];
+    IntT in_z = indices[i + non_zero_num];
+    IntT in_y = indices[i + 2 * non_zero_num];
+    IntT in_x = indices[i + 3 * non_zero_num];
+    IntT index = PointToIndex(batch, in_x, in_y, in_z, dims);
+    out_index_table[index] = i;
+  }
+}
+
+template <typename T>
+__global__ void ProductSubmRuleBookKernel(const T* x_indices,
+                                          const Dims4D x_dims,
+                                          const Dims4D kernel_dims,
+                                          const Dims4D out_dims,
+                                          const int64_t non_zero_num,
+                                          const Dims4D paddings,
+                                          const Dims4D dilations,
+                                          const Dims4D strides,
+                                          const bool subm,
+                                          const T* out_index_table,
+                                          T* rulebook,
+                                          int* counter,
+                                          T* in_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
+  extern __shared__ int counter_buf[];  // kernel_size
+  int* counter_buf2 = counter_buf + kernel_size;
+  // length = kernel_size * blockDim.x * 2;
+  int* rulebook_buf = counter_buf + kernel_size * 2;
+
+  const int offset = kernel_size * non_zero_num;
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    counter_buf[i] = 0;
+  }
+  __syncthreads();
+
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    int kernel_index = 0;
+    T batch = x_indices[i];
+    T in_z = x_indices[i + non_zero_num];
+    T in_y = x_indices[i + 2 * non_zero_num];
+    T in_x = x_indices[i + 3 * non_zero_num];
+    if (subm) {
+      in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
+    }
+    for (int kz = 0; kz < kernel_dims[1]; kz++) {
+      for (int ky = 0; ky < kernel_dims[2]; ky++) {
+        for (int kx = 0; kx < kernel_dims[3]; kx++) {
+          int in_i = -1, out_index = -1, kernel_i = -1;
+          if (phi::funcs::sparse::Check(x_dims,
+                                        kernel_dims,
+                                        paddings,
+                                        dilations,
+                                        strides,
+                                        in_x,
+                                        in_y,
+                                        in_z,
+                                        kx,
+                                        ky,
+                                        kz)) {
+            T out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
+            T out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
+            T out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
+            out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
+                batch, out_x, out_y, out_z, out_dims);
+            int real_out_index = out_index_table[out_index];
+            if (real_out_index != -1) {
+              in_i = i;
+              int buf_i = atomicAdd(&counter_buf[kernel_index], 1);
+              kernel_i = kernel_index;
+              rulebook_buf[kernel_index * blockDim.x + buf_i] = in_i;
+              rulebook_buf[kernel_index * blockDim.x +
+                           kernel_size * blockDim.x + buf_i] = real_out_index;
+            }
+          }
+          // rulebook[kernel_index * non_zero_num + i] = kernel_i;
+          // rulebook[kernel_index * non_zero_num + offset + i] = in_i;
+          // rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
+          ++kernel_index;
+        }
+      }
+    }
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    counter_buf2[i] = atomicAdd(&counter[i], counter_buf[i]);
+  }
+  __syncthreads();
+  for (int i = 0; i < kernel_size; i++) {
+    if (threadIdx.x < counter_buf[i]) {
+      rulebook[i * non_zero_num + counter_buf2[i] + threadIdx.x] = i;
+      rulebook[i * non_zero_num + offset + counter_buf2[i] + threadIdx.x] =
+          rulebook_buf[i * blockDim.x + threadIdx.x];
+      rulebook[i * non_zero_num + offset * 2 + counter_buf2[i] + threadIdx.x] =
+          rulebook_buf[i * blockDim.x + kernel_size * blockDim.x + threadIdx.x];
+    }
+  }
+}
+
 // the basic algorithm can refer to convolution_kernel.cc or
 // the second paper
 // example:
@@ -309,12 +414,6 @@ int ProductRuleBook(const Context& dev_ctx,
   int* counter_ptr = counter_per_kernel->data<int>();
   int* offsets_ptr = offsets_per_kernel->data<int>();
   int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
-  const int rulebook_rows = 3;
-  const int rulebook_cols = kernel_size * non_zero_num;
-  DenseTensorMeta rulebook_meta(
-      indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW);
-  *rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta));
-  IntT* rulebook_ptr = rulebook->data<IntT>();
 
   const auto x_dims = x.dims();
   Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
@@ -329,145 +428,193 @@ int ProductRuleBook(const Context& dev_ctx,
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
 
-  ProductRuleBookKernel<IntT><<<config.block_per_grid.x,
-                                config.thread_per_block.x,
-                                kernel_size * sizeof(int),
-                                dev_ctx.stream()>>>(indices_ptr,
-                                                    d_x_dims,
-                                                    d_kernel_dims,
-                                                    d_out_dims,
-                                                    non_zero_num,
-                                                    d_paddings,
-                                                    d_dilations,
-                                                    d_strides,
-                                                    subm,
-                                                    rulebook_ptr,
-                                                    counter_ptr,
-                                                    in_indexs.data<IntT>());
-
-// 2. remove -1
-#ifdef PADDLE_WITH_HIP
-  IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
-#else
-  IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                              rulebook_ptr,
-                              rulebook_ptr + rulebook_rows * rulebook_cols,
-                              -1);
-
-  phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
-      rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
-  IntT rulebook_len = 0;
-  phi::backends::gpu::GpuMemcpyAsync(
-      &rulebook_len,
-      rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
-      sizeof(IntT),
-#ifdef PADDLE_WITH_HIP
-      hipMemcpyDeviceToHost,
-#else
-      cudaMemcpyDeviceToHost,
-#endif
-      dev_ctx.stream());
-  dev_ctx.Wait();
-  rulebook_len /= 3;
-
   if (subm) {
     // At present, hashtable is not used to map the input and output indexes.
     // At present, the intermediate output index is generated by normal
     // convolution,
     // and then the intermediate output index is subtracted from the input index
     // to obain the rulebook.
+    const int rulebook_rows = 3;
+    const int rulebook_cols = kernel_size * non_zero_num;
+    DenseTensorMeta rulebook_meta(
+        indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW);
+    DenseTensor tmp_rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta));
+    IntT* rulebook_ptr = tmp_rulebook.data<IntT>();
+    DenseTensor out_indices =
+        phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
+    DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+    phi::Copy(
+        dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
 
-    // call lower_bound to get the real index of out_index
-    const IntT* in_indexs_ptr = in_indexs.data<IntT>();
-    IntT* out_indexs_ptr = rulebook_ptr + 2 * rulebook_len;
-    DenseTensor bound = phi::Empty(
-        dev_ctx,
-        DenseTensorMeta(
-            indices_dtype, {static_cast<int>(rulebook_len)}, DataLayout::NCHW));
-    IntT* bound_ptr = bound.data<IntT>();
-#ifdef PADDLE_WITH_HIP
-    thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()),
-#else
-    thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                        in_indexs_ptr,
-                        in_indexs_ptr + in_indexs.numel(),
-                        out_indexs_ptr,
-                        out_indexs_ptr + rulebook_len,
-                        bound_ptr);
-
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
-
-    UpdateOutIndexAndCounterAfterLowerBound<<<config.block_per_grid,
-                                              config.thread_per_block,
-                                              kernel_size * sizeof(int),
-                                              dev_ctx.stream()>>>(
-        in_indexs_ptr,
-        bound.data<IntT>(),
-        rulebook_len,
-        kernel_size,
-        x.nnz(),
-        rulebook_ptr,
-        out_indexs_ptr,
-        counter_ptr);
-
-// remove -1
+    int64_t table_size = 1;
+    for (int i = 0; i < out_dims.size() - 1; i++) {
+      table_size *= out_dims[i];
+    }
+    DenseTensor out_index_table = phi::Empty<IntT>(dev_ctx, {table_size});
+    IntT* out_index_table_ptr = out_index_table.data<IntT>();
+    thrust::fill(thrust::cuda::par.on(dev_ctx.stream()),
+                 out_index_table_ptr,
+                 out_index_table_ptr + out_index_table.numel(),
+                 -1);
+
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+    GetOutIndexTable<IntT><<<config.block_per_grid,
+                             config.thread_per_block,
+                             0,
+                             dev_ctx.stream()>>>(
+        out_indices.data<IntT>(), non_zero_num, d_x_dims, out_index_table_ptr);
+
+    if (config.thread_per_block.x > 128) {
+      config.block_per_grid.x *= config.thread_per_block.x / 128;
+      config.thread_per_block.x = 128;
+    }
+    size_t cache_size = kernel_size * 2 + kernel_size *
+                                              config.thread_per_block.x * 2 *
+                                              sizeof(int);
+    ProductSubmRuleBookKernel<IntT>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           cache_size,
+           dev_ctx.stream()>>>(indices_ptr,
+                               d_x_dims,
+                               d_kernel_dims,
+                               d_out_dims,
+                               non_zero_num,
+                               d_paddings,
+                               d_dilations,
+                               d_strides,
+                               subm,
+                               out_index_table_ptr,
+                               rulebook_ptr,
+                               counter_ptr,
+                               in_indexs.data<IntT>());
+
+    out->SetMember(out_indices, out_values, out_dims, true);
+
+    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+                           counter_ptr,
+                           counter_ptr + kernel_size,
+                           offsets_ptr);
+
+    phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                       counter_ptr,
+                                       kernel_size * sizeof(int),
+                                       gpuMemcpyDeviceToHost,
+                                       dev_ctx.stream());
+
+    phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                       offsets_ptr,
+                                       kernel_size * sizeof(int),
+                                       gpuMemcpyDeviceToHost,
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    int rulebook_len =
+        (*h_offsets)[kernel_size - 1] + (*h_counter)[kernel_size - 1];
+    DenseTensor out_rulebook = phi::Empty<IntT>(dev_ctx, {3, rulebook_len});
+    IntT* out_rulebook_ptr = out_rulebook.data<IntT>();
+    for (int i = 0; i < kernel_size; i++) {
+      if ((*h_counter)[i] <= 0) continue;
+      phi::backends::gpu::GpuMemcpyAsync(out_rulebook_ptr + (*h_offsets)[i],
+                                         rulebook_ptr + i * non_zero_num,
+                                         (*h_counter)[i] * sizeof(IntT),
+                                         gpuMemcpyDeviceToDevice,
+                                         dev_ctx.stream());
+      phi::backends::gpu::GpuMemcpyAsync(
+          out_rulebook_ptr + rulebook_len + (*h_offsets)[i],
+          rulebook_ptr + kernel_size * non_zero_num + i * non_zero_num,
+          (*h_counter)[i] * sizeof(IntT),
+          gpuMemcpyDeviceToDevice,
+          dev_ctx.stream());
+      phi::backends::gpu::GpuMemcpyAsync(
+          out_rulebook_ptr + 2 * rulebook_len + (*h_offsets)[i],
+          rulebook_ptr + 2 * kernel_size * non_zero_num + i * non_zero_num,
+          (*h_counter)[i] * sizeof(IntT),
+          gpuMemcpyDeviceToDevice,
+          dev_ctx.stream());
+    }
+    *rulebook = out_rulebook;
+    return rulebook_len;
+
+  } else {
+    const int rulebook_rows = 3;
+    const int rulebook_cols = kernel_size * non_zero_num;
+    DenseTensorMeta rulebook_meta(
+        indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW);
+    *rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta));
+    IntT* rulebook_ptr = rulebook->data<IntT>();
+    ProductRuleBookKernel<IntT><<<config.block_per_grid.x,
+                                  config.thread_per_block.x,
+                                  kernel_size * sizeof(int),
+                                  dev_ctx.stream()>>>(indices_ptr,
+                                                      d_x_dims,
+                                                      d_kernel_dims,
+                                                      d_out_dims,
+                                                      non_zero_num,
+                                                      d_paddings,
+                                                      d_dilations,
+                                                      d_strides,
+                                                      subm,
+                                                      rulebook_ptr,
+                                                      counter_ptr,
+                                                      in_indexs.data<IntT>());
+
+    // 2. remove -1
 #ifdef PADDLE_WITH_HIP
     IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
 #else
     IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
                                 rulebook_ptr,
-                                rulebook_ptr + 3 * rulebook_len,
+                                rulebook_ptr + rulebook_rows * rulebook_cols,
                                 -1);
-    phi::funcs::sparse::DistanceKernel<IntT>
-        <<<1, 1, 0, dev_ctx.stream()>>>(rulebook_ptr, last, bound_ptr);
-    phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
-                                       bound_ptr,
-                                       sizeof(IntT),
+
+    phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
+        rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
+    IntT rulebook_len = 0;
+    phi::backends::gpu::GpuMemcpyAsync(
+        &rulebook_len,
+        rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
+        sizeof(IntT),
 #ifdef PADDLE_WITH_HIP
-                                       hipMemcpyDeviceToHost,
+        hipMemcpyDeviceToHost,
 #else
-                                       cudaMemcpyDeviceToHost,
+        cudaMemcpyDeviceToHost,
 #endif
-                                       dev_ctx.stream());
+        dev_ctx.stream());
     dev_ctx.Wait();
     rulebook_len /= 3;
-  }
 
 #ifdef PADDLE_WITH_HIP
-  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+    thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
 #else
-  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
-                         counter_ptr,
-                         counter_ptr + kernel_size,
-                         offsets_ptr);
+                           counter_ptr,
+                           counter_ptr + kernel_size,
+                           offsets_ptr);
 
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
-                                     counter_ptr,
-                                     kernel_size * sizeof(int),
+    phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                       counter_ptr,
+                                       kernel_size * sizeof(int),
 #ifdef PADDLE_WITH_HIP
-                                     hipMemcpyDeviceToHost,
+                                       hipMemcpyDeviceToHost,
 #else
-                                     cudaMemcpyDeviceToHost,
+                                       cudaMemcpyDeviceToHost,
 #endif
-                                     dev_ctx.stream());
+                                       dev_ctx.stream());
 
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
-                                     offsets_ptr,
-                                     kernel_size * sizeof(int),
+    phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                       offsets_ptr,
+                                       kernel_size * sizeof(int),
 #ifdef PADDLE_WITH_HIP
-                                     hipMemcpyDeviceToHost,
+                                       hipMemcpyDeviceToHost,
 #else
-                                     cudaMemcpyDeviceToHost,
+                                       cudaMemcpyDeviceToHost,
 #endif
-                                     dev_ctx.stream());
-
-  rulebook->Resize({rulebook_rows, static_cast<int>(rulebook_len)});
+                                       dev_ctx.stream());
 
-  if (!subm) {
+    rulebook->Resize({rulebook_rows, static_cast<int>(rulebook_len)});
     // 3. sorted or merge the out index
     out_index->ResizeAndAllocate({static_cast<int>(rulebook_len)});
     unique_value->ResizeAndAllocate({static_cast<int>(rulebook_len)});
@@ -538,19 +685,8 @@ int ProductRuleBook(const Context& dev_ctx,
                                out_indices_ptr,
                                rulebook_ptr + 2 * rulebook_len);
     out->SetMember(out_indices, out_values, out_dims, true);
-  } else {
-    DenseTensor out_indices =
-        phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
-    DenseTensor out_values =
-        phi::Empty(dev_ctx,
-                   DenseTensorMeta(x.dtype(),
-                                   {x.nnz(), kernel_sizes[4]},
-                                   x.non_zero_elements().layout()));
-    phi::Copy(
-        dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
-    out->SetMember(out_indices, out_values, out_dims, true);
+    return rulebook_len;
   }
-  return rulebook_len;
 }
 
 }  // namespace sparse

From c7eddc52808f5518e5b14d440dd2118b360c5948 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 22 Jun 2022 12:12:30 +0000
Subject: [PATCH 31/70] opt copy rulebook

---
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 70 +++++++++++++------
 1 file changed, 47 insertions(+), 23 deletions(-)

diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index e4c42b701c0d7..d8cc45c445159 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -289,6 +289,41 @@ __global__ void GetOutIndexTable(const IntT* indices,
   }
 }
 
+template <typename IntT>
+__global__ void CopyRuleBook(const int* counters,
+                             const int* offsets,
+                             const IntT* in_rulebook,
+                             const int len,
+                             const int kernel_size,
+                             const int non_zero_num,
+                             IntT* out_rulebook) {
+  int tid = threadIdx.x + blockDim.x * blockIdx.x;
+  extern __shared__ int cache_counters[];
+  int* cache_offsets = cache_counters + kernel_size;
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    cache_counters[i] = counters[i];
+    cache_offsets[i] = offsets[i];
+  }
+  __syncthreads();
+  for (int i = tid; i < len; i += gridDim.x * blockDim.x) {
+    // get the kernel index
+    int kernel_index = 0;
+    for (; kernel_index < kernel_size - 1; kernel_index++) {
+      if (i >= offsets[kernel_index] && i < offsets[kernel_index + 1]) {
+        break;
+      }
+    }
+    int inner_index = i - offsets[kernel_index];
+    out_rulebook[i] = in_rulebook[kernel_index * non_zero_num + inner_index];
+    out_rulebook[len + i] =
+        in_rulebook[kernel_size * non_zero_num + kernel_index * non_zero_num +
+                    inner_index];
+    out_rulebook[2 * len + i] =
+        in_rulebook[2 * kernel_size * non_zero_num +
+                    kernel_index * non_zero_num + inner_index];
+  }
+}
+
 template <typename T>
 __global__ void ProductSubmRuleBookKernel(const T* x_indices,
                                           const Dims4D x_dims,
@@ -355,9 +390,6 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
                            kernel_size * blockDim.x + buf_i] = real_out_index;
             }
           }
-          // rulebook[kernel_index * non_zero_num + i] = kernel_i;
-          // rulebook[kernel_index * non_zero_num + offset + i] = in_i;
-          // rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
           ++kernel_index;
         }
       }
@@ -519,26 +551,18 @@ int ProductRuleBook(const Context& dev_ctx,
         (*h_offsets)[kernel_size - 1] + (*h_counter)[kernel_size - 1];
     DenseTensor out_rulebook = phi::Empty<IntT>(dev_ctx, {3, rulebook_len});
     IntT* out_rulebook_ptr = out_rulebook.data<IntT>();
-    for (int i = 0; i < kernel_size; i++) {
-      if ((*h_counter)[i] <= 0) continue;
-      phi::backends::gpu::GpuMemcpyAsync(out_rulebook_ptr + (*h_offsets)[i],
-                                         rulebook_ptr + i * non_zero_num,
-                                         (*h_counter)[i] * sizeof(IntT),
-                                         gpuMemcpyDeviceToDevice,
-                                         dev_ctx.stream());
-      phi::backends::gpu::GpuMemcpyAsync(
-          out_rulebook_ptr + rulebook_len + (*h_offsets)[i],
-          rulebook_ptr + kernel_size * non_zero_num + i * non_zero_num,
-          (*h_counter)[i] * sizeof(IntT),
-          gpuMemcpyDeviceToDevice,
-          dev_ctx.stream());
-      phi::backends::gpu::GpuMemcpyAsync(
-          out_rulebook_ptr + 2 * rulebook_len + (*h_offsets)[i],
-          rulebook_ptr + 2 * kernel_size * non_zero_num + i * non_zero_num,
-          (*h_counter)[i] * sizeof(IntT),
-          gpuMemcpyDeviceToDevice,
-          dev_ctx.stream());
-    }
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    cache_size = kernel_size * 2 * sizeof(int);
+    CopyRuleBook<IntT><<<config.block_per_grid,
+                         config.thread_per_block,
+                         cache_size,
+                         dev_ctx.stream()>>>(counter_ptr,
+                                             offsets_ptr,
+                                             rulebook_ptr,
+                                             rulebook_len,
+                                             kernel_size,
+                                             non_zero_num,
+                                             out_rulebook_ptr);
     *rulebook = out_rulebook;
     return rulebook_len;
 

From dd5e4fde5403f32bee0d7624140bc588a955f4f2 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Thu, 23 Jun 2022 01:34:13 +0000
Subject: [PATCH 32/70] check cache size

---
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 67 ++++++++-----------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index d8cc45c445159..a0b98603f85cc 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
@@ -214,10 +215,8 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
                                       const Dims4D paddings,
                                       const Dims4D dilations,
                                       const Dims4D strides,
-                                      const bool subm,
                                       T* rulebook,
-                                      int* counter,
-                                      T* in_indexs) {
+                                      int* counter) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   extern __shared__ int counter_buf[];  // kernel_size
   const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
@@ -233,9 +232,6 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
     T in_z = x_indices[i + non_zero_num];
     T in_y = x_indices[i + 2 * non_zero_num];
     T in_x = x_indices[i + 3 * non_zero_num];
-    if (subm) {
-      in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
-    }
     for (int kz = 0; kz < kernel_dims[1]; kz++) {
       for (int ky = 0; ky < kernel_dims[2]; ky++) {
         for (int kx = 0; kx < kernel_dims[3]; kx++) {
@@ -333,11 +329,9 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
                                           const Dims4D paddings,
                                           const Dims4D dilations,
                                           const Dims4D strides,
-                                          const bool subm,
                                           const T* out_index_table,
                                           T* rulebook,
-                                          int* counter,
-                                          T* in_indexs) {
+                                          int* counter) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
   extern __shared__ int counter_buf[];  // kernel_size
@@ -357,9 +351,6 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
     T in_z = x_indices[i + non_zero_num];
     T in_y = x_indices[i + 2 * non_zero_num];
     T in_x = x_indices[i + 3 * non_zero_num];
-    if (subm) {
-      in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
-    }
     for (int kz = 0; kz < kernel_dims[1]; kz++) {
       for (int ky = 0; ky < kernel_dims[2]; ky++) {
         for (int kx = 0; kx < kernel_dims[3]; kx++) {
@@ -447,8 +438,6 @@ int ProductRuleBook(const Context& dev_ctx,
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
   const IntT* indices_ptr = non_zero_indices.data<IntT>();
-  DenseTensor in_indexs = phi::Empty<Context>(
-      dev_ctx, DenseTensorMeta(indices_dtype, {x.nnz()}, DataLayout::NCHW));
   int* counter_ptr = counter_per_kernel->data<int>();
   int* offsets_ptr = offsets_per_kernel->data<int>();
   int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
@@ -503,30 +492,34 @@ int ProductRuleBook(const Context& dev_ctx,
                              dev_ctx.stream()>>>(
         out_indices.data<IntT>(), non_zero_num, d_x_dims, out_index_table_ptr);
 
-    if (config.thread_per_block.x > 128) {
-      config.block_per_grid.x *= config.thread_per_block.x / 128;
-      config.thread_per_block.x = 128;
-    }
     size_t cache_size = kernel_size * 2 + kernel_size *
                                               config.thread_per_block.x * 2 *
                                               sizeof(int);
-    ProductSubmRuleBookKernel<IntT>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           cache_size,
-           dev_ctx.stream()>>>(indices_ptr,
-                               d_x_dims,
-                               d_kernel_dims,
-                               d_out_dims,
-                               non_zero_num,
-                               d_paddings,
-                               d_dilations,
-                               d_strides,
-                               subm,
-                               out_index_table_ptr,
-                               rulebook_ptr,
-                               counter_ptr,
-                               in_indexs.data<IntT>());
+    const int MAX_CACHE_SIZE = 48 * 1024;
+    while (cache_size >= MAX_CACHE_SIZE) {
+      config.thread_per_block.x /= 2;
+      config.block_per_grid.x *= 2;
+      PADDLE_ENFORCE_GE(config.thread_per_block.x,
+                        32,
+                        phi::errors::Fatal("the shared memory is not enough"));
+      size_t cache_size = kernel_size * 2 + kernel_size *
+                                                config.thread_per_block.x * 2 *
+                                                sizeof(int);
+    }
+    ProductSubmRuleBookKernel<IntT><<<config.block_per_grid.x,
+                                      config.thread_per_block.x,
+                                      cache_size,
+                                      dev_ctx.stream()>>>(indices_ptr,
+                                                          d_x_dims,
+                                                          d_kernel_dims,
+                                                          d_out_dims,
+                                                          non_zero_num,
+                                                          d_paddings,
+                                                          d_dilations,
+                                                          d_strides,
+                                                          out_index_table_ptr,
+                                                          rulebook_ptr,
+                                                          counter_ptr);
 
     out->SetMember(out_indices, out_values, out_dims, true);
 
@@ -584,10 +577,8 @@ int ProductRuleBook(const Context& dev_ctx,
                                                       d_paddings,
                                                       d_dilations,
                                                       d_strides,
-                                                      subm,
                                                       rulebook_ptr,
-                                                      counter_ptr,
-                                                      in_indexs.data<IntT>());
+                                                      counter_ptr);
 
     // 2. remove -1
 #ifdef PADDLE_WITH_HIP

From 52367a3160de506e43fc42d15e9fab9efce6b5fa Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Thu, 23 Jun 2022 01:34:13 +0000
Subject: [PATCH 33/70] check cache size

---
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 66 ++++++++-----------
 1 file changed, 28 insertions(+), 38 deletions(-)

diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index d8cc45c445159..a3eb06563739c 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
@@ -214,10 +215,8 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
                                       const Dims4D paddings,
                                       const Dims4D dilations,
                                       const Dims4D strides,
-                                      const bool subm,
                                       T* rulebook,
-                                      int* counter,
-                                      T* in_indexs) {
+                                      int* counter) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   extern __shared__ int counter_buf[];  // kernel_size
   const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
@@ -233,9 +232,6 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
     T in_z = x_indices[i + non_zero_num];
     T in_y = x_indices[i + 2 * non_zero_num];
     T in_x = x_indices[i + 3 * non_zero_num];
-    if (subm) {
-      in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
-    }
     for (int kz = 0; kz < kernel_dims[1]; kz++) {
       for (int ky = 0; ky < kernel_dims[2]; ky++) {
         for (int kx = 0; kx < kernel_dims[3]; kx++) {
@@ -333,11 +329,9 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
                                           const Dims4D paddings,
                                           const Dims4D dilations,
                                           const Dims4D strides,
-                                          const bool subm,
                                           const T* out_index_table,
                                           T* rulebook,
-                                          int* counter,
-                                          T* in_indexs) {
+                                          int* counter) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
   extern __shared__ int counter_buf[];  // kernel_size
@@ -357,9 +351,6 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
     T in_z = x_indices[i + non_zero_num];
     T in_y = x_indices[i + 2 * non_zero_num];
     T in_x = x_indices[i + 3 * non_zero_num];
-    if (subm) {
-      in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
-    }
     for (int kz = 0; kz < kernel_dims[1]; kz++) {
       for (int ky = 0; ky < kernel_dims[2]; ky++) {
         for (int kx = 0; kx < kernel_dims[3]; kx++) {
@@ -447,8 +438,6 @@ int ProductRuleBook(const Context& dev_ctx,
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
   const IntT* indices_ptr = non_zero_indices.data<IntT>();
-  DenseTensor in_indexs = phi::Empty<Context>(
-      dev_ctx, DenseTensorMeta(indices_dtype, {x.nnz()}, DataLayout::NCHW));
   int* counter_ptr = counter_per_kernel->data<int>();
   int* offsets_ptr = offsets_per_kernel->data<int>();
   int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
@@ -503,30 +492,33 @@ int ProductRuleBook(const Context& dev_ctx,
                              dev_ctx.stream()>>>(
         out_indices.data<IntT>(), non_zero_num, d_x_dims, out_index_table_ptr);
 
-    if (config.thread_per_block.x > 128) {
-      config.block_per_grid.x *= config.thread_per_block.x / 128;
-      config.thread_per_block.x = 128;
-    }
     size_t cache_size = kernel_size * 2 + kernel_size *
                                               config.thread_per_block.x * 2 *
                                               sizeof(int);
-    ProductSubmRuleBookKernel<IntT>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           cache_size,
-           dev_ctx.stream()>>>(indices_ptr,
-                               d_x_dims,
-                               d_kernel_dims,
-                               d_out_dims,
-                               non_zero_num,
-                               d_paddings,
-                               d_dilations,
-                               d_strides,
-                               subm,
-                               out_index_table_ptr,
-                               rulebook_ptr,
-                               counter_ptr,
-                               in_indexs.data<IntT>());
+    const int MAX_CACHE_SIZE = 48 * 1024;
+    while (cache_size >= MAX_CACHE_SIZE) {
+      config.thread_per_block.x /= 2;
+      config.block_per_grid.x *= 2;
+      PADDLE_ENFORCE_GE(config.thread_per_block.x,
+                        32,
+                        phi::errors::Fatal("the shared memory is not enough"));
+      cache_size = kernel_size * 2 +
+                   kernel_size * config.thread_per_block.x * 2 * sizeof(int);
+    }
+    ProductSubmRuleBookKernel<IntT><<<config.block_per_grid.x,
+                                      config.thread_per_block.x,
+                                      cache_size,
+                                      dev_ctx.stream()>>>(indices_ptr,
+                                                          d_x_dims,
+                                                          d_kernel_dims,
+                                                          d_out_dims,
+                                                          non_zero_num,
+                                                          d_paddings,
+                                                          d_dilations,
+                                                          d_strides,
+                                                          out_index_table_ptr,
+                                                          rulebook_ptr,
+                                                          counter_ptr);
 
     out->SetMember(out_indices, out_values, out_dims, true);
 
@@ -584,10 +576,8 @@ int ProductRuleBook(const Context& dev_ctx,
                                                       d_paddings,
                                                       d_dilations,
                                                       d_strides,
-                                                      subm,
                                                       rulebook_ptr,
-                                                      counter_ptr,
-                                                      in_indexs.data<IntT>());
+                                                      counter_ptr);
 
     // 2. remove -1
 #ifdef PADDLE_WITH_HIP

From 345ebb2c512dcecf831ed6bba10187f4d08f3987 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Thu, 23 Jun 2022 05:28:03 +0000
Subject: [PATCH 34/70] correct alloc out values

---
 paddle/phi/kernels/sparse/gpu/convolution.cu.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index a3eb06563739c..7a3ec91bb9835 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -469,7 +469,11 @@ int ProductRuleBook(const Context& dev_ctx,
     IntT* rulebook_ptr = tmp_rulebook.data<IntT>();
     DenseTensor out_indices =
         phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
-    DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+    DenseTensor out_values =
+        phi::Empty(dev_ctx,
+                   DenseTensorMeta(x.dtype(),
+                                   {x.nnz(), kernel_sizes[4]},
+                                   x.non_zero_elements().layout()));
     phi::Copy(
         dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
 

From 596bfbddee69eda16cdc272b47e220ca01eec7a7 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Fri, 24 Jun 2022 14:24:10 +0000
Subject: [PATCH 35/70] fix backward

---
 paddle/phi/kernels/sparse/cpu/convolution.h   |  7 ++--
 .../kernels/sparse/cpu/convolution_kernel.cc  | 17 +++++----
 .../kernels/sparse/cpu/sparse_pool_kernel.cc  | 10 +++---
 .../sparse/gpu/convolution_grad_kernel.cu     | 16 ++-------
 .../kernels/sparse/gpu/convolution_kernel.cu  | 36 -------------------
 .../tests/unittests/test_sparse_conv_op.py    | 12 ++++---
 .../incubate/sparse/nn/functional/conv.py     |  3 +-
 7 files changed, 29 insertions(+), 72 deletions(-)

diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h
index b2544619774c2..07baf77ff5d27 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -41,13 +41,12 @@ void ProductRuleBook(const Context& dev_ctx,
                      const DDim& out_dims,
                      const bool subm,
                      DenseTensor* rulebook,
-                     DenseTensor* counter_per_kernel) {
+                     std::vector<int>* counter_per_kernel) {
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
   const IntT* indices_ptr = non_zero_indices.data<IntT>();
-  int* counter_ptr = counter_per_kernel->data<int>();
   int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
-  memset(counter_ptr, 0, kernel_size * sizeof(int));
+  memset(counter_per_kernel->data(), 0, kernel_size * sizeof(int));
 
   int rulebook_len = 0;
   // calc the rulebook_len
@@ -107,7 +106,7 @@ void ProductRuleBook(const Context& dev_ctx,
               }
 
               if (rulebook_ptr == nullptr) {
-                counter_ptr[kernel_index - 1] += 1;
+                (*counter_per_kernel)[kernel_index - 1] += 1;
                 ++rulebook_len;
               } else {
                 rulebook_ptr[rulebook_index] = kernel_index - 1;
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index 8fdd5c5ca0f51..7c37d90cd4cf9 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -67,9 +67,7 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx,
   // Second algorithm:
   // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
   // 1. product rulebook
-  DenseTensorMeta counter_meta(
-      DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  std::vector<int> counter_per_kernel(kernel_size, 0);
 
   // DenseTensor* rulebook = nullptr;
   const IntT* rulebook_ptr = nullptr;
@@ -80,11 +78,7 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx,
   if (subm && table != nullptr) {
     const DenseTensor& rulebook = table->first;
     rulebook_ptr = rulebook.data<IntT>();
-    // DenseTensor out_rulebook = phi::EmptyLike<IntT>(dev_ctx, x.rulebook());
-    // phi::Copy(dev_ctx, x.rulebook(), dev_ctx.GetPlace(), false,
-    // &out_rulebook); out->SetRulebook(out_rulebook);
     out->SetTablePtr(x.GetTablePtr());
-    // rulebook = out->mutable_rulebook();
     n = rulebook.dims()[1];
 
     DenseTensor out_indices =
@@ -93,7 +87,9 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx,
     phi::Copy(
         dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
     out->SetMember(out_indices, out_values, out_dims, true);
-    // out->SetSubm(subm);
+    memcpy(counter_per_kernel.data(),
+           table->second.data(),
+           kernel_size * sizeof(int));
   } else {
     DenseTensor rulebook;
     ProductRuleBook<T, CPUContext, IntT>(dev_ctx,
@@ -110,9 +106,12 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx,
     UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
         dev_ctx, x, kernel_size, out_channels, out_dims, &rulebook, out);
     n = rulebook.dims()[1];
+    out->SetTablePtr(x.GetTablePtr());
+    out->SetTable(key, std::make_pair(rulebook, counter_per_kernel));
+    rulebook_ptr = rulebook.data<IntT>();
   }
   // int n = rulebook->dims()[1];
-  const int* counter_ptr = counter_per_kernel.data<int>();
+  const int* counter_ptr = counter_per_kernel.data();
 
   // 2. gather
   DenseTensorMeta in_features_meta(
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
index 7655913374dbd..f4d6e807538ea 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
@@ -47,9 +47,11 @@ void MaxPoolCPUKernel(const CPUContext& dev_ctx,
       x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
   const int in_channels = real_kernel_sizes[3];
 
-  DenseTensorMeta counter_meta(
-      DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  // DenseTensorMeta counter_meta(
+  //     DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  // DenseTensor counter_per_kernel = phi::Empty(dev_ctx,
+  // std::move(counter_meta));
+  std::vector<int> counter_per_kernel(kernel_size, 0);
 
   const T* in_features_ptr = x.non_zero_elements().data<T>();
   // 1. product rule book
@@ -69,7 +71,7 @@ void MaxPoolCPUKernel(const CPUContext& dev_ctx,
 
   int rulebook_len = rulebook->dims()[1];
   const IntT* rulebook_ptr = rulebook->data<IntT>();
-  const int* counter_ptr = counter_per_kernel.data<int>();
+  const int* counter_ptr = counter_per_kernel.data();
 
   std::vector<int> offsets(kernel_size + 1);
   phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 1560f2faa5cac..9533d456af0a6 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -100,20 +100,10 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
                         &x_grad_indices);
   x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
 
-  std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0),
-      h_counter(rulebook_len, 0);
-  // phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
-  //                                    rulebook_ptr,
-  //                                    rulebook_len * sizeof(IntT),
-  //                                    gpuMemcpyDeviceToHost,
-  //                                    dev_ctx.stream());
-  // dev_ctx.Wait();
+  std::vector<int> offsets(kernel_size + 1);
+  const auto& counter = table->second;
 
-  // for (int i = 0; i < rulebook_len; i++) {
-  //   counter[h_counter[i]] += 1;
-  // }
-  memcpy(counter.data(), table->second.data(), kernel_size * sizeof(int));
-  IntT offset = 0, max_count = 0;
+  int offset = 0, max_count = 0;
   for (int i = 0; i < kernel_size; i++) {
     offsets[i] = offset;
     offset += counter[i];
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 3df89322405f8..95475ed5cb9e3 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -77,7 +77,6 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
 
   int n = 0;
-  // DenseTensor* rulebook = nullptr;
   const IntT* rulebook_ptr = nullptr;
   PADDLE_ENFORCE_EQ(
       key.empty(),
@@ -90,11 +89,6 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
     memcpy(h_counter.data(), table->second.data(), kernel_size * sizeof(int));
     out->SetTablePtr(x.GetTablePtr());
 
-    clock_t t0 = clock();
-    // DenseTensor out_rulebook = phi::EmptyLike<IntT>(dev_ctx, x.rulebook());
-    // phi::Copy(dev_ctx, x.rulebook(), dev_ctx.GetPlace(), false,
-    // &out_rulebook); out->SetRulebook(out_rulebook); rulebook =
-    // out->mutable_rulebook();
     n = rulebook.dims()[1];
 
     DenseTensor out_indices =
@@ -103,39 +97,14 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
     phi::Copy(
         dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
     out->SetMember(out_indices, out_values, out_dims, true);
-    // out->SetSubm(subm);
-    // const IntT* rulebook_ptr = rulebook->data<IntT>();
-    // std::vector<IntT> counter(n, 0);
-    // clock_t t1 = clock();
-    // phi::backends::gpu::GpuMemcpyAsync(&counter[0],
-    //                                    rulebook_ptr,
-    //                                    n * sizeof(IntT),
-    //                                    gpuMemcpyDeviceToHost,
-    //                                    dev_ctx.stream());
-    // dev_ctx.Wait();
-    // clock_t t2 = clock();
-    // for (int i = 0; i < n; i++) {
-    //   PADDLE_ENFORCE_LT(counter[i],
-    //                     kernel_size,
-    //                     phi::errors::Fatal("the kernel index must less than
-    //                     kernel_size"));
-    //   h_counter[counter[i]] += 1;
-    // }
     IntT offset = 0;
     for (int i = 0; i < kernel_size; i++) {
       offsets[i] = offset;
       offset += h_counter[i];
     }
     offsets[kernel_size] = offset;
-    // clock_t t3 = clock();
-    // auto f = [](clock_t start, clock_t end) -> float{
-    //     return (float)(end-start)/CLOCKS_PER_SEC;
-    // };
-    // printf("%f %f %f\n",  f(t0, t1), f(t1, t2), f(t2, t3));
   } else {
     DenseTensor rulebook;
-    // rulebook = &empty_rulebook;
-    // rulebook = out->mutable_rulebook();
     n = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
                                              x,
                                              kernel_sizes,
@@ -152,16 +121,11 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
                                              out,
                                              &h_counter,
                                              &offsets);
-    // out->SetSubm(subm);
     out->SetTablePtr(x.GetTablePtr());
     out->SetTable(key, std::make_pair(rulebook, h_counter));
     rulebook_ptr = rulebook.data<IntT>();
   }
 
-  // const int* counter_ptr = counter_per_kernel.data<int>();
-  // const int* offsets_ptr = counter_per_kernel.data<int>();
-  /// const IntT* rulebook_ptr = rulebook->data<IntT>();
-
   // 2. gather
   DenseTensorMeta in_features_meta(
       x.dtype(), {n, in_channels}, DataLayout::NCHW);
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
index e1a9b2428babc..afd9c33421660 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
@@ -51,6 +51,7 @@ def test_conv3d(self):
                 padding=paddings,
                 dilation=dilations,
                 groups=1,
+                key='conv3d',
                 data_format="NDHWC")
             out.backward(out)
             assert np.array_equal(correct_out_values, out.values().numpy())
@@ -66,7 +67,7 @@ def test_subm_conv3d(self):
                 indices, values, dense_shape, stop_gradient=True)
             weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
             y = paddle.incubate.sparse.nn.functional.subm_conv3d(
-                sparse_x, weight)
+                sparse_x, weight, key='subm_conv')
             assert np.array_equal(sparse_x.indices().numpy(),
                                   y.indices().numpy())
 
@@ -84,13 +85,13 @@ def test_Conv3D(self):
                 indices, values, dense_shape, False)
 
             sparse_conv3d = paddle.incubate.sparse.nn.Conv3D(
-                1, 1, (1, 3, 3), data_format='NDHWC')
+                1, 1, (1, 3, 3), data_format='NDHWC', key='conv3d')
             sparse_out = sparse_conv3d(sparse_input)
             #test errors
             with self.assertRaises(ValueError):
                 #Currently, only support data_format='NDHWC'
                 conv3d = paddle.incubate.sparse.nn.SubmConv3D(
-                    1, 1, (1, 3, 3), data_format='NCDHW')
+                    1, 1, (1, 3, 3), data_format='NCDHW', key='subm_conv')
 
     def test_SubmConv3D(self):
         with _test_eager_guard():
@@ -104,7 +105,7 @@ def test_SubmConv3D(self):
                 indices, values, dense_shape, False)
 
             subm_conv3d = paddle.incubate.sparse.nn.SubmConv3D(
-                1, 1, (1, 3, 3), data_format='NDHWC')
+                1, 1, (1, 3, 3), data_format='NDHWC', key='subm_conv')
             # test extra_repr
             print(subm_conv3d.extra_repr())
 
@@ -116,7 +117,7 @@ def test_SubmConv3D(self):
             with self.assertRaises(ValueError):
                 #Currently, only support data_format='NDHWC'
                 conv3d = paddle.incubate.sparse.nn.SubmConv3D(
-                    1, 1, (1, 3, 3), data_format='NCDHW')
+                    1, 1, (1, 3, 3), data_format='NCDHW', key='subm_conv')
 
     def test_Conv3D_bias(self):
         with _test_eager_guard():
@@ -129,6 +130,7 @@ def test_Conv3D_bias(self):
             sp_conv3d = paddle.incubate.sparse.nn.Conv3D(3,
                                                          2,
                                                          3,
+                                                         key='conv3d',
                                                          data_format='NDHWC')
             sp_conv3d.weight.set_value(
                 paddle.to_tensor(conv3d.weight.numpy().transpose(2, 3, 4, 1,
diff --git a/python/paddle/incubate/sparse/nn/functional/conv.py b/python/paddle/incubate/sparse/nn/functional/conv.py
index a0de73e30fb86..62800dd01c65b 100644
--- a/python/paddle/incubate/sparse/nn/functional/conv.py
+++ b/python/paddle/incubate/sparse/nn/functional/conv.py
@@ -84,6 +84,7 @@ def conv3d(x,
            padding=0,
            dilation=1,
            groups=1,
+           key=None,
            data_format="NDHWC",
            name=None):
     r"""
@@ -188,7 +189,7 @@ def conv3d(x,
               # (1, 1, 1, 2, 1)
     """
     return _conv3d(x, weight, bias, stride, padding, dilation, groups, False,
-                   None, data_format, name)
+                   key, data_format, name)
 
 
 def subm_conv3d(x,

From c906bdb835add55f94354f8865519425b27d1f72 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Sat, 25 Jun 2022 11:29:59 +0000
Subject: [PATCH 36/70] opt conv

---
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 283 ++++++++++++------
 .../kernels/sparse/gpu/convolution_kernel.cu  |   2 +-
 2 files changed, 197 insertions(+), 88 deletions(-)

diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 7a3ec91bb9835..fddb743de5af7 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -68,6 +68,52 @@ __global__ void GatherKernel(const T* params,
   }
 }
 
+template <typename IntT>
+__global__ void UniqueKernel(const IntT* in_indexs,
+                             const int rulebook_len,
+                             int* out_index_table,
+                             int* out_indexs,
+                             int* nnz) {
+  extern __shared__ int cache[];
+  __shared__ int count, start;
+  if (threadIdx.x == 0) {
+    count = 0;
+    start = 0;
+  }
+  __syncthreads();
+
+  int i = threadIdx.x + blockDim.x * blockIdx.x;
+  if (i < rulebook_len) {
+    // atomicOr only support int
+    int index = static_cast<int>(in_indexs[i]);
+    int change_index = index == 0 ? 1 : index;
+    int flag = atomicOr(out_index_table + index, change_index);
+    if (flag == 0) {
+      int j = atomicAdd(&count, 1);
+      cache[j] = index;
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    start = atomicAdd(nnz, count);
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < count; i += blockDim.x) {
+    out_indexs[start + i] = cache[i];
+  }
+}
+
+template <typename IntT>
+__global__ void UpdateOutIndex(const int* out_index_table,
+                               const int n,
+                               IntT* out_indexs) {
+  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
+    IntT index = out_indexs[i];
+    out_indexs[i] = out_index_table[index];
+  }
+}
+
 template <typename Context, typename IntT = int>
 inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
                                   const IntT* rulebook_ptr,
@@ -75,42 +121,38 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
                                   DenseTensor* out_index,
                                   DenseTensor* unique_key,
                                   DenseTensor* unique_value) {
-  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
-      dev_ctx, out_index, kps::IdentityFunctor<int>());
-  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
-      dev_ctx, unique_value, kps::IdentityFunctor<int>());
-
-  phi::backends::gpu::GpuMemcpyAsync(unique_key->data<IntT>(),
-                                     rulebook_ptr,
-                                     sizeof(IntT) * len,
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyDeviceToDevice,
-#else
-                                     cudaMemcpyDeviceToDevice,
-#endif
-                                     dev_ctx.stream());
-// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher
-// performance, but thrust::merge_by_key limited by data size
-#ifdef PADDLE_WITH_HIP
-  thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
-#else
-  thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                      unique_key->data<IntT>(),
-                      unique_key->data<IntT>() + len,
-                      out_index->data<int>());
-
-  // 4. unique
-  thrust::pair<IntT*, int*> new_end =
-#ifdef PADDLE_WITH_HIP
-      thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
-#else
-      thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                            unique_key->data<IntT>(),
-                            unique_key->data<IntT>() + len,
-                            unique_value->data<int>());
-  return new_end.first;
+  //  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+  //      dev_ctx, out_index, kps::IdentityFunctor<int>());
+  //  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+  //      dev_ctx, unique_value, kps::IdentityFunctor<int>());
+  //
+  //  phi::backends::gpu::GpuMemcpyAsync(unique_key->data<IntT>(),
+  //                                     rulebook_ptr,
+  //                                     sizeof(IntT) * len,
+  //                                     gpuMemcpyDeviceToDevice,
+  //                                     dev_ctx.stream());
+  //// compared with thrust::sort_by_key, thrust::merge_by_key may achieved
+  // higher / performance, but thrust::merge_by_key limited by data size
+  // #ifdef PADDLE_WITH_HIP
+  //  thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
+  // #else
+  //  thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+  // #endif
+  //                      unique_key->data<IntT>(),
+  //                      unique_key->data<IntT>() + len,
+  //                      out_index->data<int>());
+  //
+  //  // 4. unique
+  //  thrust::pair<IntT*, int*> new_end =
+  // #ifdef PADDLE_WITH_HIP
+  //      thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
+  // #else
+  //      thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+  // #endif
+  //                            unique_key->data<IntT>(),
+  //                            unique_key->data<IntT>() + len,
+  //                            unique_value->data<int>());
+  //  return new_end.first;
 }
 
 /**
@@ -285,6 +327,26 @@ __global__ void GetOutIndexTable(const IntT* indices,
   }
 }
 
+template <typename IntT>
+__global__ void GetOutIndexTable(const int* indexs,
+                                 const int non_zero_num,
+                                 const Dims4D out_dims,
+                                 int* out_index_table,
+                                 IntT* out_indices) {
+  CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
+    IntT index = static_cast<IntT>(indexs[i]);
+    out_index_table[index] = i;
+    IntT batch, x, y, z;
+    phi::funcs::sparse::IndexToPoint<Dims4D>(
+        index, out_dims, &batch, &x, &y, &z);
+    // get out indices
+    out_indices[i] = batch;
+    out_indices[i + non_zero_num] = z;
+    out_indices[i + non_zero_num * 2] = y;
+    out_indices[i + non_zero_num * 3] = x;
+  }
+}
+
 template <typename IntT>
 __global__ void CopyRuleBook(const int* counters,
                              const int* offsets,
@@ -649,66 +711,113 @@ int ProductRuleBook(const Context& dev_ctx,
     int* out_index_ptr = out_index->data<int>();
     int* unique_value_ptr = unique_value->data<int>();
     IntT* unique_key_ptr = unique_key.data<IntT>();
-
-    IntT* new_end =
-        SortedAndUniqueIndex<Context, IntT>(dev_ctx,
-                                            rulebook_ptr + 2 * rulebook_len,
-                                            rulebook_len,
-                                            out_index,
-                                            &unique_key,
-                                            unique_value);
-    // thrust::distance doesn't support stream parameters
-    // const int out_non_zero_num = thrust::distance(unique_key_ptr,
-    // new_end.first);
-    phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
-        unique_key_ptr,
-        new_end,
-        rulebook_ptr + rulebook_rows * rulebook_cols - 1);
-    IntT out_non_zero_num = 0;
-#ifdef PADDLE_WITH_HIP
-    phi::backends::gpu::GpuMemcpyAsync(
-        &out_non_zero_num,
-        rulebook_ptr + rulebook_rows * rulebook_cols - 1,
-        sizeof(IntT),
-        hipMemcpyDeviceToHost,
-        dev_ctx.stream());
-#else
-    phi::backends::gpu::GpuMemcpyAsync(
-        &out_non_zero_num,
-        rulebook_ptr + rulebook_rows * rulebook_cols - 1,
-        sizeof(IntT),
-        cudaMemcpyDeviceToHost,
-        dev_ctx.stream());
-#endif
+    cudaMemsetAsync(unique_value_ptr, 0, sizeof(int), dev_ctx.stream());
+
+    // IntT* new_end =
+    //     SortedAndUniqueIndex<Context, IntT>(dev_ctx,
+    //                                         rulebook_ptr + 2 * rulebook_len,
+    //                                         rulebook_len,
+    //                                         out_index,
+    //                                         &unique_key,
+    //                                         unique_value);
+    int64_t table_size = 1;
+    for (int i = 0; i < out_dims.size() - 1; i++) {
+      table_size *= out_dims[i];
+    }
+    DenseTensor out_index_table = phi::Empty<int>(dev_ctx, {table_size});
+    int* out_index_table_ptr = out_index_table.data<int>();
+    cudaMemsetAsync(
+        out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    size_t cache_size = sizeof(int) * config.thread_per_block.x;
+    UniqueKernel<IntT><<<config.block_per_grid,
+                         config.thread_per_block,
+                         cache_size,
+                         dev_ctx.stream()>>>(rulebook_ptr + 2 * rulebook_len,
+                                             rulebook_len,
+                                             out_index_table_ptr,
+                                             out_index_ptr,
+                                             unique_value_ptr);
+    int out_nnz = 0;
+    cudaMemcpyAsync(&out_nnz,
+                    unique_value_ptr,
+                    sizeof(int),
+                    cudaMemcpyDeviceToHost,
+                    dev_ctx.stream());
     dev_ctx.Wait();
 
-    // 5. update out_indices and rulebook by unique_value_ptr
     const int64_t sparse_dim = 4;
     DenseTensorMeta indices_meta(
-        indices_dtype, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
-    DenseTensorMeta values_meta(x.dtype(),
-                                {out_non_zero_num, kernel_sizes[4]},
-                                x.non_zero_elements().layout());
+        indices_dtype, {sparse_dim, out_nnz}, DataLayout::NCHW);
+    DenseTensorMeta values_meta(
+        x.dtype(), {out_nnz, kernel_sizes[4]}, x.non_zero_elements().layout());
     phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
     phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
+    out->SetMember(out_indices, out_values, out_dims, true);
 
     IntT* out_indices_ptr = out_indices.data<IntT>();
 
-    config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
-    UpdateIndexKernel<IntT>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(unique_key_ptr,
-                               unique_value_ptr,
-                               out_index_ptr,
-                               out_non_zero_num,
-                               rulebook_len,
-                               d_out_dims,
-                               out_indices_ptr,
-                               rulebook_ptr + 2 * rulebook_len);
-    out->SetMember(out_indices, out_values, out_dims, true);
+    thrust::sort(thrust::cuda::par.on(dev_ctx.stream()),
+                 out_index_ptr,
+                 out_index_ptr + out_nnz);
+
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
+    GetOutIndexTable<IntT><<<config.block_per_grid,
+                             config.thread_per_block,
+                             0,
+                             dev_ctx.stream()>>>(out_index_ptr,
+                                                 out_nnz,
+                                                 d_out_dims,
+                                                 out_index_table_ptr,
+                                                 out_indices_ptr);
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    UpdateOutIndex<<<config.block_per_grid,
+                     config.thread_per_block,
+                     0,
+                     dev_ctx.stream()>>>(
+        out_index_table_ptr, rulebook_len, rulebook_ptr + 2 * rulebook_len);
+
+    // thrust::distance doesn't support stream parameters
+    // const int out_non_zero_num = thrust::distance(unique_key_ptr,
+    // new_end.first);
+    //    phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0,
+    //    dev_ctx.stream()>>>(
+    //        unique_key_ptr,
+    //        new_end,
+    //        rulebook_ptr + rulebook_rows * rulebook_cols - 1);
+    //    IntT out_non_zero_num = 0;
+    // #ifdef PADDLE_WITH_HIP
+    //    phi::backends::gpu::GpuMemcpyAsync(
+    //        &out_non_zero_num,
+    //        rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+    //        sizeof(IntT),
+    //        hipMemcpyDeviceToHost,
+    //        dev_ctx.stream());
+    // #else
+    //    phi::backends::gpu::GpuMemcpyAsync(
+    //        &out_non_zero_num,
+    //        rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+    //        sizeof(IntT),
+    //        cudaMemcpyDeviceToHost,
+    //        dev_ctx.stream());
+    // #endif
+
+    // 5. update out_indices and rulebook by unique_value_ptr
+    //    config =
+    //        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx,
+    //        out_non_zero_num, 1);
+    ///    UpdateIndexKernel<IntT>
+    //        <<<config.block_per_grid.x,
+    //           config.thread_per_block.x,
+    //           0,
+    //           dev_ctx.stream()>>>(unique_key_ptr,
+    //                               unique_value_ptr,
+    //                               out_index_ptr,
+    //                               out_non_zero_num,
+    //                               rulebook_len,
+    //                               d_out_dims,
+    //                               out_indices_ptr,
+    //                               rulebook_ptr + 2 * rulebook_len);
     return rulebook_len;
   }
 }
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 0abeda531d6df..092b40a1ff4f0 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -170,7 +170,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   }
 
   // 4. scatter
-  if (subm) {
+  if (true) {
     set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
     auto config =
         phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1);

From 6dc1584d3f86874767b3e5726e95037e2d3494ac Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Sun, 26 Jun 2022 10:34:27 +0000
Subject: [PATCH 37/70] opt conv3d

---
 paddle/phi/kernels/funcs/sparse/scatter.cu.h  | 35 +++++++
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 95 ++++++-------------
 .../kernels/sparse/gpu/convolution_kernel.cu  | 46 ++-------
 3 files changed, 73 insertions(+), 103 deletions(-)

diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
index 3afe13db73a9a..39a2ad7a1db0a 100644
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -97,6 +97,41 @@ __global__ void ScatterCUDAKernel(const T* params,
   }
 }
 
+template <typename T, int VecSize>
+__global__ void ScatterKernelV2(const T* input,
+                                const int* out_index_counts,
+                                const int* origin_out_indexs,
+                                const int non_zero_num,
+                                const int kernel_size,
+                                const int channels,
+                                T* out) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int vec_channels = channels / VecSize;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  for (int i = tid; i < non_zero_num * vec_channels;
+       i += gridDim.x * blockDim.x) {
+    int indices_i = i / vec_channels;
+    int channels_i = i - indices_i * vec_channels;
+
+    int len = out_index_counts[indices_i];
+    // max(end-start) = kernel_size
+    StoreT sums = {static_cast<T>(0)};
+    for (int j = 0; j < len; j++) {
+      const int out_feature_i = origin_out_indexs[indices_i * kernel_size + j];
+      LoadT vec_in;
+      phi::Load<T, VecSize>(
+          input + out_feature_i * channels + channels_i * VecSize, &vec_in);
+#pragma unroll
+      for (int k = 0; k < VecSize; k++) {
+        sums[k] += vec_in[k];
+      }
+    }
+    phi::Store<T, VecSize>(sums,
+                           out + indices_i * channels + channels_i * VecSize);
+  }
+}
+
 }  // namespace sparse
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index fddb743de5af7..88b2b97c79404 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -107,10 +107,19 @@ __global__ void UniqueKernel(const IntT* in_indexs,
 template <typename IntT>
 __global__ void UpdateOutIndex(const int* out_index_table,
                                const int n,
-                               IntT* out_indexs) {
+                               const int kernel_size,
+                               IntT* out_indexs,
+                               int* out_index_counts,
+                               int* origin_out_indexs) {
   CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
     IntT index = out_indexs[i];
-    out_indexs[i] = out_index_table[index];
+    int real_index = out_index_table[index];
+    out_indexs[i] = real_index;
+
+    // kernel_size at most
+    int j = atomicAdd(out_index_counts + real_index, 1);
+    // nnz * kernel_size
+    origin_out_indexs[real_index * kernel_size + j] = i;
   }
 }
 
@@ -703,23 +712,11 @@ int ProductRuleBook(const Context& dev_ctx,
     rulebook->Resize({rulebook_rows, static_cast<int>(rulebook_len)});
     // 3. sorted or merge the out index
     out_index->ResizeAndAllocate({static_cast<int>(rulebook_len)});
-    unique_value->ResizeAndAllocate({static_cast<int>(rulebook_len)});
-    DenseTensor unique_key = phi::Empty(
-        dev_ctx,
-        DenseTensorMeta(
-            indices_dtype, {static_cast<int>(rulebook_len)}, DataLayout::NCHW));
+    DenseTensor unique_key =
+        phi::Empty<int>(dev_ctx, {static_cast<int>(rulebook_len)});
     int* out_index_ptr = out_index->data<int>();
-    int* unique_value_ptr = unique_value->data<int>();
-    IntT* unique_key_ptr = unique_key.data<IntT>();
-    cudaMemsetAsync(unique_value_ptr, 0, sizeof(int), dev_ctx.stream());
-
-    // IntT* new_end =
-    //     SortedAndUniqueIndex<Context, IntT>(dev_ctx,
-    //                                         rulebook_ptr + 2 * rulebook_len,
-    //                                         rulebook_len,
-    //                                         out_index,
-    //                                         &unique_key,
-    //                                         unique_value);
+    int* unique_key_ptr = unique_key.data<int>();
+
     int64_t table_size = 1;
     for (int i = 0; i < out_dims.size() - 1; i++) {
       table_size *= out_dims[i];
@@ -728,6 +725,7 @@ int ProductRuleBook(const Context& dev_ctx,
     int* out_index_table_ptr = out_index_table.data<int>();
     cudaMemsetAsync(
         out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
+    cudaMemsetAsync(unique_key_ptr, 0, sizeof(int), dev_ctx.stream());
     config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
     size_t cache_size = sizeof(int) * config.thread_per_block.x;
     UniqueKernel<IntT><<<config.block_per_grid,
@@ -737,10 +735,10 @@ int ProductRuleBook(const Context& dev_ctx,
                                              rulebook_len,
                                              out_index_table_ptr,
                                              out_index_ptr,
-                                             unique_value_ptr);
+                                             unique_key_ptr);
     int out_nnz = 0;
     cudaMemcpyAsync(&out_nnz,
-                    unique_value_ptr,
+                    unique_key_ptr,
                     sizeof(int),
                     cudaMemcpyDeviceToHost,
                     dev_ctx.stream());
@@ -771,53 +769,22 @@ int ProductRuleBook(const Context& dev_ctx,
                                                  out_index_table_ptr,
                                                  out_indices_ptr);
     config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    cudaMemsetAsync(
+        out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
+    unique_value->ResizeAndAllocate({static_cast<int>(out_nnz * kernel_size)});
+    int* unique_value_ptr = unique_value->data<int>();
+
+    // return rulebook_len;
     UpdateOutIndex<<<config.block_per_grid,
                      config.thread_per_block,
                      0,
-                     dev_ctx.stream()>>>(
-        out_index_table_ptr, rulebook_len, rulebook_ptr + 2 * rulebook_len);
-
-    // thrust::distance doesn't support stream parameters
-    // const int out_non_zero_num = thrust::distance(unique_key_ptr,
-    // new_end.first);
-    //    phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0,
-    //    dev_ctx.stream()>>>(
-    //        unique_key_ptr,
-    //        new_end,
-    //        rulebook_ptr + rulebook_rows * rulebook_cols - 1);
-    //    IntT out_non_zero_num = 0;
-    // #ifdef PADDLE_WITH_HIP
-    //    phi::backends::gpu::GpuMemcpyAsync(
-    //        &out_non_zero_num,
-    //        rulebook_ptr + rulebook_rows * rulebook_cols - 1,
-    //        sizeof(IntT),
-    //        hipMemcpyDeviceToHost,
-    //        dev_ctx.stream());
-    // #else
-    //    phi::backends::gpu::GpuMemcpyAsync(
-    //        &out_non_zero_num,
-    //        rulebook_ptr + rulebook_rows * rulebook_cols - 1,
-    //        sizeof(IntT),
-    //        cudaMemcpyDeviceToHost,
-    //        dev_ctx.stream());
-    // #endif
-
-    // 5. update out_indices and rulebook by unique_value_ptr
-    //    config =
-    //        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx,
-    //        out_non_zero_num, 1);
-    ///    UpdateIndexKernel<IntT>
-    //        <<<config.block_per_grid.x,
-    //           config.thread_per_block.x,
-    //           0,
-    //           dev_ctx.stream()>>>(unique_key_ptr,
-    //                               unique_value_ptr,
-    //                               out_index_ptr,
-    //                               out_non_zero_num,
-    //                               rulebook_len,
-    //                               d_out_dims,
-    //                               out_indices_ptr,
-    //                               rulebook_ptr + 2 * rulebook_len);
+                     dev_ctx.stream()>>>(out_index_table_ptr,
+                                         rulebook_len,
+                                         kernel_size,
+                                         rulebook_ptr + 2 * rulebook_len,
+                                         out_index_ptr,
+                                         unique_value_ptr);
+
     return rulebook_len;
   }
 }
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 8df2939bdf551..e87fd71d511f6 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -199,42 +199,10 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
               tmp_kernel_ptr,
               static_cast<T>(0),
               tmp_out_ptr);
-
-    if (subm) {
-      // if(out_channels % VecSize == 0){
-      //     auto config =
-      //         phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, M *
-      //         out_channels/VecSize, 1);
-      //     phi::funcs::sparse::ScatterCUDAKernel<T, IntT, VecSize>
-      //         <<<config.block_per_grid,
-      //         config.thread_per_block,
-      //         0,
-      //         dev_ctx.stream()>>>(out_features_ptr,
-      //                 rulebook_ptr + 2 * n + offsets[i],
-      //                 out_values_ptr,
-      //                 M,
-      //                 out_channels,
-      //                 false);
-      // }else{
-      //     auto config =
-      //         phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, M *
-      //         out_channels, 1);
-      //     phi::funcs::sparse::ScatterCUDAKernel<T, IntT, 1>
-      //         <<<config.block_per_grid,
-      //         config.thread_per_block,
-      //         0,
-      //         dev_ctx.stream()>>>(out_features_ptr,
-      //                 rulebook_ptr + 2 * n + offsets[i],
-      //                 out_values_ptr,
-      //                 M,
-      //                 out_channels,
-      //                 false);
-      // }
-    }
   }
 
   // 4. scatter
-  if (true) {
+  if (subm) {
     set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
     auto config =
         phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1);
@@ -252,29 +220,29 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
     if (out_channels % VecSize == 0) {
       auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
           dev_ctx, out->nnz() * out_channels / VecSize, 1);
-      phi::funcs::sparse::ScatterKernel<T, VecSize>
+      phi::funcs::sparse::ScatterKernelV2<T, VecSize>
           <<<config.block_per_grid.x,
              config.thread_per_block.x,
              0,
              dev_ctx.stream()>>>(out_features_ptr,
-                                 unique_value.data<int>(),
                                  out_index.data<int>(),
+                                 unique_value.data<int>(),
                                  out->nnz(),
-                                 n,
+                                 kernel_size,
                                  out_channels,
                                  out_values_ptr);
     } else {
       auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
           dev_ctx, out->nnz() * out_channels, 1);
-      phi::funcs::sparse::ScatterKernel<T, 1>
+      phi::funcs::sparse::ScatterKernelV2<T, 1>
           <<<config.block_per_grid.x,
              config.thread_per_block.x,
              0,
              dev_ctx.stream()>>>(out_features_ptr,
-                                 unique_value.data<int>(),
                                  out_index.data<int>(),
+                                 unique_value.data<int>(),
                                  out->nnz(),
-                                 n,
+                                 kernel_size,
                                  out_channels,
                                  out_values_ptr);
     }

From 82027714e4c7673e27b474f8da06678905c44edb Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 27 Jun 2022 02:50:44 +0000
Subject: [PATCH 38/70] opt scatter

---
 paddle/phi/kernels/sparse/gpu/convolution.cu.h   | 16 ++++++++++++++++
 .../phi/kernels/sparse/gpu/convolution_kernel.cu | 15 ++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 88b2b97c79404..f7f822af641a2 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -473,6 +473,21 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
   }
 }
 
+template <typename IntT>
+__global__ void UpdateOutIndex(const int n,
+                               const int kernel_size,
+                               const IntT* out_indexs,
+                               int* out_index_counts,
+                               int* origin_out_indexs) {
+  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
+    IntT index = out_indexs[i];
+    // kernel_size at most
+    int j = atomicAdd(out_index_counts + index, 1);
+    // nnz * kernel_size
+    origin_out_indexs[index * kernel_size + j] = i;
+  }
+}
+
 // the basic algorithm can refer to convolution_kernel.cc or
 // the second paper
 // example:
@@ -631,6 +646,7 @@ int ProductRuleBook(const Context& dev_ctx,
                                              non_zero_num,
                                              out_rulebook_ptr);
     *rulebook = out_rulebook;
+
     return rulebook_len;
 
   } else {
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index e87fd71d511f6..39208681a373f 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -174,6 +174,19 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
 
   if (subm) {
     // set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1);
+    unique_value.ResizeAndAllocate(
+        {static_cast<int>(out->nnz() * kernel_size)});
+    out_index.ResizeAndAllocate({static_cast<int>(n)});
+    int* out_index_ptr = out_index.data<int>();
+    int* unique_value_ptr = unique_value.data<int>();
+    cudaMemsetAsync(out_index_ptr, 0, sizeof(int) * n, dev_ctx.stream());
+    UpdateOutIndex<<<config.block_per_grid,
+                     config.thread_per_block,
+                     0,
+                     dev_ctx.stream()>>>(
+        n, kernel_size, rulebook_ptr + 2 * n, out_index_ptr, unique_value_ptr);
   }
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
@@ -202,7 +215,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   }
 
   // 4. scatter
-  if (subm) {
+  if (false) {
     set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
     auto config =
         phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1);

From 75df1e242af0cdc7c51a7bc2cf2d1044e060e4c1 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 27 Jun 2022 02:51:48 +0000
Subject: [PATCH 39/70] opt SparseMaskCopy

---
 .../kernels/sparse/gpu/sparse_mask_kernel.cu  | 95 ++++++++++++-------
 1 file changed, 62 insertions(+), 33 deletions(-)

diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index e1f2a9149b4bd..2daff1ba43ae1 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -27,7 +27,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"
 #include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
-#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -146,6 +145,36 @@ __global__ void SparseMaskCopyKernel(const IntT* x_indexs,
   }
 }
 
+template <typename IntT>
+__global__ void MaskTable(const IntT* x_indexs, const int n, int* table) {
+  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
+    int index = x_indexs[i];
+    table[index] = i == 0 ? -1 : i;
+  }
+}
+
+template <typename T, typename IntT, int VecSize>
+__global__ void MaskCopy(const IntT* mask_indexs,
+                         const int* table,
+                         const int n,
+                         const int stride,
+                         const T* x_values,
+                         T* out_values) {
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
+    int j = table[mask_indexs[i]];
+    if (j != 0) {
+      if (j == -1) j = 0;
+      for (int k = 0; k < stride; k += VecSize) {
+        LoadT vec_x;
+        phi::Load<T, VecSize>(x_values + j * stride + k, &vec_x);
+        phi::Store<T, VecSize>(vec_x, out_values + i * stride + k);
+      }
+    }
+  }
+}
+
 template <typename T, typename IntT>
 void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
                                const SparseCooTensor& x,
@@ -217,52 +246,52 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
       mask_indexs.numel(),
       sparse_dim,
       mask_indexs_ptr);
-// 4. call thrust::lower_bound
-#ifdef PADDLE_WITH_HIP
-  thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()),
-#else
-  thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                      x_indexs_ptr,
-                      x_indexs_ptr + x_indexs.numel(),
-                      mask_indexs_ptr,
-                      mask_indexs_ptr + mask_indexs.numel(),
-                      bound_out_ptr);
 
-  // 5. copy value to out
+  int table_size = 1;
+  auto x_dims = x.dims();
+  for (int i = 0; i < x_dims.size() - 1; i++) {
+    table_size *= x_dims[i];
+  }
+  DenseTensor table = phi::Empty<int>(dev_ctx, {table_size});
+  cudaMemsetAsync(
+      table.data<int>(), 0, table_size * sizeof(int), dev_ctx.stream());
+  const int64_t stride =
+      x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];
   *out = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
   phi::funcs::SetConstant<GPUContext, T> set_zero;
   set_zero(dev_ctx, out, static_cast<T>(0));
   T* out_ptr = out->data<T>();
-
-  const int64_t stride =
-      x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];
-
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1);
+  MaskTable<<<config.block_per_grid,
+              config.thread_per_block,
+              0,
+              dev_ctx.stream()>>>(
+      x_indexs_ptr, x_indexs.numel(), table.data<int>());
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1);
   const int VecSize = VecBytes / sizeof(T);
   if (stride % VecSize == 0) {
-    SparseMaskCopyKernel<T, IntT, VecSize>
+    MaskCopy<T, IntT, VecSize>
         <<<config.block_per_grid,
            config.thread_per_block,
            0,
-           dev_ctx.stream()>>>(x_indexs_ptr,
-                               mask_indexs_ptr,
-                               bound_out_ptr,
-                               x.non_zero_elements().data<T>(),
+           dev_ctx.stream()>>>(mask_indexs_ptr,
+                               table.data<int>(),
                                mask_indexs.numel(),
                                stride,
-                               out_ptr);
-  } else {
-    SparseMaskCopyKernel<T, IntT, 1>
-        <<<config.block_per_grid,
-           config.thread_per_block,
-           0,
-           dev_ctx.stream()>>>(x_indexs_ptr,
-                               mask_indexs_ptr,
-                               bound_out_ptr,
                                x.non_zero_elements().data<T>(),
-                               mask_indexs.numel(),
-                               stride,
                                out_ptr);
+  } else {
+    MaskCopy<T, IntT, 1><<<config.block_per_grid,
+                           config.thread_per_block,
+                           0,
+                           dev_ctx.stream()>>>(mask_indexs_ptr,
+                                               table.data<int>(),
+                                               mask_indexs.numel(),
+                                               stride,
+                                               x.non_zero_elements().data<T>(),
+                                               out_ptr);
   }
 }
 

From 2745b0e91178beab72047352e6c350901a03da47 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 27 Jun 2022 02:52:39 +0000
Subject: [PATCH 40/70] coalesced is not performed by default

---
 paddle/phi/kernels/sparse/sparse_utils_kernel.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index 93abf70b24412..2f5bb189c0ffe 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -156,7 +156,8 @@ void SparseCooTensorKernel(const Context& dev_ctx,
                            SparseCooTensor* out) {
   SparseCooTensor before_coalesced(
       indices, values, phi::make_ddim(dense_shape.GetData()));
-  CoalescedKernel<T, Context>(dev_ctx, before_coalesced, out);
+  // CoalescedKernel<T, Context>(dev_ctx, before_coalesced, out);
+  *out = before_coalesced;
 }
 
 }  // namespace sparse

From ad9c2b610146e933a47394a9687568c59bb019c2 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 27 Jun 2022 07:37:45 +0000
Subject: [PATCH 41/70] opt rulebook

---
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 138 ++++--------------
 .../sparse/gpu/convolution_grad_kernel.cu     |  35 +++--
 .../kernels/sparse/gpu/convolution_kernel.cu  |  10 +-
 3 files changed, 54 insertions(+), 129 deletions(-)

diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index f7f822af641a2..57bf0d51779bf 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -123,47 +123,6 @@ __global__ void UpdateOutIndex(const int* out_index_table,
   }
 }
 
-template <typename Context, typename IntT = int>
-inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
-                                  const IntT* rulebook_ptr,
-                                  const int len,
-                                  DenseTensor* out_index,
-                                  DenseTensor* unique_key,
-                                  DenseTensor* unique_value) {
-  //  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
-  //      dev_ctx, out_index, kps::IdentityFunctor<int>());
-  //  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
-  //      dev_ctx, unique_value, kps::IdentityFunctor<int>());
-  //
-  //  phi::backends::gpu::GpuMemcpyAsync(unique_key->data<IntT>(),
-  //                                     rulebook_ptr,
-  //                                     sizeof(IntT) * len,
-  //                                     gpuMemcpyDeviceToDevice,
-  //                                     dev_ctx.stream());
-  //// compared with thrust::sort_by_key, thrust::merge_by_key may achieved
-  // higher / performance, but thrust::merge_by_key limited by data size
-  // #ifdef PADDLE_WITH_HIP
-  //  thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
-  // #else
-  //  thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
-  // #endif
-  //                      unique_key->data<IntT>(),
-  //                      unique_key->data<IntT>() + len,
-  //                      out_index->data<int>());
-  //
-  //  // 4. unique
-  //  thrust::pair<IntT*, int*> new_end =
-  // #ifdef PADDLE_WITH_HIP
-  //      thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
-  // #else
-  //      thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
-  // #endif
-  //                            unique_key->data<IntT>(),
-  //                            unique_key->data<IntT>() + len,
-  //                            unique_value->data<int>());
-  //  return new_end.first;
-}
-
 /**
  * @brief: update the out index and indices
  * unique_keys: save the index of the output feature list
@@ -205,42 +164,6 @@ __global__ void UpdateIndexKernel(const T* unique_keys,
   }
 }
 
-template <typename IntT>
-__global__ void UpdateOutIndexAndCounterAfterLowerBound(
-    const IntT* x_indexs,
-    const IntT* bound_out,
-    const int rulebook_len,
-    const int kernel_size,
-    const int64_t non_zero_num,
-    IntT* rulebook_ptr,
-    IntT* out_indexs,
-    int* counter_ptr) {
-  extern __shared__ int cache_count[];
-  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
-    cache_count[i] = 0;
-  }
-  __syncthreads();
-
-  CUDA_KERNEL_LOOP_TYPE(i, rulebook_len, int64_t) {
-    int j = bound_out[i];
-    if (j >= 0 && j < non_zero_num && out_indexs[i] == x_indexs[j]) {
-      out_indexs[i] = j;
-    } else {
-      // mask this position will be remove
-      int kernel_index = rulebook_ptr[i];
-      rulebook_ptr[i + rulebook_len] = -1;
-      rulebook_ptr[i + 2 * rulebook_len] = -1;
-      rulebook_ptr[i] = -1;
-      atomicAdd(&cache_count[kernel_index], 1);
-    }
-  }
-  __syncthreads();
-
-  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
-    atomicSub(&counter_ptr[i], cache_count[i]);
-  }
-}
-
 /**
  * @brief product rulebook
  * for input_i in x_indices:
@@ -307,9 +230,9 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
             atomicAdd(&counter_buf[kernel_index], 1);
             kernel_i = kernel_index;
           }
-          rulebook[kernel_index * non_zero_num + i] = kernel_i;
-          rulebook[kernel_index * non_zero_num + offset + i] = in_i;
-          rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
+          // rulebook[kernel_index * non_zero_num + i] = kernel_i;
+          rulebook[kernel_index * non_zero_num + i] = in_i;
+          rulebook[kernel_index * non_zero_num + offset + i] = out_index;
           ++kernel_index;
         }
       }
@@ -381,13 +304,11 @@ __global__ void CopyRuleBook(const int* counters,
       }
     }
     int inner_index = i - offsets[kernel_index];
+    // out_rulebook[i] = in_rulebook[kernel_index * non_zero_num + inner_index];
     out_rulebook[i] = in_rulebook[kernel_index * non_zero_num + inner_index];
     out_rulebook[len + i] =
         in_rulebook[kernel_size * non_zero_num + kernel_index * non_zero_num +
                     inner_index];
-    out_rulebook[2 * len + i] =
-        in_rulebook[2 * kernel_size * non_zero_num +
-                    kernel_index * non_zero_num + inner_index];
   }
 }
 
@@ -464,10 +385,10 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
   __syncthreads();
   for (int i = 0; i < kernel_size; i++) {
     if (threadIdx.x < counter_buf[i]) {
-      rulebook[i * non_zero_num + counter_buf2[i] + threadIdx.x] = i;
-      rulebook[i * non_zero_num + offset + counter_buf2[i] + threadIdx.x] =
+      // rulebook[i * non_zero_num + counter_buf2[i] + threadIdx.x] = i;
+      rulebook[i * non_zero_num + counter_buf2[i] + threadIdx.x] =
           rulebook_buf[i * blockDim.x + threadIdx.x];
-      rulebook[i * non_zero_num + offset * 2 + counter_buf2[i] + threadIdx.x] =
+      rulebook[i * non_zero_num + offset + counter_buf2[i] + threadIdx.x] =
           rulebook_buf[i * blockDim.x + kernel_size * blockDim.x + threadIdx.x];
     }
   }
@@ -536,8 +457,12 @@ int ProductRuleBook(const Context& dev_ctx,
   Dims4D d_strides(1, strides[2], strides[1], strides[0]);
   Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
   // 1. product rule book
-  phi::funcs::SetConstant<Context, int> set_zero;
-  set_zero(dev_ctx, counter_per_kernel, 0);
+  // phi::funcs::SetConstant<Context, int> set_zero;
+  // set_zero(dev_ctx, counter_per_kernel, 0);
+  phi::backends::gpu::GpuMemsetAsync(counter_ptr,
+                                     0,
+                                     sizeof(int) * counter_per_kernel->numel(),
+                                     dev_ctx.stream());
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
 
@@ -547,7 +472,7 @@ int ProductRuleBook(const Context& dev_ctx,
     // convolution,
     // and then the intermediate output index is subtracted from the input index
     // to obain the rulebook.
-    const int rulebook_rows = 3;
+    const int rulebook_rows = 2;
     const int rulebook_cols = kernel_size * non_zero_num;
     DenseTensorMeta rulebook_meta(
         indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW);
@@ -631,7 +556,8 @@ int ProductRuleBook(const Context& dev_ctx,
     dev_ctx.Wait();
     int rulebook_len =
         (*h_offsets)[kernel_size - 1] + (*h_counter)[kernel_size - 1];
-    DenseTensor out_rulebook = phi::Empty<IntT>(dev_ctx, {3, rulebook_len});
+    DenseTensor out_rulebook =
+        phi::Empty<IntT>(dev_ctx, {rulebook_rows, rulebook_len});
     IntT* out_rulebook_ptr = out_rulebook.data<IntT>();
     config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
     cache_size = kernel_size * 2 * sizeof(int);
@@ -650,7 +576,7 @@ int ProductRuleBook(const Context& dev_ctx,
     return rulebook_len;
 
   } else {
-    const int rulebook_rows = 3;
+    const int rulebook_rows = 2;
     const int rulebook_cols = kernel_size * non_zero_num;
     DenseTensorMeta rulebook_meta(
         indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW);
@@ -681,20 +607,17 @@ int ProductRuleBook(const Context& dev_ctx,
                                 -1);
 
     phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
-        rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
+        rulebook_ptr, last, rulebook_ptr + rulebook_rows * rulebook_cols - 1);
     IntT rulebook_len = 0;
     phi::backends::gpu::GpuMemcpyAsync(
         &rulebook_len,
-        rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
+        rulebook_ptr + rulebook_rows * rulebook_cols - 1,
         sizeof(IntT),
-#ifdef PADDLE_WITH_HIP
-        hipMemcpyDeviceToHost,
-#else
-        cudaMemcpyDeviceToHost,
-#endif
+        gpuMemcpyDeviceToHost,
         dev_ctx.stream());
+
     dev_ctx.Wait();
-    rulebook_len /= 3;
+    rulebook_len /= 2;
 
 #ifdef PADDLE_WITH_HIP
     thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
@@ -708,21 +631,13 @@ int ProductRuleBook(const Context& dev_ctx,
     phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
                                        counter_ptr,
                                        kernel_size * sizeof(int),
-#ifdef PADDLE_WITH_HIP
-                                       hipMemcpyDeviceToHost,
-#else
-                                       cudaMemcpyDeviceToHost,
-#endif
+                                       gpuMemcpyDeviceToHost,
                                        dev_ctx.stream());
 
     phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
                                        offsets_ptr,
                                        kernel_size * sizeof(int),
-#ifdef PADDLE_WITH_HIP
-                                       hipMemcpyDeviceToHost,
-#else
-                                       cudaMemcpyDeviceToHost,
-#endif
+                                       gpuMemcpyDeviceToHost,
                                        dev_ctx.stream());
 
     rulebook->Resize({rulebook_rows, static_cast<int>(rulebook_len)});
@@ -742,12 +657,13 @@ int ProductRuleBook(const Context& dev_ctx,
     cudaMemsetAsync(
         out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
     cudaMemsetAsync(unique_key_ptr, 0, sizeof(int), dev_ctx.stream());
+
     config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
     size_t cache_size = sizeof(int) * config.thread_per_block.x;
     UniqueKernel<IntT><<<config.block_per_grid,
                          config.thread_per_block,
                          cache_size,
-                         dev_ctx.stream()>>>(rulebook_ptr + 2 * rulebook_len,
+                         dev_ctx.stream()>>>(rulebook_ptr + rulebook_len,
                                              rulebook_len,
                                              out_index_table_ptr,
                                              out_index_ptr,
@@ -797,7 +713,7 @@ int ProductRuleBook(const Context& dev_ctx,
                      dev_ctx.stream()>>>(out_index_table_ptr,
                                          rulebook_len,
                                          kernel_size,
-                                         rulebook_ptr + 2 * rulebook_len,
+                                         rulebook_ptr + rulebook_len,
                                          out_index_ptr,
                                          unique_value_ptr);
 
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 9533d456af0a6..7cabae44e7903 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -83,7 +83,9 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   *kernel_grad = phi::EmptyLike<T>(dev_ctx, kernel);
   T* d_kernel_ptr = kernel_grad->data<T>();
   phi::funcs::SetConstant<GPUContext, T> set_zero;
-  set_zero(dev_ctx, kernel_grad, static_cast<T>(0.0f));
+  // set_zero(dev_ctx, kernel_grad, static_cast<T>(0.0f));
+  phi::backends::gpu::GpuMemsetAsync(
+      d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel(), dev_ctx.stream());
 
   int half_kernel_size = kernel_size / 2;
   auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
@@ -91,8 +93,14 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
       phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
   DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
   T* x_grad_values_ptr = x_grad_values.data<T>();
-  set_zero(dev_ctx, &x_grad_values, static_cast<T>(0.0f));
-  set_zero(dev_ctx, &d_x_features, static_cast<T>(0.0f));
+  // set_zero(dev_ctx, &x_grad_values, static_cast<T>(0.0f));
+  phi::backends::gpu::GpuMemsetAsync(x_grad_values_ptr,
+                                     0,
+                                     sizeof(T) * x_grad_values.numel(),
+                                     dev_ctx.stream());
+  // set_zero(dev_ctx, &d_x_features, static_cast<T>(0.0f));
+  phi::backends::gpu::GpuMemsetAsync(
+      d_x_features_ptr, 0, sizeof(T) * d_x_features.numel(), dev_ctx.stream());
   phi::Copy<GPUContext>(dev_ctx,
                         x.non_zero_indices(),
                         dev_ctx.GetPlace(),
@@ -138,7 +146,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
            config.thread_per_block.x,
            0,
            dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                               rulebook_ptr + rulebook_len,
+                               rulebook_ptr,
                                in_features_ptr,
                                rulebook_len,
                                in_channels);
@@ -150,7 +158,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
            config.thread_per_block.x,
            0,
            dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                               rulebook_ptr + rulebook_len,
+                               rulebook_ptr,
                                in_features_ptr,
                                rulebook_len,
                                in_channels);
@@ -164,7 +172,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
            config.thread_per_block.x,
            0,
            dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
-                               rulebook_ptr + rulebook_len * 2,
+                               rulebook_ptr + rulebook_len,
                                out_grad_features_ptr,
                                rulebook_len,
                                out_channels);
@@ -176,7 +184,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
            config.thread_per_block.x,
            0,
            dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
-                               rulebook_ptr + rulebook_len * 2,
+                               rulebook_ptr + rulebook_len,
                                out_grad_features_ptr,
                                rulebook_len,
                                out_channels);
@@ -231,13 +239,12 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   phi::funcs::ScatterCUDAKernel<<<config.block_per_grid,
                                   config.thread_per_block,
                                   0,
-                                  dev_ctx.stream()>>>(
-      d_x_features_ptr,
-      rulebook_ptr + rulebook_len,
-      x_grad_values_ptr,
-      rulebook_len,
-      in_channels,
-      false);
+                                  dev_ctx.stream()>>>(d_x_features_ptr,
+                                                      rulebook_ptr,
+                                                      x_grad_values_ptr,
+                                                      rulebook_len,
+                                                      in_channels,
+                                                      false);
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 39208681a373f..20911109036b3 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -139,6 +139,8 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   T* out_features_ptr = out_features.data<T>();
   phi::funcs::SetConstant<GPUContext, T> set_zero;
   set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
+  // phi::backends::gpu::GpuMemsetAsync(out_features_ptr,
+  // static_cast<T>(0.0f), sizeof(T) * out_features.numel(), dev_ctx.stream());
 
   const int VecSize = VecBytes / sizeof(T);
   if (in_channels % VecSize == 0) {
@@ -149,7 +151,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
            config.thread_per_block.x,
            0,
            dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                               rulebook_ptr + n,
+                               rulebook_ptr,
                                in_features_ptr,
                                n,
                                in_channels);
@@ -161,7 +163,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
            config.thread_per_block.x,
            0,
            dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                               rulebook_ptr + n,
+                               rulebook_ptr,
                                in_features_ptr,
                                n,
                                in_channels);
@@ -186,7 +188,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
                      config.thread_per_block,
                      0,
                      dev_ctx.stream()>>>(
-        n, kernel_size, rulebook_ptr + 2 * n, out_index_ptr, unique_value_ptr);
+        n, kernel_size, rulebook_ptr + n, out_index_ptr, unique_value_ptr);
   }
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
@@ -224,7 +226,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
            config.thread_per_block,
            0,
            dev_ctx.stream()>>>(out_features_ptr,
-                               rulebook_ptr + 2 * n,
+                               rulebook_ptr + n,
                                out_values_ptr,
                                n,
                                out_channels,

From 214475bb64302594790ca38c3da39fde4898334e Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 27 Jun 2022 08:24:47 +0000
Subject: [PATCH 42/70] remove a sync

---
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 57bf0d51779bf..bc15b43a3200c 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -606,18 +606,22 @@ int ProductRuleBook(const Context& dev_ctx,
                                 rulebook_ptr + rulebook_rows * rulebook_cols,
                                 -1);
 
-    phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
-        rulebook_ptr, last, rulebook_ptr + rulebook_rows * rulebook_cols - 1);
-    IntT rulebook_len = 0;
-    phi::backends::gpu::GpuMemcpyAsync(
-        &rulebook_len,
-        rulebook_ptr + rulebook_rows * rulebook_cols - 1,
-        sizeof(IntT),
-        gpuMemcpyDeviceToHost,
-        dev_ctx.stream());
-
-    dev_ctx.Wait();
-    rulebook_len /= 2;
+    // phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
+    //     rulebook_ptr, last, rulebook_ptr + rulebook_rows * rulebook_cols -
+    //     1);
+    // IntT rulebook_len = 0;
+    // phi::backends::gpu::GpuMemcpyAsync(
+    //     &rulebook_len,
+    //     rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+    //     sizeof(IntT),
+    //     gpuMemcpyDeviceToHost,
+    //     dev_ctx.stream());
+
+    // dev_ctx.Wait();
+    // rulebook_len /= 2;
+    // printf("rulebook_len = %d\n", rulebook_len);
+    // printf("distance = %d\n", last-rulebook_ptr);
+    IntT rulebook_len = (last - rulebook_ptr) / 2;
 
 #ifdef PADDLE_WITH_HIP
     thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),

From 13f0b93146205b4289747ff23f6895ab7678a8f2 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 28 Jun 2022 04:05:05 +0000
Subject: [PATCH 43/70] gatherV2

---
 paddle/phi/kernels/funcs/sparse/scatter.cu.h  |  2 +
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 59 ++++++++++++++--
 .../sparse/gpu/convolution_grad_kernel.cu     | 68 ++++++++++++++++---
 .../kernels/sparse/gpu/convolution_kernel.cu  | 39 +++++++----
 4 files changed, 138 insertions(+), 30 deletions(-)

diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
index 39a2ad7a1db0a..de0a14c7dbf80 100644
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -117,6 +117,8 @@ __global__ void ScatterKernelV2(const T* input,
     int len = out_index_counts[indices_i];
     // max(end-start) = kernel_size
     StoreT sums = {static_cast<T>(0)};
+    phi::Load<T, VecSize>(out + indices_i * channels + channels_i * VecSize,
+                          &sums);
     for (int j = 0; j < len; j++) {
       const int out_feature_i = origin_out_indexs[indices_i * kernel_size + j];
       LoadT vec_in;
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index bc15b43a3200c..dc8600ce31d9d 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -68,6 +68,34 @@ __global__ void GatherKernel(const T* params,
   }
 }
 
+template <typename T, typename IntT, int VecSize>
+__global__ void GatherKernelV2(const T* inputs,
+                               const int* index_counts,
+                               const int* origin_indexs,
+                               const int non_zero_num,
+                               const int kernel_size,
+                               T* output,
+                               const int channels) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int vec_channels = channels / VecSize;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  for (int i = tid; i < non_zero_num * vec_channels;
+       i += gridDim.x * blockDim.x) {
+    int indices_i = i / vec_channels;
+    int channels_i = i - indices_i * vec_channels;
+    int len = index_counts[indices_i];
+    LoadT in_vec;
+    phi::Load<T, VecSize>(inputs + indices_i * channels + channels_i * VecSize,
+                          &in_vec);
+    for (int j = 0; j < len; j++) {
+      int out_i = origin_indexs[indices_i * kernel_size + j];
+      phi::Store<T, VecSize>(in_vec,
+                             output + out_i * channels + channels_i * VecSize);
+    }
+  }
+}
+
 template <typename IntT>
 __global__ void UniqueKernel(const IntT* in_indexs,
                              const int rulebook_len,
@@ -397,15 +425,34 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
 template <typename IntT>
 __global__ void UpdateOutIndex(const int n,
                                const int kernel_size,
-                               const IntT* out_indexs,
-                               int* out_index_counts,
-                               int* origin_out_indexs) {
+                               const IntT* indexs,
+                               int* index_counts,
+                               int* index_groups) {
   CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
-    IntT index = out_indexs[i];
+    IntT index = indexs[i];
+    // kernel_size at most
+    int j = atomicAdd(index_counts + index, 1);
+    // nnz * kernel_size
+    index_groups[index * kernel_size + j] = i;
+  }
+}
+
+template <typename IntT>
+__global__ void UpdateOutIndexV2(const int n,
+                                 const int kernel_size,
+                                 const int half_kernel_offset,
+                                 const IntT* indexs,
+                                 int* index_counts,
+                                 int* index_groups) {
+  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
+    IntT index = indexs[i];
     // kernel_size at most
-    int j = atomicAdd(out_index_counts + index, 1);
+    int* counts_ptr =
+        i < half_kernel_offset ? index_counts : index_counts + nnz;
+    int j = atomicAdd(counts_ptr + index, 1);
     // nnz * kernel_size
-    origin_out_indexs[index * kernel_size + j] = i;
+    int group_offset = i < half_kernel_offset ? 0 : kernel_size / 2;
+    index_groups[index * kernel_size + j + group_offset] = i;
   }
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 7cabae44e7903..38c9661610766 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -233,18 +233,64 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   }
 
   // 4. scatter
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * in_channels, 1);
+  //  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+  //      dev_ctx, rulebook_len * in_channels, 1);
+  //
+  //  phi::funcs::ScatterCUDAKernel<<<config.block_per_grid,
+  //                                  config.thread_per_block,
+  //                                  0,
+  //                                  dev_ctx.stream()>>>(d_x_features_ptr,
+  //                                                      rulebook_ptr,
+  //                                                      x_grad_values_ptr,
+  //                                                      rulebook_len,
+  //                                                      in_channels,
+  //                                                      false);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+  DenseTensor unique_value =
+      phi::Empty<int>(dev_ctx, {static_cast<int>(x_grad->nnz() * kernel_size)});
+  DenseTensor out_index =
+      phi::Empty<int>(dev_ctx, {static_cast<int>(rulebook_len)});
+  int* out_index_ptr = out_index.data<int>();
+  int* unique_value_ptr = unique_value.data<int>();
+  cudaMemsetAsync(
+      out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
 
-  phi::funcs::ScatterCUDAKernel<<<config.block_per_grid,
-                                  config.thread_per_block,
-                                  0,
-                                  dev_ctx.stream()>>>(d_x_features_ptr,
-                                                      rulebook_ptr,
-                                                      x_grad_values_ptr,
-                                                      rulebook_len,
-                                                      in_channels,
-                                                      false);
+  UpdateOutIndex<<<config.block_per_grid,
+                   config.thread_per_block,
+                   0,
+                   dev_ctx.stream()>>>(
+      rulebook_len, kernel_size, rulebook_ptr, out_index_ptr, unique_value_ptr);
+
+  if (in_channels % VecSize == 0) {
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, x_grad->nnz() * in_channels / VecSize, 1);
+    phi::funcs::sparse::ScatterKernelV2<T, VecSize>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(d_x_features_ptr,
+                               out_index.data<int>(),
+                               unique_value.data<int>(),
+                               x_grad->nnz(),
+                               kernel_size,
+                               in_channels,
+                               x_grad_values_ptr);
+  } else {
+    config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, x_grad->nnz() * in_channels, 1);
+    phi::funcs::sparse::ScatterKernelV2<T, 1>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(d_x_features_ptr,
+                               out_index.data<int>(),
+                               unique_value.data<int>(),
+                               x_grad->nnz(),
+                               kernel_size,
+                               in_channels,
+                               x_grad_values_ptr);
+  }
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 20911109036b3..8c1e1e9e9e33f 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -139,33 +139,47 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   T* out_features_ptr = out_features.data<T>();
   phi::funcs::SetConstant<GPUContext, T> set_zero;
   set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
-  // phi::backends::gpu::GpuMemsetAsync(out_features_ptr,
-  // static_cast<T>(0.0f), sizeof(T) * out_features.numel(), dev_ctx.stream());
+
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1);
+  DenseTensor index_groups = phi::Empty<int>(dev_ctx, {x.nnz() * kernel_size});
+  DenseTensor index_counts = phi::Empty<int>(dev_ctx, {n});
+  int* index_counts_ptr = index_counts.data<int>();
+  int* index_groups_ptr = index_groups.data<int>();
+  cudaMemsetAsync(index_counts_ptr, 0, sizeof(int) * n, dev_ctx.stream());
+  UpdateOutIndex<<<config.block_per_grid,
+                   config.thread_per_block,
+                   0,
+                   dev_ctx.stream()>>>(
+      n, kernel_size, rulebook_ptr, index_counts_ptr, index_groups_ptr);
 
   const int VecSize = VecBytes / sizeof(T);
   if (in_channels % VecSize == 0) {
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, n * in_channels / VecSize, 1);
-    GatherKernel<T, IntT, VecSize>
+        dev_ctx, x.nnz() * in_channels / VecSize, 1);
+    GatherKernelV2<T, IntT, VecSize>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
            0,
            dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                               rulebook_ptr,
+                               index_counts_ptr,
+                               index_groups_ptr,
+                               x.nnz(),
+                               kernel_size,
                                in_features_ptr,
-                               n,
                                in_channels);
   } else {
-    auto config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
-    GatherKernel<T, IntT, 1>
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, x.nnz() * in_channels, 1);
+    GatherKernelV2<T, IntT, 1>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
            0,
            dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                               rulebook_ptr,
+                               index_counts_ptr,
+                               index_groups_ptr,
+                               x.nnz(),
+                               kernel_size,
                                in_features_ptr,
-                               n,
                                in_channels);
   }
 
@@ -173,10 +187,9 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
   auto* out_values = out->mutable_non_zero_elements();
   T* out_values_ptr = out_values->data<T>();
+  set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
 
   if (subm) {
-    // set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
-
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1);
     unique_value.ResizeAndAllocate(
         {static_cast<int>(out->nnz() * kernel_size)});

From c9929a2689339ed47c185c910d67e07d8bf9428a Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 28 Jun 2022 05:58:01 +0000
Subject: [PATCH 44/70] opt gather of backward

---
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 12 ++---
 .../sparse/gpu/convolution_grad_kernel.cu     | 54 ++++++++++---------
 .../kernels/sparse/gpu/convolution_kernel.cu  | 38 +++++--------
 3 files changed, 49 insertions(+), 55 deletions(-)

diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index dc8600ce31d9d..c5aea43e872ca 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -447,12 +447,12 @@ __global__ void UpdateOutIndexV2(const int n,
   CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
     IntT index = indexs[i];
     // kernel_size at most
-    int* counts_ptr =
-        i < half_kernel_offset ? index_counts : index_counts + nnz;
-    int j = atomicAdd(counts_ptr + index, 1);
-    // nnz * kernel_size
-    int group_offset = i < half_kernel_offset ? 0 : kernel_size / 2;
-    index_groups[index * kernel_size + j + group_offset] = i;
+    /// int* counts_ptr =
+    ///     i < half_kernel_offset ? index_counts : index_counts + nnz;
+    /// int j = atomicAdd(counts_ptr + index, 1);
+    ///// nnz * kernel_size
+    /// int group_offset = i < half_kernel_offset ? 0 : kernel_size / 2;
+    /// index_groups[index * kernel_size + j + group_offset] = i;
   }
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 38c9661610766..9571b260f0e3d 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -137,30 +137,52 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
     }
   }
 
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+  DenseTensor unique_value =
+      phi::Empty<int>(dev_ctx, {static_cast<int>(x_grad->nnz() * kernel_size)});
+  DenseTensor out_index =
+      phi::Empty<int>(dev_ctx, {static_cast<int>(rulebook_len)});
+  int* out_index_ptr = out_index.data<int>();
+  int* unique_value_ptr = unique_value.data<int>();
+  cudaMemsetAsync(
+      out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
+
+  UpdateOutIndex<<<config.block_per_grid,
+                   config.thread_per_block,
+                   0,
+                   dev_ctx.stream()>>>(
+      rulebook_len, kernel_size, rulebook_ptr, out_index_ptr, unique_value_ptr);
+
   const int VecSize = VecBytes / sizeof(T);
   if (in_channels % VecSize == 0) {
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, rulebook_len * in_channels / VecSize, 1);
-    GatherKernel<T, IntT, VecSize>
+    GatherKernelV2<T, IntT, VecSize>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
            0,
            dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                               rulebook_ptr,
-                               in_features_ptr,
+                               // rulebook_ptr,
+                               out_index_ptr,
+                               unique_value_ptr,
                                rulebook_len,
+                               kernel_size,
+                               in_features_ptr,
                                in_channels);
   } else {
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, rulebook_len * in_channels, 1);
-    GatherKernel<T, IntT, 1>
+    GatherKernelV2<T, IntT, 1>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
            0,
            dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                               rulebook_ptr,
-                               in_features_ptr,
+                               out_index_ptr,
+                               unique_value_ptr,
                                rulebook_len,
+                               kernel_size,
+                               in_features_ptr,
                                in_channels);
   }
 
@@ -245,25 +267,9 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   //                                                      rulebook_len,
   //                                                      in_channels,
   //                                                      false);
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
-  DenseTensor unique_value =
-      phi::Empty<int>(dev_ctx, {static_cast<int>(x_grad->nnz() * kernel_size)});
-  DenseTensor out_index =
-      phi::Empty<int>(dev_ctx, {static_cast<int>(rulebook_len)});
-  int* out_index_ptr = out_index.data<int>();
-  int* unique_value_ptr = unique_value.data<int>();
-  cudaMemsetAsync(
-      out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
-
-  UpdateOutIndex<<<config.block_per_grid,
-                   config.thread_per_block,
-                   0,
-                   dev_ctx.stream()>>>(
-      rulebook_len, kernel_size, rulebook_ptr, out_index_ptr, unique_value_ptr);
 
   if (in_channels % VecSize == 0) {
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, x_grad->nnz() * in_channels / VecSize, 1);
     phi::funcs::sparse::ScatterKernelV2<T, VecSize>
         <<<config.block_per_grid.x,
@@ -277,7 +283,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
                                in_channels,
                                x_grad_values_ptr);
   } else {
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, x_grad->nnz() * in_channels, 1);
     phi::funcs::sparse::ScatterKernelV2<T, 1>
         <<<config.block_per_grid.x,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 8c1e1e9e9e33f..3bf2a028acb61 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -139,47 +139,33 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   T* out_features_ptr = out_features.data<T>();
   phi::funcs::SetConstant<GPUContext, T> set_zero;
   set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
-
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1);
-  DenseTensor index_groups = phi::Empty<int>(dev_ctx, {x.nnz() * kernel_size});
-  DenseTensor index_counts = phi::Empty<int>(dev_ctx, {n});
-  int* index_counts_ptr = index_counts.data<int>();
-  int* index_groups_ptr = index_groups.data<int>();
-  cudaMemsetAsync(index_counts_ptr, 0, sizeof(int) * n, dev_ctx.stream());
-  UpdateOutIndex<<<config.block_per_grid,
-                   config.thread_per_block,
-                   0,
-                   dev_ctx.stream()>>>(
-      n, kernel_size, rulebook_ptr, index_counts_ptr, index_groups_ptr);
+  // phi::backends::gpu::GpuMemsetAsync(out_features_ptr,
+  // static_cast<T>(0.0f), sizeof(T) * out_features.numel(), dev_ctx.stream());
 
   const int VecSize = VecBytes / sizeof(T);
   if (in_channels % VecSize == 0) {
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, x.nnz() * in_channels / VecSize, 1);
-    GatherKernelV2<T, IntT, VecSize>
+        dev_ctx, n * in_channels / VecSize, 1);
+    GatherKernel<T, IntT, VecSize>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
            0,
            dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                               index_counts_ptr,
-                               index_groups_ptr,
-                               x.nnz(),
-                               kernel_size,
+                               rulebook_ptr,
                                in_features_ptr,
+                               n,
                                in_channels);
   } else {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, x.nnz() * in_channels, 1);
-    GatherKernelV2<T, IntT, 1>
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
+    GatherKernel<T, IntT, 1>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
            0,
            dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                               index_counts_ptr,
-                               index_groups_ptr,
-                               x.nnz(),
-                               kernel_size,
+                               rulebook_ptr,
                                in_features_ptr,
+                               n,
                                in_channels);
   }
 
@@ -190,6 +176,8 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
 
   if (subm) {
+    // set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
+
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1);
     unique_value.ResizeAndAllocate(
         {static_cast<int>(out->nnz() * kernel_size)});

From dab4609996c85e0e5005cbdff138d19ac502ae82 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 28 Jun 2022 07:54:12 +0000
Subject: [PATCH 45/70] resolve conflict

---
 paddle/phi/kernels/sparse/cpu/convolution_kernel.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index 7c37d90cd4cf9..ecf7073b41109 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/sparse/cpu/convolution.h"
 

From f66d0c759bc196b2be68b54fcac397078616419d Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 28 Jun 2022 11:30:58 +0000
Subject: [PATCH 46/70] opt groups indexs

---
 paddle/phi/kernels/funcs/sparse/scatter.cu.h  | 49 ++++++++++++++++
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 56 ++++++++++++++++---
 .../sparse/gpu/convolution_grad_kernel.cu     | 53 ++++++++----------
 .../kernels/sparse/gpu/convolution_kernel.cu  |  1 -
 4 files changed, 118 insertions(+), 41 deletions(-)

diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
index de0a14c7dbf80..33157304424a3 100644
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -134,6 +134,55 @@ __global__ void ScatterKernelV2(const T* input,
   }
 }
 
+template <typename T, int VecSize>
+__global__ void ScatterKernelV3(const T* input,
+                                const int* out_index_counts,
+                                const int* origin_out_indexs,
+                                const int non_zero_num,
+                                const int kernel_size,
+                                const int channels,
+                                T* out) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int vec_channels = channels / VecSize;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  for (int i = tid; i < non_zero_num * vec_channels;
+       i += gridDim.x * blockDim.x) {
+    int indices_i = i / vec_channels;
+    int channels_i = i - indices_i * vec_channels;
+
+    int len1 = out_index_counts[indices_i];
+    StoreT sums = {static_cast<T>(0)};
+    phi::Load<T, VecSize>(out + indices_i * channels + channels_i * VecSize,
+                          &sums);
+    for (int j = 0; j < len1; j++) {
+      const int out_feature_i = origin_out_indexs[indices_i * kernel_size + j];
+      LoadT vec_in;
+      phi::Load<T, VecSize>(
+          input + out_feature_i * channels + channels_i * VecSize, &vec_in);
+#pragma unroll
+      for (int k = 0; k < VecSize; k++) {
+        sums[k] += vec_in[k];
+      }
+    }
+
+    int len2 = out_index_counts[non_zero_num + indices_i];
+    for (int j = 0; j < len2; j++) {
+      const int out_feature_i = origin_out_indexs[indices_i * kernel_size + j +
+                                                  kernel_size * non_zero_num];
+      LoadT vec_in;
+      phi::Load<T, VecSize>(
+          input + out_feature_i * channels + channels_i * VecSize, &vec_in);
+#pragma unroll
+      for (int k = 0; k < VecSize; k++) {
+        sums[k] += vec_in[k];
+      }
+    }
+    phi::Store<T, VecSize>(sums,
+                           out + indices_i * channels + channels_i * VecSize);
+  }
+}
+
 }  // namespace sparse
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 6a85b8622c609..a86f5f06083c2 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -96,6 +96,41 @@ __global__ void GatherKernelV2(const T* inputs,
   }
 }
 
+template <typename T, typename IntT, int VecSize>
+__global__ void GatherKernelV3(const T* inputs,
+                               const int* index_counts,
+                               const int* origin_indexs,
+                               const int non_zero_num,
+                               const int kernel_size,
+                               T* output,
+                               const int channels) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int vec_channels = channels / VecSize;
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  for (int i = tid; i < non_zero_num * vec_channels;
+       i += gridDim.x * blockDim.x) {
+    int indices_i = i / vec_channels;
+    int channels_i = i - indices_i * vec_channels;
+    int len1 = index_counts[indices_i];
+    LoadT in_vec;
+    phi::Load<T, VecSize>(inputs + indices_i * channels + channels_i * VecSize,
+                          &in_vec);
+    for (int j = 0; j < len1; j++) {
+      int out_i = origin_indexs[indices_i * kernel_size + j];
+      phi::Store<T, VecSize>(in_vec,
+                             output + out_i * channels + channels_i * VecSize);
+    }
+    int len2 = index_counts[non_zero_num + indices_i];
+    for (int j = 0; j < len2; j++) {
+      int out_i = origin_indexs[indices_i * kernel_size + j +
+                                kernel_size * non_zero_num];
+      phi::Store<T, VecSize>(in_vec,
+                             output + out_i * channels + channels_i * VecSize);
+    }
+  }
+}
+
 template <typename IntT>
 __global__ void UniqueKernel(const IntT* in_indexs,
                              const int rulebook_len,
@@ -438,21 +473,24 @@ __global__ void UpdateOutIndex(const int n,
 }
 
 template <typename IntT>
-__global__ void UpdateOutIndexV2(const int n,
+__global__ void UpdateOutIndexV2(const int rulebook_len,
+                                 const int non_zero_num,
                                  const int kernel_size,
                                  const int half_kernel_offset,
                                  const IntT* indexs,
                                  int* index_counts,
                                  int* index_groups) {
-  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
+  CUDA_KERNEL_LOOP_TYPE(i, rulebook_len, int64_t) {
     IntT index = indexs[i];
-    // kernel_size at most
-    /// int* counts_ptr =
-    ///     i < half_kernel_offset ? index_counts : index_counts + nnz;
-    /// int j = atomicAdd(counts_ptr + index, 1);
-    ///// nnz * kernel_size
-    /// int group_offset = i < half_kernel_offset ? 0 : kernel_size / 2;
-    /// index_groups[index * kernel_size + j + group_offset] = i;
+    int* counts_ptr =
+        i < half_kernel_offset ? index_counts : index_counts + non_zero_num;
+    int* groups_ptr = i < half_kernel_offset
+                          ? index_groups
+                          : index_groups + non_zero_num * kernel_size;
+    // conflict kernel_size times at most
+    int j = atomicAdd(counts_ptr + index, 1);
+    // nnz * kernel_size
+    groups_ptr[index * kernel_size + j] = i;
   }
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 745ee0e83bb91..72df861cccbef 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -139,48 +139,52 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
 
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
-  DenseTensor unique_value =
-      phi::Empty<int>(dev_ctx, {static_cast<int>(x_grad->nnz() * kernel_size)});
+  DenseTensor unique_value = phi::Empty<int>(
+      dev_ctx, {static_cast<int>(x_grad->nnz() * kernel_size * 2)});
   DenseTensor out_index =
-      phi::Empty<int>(dev_ctx, {static_cast<int>(rulebook_len)});
+      phi::Empty<int>(dev_ctx, {static_cast<int>(x.nnz() * 2)});
   int* out_index_ptr = out_index.data<int>();
   int* unique_value_ptr = unique_value.data<int>();
   cudaMemsetAsync(
-      out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
+      out_index_ptr, 0, sizeof(int) * x.nnz() * 2, dev_ctx.stream());
 
-  UpdateOutIndex<<<config.block_per_grid,
-                   config.thread_per_block,
-                   0,
-                   dev_ctx.stream()>>>(
-      rulebook_len, kernel_size, rulebook_ptr, out_index_ptr, unique_value_ptr);
+  UpdateOutIndexV2<<<config.block_per_grid,
+                     config.thread_per_block,
+                     0,
+                     dev_ctx.stream()>>>(rulebook_len,
+                                         x.nnz(),
+                                         kernel_size,
+                                         offsets[kernel_size / 2],
+                                         rulebook_ptr,
+                                         out_index_ptr,
+                                         unique_value_ptr);
 
   const int VecSize = VecBytes / sizeof(T);
   if (in_channels % VecSize == 0) {
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, rulebook_len * in_channels / VecSize, 1);
-    GatherKernelV2<T, IntT, VecSize>
+        dev_ctx, x.nnz() * in_channels / VecSize, 1);
+    GatherKernelV3<T, IntT, VecSize>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
            0,
            dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                               // rulebook_ptr,
                                out_index_ptr,
                                unique_value_ptr,
-                               rulebook_len,
+                               x.nnz(),
                                kernel_size,
                                in_features_ptr,
                                in_channels);
   } else {
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, rulebook_len * in_channels, 1);
-    GatherKernelV2<T, IntT, 1>
+        dev_ctx, x.nnz() * in_channels, 1);
+    GatherKernelV3<T, IntT, 1>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
            0,
            dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
                                out_index_ptr,
                                unique_value_ptr,
-                               rulebook_len,
+                               x.nnz(),
                                kernel_size,
                                in_features_ptr,
                                in_channels);
@@ -255,23 +259,10 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   }
 
   // 4. scatter
-  //  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-  //      dev_ctx, rulebook_len * in_channels, 1);
-  //
-  //  phi::funcs::ScatterCUDAKernel<<<config.block_per_grid,
-  //                                  config.thread_per_block,
-  //                                  0,
-  //                                  dev_ctx.stream()>>>(d_x_features_ptr,
-  //                                                      rulebook_ptr,
-  //                                                      x_grad_values_ptr,
-  //                                                      rulebook_len,
-  //                                                      in_channels,
-  //                                                      false);
-
   if (in_channels % VecSize == 0) {
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, x_grad->nnz() * in_channels / VecSize, 1);
-    phi::funcs::sparse::ScatterKernelV2<T, VecSize>
+    phi::funcs::sparse::ScatterKernelV3<T, VecSize>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
            0,
@@ -285,7 +276,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   } else {
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, x_grad->nnz() * in_channels, 1);
-    phi::funcs::sparse::ScatterKernelV2<T, 1>
+    phi::funcs::sparse::ScatterKernelV3<T, 1>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
            0,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 3bf2a028acb61..8e09c4c24e7d5 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -177,7 +177,6 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
 
   if (subm) {
     // set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
-
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1);
     unique_value.ResizeAndAllocate(
         {static_cast<int>(out->nnz() * kernel_size)});

From 44ad03e1e4daa15a20bb98e4b123b64501ff75c8 Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Tue, 28 Jun 2022 23:18:53 +0800
Subject: [PATCH 47/70] refine code

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 127 +++++++++++---------
 1 file changed, 73 insertions(+), 54 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index a26bba041912b..1936fccc63e4a 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -138,6 +138,62 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
   }
 }
 
+template <typename T>
+__device__ __forceinline__ void merge_block_vertical(
+    BatchNormParamType<T> x_sum,
+    BatchNormParamType<T> x_square_sum,
+    BatchNormParamType<T> *smem_sum,
+    BatchNormParamType<T> *smem_square_sum,
+    BatchNormParamType<T> *x_sum_out,
+    BatchNormParamType<T> *x_square_sum_out) {
+  int tid = threadIdx.x + threadIdx.y * blockDim.x;
+#pragma unroll
+  for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+    if (threadIdx.y < offset * 2) {
+      smem_sum[tid] = x_sum;
+      smem_square_sum[tid] = x_square_sum;
+    }
+    __syncthreads();
+    if (threadIdx.y < offset) {
+      int pair_tid = tid + offset * blockDim.x;
+      x_sum += smem_sum[pair_tid];
+      x_square_sum += smem_square_sum[pair_tid];
+    }
+  }
+  if (threadIdx.y == 0) {
+    *x_sum_out = x_sum;
+    *x_square_sum_out = x_square_sum;
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ void merge_block_horizonal(
+    BatchNormParamType<T> x_sum,
+    BatchNormParamType<T> x_square_sum,
+    BatchNormParamType<T> *smem_sum,
+    BatchNormParamType<T> *smem_square_sum,
+    BatchNormParamType<T> *x_sum_out,
+    BatchNormParamType<T> *x_square_sum_out) {
+  int tid = threadIdx.x + threadIdx.y * blockDim.x;
+#pragma unroll
+  for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
+    if (threadIdx.x < offset * 2) {
+      smem_sum[tid] = x_sum;
+      smem_square_sum[tid] = x_square_sum;
+    }
+    __syncthreads();
+    if (threadIdx.x < offset) {
+      int pair_tid = tid + offset;
+      x_sum += smem_sum[pair_tid];
+      x_square_sum += smem_square_sum[pair_tid];
+    }
+  }
+  if (threadIdx.x == 0) {
+    *x_sum_out = x_sum;
+    *x_square_sum_out = x_square_sum;
+  }
+}
+
 template <typename T, int BlockDim>
 static __global__ void BNForwardTraining2DChannelLastCompStat(
     const T *x,
@@ -180,20 +236,8 @@ static __global__ void BNForwardTraining2DChannelLastCompStat(
     }
 
     // vertical block sum
-    int tid = threadIdx.x + threadIdx.y * blockDim.x;
-#pragma unroll
-    for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
-      if (threadIdx.y < offset * 2) {
-        smem_sum[tid] = x_sum;
-        smem_square_sum[tid] = x_square_sum;
-      }
-      __syncthreads();
-      if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
-        int pair_tid = tid + offset * blockDim.x;
-        x_sum += smem_sum[pair_tid];
-        x_square_sum += smem_square_sum[pair_tid];
-      }
-    }
+    merge_block_vertical(
+        x_sum, x_square_sum, smem_sum, smem_square_sum, &x_sum, &x_square_sum);
 
     if (gridDim.y > 1) {
       volatile BatchNormParamType<T> *staging_sum = block_data_ptr;
@@ -228,19 +272,12 @@ static __global__ void BNForwardTraining2DChannelLastCompStat(
         }
 
         // vertical block sum
-        int tid = threadIdx.x + threadIdx.y * blockDim.x;
-        for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
-          if (threadIdx.y < offset * 2) {
-            smem_sum[tid] = x_sum;
-            smem_square_sum[tid] = x_square_sum;
-          }
-          __syncthreads();
-          if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
-            int pair_tid = tid + offset * blockDim.x;
-            x_sum += smem_sum[pair_tid];
-            x_square_sum += smem_square_sum[pair_tid];
-          }
-        }
+        merge_block_vertical(x_sum,
+                             x_square_sum,
+                             smem_sum,
+                             smem_square_sum,
+                             &x_sum,
+                             &x_square_sum);
 
         // final compute
         if (threadIdx.y == 0) {
@@ -363,19 +400,8 @@ static __global__ void BNForwardTraining2DCompStat(
     }
 
     // horizonal block sum
-    int tid = threadIdx.x + threadIdx.y * blockDim.x;
-    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
-      if (threadIdx.x < offset * 2) {
-        smem_sum[tid] = x_sum;
-        smem_square_sum[tid] = x_square_sum;
-      }
-      __syncthreads();
-      if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) {
-        int pair_tid = tid + offset;
-        x_sum += smem_sum[pair_tid];
-        x_square_sum += smem_square_sum[pair_tid];
-      }
-    }
+    merge_block_horizonal(
+        x_sum, x_square_sum, smem_sum, smem_square_sum, &x_sum, &x_square_sum);
 
     if (gridDim.x > 1) {
       volatile BatchNormParamType<T> *staging_sum = block_data_ptr;
@@ -409,20 +435,13 @@ static __global__ void BNForwardTraining2DCompStat(
           x_square_sum += staging_square_sum[i + x * C];
         }
 
-        // vertical block sum
-        int tid = threadIdx.x + threadIdx.y * blockDim.x;
-        for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
-          if (threadIdx.x < offset * 2) {
-            smem_sum[tid] = x_sum;
-            smem_square_sum[tid] = x_square_sum;
-          }
-          __syncthreads();
-          if (threadIdx.x < offset && threadIdx.x + offset < blockDim.y) {
-            int pair_tid = tid + offset;
-            x_sum += smem_sum[pair_tid];
-            x_square_sum += smem_square_sum[pair_tid];
-          }
-        }
+        // horizonal block sum
+        merge_block_horizonal(x_sum,
+                              x_square_sum,
+                              smem_sum,
+                              smem_square_sum,
+                              &x_sum,
+                              &x_square_sum);
 
         // final compute
         if (threadIdx.x == 0) {

From db9792c1109dfef99892807bd7d8972138fc2ce3 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Fri, 1 Jul 2022 13:38:35 +0000
Subject: [PATCH 48/70] replace sort with remove_copy

---
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index a86f5f06083c2..001b38847e5ff 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -137,11 +137,9 @@ __global__ void UniqueKernel(const IntT* in_indexs,
                              int* out_index_table,
                              int* out_indexs,
                              int* nnz) {
-  extern __shared__ int cache[];
-  __shared__ int count, start;
+  __shared__ int count;
   if (threadIdx.x == 0) {
     count = 0;
-    start = 0;
   }
   __syncthreads();
 
@@ -149,7 +147,7 @@ __global__ void UniqueKernel(const IntT* in_indexs,
   if (i < rulebook_len) {
     // atomicOr only support int
     int index = static_cast<int>(in_indexs[i]);
-    int change_index = index == 0 ? 1 : index;
+    int change_index = index == 0 ? -1 : index;
     int flag = atomicOr(out_index_table + index, change_index);
     if (flag == 0) {
       int j = atomicAdd(&count, 1);
@@ -159,11 +157,7 @@ __global__ void UniqueKernel(const IntT* in_indexs,
   __syncthreads();
 
   if (threadIdx.x == 0) {
-    start = atomicAdd(nnz, count);
-  }
-  __syncthreads();
-  for (int i = threadIdx.x; i < count; i += blockDim.x) {
-    out_indexs[start + i] = cache[i];
+    atomicAdd(nnz, count);
   }
 }
 
@@ -330,6 +324,7 @@ __global__ void GetOutIndexTable(const int* indexs,
                                  IntT* out_indices) {
   CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
     IntT index = static_cast<IntT>(indexs[i]);
+    index = index == -1 ? 0 : index;
     out_index_table[index] = i;
     IntT batch, x, y, z;
     phi::funcs::sparse::IndexToPoint<Dims4D>(
@@ -748,10 +743,9 @@ int ProductRuleBook(const Context& dev_ctx,
     cudaMemsetAsync(unique_key_ptr, 0, sizeof(int), dev_ctx.stream());
 
     config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
-    size_t cache_size = sizeof(int) * config.thread_per_block.x;
     UniqueKernel<IntT><<<config.block_per_grid,
                          config.thread_per_block,
-                         cache_size,
+                         0,
                          dev_ctx.stream()>>>(rulebook_ptr + rulebook_len,
                                              rulebook_len,
                                              out_index_table_ptr,
@@ -776,9 +770,14 @@ int ProductRuleBook(const Context& dev_ctx,
 
     IntT* out_indices_ptr = out_indices.data<IntT>();
 
-    thrust::sort(thrust::cuda::par.on(dev_ctx.stream()),
-                 out_index_ptr,
-                 out_index_ptr + out_nnz);
+    // thrust::sort(thrust::cuda::par.on(dev_ctx.stream()),
+    //              out_index_ptr,
+    //              out_index_ptr + out_nnz);
+    thrust::remove_copy(thrust::cuda::par.on(dev_ctx.stream()),
+                        out_index_table_ptr,
+                        out_index_table_ptr + table_size,
+                        out_index_ptr,
+                        0);
 
     config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
     GetOutIndexTable<IntT><<<config.block_per_grid,

From 6f9d6ea4176b0a50a1786dbd7ede76e8c4c4c0cf Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Fri, 1 Jul 2022 13:51:39 +0000
Subject: [PATCH 49/70] fix cache

---
 paddle/phi/kernels/sparse/gpu/convolution.cu.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 001b38847e5ff..e5285bb7c33d5 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -151,7 +151,6 @@ __global__ void UniqueKernel(const IntT* in_indexs,
     int flag = atomicOr(out_index_table + index, change_index);
     if (flag == 0) {
       int j = atomicAdd(&count, 1);
-      cache[j] = index;
     }
   }
   __syncthreads();

From c2957762ebc09f4db69d588c2f75c1879b653ae4 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Sun, 3 Jul 2022 03:03:30 +0000
Subject: [PATCH 50/70] unorder the out index of Conv3D

---
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 41 +++++++++++++------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 001b38847e5ff..3f975c80d54d0 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -137,9 +137,11 @@ __global__ void UniqueKernel(const IntT* in_indexs,
                              int* out_index_table,
                              int* out_indexs,
                              int* nnz) {
-  __shared__ int count;
+  extern __shared__ int cache[];
+  __shared__ int count, start;
   if (threadIdx.x == 0) {
     count = 0;
+    start = 0;
   }
   __syncthreads();
 
@@ -157,7 +159,11 @@ __global__ void UniqueKernel(const IntT* in_indexs,
   __syncthreads();
 
   if (threadIdx.x == 0) {
-    atomicAdd(nnz, count);
+    start = atomicAdd(nnz, count);
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < count; i += blockDim.x) {
+    out_indexs[start + i] = cache[i];
   }
 }
 
@@ -317,14 +323,14 @@ __global__ void GetOutIndexTable(const IntT* indices,
 }
 
 template <typename IntT>
-__global__ void GetOutIndexTable(const int* indexs,
+__global__ void GetOutIndexTable(int* indexs,
                                  const int non_zero_num,
                                  const Dims4D out_dims,
                                  int* out_index_table,
                                  IntT* out_indices) {
   CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
     IntT index = static_cast<IntT>(indexs[i]);
-    index = index == -1 ? 0 : index;
+    // index = index == -1 ? 0 : index;
     out_index_table[index] = i;
     IntT batch, x, y, z;
     phi::funcs::sparse::IndexToPoint<Dims4D>(
@@ -334,6 +340,7 @@ __global__ void GetOutIndexTable(const int* indexs,
     out_indices[i + non_zero_num] = z;
     out_indices[i + non_zero_num * 2] = y;
     out_indices[i + non_zero_num * 3] = x;
+    indexs[i] = 0;
   }
 }
 
@@ -743,9 +750,10 @@ int ProductRuleBook(const Context& dev_ctx,
     cudaMemsetAsync(unique_key_ptr, 0, sizeof(int), dev_ctx.stream());
 
     config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    size_t cache_size = sizeof(int) * config.thread_per_block.x;
     UniqueKernel<IntT><<<config.block_per_grid,
                          config.thread_per_block,
-                         0,
+                         cache_size,
                          dev_ctx.stream()>>>(rulebook_ptr + rulebook_len,
                                              rulebook_len,
                                              out_index_table_ptr,
@@ -759,6 +767,14 @@ int ProductRuleBook(const Context& dev_ctx,
                     dev_ctx.stream());
     dev_ctx.Wait();
 
+    // thrust::pair<int*, int*> min_max =
+    // thrust::minmax_element(thrust::cuda::par.on(dev_ctx.stream()),
+    // out_index_ptr, out_index_ptr + out_nnz); int start = 0, end = 0;
+    // cudaMemcpyAsync(&start, min_max.first, sizeof(int),
+    // cudaMemcpyDeviceToHost, dev_ctx.stream()); cudaMemcpyAsync(&end,
+    // min_max.second, sizeof(int), cudaMemcpyDeviceToHost, dev_ctx.stream());
+    // dev_ctx.Wait();
+
     const int64_t sparse_dim = 4;
     DenseTensorMeta indices_meta(
         indices_dtype, {sparse_dim, out_nnz}, DataLayout::NCHW);
@@ -773,11 +789,12 @@ int ProductRuleBook(const Context& dev_ctx,
     // thrust::sort(thrust::cuda::par.on(dev_ctx.stream()),
     //              out_index_ptr,
     //              out_index_ptr + out_nnz);
-    thrust::remove_copy(thrust::cuda::par.on(dev_ctx.stream()),
-                        out_index_table_ptr,
-                        out_index_table_ptr + table_size,
-                        out_index_ptr,
-                        0);
+    // printf("start = %d, end=%d, table_size=%d, nnz=%d\n", start, end,
+    // table_size); thrust::remove_copy(thrust::cuda::par.on(dev_ctx.stream()),
+    //                    out_index_table_ptr + start,
+    //                    out_index_table_ptr + end,
+    //                    out_index_ptr,
+    //                    0);
 
     config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
     GetOutIndexTable<IntT><<<config.block_per_grid,
@@ -789,8 +806,8 @@ int ProductRuleBook(const Context& dev_ctx,
                                                  out_index_table_ptr,
                                                  out_indices_ptr);
     config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
-    cudaMemsetAsync(
-        out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
+    // cudaMemsetAsync(
+    //     out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
     unique_value->ResizeAndAllocate({static_cast<int>(out_nnz * kernel_size)});
     int* unique_value_ptr = unique_value->data<int>();
 

From e181b1c085ea6d5f75dfd4bf5ccee6be6f52e86d Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 4 Jul 2022 06:06:10 +0000
Subject: [PATCH 51/70] add coalesced

---
 paddle/phi/api/yaml/sparse_api.yaml             |  7 +++++++
 .../phi/kernels/sparse/gpu/coalesced_kernel.cu  |  2 +-
 paddle/phi/kernels/sparse/gpu/convolution.cu.h  | 15 +++++++++------
 .../kernels/sparse/gpu/sparse_mask_kernel.cu    | 17 +++++++++--------
 .../tests/unittests/test_sparse_conv_op.py      |  1 +
 python/paddle/incubate/sparse/__init__.py       |  3 +++
 6 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml
index f3379f8c956db..3987208c2ecc7 100644
--- a/paddle/phi/api/yaml/sparse_api.yaml
+++ b/paddle/phi/api/yaml/sparse_api.yaml
@@ -131,6 +131,13 @@
     layout : x
   backward : values_grad
 
+- api: coalesced
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func: coalesced{sparse_coo -> sparse_coo}
+    layout : x
+
 - api: full_like
   args : (Tensor x, Scalar value, DataType dtype=DataType::UNDEFINED)
   output : Tensor(out)
diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
index c5f0c332c7bbc..bf59c10bedd96 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
@@ -192,7 +192,7 @@ void CoalescedKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sort,
+PD_REGISTER_KERNEL(coalesced,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::CoalescedKernel,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 3f975c80d54d0..c923e6df5fc9f 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -318,7 +318,7 @@ __global__ void GetOutIndexTable(const IntT* indices,
     IntT in_y = indices[i + 2 * non_zero_num];
     IntT in_x = indices[i + 3 * non_zero_num];
     IntT index = PointToIndex(batch, in_x, in_y, in_z, dims);
-    out_index_table[index] = i;
+    out_index_table[index] = i == 0 ? -1 : i;
   }
 }
 
@@ -429,7 +429,8 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
             out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
                 batch, out_x, out_y, out_z, out_dims);
             int real_out_index = out_index_table[out_index];
-            if (real_out_index != -1) {
+            if (real_out_index != 0) {
+              real_out_index = real_out_index == -1 ? 0 : real_out_index;
               in_i = i;
               int buf_i = atomicAdd(&counter_buf[kernel_index], 1);
               kernel_i = kernel_index;
@@ -581,10 +582,12 @@ int ProductRuleBook(const Context& dev_ctx,
     }
     DenseTensor out_index_table = phi::Empty<IntT>(dev_ctx, {table_size});
     IntT* out_index_table_ptr = out_index_table.data<IntT>();
-    thrust::fill(thrust::cuda::par.on(dev_ctx.stream()),
-                 out_index_table_ptr,
-                 out_index_table_ptr + out_index_table.numel(),
-                 -1);
+    // thrust::fill(thrust::cuda::par.on(dev_ctx.stream()),
+    //              out_index_table_ptr,
+    //              out_index_table_ptr + out_index_table.numel(),
+    //              -1);
+    cudaMemsetAsync(
+        out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
 
     auto config =
         phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index 36f79dd346680..0e399a7b0e81f 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -94,14 +94,15 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx,
 
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1);
-  MaskKernel<T, IntT><<<config.block_per_grid, config.thread_per_block>>>(
-      x_ptr,
-      indices_ptr,
-      sparse_offsets.data<int64_t>(),
-      non_zero_num,
-      cols,
-      sparse_dim,
-      out_values_ptr);
+  MaskKernel<T, IntT>
+      <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
+          x_ptr,
+          indices_ptr,
+          sparse_offsets.data<int64_t>(),
+          non_zero_num,
+          cols,
+          sparse_dim,
+          out_values_ptr);
 
   out->SetMember(out_indices, out_values, dims, true);
 }
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
index afd9c33421660..e43db8bd9150b 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
@@ -54,6 +54,7 @@ def test_conv3d(self):
                 key='conv3d',
                 data_format="NDHWC")
             out.backward(out)
+            out = paddle.incubate.sparse.coalesced(out)
             assert np.array_equal(correct_out_values, out.values().numpy())
 
     def test_subm_conv3d(self):
diff --git a/python/paddle/incubate/sparse/__init__.py b/python/paddle/incubate/sparse/__init__.py
index f696434118745..6c9678873abe5 100644
--- a/python/paddle/incubate/sparse/__init__.py
+++ b/python/paddle/incubate/sparse/__init__.py
@@ -28,6 +28,8 @@
 from .math import multiply
 from .math import subtract
 
+from .coalesced import coalesced
+
 from . import nn
 
 __all__ = [
@@ -43,4 +45,5 @@
     'subtract',
     'multiply',
     'divide',
+    'coalesced',
 ]

From e2bf43a6e9ead0886f57947ce1eb431f7dc86f22 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 4 Jul 2022 07:44:16 +0000
Subject: [PATCH 52/70] add coalesced.py

---
 python/paddle/incubate/sparse/coalesced.py | 25 ++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 python/paddle/incubate/sparse/coalesced.py

diff --git a/python/paddle/incubate/sparse/coalesced.py b/python/paddle/incubate/sparse/coalesced.py
new file mode 100644
index 0000000000000..dcd2f8ca28f3a
--- /dev/null
+++ b/python/paddle/incubate/sparse/coalesced.py
@@ -0,0 +1,25 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle import _C_ops
+from paddle.fluid.framework import core, dygraph_only
+
+__all__ = [
+    'coalesced',
+]
+
+
+@dygraph_only
+def coalesced(x):
+    return _C_ops.final_state_sparse_coalesced(x)

From 0aa457fc6e8b0ffda8b0bb28330baf02b343c8eb Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 5 Jul 2022 06:35:03 +0000
Subject: [PATCH 53/70] coalesced before compare result

---
 paddle/phi/kernels/sparse/coalesced_kernel.h  |   7 +
 .../kernels/sparse/gpu/coalesced_kernel.cu    |  13 +-
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 122 ++++++------------
 .../sparse/gpu/convolution_grad_kernel.cu     |  20 +--
 .../kernels/sparse/gpu/convolution_kernel.cu  |  78 +++++------
 .../kernels/test_sparse_conv3d_dev_api.cc     |   6 +-
 6 files changed, 89 insertions(+), 157 deletions(-)

diff --git a/paddle/phi/kernels/sparse/coalesced_kernel.h b/paddle/phi/kernels/sparse/coalesced_kernel.h
index 0755579a57ade..d2f5f8f3150af 100644
--- a/paddle/phi/kernels/sparse/coalesced_kernel.h
+++ b/paddle/phi/kernels/sparse/coalesced_kernel.h
@@ -26,5 +26,12 @@ void CoalescedKernel(const Context& dev_ctx,
                      const SparseCooTensor& x,
                      SparseCooTensor* out);
 
+template <typename T, typename Context>
+SparseCooTensor Coalesced(const Context& dev_ctx, const SparseCooTensor& x) {
+  SparseCooTensor coo;
+  CoalescedKernel<T, Context>(dev_ctx, x, &coo);
+  return coo;
+}
+
 }  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
index bf59c10bedd96..ac147ccd0abb6 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
@@ -55,11 +55,7 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx,
   phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
                                      sparse_offsets.data(),
                                      sizeof(IntT) * sparse_dim,
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyHostToDevice,
-#else
-                                     cudaMemcpyHostToDevice,
-#endif
+                                     gpuMemcpyHostToDevice,
                                      dev_ctx.stream());
 
   // 1. flatten indices
@@ -117,11 +113,7 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx,
   phi::backends::gpu::GpuMemcpyAsync(&out_nnz,
                                      out_indices.data<IntT>(),
                                      sizeof(IntT),
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyDeviceToHost,
-#else
-                                     cudaMemcpyDeviceToHost,
-#endif
+                                     gpuMemcpyDeviceToHost,
                                      dev_ctx.stream());
   dev_ctx.Wait();
 
@@ -188,7 +180,6 @@ void CoalescedKernel(const Context& dev_ctx,
         CoalescedGPUKernel<T, data_t>(dev_ctx, x, out);
       }));
 }
-
 }  // namespace sparse
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index c923e6df5fc9f..4c0929a91cefd 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -37,17 +37,9 @@ namespace sparse {
 
 using Dims4D = phi::funcs::sparse::Dims4D;
 
-// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
-// this kernel with phi::GatherCUDAKernel;
-// Vectorization can be used to improve read and write bandwidth
-/**
- * brief: gather data from params according to indices
- * params: the inputs
- * indices: the indices you want to gather
- * output: the outputs
- * index_size: the size of indices
- * slice_size: slice size corresponding to each index, here is the channel size
- **/
+// Vectorize load and store global memory
+// In the scene of 3D point cloud, the slice_size 4,8,16,32,64 are commonly
+// used.
 template <typename T, typename IndexT = int, int VecSize>
 __global__ void GatherKernel(const T* params,
                              const IndexT* indices,
@@ -68,6 +60,7 @@ __global__ void GatherKernel(const T* params,
   }
 }
 
+// the index_counts records the number of times the same index will be gather
 template <typename T, typename IntT, int VecSize>
 __global__ void GatherKernelV2(const T* inputs,
                                const int* index_counts,
@@ -96,6 +89,7 @@ __global__ void GatherKernelV2(const T* inputs,
   }
 }
 
+// double sparse, seed GroupIndexs
 template <typename T, typename IntT, int VecSize>
 __global__ void GatherKernelV3(const T* inputs,
                                const int* index_counts,
@@ -131,6 +125,7 @@ __global__ void GatherKernelV3(const T* inputs,
   }
 }
 
+// unique the out indexs in rulebook
 template <typename IntT>
 __global__ void UniqueKernel(const IntT* in_indexs,
                              const int rulebook_len,
@@ -168,12 +163,12 @@ __global__ void UniqueKernel(const IntT* in_indexs,
 }
 
 template <typename IntT>
-__global__ void UpdateOutIndex(const int* out_index_table,
-                               const int n,
-                               const int kernel_size,
-                               IntT* out_indexs,
-                               int* out_index_counts,
-                               int* origin_out_indexs) {
+__global__ void GroupIndexs(const int* out_index_table,
+                            const int n,
+                            const int kernel_size,
+                            IntT* out_indexs,
+                            int* out_index_counts,
+                            int* out_index_groups) {
   CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
     IntT index = out_indexs[i];
     int real_index = out_index_table[index];
@@ -182,7 +177,7 @@ __global__ void UpdateOutIndex(const int* out_index_table,
     // kernel_size at most
     int j = atomicAdd(out_index_counts + real_index, 1);
     // nnz * kernel_size
-    origin_out_indexs[real_index * kernel_size + j] = i;
+    out_index_groups[real_index * kernel_size + j] = i;
   }
 }
 
@@ -461,11 +456,11 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
 }
 
 template <typename IntT>
-__global__ void UpdateOutIndex(const int n,
-                               const int kernel_size,
-                               const IntT* indexs,
-                               int* index_counts,
-                               int* index_groups) {
+__global__ void GroupIndexs(const int n,
+                            const int kernel_size,
+                            const IntT* indexs,
+                            int* index_counts,
+                            int* index_groups) {
   CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
     IntT index = indexs[i];
     // kernel_size at most
@@ -475,14 +470,15 @@ __global__ void UpdateOutIndex(const int n,
   }
 }
 
+// double space to reduce atomicAdd conflict
 template <typename IntT>
-__global__ void UpdateOutIndexV2(const int rulebook_len,
-                                 const int non_zero_num,
-                                 const int kernel_size,
-                                 const int half_kernel_offset,
-                                 const IntT* indexs,
-                                 int* index_counts,
-                                 int* index_groups) {
+__global__ void GroupIndexsV2(const int rulebook_len,
+                              const int non_zero_num,
+                              const int kernel_size,
+                              const int half_kernel_offset,
+                              const IntT* indexs,
+                              int* index_counts,
+                              int* index_groups) {
   CUDA_KERNEL_LOOP_TYPE(i, rulebook_len, int64_t) {
     IntT index = indexs[i];
     int* counts_ptr =
@@ -545,8 +541,6 @@ int ProductRuleBook(const Context& dev_ctx,
   Dims4D d_strides(1, strides[2], strides[1], strides[0]);
   Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
   // 1. product rule book
-  // phi::funcs::SetConstant<Context, int> set_zero;
-  // set_zero(dev_ctx, counter_per_kernel, 0);
   phi::backends::gpu::GpuMemsetAsync(counter_ptr,
                                      0,
                                      sizeof(int) * counter_per_kernel->numel(),
@@ -555,11 +549,6 @@ int ProductRuleBook(const Context& dev_ctx,
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
 
   if (subm) {
-    // At present, hashtable is not used to map the input and output indexes.
-    // At present, the intermediate output index is generated by normal
-    // convolution,
-    // and then the intermediate output index is subtracted from the input index
-    // to obain the rulebook.
     const int rulebook_rows = 2;
     const int rulebook_cols = kernel_size * non_zero_num;
     DenseTensorMeta rulebook_meta(
@@ -582,11 +571,7 @@ int ProductRuleBook(const Context& dev_ctx,
     }
     DenseTensor out_index_table = phi::Empty<IntT>(dev_ctx, {table_size});
     IntT* out_index_table_ptr = out_index_table.data<IntT>();
-    // thrust::fill(thrust::cuda::par.on(dev_ctx.stream()),
-    //              out_index_table_ptr,
-    //              out_index_table_ptr + out_index_table.numel(),
-    //              -1);
-    cudaMemsetAsync(
+    phi::backends::gpu::GpuMemsetAsync(
         out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
 
     auto config =
@@ -696,21 +681,6 @@ int ProductRuleBook(const Context& dev_ctx,
                                 rulebook_ptr + rulebook_rows * rulebook_cols,
                                 -1);
 
-    // phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
-    //     rulebook_ptr, last, rulebook_ptr + rulebook_rows * rulebook_cols -
-    //     1);
-    // IntT rulebook_len = 0;
-    // phi::backends::gpu::GpuMemcpyAsync(
-    //     &rulebook_len,
-    //     rulebook_ptr + rulebook_rows * rulebook_cols - 1,
-    //     sizeof(IntT),
-    //     gpuMemcpyDeviceToHost,
-    //     dev_ctx.stream());
-
-    // dev_ctx.Wait();
-    // rulebook_len /= 2;
-    // printf("rulebook_len = %d\n", rulebook_len);
-    // printf("distance = %d\n", last-rulebook_ptr);
     IntT rulebook_len = (last - rulebook_ptr) / 2;
 
 #ifdef PADDLE_WITH_HIP
@@ -770,14 +740,6 @@ int ProductRuleBook(const Context& dev_ctx,
                     dev_ctx.stream());
     dev_ctx.Wait();
 
-    // thrust::pair<int*, int*> min_max =
-    // thrust::minmax_element(thrust::cuda::par.on(dev_ctx.stream()),
-    // out_index_ptr, out_index_ptr + out_nnz); int start = 0, end = 0;
-    // cudaMemcpyAsync(&start, min_max.first, sizeof(int),
-    // cudaMemcpyDeviceToHost, dev_ctx.stream()); cudaMemcpyAsync(&end,
-    // min_max.second, sizeof(int), cudaMemcpyDeviceToHost, dev_ctx.stream());
-    // dev_ctx.Wait();
-
     const int64_t sparse_dim = 4;
     DenseTensorMeta indices_meta(
         indices_dtype, {sparse_dim, out_nnz}, DataLayout::NCHW);
@@ -789,16 +751,6 @@ int ProductRuleBook(const Context& dev_ctx,
 
     IntT* out_indices_ptr = out_indices.data<IntT>();
 
-    // thrust::sort(thrust::cuda::par.on(dev_ctx.stream()),
-    //              out_index_ptr,
-    //              out_index_ptr + out_nnz);
-    // printf("start = %d, end=%d, table_size=%d, nnz=%d\n", start, end,
-    // table_size); thrust::remove_copy(thrust::cuda::par.on(dev_ctx.stream()),
-    //                    out_index_table_ptr + start,
-    //                    out_index_table_ptr + end,
-    //                    out_index_ptr,
-    //                    0);
-
     config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
     GetOutIndexTable<IntT><<<config.block_per_grid,
                              config.thread_per_block,
@@ -809,21 +761,19 @@ int ProductRuleBook(const Context& dev_ctx,
                                                  out_index_table_ptr,
                                                  out_indices_ptr);
     config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
-    // cudaMemsetAsync(
-    //     out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
     unique_value->ResizeAndAllocate({static_cast<int>(out_nnz * kernel_size)});
     int* unique_value_ptr = unique_value->data<int>();
 
     // return rulebook_len;
-    UpdateOutIndex<<<config.block_per_grid,
-                     config.thread_per_block,
-                     0,
-                     dev_ctx.stream()>>>(out_index_table_ptr,
-                                         rulebook_len,
-                                         kernel_size,
-                                         rulebook_ptr + rulebook_len,
-                                         out_index_ptr,
-                                         unique_value_ptr);
+    GroupIndexs<<<config.block_per_grid,
+                  config.thread_per_block,
+                  0,
+                  dev_ctx.stream()>>>(out_index_table_ptr,
+                                      rulebook_len,
+                                      kernel_size,
+                                      rulebook_ptr + rulebook_len,
+                                      out_index_ptr,
+                                      unique_value_ptr);
 
     return rulebook_len;
   }
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 72df861cccbef..06aed45b488e4 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -148,16 +148,16 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   cudaMemsetAsync(
       out_index_ptr, 0, sizeof(int) * x.nnz() * 2, dev_ctx.stream());
 
-  UpdateOutIndexV2<<<config.block_per_grid,
-                     config.thread_per_block,
-                     0,
-                     dev_ctx.stream()>>>(rulebook_len,
-                                         x.nnz(),
-                                         kernel_size,
-                                         offsets[kernel_size / 2],
-                                         rulebook_ptr,
-                                         out_index_ptr,
-                                         unique_value_ptr);
+  GroupIndexsV2<<<config.block_per_grid,
+                  config.thread_per_block,
+                  0,
+                  dev_ctx.stream()>>>(rulebook_len,
+                                      x.nnz(),
+                                      kernel_size,
+                                      offsets[kernel_size / 2],
+                                      rulebook_ptr,
+                                      out_index_ptr,
+                                      unique_value_ptr);
 
   const int VecSize = VecBytes / sizeof(T);
   if (in_channels % VecSize == 0) {
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 8e09c4c24e7d5..3a7415870e103 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -139,8 +139,6 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   T* out_features_ptr = out_features.data<T>();
   phi::funcs::SetConstant<GPUContext, T> set_zero;
   set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
-  // phi::backends::gpu::GpuMemsetAsync(out_features_ptr,
-  // static_cast<T>(0.0f), sizeof(T) * out_features.numel(), dev_ctx.stream());
 
   const int VecSize = VecBytes / sizeof(T);
   if (in_channels % VecSize == 0) {
@@ -176,18 +174,18 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
 
   if (subm) {
-    // set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1);
     unique_value.ResizeAndAllocate(
         {static_cast<int>(out->nnz() * kernel_size)});
     out_index.ResizeAndAllocate({static_cast<int>(n)});
     int* out_index_ptr = out_index.data<int>();
     int* unique_value_ptr = unique_value.data<int>();
-    cudaMemsetAsync(out_index_ptr, 0, sizeof(int) * n, dev_ctx.stream());
-    UpdateOutIndex<<<config.block_per_grid,
-                     config.thread_per_block,
-                     0,
-                     dev_ctx.stream()>>>(
+    phi::backends::gpu::GpuMemsetAsync(
+        out_index_ptr, 0, sizeof(int) * n, dev_ctx.stream());
+    GroupIndexs<<<config.block_per_grid,
+                  config.thread_per_block,
+                  0,
+                  dev_ctx.stream()>>>(
         n, kernel_size, rulebook_ptr + n, out_index_ptr, unique_value_ptr);
   }
   const T* kernel_ptr = kernel.data<T>();
@@ -217,50 +215,34 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   }
 
   // 4. scatter
-  if (false) {
-    set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
-    auto config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1);
-    phi::funcs::ScatterCUDAKernel<T, IntT>
-        <<<config.block_per_grid,
-           config.thread_per_block,
+  if (out_channels % VecSize == 0) {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, out->nnz() * out_channels / VecSize, 1);
+    phi::funcs::sparse::ScatterKernelV2<T, VecSize>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
            0,
            dev_ctx.stream()>>>(out_features_ptr,
-                               rulebook_ptr + n,
-                               out_values_ptr,
-                               n,
+                               out_index.data<int>(),
+                               unique_value.data<int>(),
+                               out->nnz(),
+                               kernel_size,
                                out_channels,
-                               false);
+                               out_values_ptr);
   } else {
-    if (out_channels % VecSize == 0) {
-      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-          dev_ctx, out->nnz() * out_channels / VecSize, 1);
-      phi::funcs::sparse::ScatterKernelV2<T, VecSize>
-          <<<config.block_per_grid.x,
-             config.thread_per_block.x,
-             0,
-             dev_ctx.stream()>>>(out_features_ptr,
-                                 out_index.data<int>(),
-                                 unique_value.data<int>(),
-                                 out->nnz(),
-                                 kernel_size,
-                                 out_channels,
-                                 out_values_ptr);
-    } else {
-      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-          dev_ctx, out->nnz() * out_channels, 1);
-      phi::funcs::sparse::ScatterKernelV2<T, 1>
-          <<<config.block_per_grid.x,
-             config.thread_per_block.x,
-             0,
-             dev_ctx.stream()>>>(out_features_ptr,
-                                 out_index.data<int>(),
-                                 unique_value.data<int>(),
-                                 out->nnz(),
-                                 kernel_size,
-                                 out_channels,
-                                 out_values_ptr);
-    }
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, out->nnz() * out_channels, 1);
+    phi::funcs::sparse::ScatterKernelV2<T, 1>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(out_features_ptr,
+                               out_index.data<int>(),
+                               unique_value.data<int>(),
+                               out->nnz(),
+                               kernel_size,
+                               out_channels,
+                               out_values_ptr);
   }
 }
 /**
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index 5f3c290a2414e..6f03f3e1ec0b6 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
@@ -207,6 +208,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                                             1,
                                             subm,
                                             "Conv3d_0");
+  SparseCooTensor tmp_d_out = sparse::Coalesced<T>(dev_ctx_gpu, d_out);
 
   ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
   ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
@@ -218,7 +220,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
       dev_ctx_cpu,
       DenseTensorMeta(indices_dtype, {4, d_out.nnz()}, DataLayout::NCHW));
   phi::Copy(dev_ctx_gpu,
-            d_out.non_zero_indices(),
+            tmp_d_out.non_zero_indices(),
             phi::CPUPlace(),
             true,
             &h_indices_tensor);
@@ -232,7 +234,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
       phi::EmptyLike<T>(dev_ctx_cpu, d_out.non_zero_elements());
 
   phi::Copy(dev_ctx_gpu,
-            d_out.non_zero_elements(),
+            tmp_d_out.non_zero_elements(),
             phi::CPUPlace(),
             true,
             &h_features_tensor);

From 749dccefd077f86254d09eaba020e20a7a018ae9 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 5 Jul 2022 11:54:05 +0000
Subject: [PATCH 54/70] the key of conv3d is not required

---
 .../final_state_generator/eager_gen.py        |  3 +
 paddle/phi/api/yaml/sparse_api.yaml           |  5 +-
 paddle/phi/api/yaml/sparse_bw_api.yaml        |  6 +-
 .../kernels/sparse/convolution_grad_kernel.h  |  6 ++
 .../phi/kernels/sparse/convolution_kernel.h   | 12 ++-
 .../sparse/cpu/convolution_grad_kernel.cc     | 48 +++++++---
 .../kernels/sparse/cpu/convolution_kernel.cc  | 79 ++++++++++------
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 15 +--
 .../sparse/gpu/convolution_grad_kernel.cu     | 39 ++++++--
 .../kernels/sparse/gpu/convolution_kernel.cu  | 91 ++++++++++++-------
 .../kernels/test_sparse_conv3d_dev_api.cc     | 18 ++--
 .../tests/unittests/test_sparse_conv_op.py    |  4 +-
 .../incubate/sparse/nn/functional/conv.py     |  9 +-
 .../paddle/incubate/sparse/nn/layer/conv.py   |  3 +-
 14 files changed, 221 insertions(+), 117 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index d406f00b25039..a595bf5c613c6 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -545,6 +545,9 @@ def BackwardValidationCheck(self):
         backward_forward_inputs_map = self.backward_forward_inputs_map
         backward_grad_inputs_map = self.backward_grad_inputs_map
         backward_attrs_list = self.backward_attrs_list
+        print(backward_forward_inputs_map)
+        print(backward_grad_inputs_map)
+        print(backward_attrs_list)
 
         # Check Order: TensorWrappers, GradTensors, Attributes
         max_fwd_input_position = -1
diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml
index 3987208c2ecc7..cc583fce1f8a1 100644
--- a/paddle/phi/api/yaml/sparse_api.yaml
+++ b/paddle/phi/api/yaml/sparse_api.yaml
@@ -9,10 +9,11 @@
 
 - api : conv3d
   args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key)
-  output : Tensor(out)
+  output : Tensor(out), Tensor(rulebook), Tensor(counter) 
   kernel :
-    func : sparse_conv3d{sparse_coo, dense -> sparse_coo}
+    func : sparse_conv3d{sparse_coo, dense -> sparse_coo, dense, dense}
     layout : x
+  intermediate: rulebook, counter
   backward : conv3d_grad
 
 - api : coo_to_dense
diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml
index c96443bc8241b..04cdfccce636e 100644
--- a/paddle/phi/api/yaml/sparse_bw_api.yaml
+++ b/paddle/phi/api/yaml/sparse_bw_api.yaml
@@ -7,11 +7,11 @@
            add_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
 
 - backward_api : conv3d_grad
-  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out@SparseCooTensor)
-  args : (Tensor x, Tensor kernel, Tensor out, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key)
+  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor), Tensor(counter@DenseTensor)
+  args : (Tensor x, Tensor kernel, Tensor out, Tensor rulebook, Tensor counter, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key)
   output : Tensor(x_grad), Tensor(kernel_grad)
   kernel :
-    func : sparse_conv3d_grad{sparse_coo, dense, sparse_coo, sparse_coo -> sparse_coo, dense}
+    func : sparse_conv3d_grad{sparse_coo, dense, sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense}
 
 - backward_api : coo_to_dense_grad
   forward : coo_to_dense(Tensor x) -> Tensor(out)
diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
index 52be7ffd02903..54d09babb2cf9 100644
--- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
@@ -27,6 +27,8 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& kernel,
                       const SparseCooTensor& out,
+                      const DenseTensor& rulebook,
+                      const DenseTensor& counter,
                       const SparseCooTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
@@ -43,6 +45,8 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dGrad(
     const SparseCooTensor& x,
     const DenseTensor& kernel,
     const SparseCooTensor& out,
+    const DenseTensor& rulebook,
+    const DenseTensor& counter,
     const SparseCooTensor& out_grad,
     const std::vector<int>& paddings,
     const std::vector<int>& dilations,
@@ -58,6 +62,8 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dGrad(
                                x,
                                kernel,
                                out,
+                               rulebook,
+                               counter,
                                out_grad,
                                paddings,
                                dilations,
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
index 6acc3241385a3..62559d4e0ff1e 100644
--- a/paddle/phi/kernels/sparse/convolution_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -32,7 +32,9 @@ void Conv3dKernel(const Context& dev_ctx,
                   const int groups,
                   const bool subm,
                   const std::string& key,
-                  SparseCooTensor* out);
+                  SparseCooTensor* out,
+                  DenseTensor* rulebook,
+                  DenseTensor* counter);
 
 template <typename T, typename Context>
 SparseCooTensor Conv3d(const Context& dev_ctx,
@@ -43,7 +45,9 @@ SparseCooTensor Conv3d(const Context& dev_ctx,
                        const std::vector<int>& strides,
                        const int groups,
                        const bool subm,
-                       const std::string& key) {
+                       const std::string& key,
+                       DenseTensor* rulebook,
+                       DenseTensor* counter) {
   SparseCooTensor coo;
   Conv3dKernel<T, Context>(dev_ctx,
                            x,
@@ -54,7 +58,9 @@ SparseCooTensor Conv3d(const Context& dev_ctx,
                            groups,
                            subm,
                            key,
-                           &coo);
+                           &coo,
+                           rulebook,
+                           counter);
   return coo;
 }
 
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
index 071d586dc7d56..5c6c2539c0a74 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -35,6 +35,8 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx,
                          const SparseCooTensor& x,
                          const DenseTensor& kernel,
                          const SparseCooTensor& out,
+                         const DenseTensor& rulebook,
+                         const DenseTensor& counter,
                          const SparseCooTensor& out_grad,
                          const std::vector<int>& paddings,
                          const std::vector<int>& dilations,
@@ -48,12 +50,26 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx,
   const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
-  // const DenseTensor& rulebook = out.rulebook();
-  const auto* table = out.table(key);
-  const DenseTensor& rulebook = table->first;
-  const IntT* rulebook_ptr = rulebook.data<IntT>();
 
-  const int rulebook_len = rulebook.dims()[1];
+  int rulebook_len = 0;
+  const IntT* rulebook_ptr = nullptr;
+  const int* counter_ptr = nullptr;
+  bool cache_in_table = false;
+  if (!key.empty()) {
+    const auto* table = out.table(key);
+    if (table != nullptr) {
+      cache_in_table = true;
+      const DenseTensor& tmp_rulebook = table->first;
+      rulebook_ptr = tmp_rulebook.data<IntT>();
+      rulebook_len = tmp_rulebook.dims()[1];
+      counter_ptr = table->second.data();
+    }
+  }
+  if (!cache_in_table) {
+    rulebook_ptr = rulebook.data<IntT>();
+    rulebook_len = rulebook.dims()[1];
+    counter_ptr = counter.data<int>();
+  }
 
   DenseTensorMeta in_features_meta(
       x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
@@ -90,16 +106,14 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx,
                         &x_grad_indices);
   x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
 
-  std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0);
-  for (int i = 0; i < rulebook_len; i++) {
-    counter[rulebook_ptr[i]] += 1;
-  }
-  IntT offset = 0, max_count = 0;
+  std::vector<IntT> offsets(kernel_size + 1);
+  IntT offset = 0;
+  int max_count = 0;
   for (int i = 0; i < kernel_size; i++) {
     offsets[i] = offset;
-    offset += counter[i];
+    offset += counter_ptr[i];
     if (i < half_kernel_size) {
-      max_count = std::max(max_count, counter[i]);
+      max_count = std::max(max_count, counter_ptr[i]);
     }
   }
   offsets[kernel_size] = offset;
@@ -133,11 +147,11 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx,
 
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
+    if (counter_ptr[i] <= 0 || (subm && i == half_kernel_size)) {
       continue;
     }
 
-    const int M = counter[i];
+    const int M = counter_ptr[i];
     const int K = in_channels;
     const int N = out_channels;
     T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
@@ -175,7 +189,7 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx,
 
   // 4. scatter
   Scatter<T, IntT>(d_x_features_ptr,
-                   rulebook.data<IntT>() + rulebook_len,
+                   rulebook_ptr + rulebook_len,
                    rulebook_len,
                    in_channels,
                    x_grad_values_ptr);
@@ -186,6 +200,8 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& kernel,
                       const SparseCooTensor& out,
+                      const DenseTensor& rulebook,
+                      const DenseTensor& counter,
                       const SparseCooTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
@@ -201,6 +217,8 @@ void Conv3dGradKernel(const Context& dev_ctx,
                                        x,
                                        kernel,
                                        out,
+                                       rulebook,
+                                       counter,
                                        out_grad,
                                        paddings,
                                        dilations,
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index ecf7073b41109..f5f7497df96fb 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -37,7 +37,9 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx,
                      const int groups,
                      const bool subm,
                      const std::string& key,
-                     SparseCooTensor* out) {
+                     SparseCooTensor* out,
+                     DenseTensor* rulebook,
+                     DenseTensor* counter) {
   // update padding and dilation
   // Currently, only support x.layout is NDHWC, groups = 1
   // if x.layout != NDHWC then transpose(x), transpose(weight)
@@ -71,27 +73,34 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx,
 
   // DenseTensor* rulebook = nullptr;
   const IntT* rulebook_ptr = nullptr;
-  PADDLE_ENFORCE(!key.empty(),
-                 phi::errors::Fatal("the key of sparse conv must be not null"));
   int n = 0;
-  const auto* table = x.table(key);
-  if (subm && table != nullptr) {
-    const DenseTensor& rulebook = table->first;
-    rulebook_ptr = rulebook.data<IntT>();
-    out->SetTablePtr(x.GetTablePtr());
-    n = rulebook.dims()[1];
-
-    DenseTensor out_indices =
-        phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
-    DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
-    phi::Copy(
-        dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
-    out->SetMember(out_indices, out_values, out_dims, true);
-    memcpy(counter_per_kernel.data(),
-           table->second.data(),
-           kernel_size * sizeof(int));
-  } else {
-    DenseTensor rulebook;
+  bool need_product_rulebook = true;
+  if (subm && !key.empty()) {
+    const auto* table = x.table(key);
+    if (table != nullptr) {
+      need_product_rulebook = false;
+      const DenseTensor& rulebook = table->first;
+      rulebook_ptr = rulebook.data<IntT>();
+      out->SetTablePtr(x.GetTablePtr());
+      n = rulebook.dims()[1];
+
+      DenseTensor out_indices =
+          phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
+      DenseTensor out_values =
+          phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+      phi::Copy(dev_ctx,
+                x.non_zero_indices(),
+                dev_ctx.GetPlace(),
+                false,
+                &out_indices);
+      out->SetMember(out_indices, out_values, out_dims, true);
+      memcpy(counter_per_kernel.data(),
+             table->second.data(),
+             kernel_size * sizeof(int));
+    }
+  }
+  if (need_product_rulebook) {
+    DenseTensor tmp_rulebook;
     ProductRuleBook<T, CPUContext, IntT>(dev_ctx,
                                          x,
                                          kernel_sizes,
@@ -100,15 +109,25 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx,
                                          subm_strides,
                                          out_dims,
                                          subm,
-                                         &rulebook,
+                                         &tmp_rulebook,
                                          &counter_per_kernel);
 
     UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
-        dev_ctx, x, kernel_size, out_channels, out_dims, &rulebook, out);
-    n = rulebook.dims()[1];
+        dev_ctx, x, kernel_size, out_channels, out_dims, &tmp_rulebook, out);
+    n = tmp_rulebook.dims()[1];
+    rulebook_ptr = tmp_rulebook.data<IntT>();
+
     out->SetTablePtr(x.GetTablePtr());
-    out->SetTable(key, std::make_pair(rulebook, counter_per_kernel));
-    rulebook_ptr = rulebook.data<IntT>();
+    if (!key.empty()) {
+      out->SetTable(key, std::make_pair(tmp_rulebook, counter_per_kernel));
+    } else {
+      *rulebook = tmp_rulebook;
+      counter->Resize({kernel_size});
+      int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
+      memcpy(counter_ptr,
+             counter_per_kernel.data(),
+             counter_per_kernel.size() * sizeof(int));
+    }
   }
   // int n = rulebook->dims()[1];
   const int* counter_ptr = counter_per_kernel.data();
@@ -183,7 +202,9 @@ void Conv3dKernel(const Context& dev_ctx,
                   const int groups,
                   const bool subm,
                   const std::string& key,
-                  SparseCooTensor* out) {
+                  SparseCooTensor* out,
+                  DenseTensor* rulebook,
+                  DenseTensor* counter) {
   PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "Conv3dCPUKernel", ([&] {
         Conv3dCPUKernel<T, data_t>(dev_ctx,
@@ -195,7 +216,9 @@ void Conv3dKernel(const Context& dev_ctx,
                                    groups,
                                    subm,
                                    key,
-                                   out);
+                                   out,
+                                   rulebook,
+                                   counter);
       }));
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 4c0929a91cefd..139259f48c8b3 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -718,9 +718,10 @@ int ProductRuleBook(const Context& dev_ctx,
     }
     DenseTensor out_index_table = phi::Empty<int>(dev_ctx, {table_size});
     int* out_index_table_ptr = out_index_table.data<int>();
-    cudaMemsetAsync(
+    phi::backends::gpu::GpuMemsetAsync(
         out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
-    cudaMemsetAsync(unique_key_ptr, 0, sizeof(int), dev_ctx.stream());
+    phi::backends::gpu::GpuMemsetAsync(
+        unique_key_ptr, 0, sizeof(int), dev_ctx.stream());
 
     config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
     size_t cache_size = sizeof(int) * config.thread_per_block.x;
@@ -733,11 +734,11 @@ int ProductRuleBook(const Context& dev_ctx,
                                              out_index_ptr,
                                              unique_key_ptr);
     int out_nnz = 0;
-    cudaMemcpyAsync(&out_nnz,
-                    unique_key_ptr,
-                    sizeof(int),
-                    cudaMemcpyDeviceToHost,
-                    dev_ctx.stream());
+    phi::backends::gpu::GpuMemcpyAsync(&out_nnz,
+                                       unique_key_ptr,
+                                       sizeof(int),
+                                       gpuMemcpyDeviceToHost,
+                                       dev_ctx.stream());
     dev_ctx.Wait();
 
     const int64_t sparse_dim = 4;
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 06aed45b488e4..741720b20b9ff 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -44,6 +44,8 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
                          const SparseCooTensor& x,
                          const DenseTensor& kernel,
                          const SparseCooTensor& out,
+                         const DenseTensor& rulebook,
+                         const DenseTensor& counter,
                          const SparseCooTensor& out_grad,
                          const std::vector<int>& paddings,
                          const std::vector<int>& dilations,
@@ -58,11 +60,25 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
 
-  const auto* table = out.table(key);
-  const DenseTensor& rulebook = table->first;
-  const IntT* rulebook_ptr = rulebook.data<IntT>();
-
-  const int rulebook_len = rulebook.dims()[1];
+  int rulebook_len = 0;
+  const IntT* rulebook_ptr = nullptr;
+  const int* counter_ptr = nullptr;
+  bool cache_in_table = false;
+  if (!key.empty()) {
+    const auto* table = out.table(key);
+    if (table != nullptr) {
+      cache_in_table = true;
+      const DenseTensor& tmp_rulebook = table->first;
+      rulebook_ptr = tmp_rulebook.data<IntT>();
+      rulebook_len = tmp_rulebook.dims()[1];
+      counter_ptr = table->second.data();
+    }
+  }
+  if (!cache_in_table) {
+    rulebook_ptr = rulebook.data<IntT>();
+    rulebook_len = rulebook.dims()[1];
+    counter_ptr = counter.data<int>();
+  }
 
   DenseTensorMeta in_features_meta(
       x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
@@ -109,14 +125,13 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
 
   std::vector<int> offsets(kernel_size + 1);
-  const auto& counter = table->second;
 
   int offset = 0, max_count = 0;
   for (int i = 0; i < kernel_size; i++) {
     offsets[i] = offset;
-    offset += counter[i];
+    offset += counter_ptr[i];
     if (i < half_kernel_size) {
-      max_count = std::max(max_count, counter[i]);
+      max_count = std::max(max_count, counter_ptr[i]);
     }
   }
   offsets[kernel_size] = offset;
@@ -218,11 +233,11 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
 
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0 || (subm && i == half_kernel_size)) {
+    if (counter_ptr[i] <= 0 || (subm && i == half_kernel_size)) {
       continue;
     }
 
-    const int M = counter[i];
+    const int M = counter_ptr[i];
     const int K = in_channels;
     const int N = out_channels;
     T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
@@ -295,6 +310,8 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& kernel,
                       const SparseCooTensor& out,
+                      const DenseTensor& rulebook,
+                      const DenseTensor& counter,
                       const SparseCooTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
@@ -310,6 +327,8 @@ void Conv3dGradKernel(const Context& dev_ctx,
                                        x,
                                        kernel,
                                        out,
+                                       rulebook,
+                                       counter,
                                        out_grad,
                                        paddings,
                                        dilations,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 3a7415870e103..6c071d963ebd2 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -23,6 +23,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
+#include "glog/logging.h"
+
 namespace phi {
 namespace sparse {
 
@@ -36,7 +38,9 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
                      const int groups,
                      const bool subm,
                      const std::string& key,
-                     SparseCooTensor* out) {
+                     SparseCooTensor* out,
+                     DenseTensor* rulebook,
+                     DenseTensor* counter) {
   // update padding and dilation
   // Currently, only support x.layout is NDHWC, groups = 1
   // if x.layout != NDHWC then transpose(x), transpose(weight)
@@ -76,35 +80,41 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
   DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
 
+  VLOG(6) << "call SubmConv3D or Conv3D " << subm << " and the key is " << key;
   int n = 0;
   const IntT* rulebook_ptr = nullptr;
-  PADDLE_ENFORCE_EQ(
-      key.empty(),
-      false,
-      phi::errors::Fatal("the key of sparse conv must be not null"));
-  const auto* table = x.table(key);
-  if (subm && table != nullptr) {
-    const DenseTensor& rulebook = table->first;
-    rulebook_ptr = rulebook.data<IntT>();
-    memcpy(h_counter.data(), table->second.data(), kernel_size * sizeof(int));
-    out->SetTablePtr(x.GetTablePtr());
+  bool need_product_rulebook = true;
+  if (subm && !key.empty()) {
+    const auto* table = x.table(key);
+    if (table != nullptr) {
+      need_product_rulebook = false;
+      const DenseTensor& rulebook = table->first;
+      rulebook_ptr = rulebook.data<IntT>();
+      memcpy(h_counter.data(), table->second.data(), kernel_size * sizeof(int));
+      out->SetTablePtr(x.GetTablePtr());
 
-    n = rulebook.dims()[1];
+      n = rulebook.dims()[1];
 
-    DenseTensor out_indices =
-        phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
-    DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
-    phi::Copy(
-        dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
-    out->SetMember(out_indices, out_values, out_dims, true);
-    IntT offset = 0;
-    for (int i = 0; i < kernel_size; i++) {
-      offsets[i] = offset;
-      offset += h_counter[i];
+      DenseTensor out_indices =
+          phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
+      DenseTensor out_values =
+          phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+      phi::Copy(dev_ctx,
+                x.non_zero_indices(),
+                dev_ctx.GetPlace(),
+                false,
+                &out_indices);
+      out->SetMember(out_indices, out_values, out_dims, true);
+      IntT offset = 0;
+      for (int i = 0; i < kernel_size; i++) {
+        offsets[i] = offset;
+        offset += h_counter[i];
+      }
+      offsets[kernel_size] = offset;
     }
-    offsets[kernel_size] = offset;
-  } else {
-    DenseTensor rulebook;
+  }
+  if (need_product_rulebook) {
+    DenseTensor tmp_rulebook;
     n = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
                                              x,
                                              kernel_sizes,
@@ -113,7 +123,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
                                              subm_strides,
                                              out_dims,
                                              subm,
-                                             &rulebook,
+                                             &tmp_rulebook,
                                              &counter_per_kernel,
                                              &offsets_per_kernel,
                                              &out_index,
@@ -121,9 +131,17 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
                                              out,
                                              &h_counter,
                                              &offsets);
+    rulebook_ptr = tmp_rulebook.data<IntT>();
+
     out->SetTablePtr(x.GetTablePtr());
-    out->SetTable(key, std::make_pair(rulebook, h_counter));
-    rulebook_ptr = rulebook.data<IntT>();
+    if (!key.empty()) {
+      out->SetTable(key, std::make_pair(tmp_rulebook, h_counter));
+    } else {
+      *rulebook = tmp_rulebook;
+      counter->Resize({kernel_size});
+      int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
+      memcpy(counter_ptr, h_counter.data(), h_counter.size() * sizeof(int));
+    }
   }
 
   // 2. gather
@@ -245,10 +263,13 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
                                out_values_ptr);
   }
 }
+
 /**
- * x: (N, D, H, W, C)
- * kernel: (D, H, W, C, OC)
- * out: (N, D, H, W, OC)
+ * x: the input SparseCooTensor, shape is (N, D, H, W, C)
+ * kernel: the weight data, shape is (D, H, W, C, OC)
+ * out: the output SparseCooTensor, shape is (N, D, H, W, OC)
+ * rulebook: return rulebook if key is not vailed else return nullptr
+ * counter: return counter if key is not vailed else return nullptr
  **/
 template <typename T, typename Context>
 void Conv3dKernel(const Context& dev_ctx,
@@ -260,7 +281,9 @@ void Conv3dKernel(const Context& dev_ctx,
                   const int groups,
                   const bool subm,
                   const std::string& key,
-                  SparseCooTensor* out) {
+                  SparseCooTensor* out,
+                  DenseTensor* rulebook,
+                  DenseTensor* counter) {
   PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "Conv3dGPUKernel", ([&] {
         Conv3dGPUKernel<T, data_t>(dev_ctx,
@@ -272,7 +295,9 @@ void Conv3dKernel(const Context& dev_ctx,
                                    groups,
                                    subm,
                                    key,
-                                   out);
+                                   out,
+                                   rulebook,
+                                   counter);
       }));
 }
 
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index 6f03f3e1ec0b6..df4fec61a9a3d 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -112,8 +112,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
   };
 
   if (!std::is_same<T, phi::dtype::float16>::value) {
-    // DenseTensor rulebook = phi::Empty(
-    //     dev_ctx_cpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
+    DenseTensor rulebook, counter;
     SparseCooTensor out = sparse::Conv3d<T>(dev_ctx_cpu,
                                             x_tensor,
                                             kernel_tensor,
@@ -122,7 +121,9 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                                             strides,
                                             1,
                                             subm,
-                                            "Conv3d_0");
+                                            "Conv3d_0",
+                                            &rulebook,
+                                            &counter);
 
     ASSERT_EQ(correct_out_dims.size(), out.dims().size());
     for (int i = 0; i < correct_out_dims.size(); i++) {
@@ -143,6 +144,8 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                                 x_tensor,
                                 kernel_tensor,
                                 out,
+                                rulebook,
+                                counter,
                                 out,
                                 paddings,
                                 dilations,
@@ -197,8 +200,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
   phi::Copy(
       dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);
 
-  // DenseTensor d_rulebook = phi::Empty(
-  //     dev_ctx_gpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
+  DenseTensor d_rulebook, d_counter;
   SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu,
                                             d_x_tensor,
                                             d_kernel_tensor,
@@ -207,7 +209,9 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                                             strides,
                                             1,
                                             subm,
-                                            "Conv3d_0");
+                                            "Conv3d_0",
+                                            &d_rulebook,
+                                            &d_counter);
   SparseCooTensor tmp_d_out = sparse::Coalesced<T>(dev_ctx_gpu, d_out);
 
   ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
@@ -246,6 +250,8 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                               d_x_tensor,
                               d_kernel_tensor,
                               d_out,
+                              d_rulebook,
+                              d_counter,
                               d_out,
                               paddings,
                               dilations,
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
index e43db8bd9150b..ede33e4167472 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
@@ -51,7 +51,6 @@ def test_conv3d(self):
                 padding=paddings,
                 dilation=dilations,
                 groups=1,
-                key='conv3d',
                 data_format="NDHWC")
             out.backward(out)
             out = paddle.incubate.sparse.coalesced(out)
@@ -86,7 +85,7 @@ def test_Conv3D(self):
                 indices, values, dense_shape, False)
 
             sparse_conv3d = paddle.incubate.sparse.nn.Conv3D(
-                1, 1, (1, 3, 3), data_format='NDHWC', key='conv3d')
+                1, 1, (1, 3, 3), data_format='NDHWC')
             sparse_out = sparse_conv3d(sparse_input)
             #test errors
             with self.assertRaises(ValueError):
@@ -131,7 +130,6 @@ def test_Conv3D_bias(self):
             sp_conv3d = paddle.incubate.sparse.nn.Conv3D(3,
                                                          2,
                                                          3,
-                                                         key='conv3d',
                                                          data_format='NDHWC')
             sp_conv3d.weight.set_value(
                 paddle.to_tensor(conv3d.weight.numpy().transpose(2, 3, 4, 1,
diff --git a/python/paddle/incubate/sparse/nn/functional/conv.py b/python/paddle/incubate/sparse/nn/functional/conv.py
index 62800dd01c65b..2dda83b2c1659 100644
--- a/python/paddle/incubate/sparse/nn/functional/conv.py
+++ b/python/paddle/incubate/sparse/nn/functional/conv.py
@@ -63,9 +63,9 @@ def _conv3d(x,
     dilation = convert_to_list(dilation, dims, 'dilation')
     op_type = "conv3d"
 
-    pre_bias = _C_ops.final_state_sparse_conv3d(
-        x, weight, padding, dilation, stride, groups, subm,
-        key if key is not None else name)
+    pre_bias = _C_ops.final_state_sparse_conv3d(x, weight, padding, dilation,
+                                                stride, groups, subm,
+                                                key if key is not None else "")
     if bias is not None:
         values = pre_bias.values()
         add_bias = elementwise_add(values, bias, axis=1)
@@ -84,7 +84,6 @@ def conv3d(x,
            padding=0,
            dilation=1,
            groups=1,
-           key=None,
            data_format="NDHWC",
            name=None):
     r"""
@@ -189,7 +188,7 @@ def conv3d(x,
               # (1, 1, 1, 2, 1)
     """
     return _conv3d(x, weight, bias, stride, padding, dilation, groups, False,
-                   key, data_format, name)
+                   None, data_format, name)
 
 
 def subm_conv3d(x,
diff --git a/python/paddle/incubate/sparse/nn/layer/conv.py b/python/paddle/incubate/sparse/nn/layer/conv.py
index b583fb6c12627..c7fe1f7b4033e 100644
--- a/python/paddle/incubate/sparse/nn/layer/conv.py
+++ b/python/paddle/incubate/sparse/nn/layer/conv.py
@@ -231,7 +231,6 @@ def __init__(self,
                  padding=0,
                  dilation=1,
                  groups=1,
-                 key=None,
                  padding_mode='zeros',
                  weight_attr=None,
                  bias_attr=None,
@@ -244,7 +243,7 @@ def __init__(self,
                                      dilation=dilation,
                                      groups=groups,
                                      subm=False,
-                                     key=key,
+                                     key=None,
                                      padding_mode=padding_mode,
                                      weight_attr=weight_attr,
                                      bias_attr=bias_attr,

From d38563b5bda1c67c32c2b0a77c664fa0b11be535 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 5 Jul 2022 12:20:52 +0000
Subject: [PATCH 55/70] opt code structure

---
 paddle/phi/kernels/funcs/sparse/scatter.cu.h  | 108 +++---------------
 .../phi/kernels/sparse/gpu/convolution.cu.h   |  60 +++-------
 .../sparse/gpu/convolution_grad_kernel.cu     |  20 ++--
 .../kernels/sparse/gpu/convolution_kernel.cu  |   2 +
 4 files changed, 47 insertions(+), 143 deletions(-)

diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
index 33157304424a3..270d17722c3a4 100644
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -67,80 +67,17 @@ __global__ void ScatterKernel(const T* input,
                            out + indices_i * channels + channels_i * VecSize);
   }
 }
-
-template <typename T, typename IndexT = int, int VecSize>
-__global__ void ScatterCUDAKernel(const T* params,
-                                  const IndexT* indices,
-                                  T* output,
-                                  size_t index_size,
-                                  size_t slice_size,
-                                  bool overwrite) {
-  const size_t vec_slice_size = slice_size / VecSize;
-  using LoadT = phi::AlignedVector<T, VecSize>;
-  using StoreT = phi::AlignedVector<T, VecSize>;
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * vec_slice_size, int64_t) {
-    int64_t indices_i = i / vec_slice_size;
-    int64_t slice_i =
-        i - indices_i * vec_slice_size;  // offset inside the slice
-    IndexT scatter_i = indices[indices_i];
-
-    int64_t out_i = scatter_i * slice_size + slice_i * VecSize;
-    LoadT vec_params, vec_out;
-    phi::Load<T, VecSize>(params + i * VecSize, &vec_params);
-    phi::Load<T, VecSize>(output + out_i, &vec_out);
-#pragma unroll
-    for (int j = 0; j < VecSize; j++) {
-      vec_out[j] += vec_params[j];
-    }
-    phi::Store<T, VecSize>(vec_out, output + out_i);
-    // output[out_i] += params[i];
-  }
-}
-
+// scatter's index has been grouped in advance
+// index_counts record the count of every group
+// index_groups save the index of every group
 template <typename T, int VecSize>
 __global__ void ScatterKernelV2(const T* input,
-                                const int* out_index_counts,
-                                const int* origin_out_indexs,
-                                const int non_zero_num,
-                                const int kernel_size,
-                                const int channels,
-                                T* out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  const int vec_channels = channels / VecSize;
-  using LoadT = phi::AlignedVector<T, VecSize>;
-  using StoreT = phi::AlignedVector<T, VecSize>;
-  for (int i = tid; i < non_zero_num * vec_channels;
-       i += gridDim.x * blockDim.x) {
-    int indices_i = i / vec_channels;
-    int channels_i = i - indices_i * vec_channels;
-
-    int len = out_index_counts[indices_i];
-    // max(end-start) = kernel_size
-    StoreT sums = {static_cast<T>(0)};
-    phi::Load<T, VecSize>(out + indices_i * channels + channels_i * VecSize,
-                          &sums);
-    for (int j = 0; j < len; j++) {
-      const int out_feature_i = origin_out_indexs[indices_i * kernel_size + j];
-      LoadT vec_in;
-      phi::Load<T, VecSize>(
-          input + out_feature_i * channels + channels_i * VecSize, &vec_in);
-#pragma unroll
-      for (int k = 0; k < VecSize; k++) {
-        sums[k] += vec_in[k];
-      }
-    }
-    phi::Store<T, VecSize>(sums,
-                           out + indices_i * channels + channels_i * VecSize);
-  }
-}
-
-template <typename T, int VecSize>
-__global__ void ScatterKernelV3(const T* input,
-                                const int* out_index_counts,
-                                const int* origin_out_indexs,
+                                const int* index_counts,
+                                const int* index_groups,
                                 const int non_zero_num,
                                 const int kernel_size,
                                 const int channels,
+                                const int buffer_counts,
                                 T* out) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   const int vec_channels = channels / VecSize;
@@ -151,31 +88,22 @@ __global__ void ScatterKernelV3(const T* input,
     int indices_i = i / vec_channels;
     int channels_i = i - indices_i * vec_channels;
 
-    int len1 = out_index_counts[indices_i];
     StoreT sums = {static_cast<T>(0)};
     phi::Load<T, VecSize>(out + indices_i * channels + channels_i * VecSize,
                           &sums);
-    for (int j = 0; j < len1; j++) {
-      const int out_feature_i = origin_out_indexs[indices_i * kernel_size + j];
-      LoadT vec_in;
-      phi::Load<T, VecSize>(
-          input + out_feature_i * channels + channels_i * VecSize, &vec_in);
+    for (int it = 0; it < buffer_counts; it++) {
+      int len = index_counts[indices_i + it * non_zero_num];
+      const int group_offset = it * kernel_size * non_zero_num;
+      for (int j = 0; j < len; j++) {
+        const int out_feature_i =
+            index_groups[indices_i * kernel_size + j + group_offset];
+        LoadT vec_in;
+        phi::Load<T, VecSize>(
+            input + out_feature_i * channels + channels_i * VecSize, &vec_in);
 #pragma unroll
-      for (int k = 0; k < VecSize; k++) {
-        sums[k] += vec_in[k];
-      }
-    }
-
-    int len2 = out_index_counts[non_zero_num + indices_i];
-    for (int j = 0; j < len2; j++) {
-      const int out_feature_i = origin_out_indexs[indices_i * kernel_size + j +
-                                                  kernel_size * non_zero_num];
-      LoadT vec_in;
-      phi::Load<T, VecSize>(
-          input + out_feature_i * channels + channels_i * VecSize, &vec_in);
-#pragma unroll
-      for (int k = 0; k < VecSize; k++) {
-        sums[k] += vec_in[k];
+        for (int k = 0; k < VecSize; k++) {
+          sums[k] += vec_in[k];
+        }
       }
     }
     phi::Store<T, VecSize>(sums,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 139259f48c8b3..f1d5074a777c0 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -60,44 +60,16 @@ __global__ void GatherKernel(const T* params,
   }
 }
 
-// the index_counts records the number of times the same index will be gather
-template <typename T, typename IntT, int VecSize>
-__global__ void GatherKernelV2(const T* inputs,
-                               const int* index_counts,
-                               const int* origin_indexs,
-                               const int non_zero_num,
-                               const int kernel_size,
-                               T* output,
-                               const int channels) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  const int vec_channels = channels / VecSize;
-  using LoadT = phi::AlignedVector<T, VecSize>;
-  using StoreT = phi::AlignedVector<T, VecSize>;
-  for (int i = tid; i < non_zero_num * vec_channels;
-       i += gridDim.x * blockDim.x) {
-    int indices_i = i / vec_channels;
-    int channels_i = i - indices_i * vec_channels;
-    int len = index_counts[indices_i];
-    LoadT in_vec;
-    phi::Load<T, VecSize>(inputs + indices_i * channels + channels_i * VecSize,
-                          &in_vec);
-    for (int j = 0; j < len; j++) {
-      int out_i = origin_indexs[indices_i * kernel_size + j];
-      phi::Store<T, VecSize>(in_vec,
-                             output + out_i * channels + channels_i * VecSize);
-    }
-  }
-}
-
 // double sparse, seed GroupIndexs
 template <typename T, typename IntT, int VecSize>
-__global__ void GatherKernelV3(const T* inputs,
+__global__ void GatherKernelV2(const T* inputs,
                                const int* index_counts,
-                               const int* origin_indexs,
+                               const int* index_groups,
                                const int non_zero_num,
                                const int kernel_size,
-                               T* output,
-                               const int channels) {
+                               const int channels,
+                               const int buffer_count,
+                               T* output) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   const int vec_channels = channels / VecSize;
   using LoadT = phi::AlignedVector<T, VecSize>;
@@ -106,21 +78,19 @@ __global__ void GatherKernelV3(const T* inputs,
        i += gridDim.x * blockDim.x) {
     int indices_i = i / vec_channels;
     int channels_i = i - indices_i * vec_channels;
-    int len1 = index_counts[indices_i];
     LoadT in_vec;
     phi::Load<T, VecSize>(inputs + indices_i * channels + channels_i * VecSize,
                           &in_vec);
-    for (int j = 0; j < len1; j++) {
-      int out_i = origin_indexs[indices_i * kernel_size + j];
-      phi::Store<T, VecSize>(in_vec,
-                             output + out_i * channels + channels_i * VecSize);
-    }
-    int len2 = index_counts[non_zero_num + indices_i];
-    for (int j = 0; j < len2; j++) {
-      int out_i = origin_indexs[indices_i * kernel_size + j +
-                                kernel_size * non_zero_num];
-      phi::Store<T, VecSize>(in_vec,
-                             output + out_i * channels + channels_i * VecSize);
+#pragma unroll
+    for (int it = 0; it < buffer_count; it++) {
+      int len = index_counts[indices_i + it * non_zero_num];
+      const int group_offset = it * kernel_size * non_zero_num;
+#pragma unroll
+      for (int j = 0; j < len; j++) {
+        int out_i = index_groups[indices_i * kernel_size + j + group_offset];
+        phi::Store<T, VecSize>(
+            in_vec, output + out_i * channels + channels_i * VecSize);
+      }
     }
   }
 }
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 741720b20b9ff..b6f674576f934 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -178,7 +178,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   if (in_channels % VecSize == 0) {
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, x.nnz() * in_channels / VecSize, 1);
-    GatherKernelV3<T, IntT, VecSize>
+    GatherKernelV2<T, IntT, VecSize>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
            0,
@@ -187,12 +187,13 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
                                unique_value_ptr,
                                x.nnz(),
                                kernel_size,
-                               in_features_ptr,
-                               in_channels);
+                               in_channels,
+                               2,
+                               in_features_ptr);
   } else {
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, x.nnz() * in_channels, 1);
-    GatherKernelV3<T, IntT, 1>
+    GatherKernelV2<T, IntT, 1>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
            0,
@@ -201,8 +202,9 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
                                unique_value_ptr,
                                x.nnz(),
                                kernel_size,
-                               in_features_ptr,
-                               in_channels);
+                               in_channels,
+                               2,
+                               in_features_ptr);
   }
 
   if (out_channels % VecSize == 0) {
@@ -277,7 +279,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   if (in_channels % VecSize == 0) {
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, x_grad->nnz() * in_channels / VecSize, 1);
-    phi::funcs::sparse::ScatterKernelV3<T, VecSize>
+    phi::funcs::sparse::ScatterKernelV2<T, VecSize>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
            0,
@@ -287,11 +289,12 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
                                x_grad->nnz(),
                                kernel_size,
                                in_channels,
+                               2,
                                x_grad_values_ptr);
   } else {
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, x_grad->nnz() * in_channels, 1);
-    phi::funcs::sparse::ScatterKernelV3<T, 1>
+    phi::funcs::sparse::ScatterKernelV2<T, 1>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
            0,
@@ -301,6 +304,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
                                x_grad->nnz(),
                                kernel_size,
                                in_channels,
+                               2,
                                x_grad_values_ptr);
   }
 }
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 6c071d963ebd2..1381373abc03b 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -246,6 +246,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
                                out->nnz(),
                                kernel_size,
                                out_channels,
+                               1,
                                out_values_ptr);
   } else {
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
@@ -260,6 +261,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
                                out->nnz(),
                                kernel_size,
                                out_channels,
+                               1,
                                out_values_ptr);
   }
 }

From d06527fa24e0d9ca01b5231173da173a07f92604 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 6 Jul 2022 02:27:06 +0000
Subject: [PATCH 56/70] opt gather/scatter code structure

---
 paddle/phi/kernels/funcs/sparse/scatter.cu.h  |  44 +++++++
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 113 ++++++++++-------
 .../sparse/gpu/convolution_grad_kernel.cu     | 119 ++++--------------
 .../kernels/sparse/gpu/convolution_kernel.cu  |  72 +++--------
 4 files changed, 154 insertions(+), 194 deletions(-)

diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
index 270d17722c3a4..f7c4b7642d7bd 100644
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 #define VecBytes 16
@@ -111,6 +113,48 @@ __global__ void ScatterKernelV2(const T* input,
   }
 }
 
+template <typename T>
+void ScatterV2(const GPUContext& dev_ctx,
+               const T* input,
+               const int* index_counts,
+               const int* index_groups,
+               const int non_zero_num,
+               const int kernel_size,
+               const int channels,
+               const int buffer_counts,
+               T* output) {
+  const int VecSize = VecBytes / sizeof(T);
+  if (channels % VecSize == 0) {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, non_zero_num * channels / VecSize, 1);
+    ScatterKernelV2<T, VecSize><<<config.block_per_grid.x,
+                                  config.thread_per_block.x,
+                                  0,
+                                  dev_ctx.stream()>>>(input,
+                                                      index_counts,
+                                                      index_groups,
+                                                      non_zero_num,
+                                                      kernel_size,
+                                                      channels,
+                                                      buffer_counts,
+                                                      output);
+  } else {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, non_zero_num * channels, 1);
+    ScatterKernelV2<T, 1><<<config.block_per_grid.x,
+                            config.thread_per_block.x,
+                            0,
+                            dev_ctx.stream()>>>(input,
+                                                index_counts,
+                                                index_groups,
+                                                non_zero_num,
+                                                kernel_size,
+                                                channels,
+                                                buffer_counts,
+                                                output);
+  }
+}
+
 }  // namespace sparse
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index f1d5074a777c0..47409d3a0664b 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
 #include "paddle/phi/kernels/funcs/sparse/utils.cu.h"
 #include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
@@ -95,6 +96,75 @@ __global__ void GatherKernelV2(const T* inputs,
   }
 }
 
+template <typename T, typename IntT>
+inline void Gather(const GPUContext& dev_ctx,
+                   const T* inputs,
+                   const IntT* indices,
+                   const int indices_size,
+                   const int channels,
+                   T* output) {
+  const int VecSize = VecBytes / sizeof(T);
+  if (channels % VecSize == 0) {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, indices_size * channels / VecSize, 1);
+    GatherKernel<T, IntT, VecSize>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(inputs, indices, output, indices_size, channels);
+  } else {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, indices_size * channels, 1);
+    GatherKernel<T, IntT, 1>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(inputs, indices, output, indices_size, channels);
+  }
+}
+
+template <typename T, typename IntT>
+inline void GatherV2(const GPUContext& dev_ctx,
+                     const T* inputs,
+                     const int* index_counts,
+                     const int* index_groups,
+                     const int non_zero_num,
+                     const int kernel_size,
+                     const int channels,
+                     const int buffer_count,
+                     T* output) {
+  const int VecSize = VecBytes / sizeof(T);
+  if (channels % VecSize == 0) {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, non_zero_num * channels / VecSize, 1);
+    GatherKernelV2<T, IntT, VecSize><<<config.block_per_grid.x,
+                                       config.thread_per_block.x,
+                                       0,
+                                       dev_ctx.stream()>>>(inputs,
+                                                           index_counts,
+                                                           index_groups,
+                                                           non_zero_num,
+                                                           kernel_size,
+                                                           channels,
+                                                           buffer_count,
+                                                           output);
+  } else {
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, non_zero_num * channels, 1);
+    GatherKernelV2<T, IntT, 1><<<config.block_per_grid.x,
+                                 config.thread_per_block.x,
+                                 0,
+                                 dev_ctx.stream()>>>(inputs,
+                                                     index_counts,
+                                                     index_groups,
+                                                     non_zero_num,
+                                                     kernel_size,
+                                                     channels,
+                                                     buffer_count,
+                                                     output);
+  }
+}
+
 // unique the out indexs in rulebook
 template <typename IntT>
 __global__ void UniqueKernel(const IntT* in_indexs,
@@ -151,47 +221,6 @@ __global__ void GroupIndexs(const int* out_index_table,
   }
 }
 
-/**
- * @brief: update the out index and indices
- * unique_keys: save the index of the output feature list
- * unique_values: indiates the index of key before deduplication
- * out_indexs: indicates the position of the output index in the rulebook
- * rulebook_len: indicates the length of rulebook
- * out_dims: indicates the output dims
- * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
- * rulebook_out_indexs: the output index in rulebook
- **/
-template <typename T>
-__global__ void UpdateIndexKernel(const T* unique_keys,
-                                  const int* unique_values,
-                                  const int* out_indexs,
-                                  const int64_t non_zero_num,
-                                  const int rulebook_len,
-                                  const Dims4D out_dims,
-                                  T* out_indices,
-                                  T* rulebook_out_indexs) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
-    const T index = unique_keys[i];
-    T batch, x, y, z;
-    phi::funcs::sparse::IndexToPoint<Dims4D>(
-        index, out_dims, &batch, &x, &y, &z);
-    // get out indices
-    out_indices[i] = batch;
-    out_indices[i + non_zero_num] = z;
-    out_indices[i + non_zero_num * 2] = y;
-    out_indices[i + non_zero_num * 3] = x;
-
-    // update rulebook
-    int start = unique_values[i];
-    int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
-    // max(end-start) = kernel_size
-    for (T j = start; j < end; j++) {
-      rulebook_out_indexs[out_indexs[j]] = i;
-    }
-  }
-}
-
 /**
  * @brief product rulebook
  * for input_i in x_indices:
@@ -295,7 +324,6 @@ __global__ void GetOutIndexTable(int* indexs,
                                  IntT* out_indices) {
   CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
     IntT index = static_cast<IntT>(indexs[i]);
-    // index = index == -1 ? 0 : index;
     out_index_table[index] = i;
     IntT batch, x, y, z;
     phi::funcs::sparse::IndexToPoint<Dims4D>(
@@ -334,7 +362,6 @@ __global__ void CopyRuleBook(const int* counters,
       }
     }
     int inner_index = i - offsets[kernel_index];
-    // out_rulebook[i] = in_rulebook[kernel_index * non_zero_num + inner_index];
     out_rulebook[i] = in_rulebook[kernel_index * non_zero_num + inner_index];
     out_rulebook[len + i] =
         in_rulebook[kernel_size * non_zero_num + kernel_index * non_zero_num +
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index b6f674576f934..1a6842416e3dd 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -24,8 +24,6 @@ limitations under the License. */
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/scatter.cu.h"
-#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
 namespace phi {
@@ -99,7 +97,6 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   *kernel_grad = phi::EmptyLike<T>(dev_ctx, kernel);
   T* d_kernel_ptr = kernel_grad->data<T>();
   phi::funcs::SetConstant<GPUContext, T> set_zero;
-  // set_zero(dev_ctx, kernel_grad, static_cast<T>(0.0f));
   phi::backends::gpu::GpuMemsetAsync(
       d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel(), dev_ctx.stream());
 
@@ -109,12 +106,10 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
       phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
   DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
   T* x_grad_values_ptr = x_grad_values.data<T>();
-  // set_zero(dev_ctx, &x_grad_values, static_cast<T>(0.0f));
   phi::backends::gpu::GpuMemsetAsync(x_grad_values_ptr,
                                      0,
                                      sizeof(T) * x_grad_values.numel(),
                                      dev_ctx.stream());
-  // set_zero(dev_ctx, &d_x_features, static_cast<T>(0.0f));
   phi::backends::gpu::GpuMemsetAsync(
       d_x_features_ptr, 0, sizeof(T) * d_x_features.numel(), dev_ctx.stream());
   phi::Copy<GPUContext>(dev_ctx,
@@ -160,7 +155,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
       phi::Empty<int>(dev_ctx, {static_cast<int>(x.nnz() * 2)});
   int* out_index_ptr = out_index.data<int>();
   int* unique_value_ptr = unique_value.data<int>();
-  cudaMemsetAsync(
+  phi::backends::gpu::GpuMemsetAsync(
       out_index_ptr, 0, sizeof(int) * x.nnz() * 2, dev_ctx.stream());
 
   GroupIndexsV2<<<config.block_per_grid,
@@ -174,64 +169,22 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
                                       out_index_ptr,
                                       unique_value_ptr);
 
-  const int VecSize = VecBytes / sizeof(T);
-  if (in_channels % VecSize == 0) {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, x.nnz() * in_channels / VecSize, 1);
-    GatherKernelV2<T, IntT, VecSize>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                               out_index_ptr,
-                               unique_value_ptr,
-                               x.nnz(),
-                               kernel_size,
-                               in_channels,
-                               2,
-                               in_features_ptr);
-  } else {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, x.nnz() * in_channels, 1);
-    GatherKernelV2<T, IntT, 1>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                               out_index_ptr,
-                               unique_value_ptr,
-                               x.nnz(),
-                               kernel_size,
-                               in_channels,
-                               2,
-                               in_features_ptr);
-  }
+  GatherV2<T, IntT>(dev_ctx,
+                    x.non_zero_elements().data<T>(),
+                    out_index_ptr,
+                    unique_value_ptr,
+                    x.nnz(),
+                    kernel_size,
+                    in_channels,
+                    2,
+                    in_features_ptr);
 
-  if (out_channels % VecSize == 0) {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, rulebook_len * out_channels / VecSize, 1);
-    GatherKernel<T, IntT, VecSize>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
-                               rulebook_ptr + rulebook_len,
-                               out_grad_features_ptr,
-                               rulebook_len,
-                               out_channels);
-  } else {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, rulebook_len * out_channels, 1);
-    GatherKernel<T, IntT, 1>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
-                               rulebook_ptr + rulebook_len,
-                               out_grad_features_ptr,
-                               rulebook_len,
-                               out_channels);
-  }
+  Gather<T, IntT>(dev_ctx,
+                  out_grad.non_zero_elements().data<T>(),
+                  rulebook_ptr + rulebook_len,
+                  rulebook_len,
+                  out_channels,
+                  out_grad_features_ptr);
 
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
@@ -276,37 +229,15 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   }
 
   // 4. scatter
-  if (in_channels % VecSize == 0) {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, x_grad->nnz() * in_channels / VecSize, 1);
-    phi::funcs::sparse::ScatterKernelV2<T, VecSize>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(d_x_features_ptr,
-                               out_index.data<int>(),
-                               unique_value.data<int>(),
-                               x_grad->nnz(),
-                               kernel_size,
-                               in_channels,
-                               2,
-                               x_grad_values_ptr);
-  } else {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, x_grad->nnz() * in_channels, 1);
-    phi::funcs::sparse::ScatterKernelV2<T, 1>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(d_x_features_ptr,
-                               out_index.data<int>(),
-                               unique_value.data<int>(),
-                               x_grad->nnz(),
-                               kernel_size,
-                               in_channels,
-                               2,
-                               x_grad_values_ptr);
-  }
+  phi::funcs::sparse::ScatterV2<T>(dev_ctx,
+                                   d_x_features_ptr,
+                                   out_index.data<int>(),
+                                   unique_value.data<int>(),
+                                   x_grad->nnz(),
+                                   kernel_size,
+                                   in_channels,
+                                   2,
+                                   x_grad_values_ptr);
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 1381373abc03b..70453d371cc50 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -158,32 +158,12 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   phi::funcs::SetConstant<GPUContext, T> set_zero;
   set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
 
-  const int VecSize = VecBytes / sizeof(T);
-  if (in_channels % VecSize == 0) {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, n * in_channels / VecSize, 1);
-    GatherKernel<T, IntT, VecSize>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                               rulebook_ptr,
-                               in_features_ptr,
-                               n,
-                               in_channels);
-  } else {
-    auto config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
-    GatherKernel<T, IntT, 1>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                               rulebook_ptr,
-                               in_features_ptr,
-                               n,
-                               in_channels);
-  }
+  Gather<T, IntT>(dev_ctx,
+                  x.non_zero_elements().data<T>(),
+                  rulebook_ptr,
+                  n,
+                  in_channels,
+                  in_features_ptr);
 
   // 3. call gemm for every werght
   auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
@@ -233,37 +213,15 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   }
 
   // 4. scatter
-  if (out_channels % VecSize == 0) {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, out->nnz() * out_channels / VecSize, 1);
-    phi::funcs::sparse::ScatterKernelV2<T, VecSize>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(out_features_ptr,
-                               out_index.data<int>(),
-                               unique_value.data<int>(),
-                               out->nnz(),
-                               kernel_size,
-                               out_channels,
-                               1,
-                               out_values_ptr);
-  } else {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, out->nnz() * out_channels, 1);
-    phi::funcs::sparse::ScatterKernelV2<T, 1>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(out_features_ptr,
-                               out_index.data<int>(),
-                               unique_value.data<int>(),
-                               out->nnz(),
-                               kernel_size,
-                               out_channels,
-                               1,
-                               out_values_ptr);
-  }
+  phi::funcs::sparse::ScatterV2<T>(dev_ctx,
+                                   out_features_ptr,
+                                   out_index.data<int>(),
+                                   unique_value.data<int>(),
+                                   out->nnz(),
+                                   kernel_size,
+                                   out_channels,
+                                   1,
+                                   out_values_ptr);
 }
 
 /**

From 842acf70487069a4088defa92bd685db1550e14c Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 6 Jul 2022 04:11:17 +0000
Subject: [PATCH 57/70] fix pool

---
 paddle/phi/api/yaml/sparse_api.yaml           |  6 +--
 paddle/phi/api/yaml/sparse_bw_api.yaml        |  6 +--
 .../sparse/cpu/sparse_pool_grad_kernel.cc     | 15 +++----
 .../kernels/sparse/cpu/sparse_pool_kernel.cc  | 14 +++++--
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 36 +++++++----------
 .../sparse/gpu/sparse_pool_grad_kernel.cu     | 32 +++++----------
 .../kernels/sparse/gpu/sparse_pool_kernel.cu  | 39 +++++++++++--------
 .../kernels/sparse/sparse_pool_grad_kernel.h  |  4 +-
 .../phi/kernels/sparse/sparse_pool_kernel.h   | 17 ++++++--
 .../tests/kernels/test_sparse_pool_dev_api.cc | 27 ++++++++-----
 10 files changed, 104 insertions(+), 92 deletions(-)

diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml
index cc583fce1f8a1..a73529dde3c17 100644
--- a/paddle/phi/api/yaml/sparse_api.yaml
+++ b/paddle/phi/api/yaml/sparse_api.yaml
@@ -169,11 +169,11 @@
 
 - api: maxpool
   args : (Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides)
-  output : Tensor(out), Tensor(rulebook)
+  output : Tensor(out), Tensor(rulebook), Tensor(counter)
   kernel :
-    func : sparse_maxpool{sparse_coo -> sparse_coo, dense}
+    func : sparse_maxpool{sparse_coo -> sparse_coo, dense, dense}
     layout : x
-  intermediate : rulebook
+  intermediate : rulebook, counter
   backward : sparse_maxpool_grad
 
 - api: mv
diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml
index 04cdfccce636e..4d0371257d810 100644
--- a/paddle/phi/api/yaml/sparse_bw_api.yaml
+++ b/paddle/phi/api/yaml/sparse_bw_api.yaml
@@ -93,11 +93,11 @@
     func : softmax_csr_grad{sparse_csr, sparse_csr -> sparse_csr}
 
 - backward_api : sparse_maxpool_grad
-  forward : sparse_maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out), Tensor(rulebook)
-  args : (Tensor x, Tensor rulebook, Tensor out, Tensor out_grad, int[] kernel_sizes)
+  forward : sparse_maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out), Tensor(rulebook), Tensor(counter)
+  args : (Tensor x, Tensor rulebook, Tensor counter, Tensor out, Tensor out_grad, int[] kernel_sizes)
   output : Tensor(x_grad)
   kernel :
-    func : sparse_maxpool_grad {sparse_coo, dense, sparse_coo, sparse_coo -> sparse_coo}
+    func : sparse_maxpool_grad {sparse_coo, dense, dense, sparse_coo, sparse_coo -> sparse_coo}
 
 - backward_api : sqrt_grad
   forward : sqrt(Tensor x) -> Tensor(out)
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
index 64c843c07a6ef..580cfe9bb94d0 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
@@ -28,6 +28,7 @@ template <typename T, typename IntT = int>
 void MaxPoolGradCPUKernel(const CPUContext& dev_ctx,
                           const SparseCooTensor& x,
                           const DenseTensor& rulebook,
+                          const DenseTensor& counter,
                           const SparseCooTensor& out,
                           const SparseCooTensor& out_grad,
                           const std::vector<int>& kernel_sizes,
@@ -36,11 +37,10 @@ void MaxPoolGradCPUKernel(const CPUContext& dev_ctx,
   const int channels = x.dims()[4];
   int rulebook_len = rulebook.dims()[1];
   const IntT* rulebook_ptr = rulebook.data<IntT>();
-  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
-  for (int i = 0; i < rulebook_len; i++) {
-    counter[rulebook_ptr[i]] += 1;
-  }
-  phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size);
+  std::vector<int> offsets(kernel_size + 1);
+  const int* counter_ptr = counter.data<int>();
+
+  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
 
   const T* in_features_ptr = x.non_zero_elements().data<T>();
   const T* out_features_ptr = out.non_zero_elements().data<T>();
@@ -60,7 +60,7 @@ void MaxPoolGradCPUKernel(const CPUContext& dev_ctx,
 
   phi::funcs::MaxPoolGrad<T> grad_functor;
   for (int i = 0; i < kernel_size; i++) {
-    for (int j = 0; j < counter[i]; j++) {
+    for (int j = 0; j < counter_ptr[i]; j++) {
       IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
       IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
       for (int c = 0; c < channels; c++) {
@@ -78,6 +78,7 @@ template <typename T, typename Context>
 void MaxPoolGradKernel(const Context& dev_ctx,
                        const SparseCooTensor& x,
                        const DenseTensor& rulebook,
+                       const DenseTensor& counter,
                        const SparseCooTensor& out,
                        const SparseCooTensor& out_grad,
                        const std::vector<int>& kernel_sizes,
@@ -85,7 +86,7 @@ void MaxPoolGradKernel(const Context& dev_ctx,
   PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "MaxPoolGradCPUKernel", ([&] {
         MaxPoolGradCPUKernel<T, data_t>(
-            dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad);
+            dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad);
       }));
 }
 
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
index f4d6e807538ea..a3224b6fe14bb 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
@@ -37,7 +37,8 @@ void MaxPoolCPUKernel(const CPUContext& dev_ctx,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
                       SparseCooTensor* out,
-                      DenseTensor* rulebook) {
+                      DenseTensor* rulebook,
+                      DenseTensor* counter) {
   const auto& x_dims = x.dims();
   int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
   const std::vector<int>& real_kernel_sizes =
@@ -71,7 +72,10 @@ void MaxPoolCPUKernel(const CPUContext& dev_ctx,
 
   int rulebook_len = rulebook->dims()[1];
   const IntT* rulebook_ptr = rulebook->data<IntT>();
-  const int* counter_ptr = counter_per_kernel.data();
+
+  counter->Resize({kernel_size});
+  int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
+  memcpy(counter_ptr, counter_per_kernel.data(), kernel_size * sizeof(int));
 
   std::vector<int> offsets(kernel_size + 1);
   phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
@@ -107,7 +111,8 @@ void MaxPoolKernel(const Context& dev_ctx,
                    const std::vector<int>& dilations,
                    const std::vector<int>& strides,
                    SparseCooTensor* out,
-                   DenseTensor* rulebook) {
+                   DenseTensor* rulebook,
+                   DenseTensor* counter) {
   PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "MaxPoolCPUKernel", ([&] {
         MaxPoolCPUKernel<T, data_t>(dev_ctx,
@@ -117,7 +122,8 @@ void MaxPoolKernel(const Context& dev_ctx,
                                     dilations,
                                     strides,
                                     out,
-                                    rulebook);
+                                    rulebook,
+                                    counter);
       }));
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 47409d3a0664b..9787393d06960 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -305,7 +305,7 @@ template <typename IntT>
 __global__ void GetOutIndexTable(const IntT* indices,
                                  const IntT non_zero_num,
                                  const Dims4D dims,
-                                 IntT* out_index_table) {
+                                 int* out_index_table) {
   CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
     IntT batch = indices[i];
     IntT in_z = indices[i + non_zero_num];
@@ -378,7 +378,7 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
                                           const Dims4D paddings,
                                           const Dims4D dilations,
                                           const Dims4D strides,
-                                          const T* out_index_table,
+                                          const int* out_index_table,
                                           T* rulebook,
                                           int* counter) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -545,11 +545,19 @@ int ProductRuleBook(const Context& dev_ctx,
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
 
+  const int rulebook_rows = 2;
+  const int rulebook_cols = kernel_size * non_zero_num;
+  DenseTensorMeta rulebook_meta(
+      indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW);
+
+  int64_t table_size = 1;
+  for (int i = 0; i < out_dims.size() - 1; i++) {
+    table_size *= out_dims[i];
+  }
+  DenseTensor out_index_table = phi::Empty<int>(dev_ctx, {table_size});
+  int* out_index_table_ptr = out_index_table.data<int>();
+
   if (subm) {
-    const int rulebook_rows = 2;
-    const int rulebook_cols = kernel_size * non_zero_num;
-    DenseTensorMeta rulebook_meta(
-        indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW);
     DenseTensor tmp_rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta));
     IntT* rulebook_ptr = tmp_rulebook.data<IntT>();
     DenseTensor out_indices =
@@ -562,12 +570,6 @@ int ProductRuleBook(const Context& dev_ctx,
     phi::Copy(
         dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
 
-    int64_t table_size = 1;
-    for (int i = 0; i < out_dims.size() - 1; i++) {
-      table_size *= out_dims[i];
-    }
-    DenseTensor out_index_table = phi::Empty<IntT>(dev_ctx, {table_size});
-    IntT* out_index_table_ptr = out_index_table.data<IntT>();
     phi::backends::gpu::GpuMemsetAsync(
         out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
 
@@ -648,10 +650,6 @@ int ProductRuleBook(const Context& dev_ctx,
     return rulebook_len;
 
   } else {
-    const int rulebook_rows = 2;
-    const int rulebook_cols = kernel_size * non_zero_num;
-    DenseTensorMeta rulebook_meta(
-        indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW);
     *rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta));
     IntT* rulebook_ptr = rulebook->data<IntT>();
     ProductRuleBookKernel<IntT><<<config.block_per_grid.x,
@@ -709,12 +707,6 @@ int ProductRuleBook(const Context& dev_ctx,
     int* out_index_ptr = out_index->data<int>();
     int* unique_key_ptr = unique_key.data<int>();
 
-    int64_t table_size = 1;
-    for (int i = 0; i < out_dims.size() - 1; i++) {
-      table_size *= out_dims[i];
-    }
-    DenseTensor out_index_table = phi::Empty<int>(dev_ctx, {table_size});
-    int* out_index_table_ptr = out_index_table.data<int>();
     phi::backends::gpu::GpuMemsetAsync(
         out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
     phi::backends::gpu::GpuMemsetAsync(
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
index 5fe6e68c1e83f..12225da7a01fb 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
@@ -55,6 +55,7 @@ template <typename T, typename IntT = int>
 void MaxPoolGradGPUKernel(const GPUContext& dev_ctx,
                           const SparseCooTensor& x,
                           const DenseTensor& rulebook,
+                          const DenseTensor& counter,
                           const SparseCooTensor& out,
                           const SparseCooTensor& out_grad,
                           const std::vector<int>& kernel_sizes,
@@ -63,23 +64,9 @@ void MaxPoolGradGPUKernel(const GPUContext& dev_ctx,
   const int in_channels = x.dims()[4];
   int rulebook_len = rulebook.dims()[1];
   const IntT* rulebook_ptr = rulebook.data<IntT>();
-  std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0),
-      h_counter(rulebook_len, 0);
-  phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
-                                     rulebook_ptr,
-                                     rulebook_len * sizeof(IntT),
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyDeviceToHost,
-#else
-                                     cudaMemcpyDeviceToHost,
-#endif
-
-                                     dev_ctx.stream());
-  dev_ctx.Wait();
-  for (int i = 0; i < rulebook_len; i++) {
-    counter[h_counter[i]] += 1;
-  }
-  phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size);
+  std::vector<int> offsets(kernel_size + 1);
+  const int* counter_ptr = counter.data<int>();
+  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
 
   const T* in_features_ptr = x.non_zero_elements().data<T>();
   const T* out_features_ptr = out.non_zero_elements().data<T>();
@@ -99,12 +86,12 @@ void MaxPoolGradGPUKernel(const GPUContext& dev_ctx,
                         &x_grad_indices);
 
   for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0) {
+    if (counter_ptr[i] <= 0) {
       continue;
     }
 
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, counter[i] * in_channels, 1);
+        dev_ctx, counter_ptr[i] * in_channels, 1);
     MaxPoolGradCudaKernel<T, IntT>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
@@ -112,8 +99,8 @@ void MaxPoolGradGPUKernel(const GPUContext& dev_ctx,
            dev_ctx.stream()>>>(in_features_ptr,
                                out_features_ptr,
                                out_grad_ptr,
-                               rulebook_ptr + offsets[i] + rulebook_len,
-                               counter[i],
+                               rulebook_ptr + offsets[i],
+                               counter_ptr[i],
                                rulebook_len,
                                in_channels,
                                x_grad_ptr);
@@ -124,6 +111,7 @@ template <typename T, typename Context>
 void MaxPoolGradKernel(const Context& dev_ctx,
                        const SparseCooTensor& x,
                        const DenseTensor& rulebook,
+                       const DenseTensor& counter,
                        const SparseCooTensor& out,
                        const SparseCooTensor& out_grad,
                        const std::vector<int>& kernel_sizes,
@@ -131,7 +119,7 @@ void MaxPoolGradKernel(const Context& dev_ctx,
   PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "MaxPoolGradGPUKernel", ([&] {
         MaxPoolGradGPUKernel<T, data_t>(
-            dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad);
+            dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad);
       }));
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
index bc6723d26b7a6..61a622075efde 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
@@ -55,7 +55,8 @@ void MaxPoolGPUKernel(const GPUContext& dev_ctx,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
                       SparseCooTensor* out,
-                      DenseTensor* rulebook) {
+                      DenseTensor* rulebook,
+                      DenseTensor* counter) {
   const auto& x_dims = x.dims();
   int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
   const std::vector<int>& real_kernel_sizes =
@@ -65,7 +66,7 @@ void MaxPoolGPUKernel(const GPUContext& dev_ctx,
       x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
   const int in_channels = real_kernel_sizes[3];
 
-  std::vector<int> offsets(kernel_size + 1), counter(kernel_size);
+  std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
   DenseTensorMeta counter_meta(
       DataType::INT32, {kernel_size}, DataLayout::NCHW);
   DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
@@ -89,13 +90,16 @@ void MaxPoolGPUKernel(const GPUContext& dev_ctx,
                                                           &out_index,
                                                           &unique_value,
                                                           out,
-                                                          &counter,
+                                                          &h_counter,
                                                           &offsets);
 
   const IntT* rulebook_ptr = rulebook->data<IntT>();
 
   T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
   const T* in_features_ptr = x.non_zero_elements().data<T>();
+  counter->Resize({kernel_size});
+  int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
+  memcpy(counter_ptr, h_counter.data(), h_counter.size() * sizeof(int));
 // 2. max pool
 #ifdef PADDLE_WITH_HIP
   thrust::fill(thrust::hip::par.on(dev_ctx.stream()),
@@ -107,22 +111,21 @@ void MaxPoolGPUKernel(const GPUContext& dev_ctx,
                static_cast<T>(0));
   // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster
   for (int i = 0; i < kernel_size; i++) {
-    if (counter[i] <= 0) {
+    if (h_counter[i] <= 0) {
       continue;
     }
 
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, counter[i] * in_channels, 1);
-    MaxPoolCudaKernel<T, IntT>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(in_features_ptr,
-                               rulebook_ptr + offsets[i] + rulebook_len,
-                               counter[i],
-                               rulebook_len,
-                               in_channels,
-                               out_features_ptr);
+        dev_ctx, h_counter[i] * in_channels, 1);
+    MaxPoolCudaKernel<T, IntT><<<config.block_per_grid.x,
+                                 config.thread_per_block.x,
+                                 0,
+                                 dev_ctx.stream()>>>(in_features_ptr,
+                                                     rulebook_ptr + offsets[i],
+                                                     h_counter[i],
+                                                     rulebook_len,
+                                                     in_channels,
+                                                     out_features_ptr);
   }
 }
 
@@ -134,7 +137,8 @@ void MaxPoolKernel(const Context& dev_ctx,
                    const std::vector<int>& dilations,
                    const std::vector<int>& strides,
                    SparseCooTensor* out,
-                   DenseTensor* rulebook) {
+                   DenseTensor* rulebook,
+                   DenseTensor* counter) {
   PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "MaxPoolGPUKernel", ([&] {
         MaxPoolGPUKernel<T, data_t>(dev_ctx,
@@ -144,7 +148,8 @@ void MaxPoolKernel(const Context& dev_ctx,
                                     dilations,
                                     strides,
                                     out,
-                                    rulebook);
+                                    rulebook,
+                                    counter);
       }));
 }
 
diff --git a/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h
index 2f7366a010aaa..ef9f8418b0116 100644
--- a/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h
@@ -25,6 +25,7 @@ template <typename T, typename Context>
 void MaxPoolGradKernel(const Context& dev_ctx,
                        const SparseCooTensor& x,
                        const DenseTensor& rulebook,
+                       const DenseTensor& counter,
                        const SparseCooTensor& out,
                        const SparseCooTensor& out_grad,
                        const std::vector<int>& kernel_sizes,
@@ -34,12 +35,13 @@ template <typename T, typename Context>
 SparseCooTensor MaxPoolGrad(const Context& dev_ctx,
                             const SparseCooTensor& x,
                             const DenseTensor& rulebook,
+                            const DenseTensor& counter,
                             const SparseCooTensor& out,
                             const SparseCooTensor& out_grad,
                             const std::vector<int>& kernel_sizes) {
   SparseCooTensor x_grad;
   MaxPoolGradKernel<T, Context>(
-      dev_ctx, x, rulebook, out, out_grad, kernel_sizes, &x_grad);
+      dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, &x_grad);
   return x_grad;
 }
 
diff --git a/paddle/phi/kernels/sparse/sparse_pool_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_kernel.h
index d5248a1ad250e..9f4939da8d52a 100644
--- a/paddle/phi/kernels/sparse/sparse_pool_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_pool_kernel.h
@@ -29,7 +29,8 @@ void MaxPoolKernel(const Context& dev_ctx,
                    const std::vector<int>& dilations,
                    const std::vector<int>& strides,
                    SparseCooTensor* out,
-                   DenseTensor* rulebook);
+                   DenseTensor* rulebook,
+                   DenseTensor* counter);
 
 template <typename T, typename Context>
 SparseCooTensor MaxPool(const Context& dev_ctx,
@@ -38,10 +39,18 @@ SparseCooTensor MaxPool(const Context& dev_ctx,
                         const std::vector<int>& paddings,
                         const std::vector<int>& dilations,
                         const std::vector<int>& strides,
-                        DenseTensor* rulebook) {
+                        DenseTensor* rulebook,
+                        DenseTensor* counter) {
   SparseCooTensor coo;
-  MaxPoolKernel<T, Context>(
-      dev_ctx, x, kernel_sizes, paddings, dilations, strides, &coo, rulebook);
+  MaxPoolKernel<T, Context>(dev_ctx,
+                            x,
+                            kernel_sizes,
+                            paddings,
+                            dilations,
+                            strides,
+                            &coo,
+                            rulebook,
+                            counter);
   return coo;
 }
 
diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
index 460dca59c718c..36fa99d9bfc75 100644
--- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
 #include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
 
@@ -89,14 +90,15 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
   };
 
   if (!std::is_same<T, phi::dtype::float16>::value) {
-    DenseTensor rulebook;
+    DenseTensor rulebook, counter;
     SparseCooTensor out = sparse::MaxPool<T>(dev_ctx_cpu,
                                              x_tensor,
                                              kernel_sizes,
                                              paddings,
                                              dilations,
                                              strides,
-                                             &rulebook);
+                                             &rulebook,
+                                             &counter);
 
     ASSERT_EQ(correct_out_dims.size(), out.dims().size());
     for (int i = 0; i < correct_out_dims.size(); i++) {
@@ -113,7 +115,7 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
 
     if (backward) {
       SparseCooTensor x_grad = sparse::MaxPoolGrad<T>(
-          dev_ctx_cpu, x_tensor, rulebook, out, out, kernel_sizes);
+          dev_ctx_cpu, x_tensor, rulebook, counter, out, out, kernel_sizes);
       f_verify(x_grad.non_zero_elements().data<T>(), features_grad);
     }
   }
@@ -149,14 +151,16 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
 
   SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims);
 
-  DenseTensor d_rulebook;
+  DenseTensor d_rulebook, d_counter;
   SparseCooTensor d_out = sparse::MaxPool<T>(dev_ctx_gpu,
                                              d_x_tensor,
                                              kernel_sizes,
                                              paddings,
                                              dilations,
                                              strides,
-                                             &d_rulebook);
+                                             &d_rulebook,
+                                             &d_counter);
+  SparseCooTensor tmp_d_out = sparse::Coalesced<T>(dev_ctx_gpu, d_out);
 
   ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
   ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
@@ -168,7 +172,7 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
       dev_ctx_cpu,
       DenseTensorMeta(indices_dtype, {4, d_out.nnz()}, DataLayout::NCHW));
   phi::Copy(dev_ctx_gpu,
-            d_out.non_zero_indices(),
+            tmp_d_out.non_zero_indices(),
             phi::CPUPlace(),
             true,
             &h_indices_tensor);
@@ -182,15 +186,20 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
       phi::EmptyLike<T>(dev_ctx_cpu, d_out.non_zero_elements());
 
   phi::Copy(dev_ctx_gpu,
-            d_out.non_zero_elements(),
+            tmp_d_out.non_zero_elements(),
             phi::CPUPlace(),
             true,
             &h_features_tensor);
   f_verify(h_features_tensor.data<T>(), correct_out_features);
 
   if (backward) {
-    SparseCooTensor x_grad = sparse::MaxPoolGrad<T>(
-        dev_ctx_gpu, d_x_tensor, d_rulebook, d_out, d_out, kernel_sizes);
+    SparseCooTensor x_grad = sparse::MaxPoolGrad<T>(dev_ctx_gpu,
+                                                    d_x_tensor,
+                                                    d_rulebook,
+                                                    d_counter,
+                                                    d_out,
+                                                    d_out,
+                                                    kernel_sizes);
     DenseTensor h_features_grad =
         phi::EmptyLike<T>(dev_ctx_cpu, x_grad.non_zero_elements());
     phi::Copy(dev_ctx_gpu,

From 5751987c6407cf3492b047e012515dd686fdfc07 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 6 Jul 2022 05:40:29 +0000
Subject: [PATCH 58/70] rename pool_kernel.cc

---
 .../kernels/sparse/cpu/sparse_mask_kernel.cc  | 183 ----------
 .../sparse/cpu/sparse_pool_grad_kernel.cc     | 103 ------
 .../kernels/sparse/cpu/sparse_pool_kernel.cc  | 140 --------
 .../kernels/sparse/gpu/sparse_mask_kernel.cu  | 340 ------------------
 .../sparse/gpu/sparse_pool_grad_kernel.cu     | 136 -------
 .../kernels/sparse/gpu/sparse_pool_kernel.cu  | 167 ---------
 .../phi/kernels/sparse/sparse_mask_kernel.h   |  36 --
 .../kernels/sparse/sparse_pool_grad_kernel.h  |  49 ---
 .../phi/kernels/sparse/sparse_pool_kernel.h   |  58 ---
 .../sparse/sparse_utils_grad_kernel.cc        |   1 -
 .../kernels/sparse/sparse_utils_grad_kernel.h |   2 +-
 .../tests/kernels/test_sparse_pool_dev_api.cc |   4 +-
 12 files changed, 3 insertions(+), 1216 deletions(-)
 delete mode 100644 paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
 delete mode 100644 paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
 delete mode 100644 paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
 delete mode 100644 paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
 delete mode 100644 paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
 delete mode 100644 paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
 delete mode 100644 paddle/phi/kernels/sparse/sparse_mask_kernel.h
 delete mode 100644 paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h
 delete mode 100644 paddle/phi/kernels/sparse/sparse_pool_kernel.h

diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
deleted file mode 100644
index cf2acd8557333..0000000000000
--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
-
-#include "paddle/phi/api/ext/dispatch.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename IntT>
-void SparseMaskCPUKernel(const CPUContext& dev_ctx,
-                         const DenseTensor& x,
-                         const SparseCooTensor& mask,
-                         SparseCooTensor* out) {
-  const DDim& dims = x.dims();
-  PADDLE_ENFORCE_EQ(
-      x.dims(),
-      mask.dims(),
-      phi::errors::InvalidArgument("the input x and mask must have the shape"));
-  const DenseTensor& indices = mask.non_zero_indices();
-  const DenseTensor& values = mask.non_zero_elements();
-  const int sparse_dim = mask.sparse_dim();
-
-  DenseTensor out_indices = phi::EmptyLike<T>(dev_ctx, indices);
-  DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, values);
-
-  // the out_indices is same as indices of mask
-  phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &out_indices);
-
-  T* out_values_ptr = out_values.data<T>();
-  const T* x_ptr = x.data<T>();
-
-  const int64_t non_zero_num = mask.nnz();
-  auto dims_2d = flatten_to_2d(dims, sparse_dim);
-  const int cols = dims_2d[1];
-  const IntT* indices_ptr = indices.data<IntT>();
-
-  std::vector<IntT> out_indexs(non_zero_num), sparse_offsets(sparse_dim);
-
-  phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
-      dims, sparse_dim, sparse_offsets.data());
-
-  for (int64_t i = 0; i < non_zero_num; i++) {
-    int64_t index = phi::funcs::sparse::CoordinateToIndex<IntT>(
-        indices_ptr, sparse_offsets.data(), non_zero_num, sparse_dim, i);
-    memcpy(out_values_ptr + i * cols, x_ptr + index * cols, cols * sizeof(T));
-  }
-
-  out->SetMember(out_indices, out_values, dims, true);
-}
-
-/**
- * @brief Filter the DenseTensor x by the
- * mask.non_zero_indices() and output a SparseCooTensor
- * x and mask must have the same shape.
- **/
-template <typename T, typename Context>
-void SparseMaskKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const SparseCooTensor& mask,
-                      SparseCooTensor* out) {
-  PD_VISIT_INTEGRAL_TYPES(
-      mask.non_zero_indices().dtype(), "SparseMaskCPUKernel", ([&] {
-        SparseMaskCPUKernel<T, data_t>(dev_ctx, x, mask, out);
-      }));
-}
-
-template <typename T, typename IntT>
-void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx,
-                               const SparseCooTensor& x,
-                               const DenseTensor& mask_indices,
-                               DenseTensor* out) {
-  PADDLE_ENFORCE_EQ(
-      mask_indices.dims().size(),
-      2,
-      phi::errors::InvalidArgument("the mask_indices must be 2-D tensor"));
-
-  const int32_t sparse_dim = x.sparse_dim();
-
-  std::vector<IntT> sparse_offsets(sparse_dim), x_indexs(x.nnz()),
-      mask_indexs(mask_indices.dims()[1]);
-  phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
-      x.dims(), sparse_dim, sparse_offsets.data());
-
-  phi::funcs::sparse::FlattenIndices(x.non_zero_indices().data<IntT>(),
-                                     sparse_offsets.data(),
-                                     x.nnz(),
-                                     sparse_dim,
-                                     0,
-                                     1,
-                                     x_indexs.data());
-  phi::funcs::sparse::FlattenIndices(mask_indices.data<IntT>(),
-                                     sparse_offsets.data(),
-                                     x.nnz(),
-                                     sparse_dim,
-                                     0,
-                                     1,
-                                     mask_indexs.data());
-
-  std::unordered_map<IntT, uint64_t> x_indexs_map;
-  for (uint64_t i = 0; i < x_indexs.size(); i++) {
-    x_indexs_map[x_indexs[i]] = i;
-  }
-  *out = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
-  T* out_ptr = out->data<T>();
-  memset(out_ptr, static_cast<T>(0), out->numel() * sizeof(T));
-  const int64_t stride =
-      x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];
-  const T* in_ptr = x.non_zero_elements().data<T>();
-  // TODO(zhangkaihuo): multithreading can be used for acceleration
-  for (uint64_t i = 0; i < mask_indexs.size(); i++) {
-    auto iter = x_indexs_map.find(mask_indexs[i]);
-    if (iter != x_indexs_map.end()) {
-      memcpy(out_ptr + i * stride,
-             in_ptr + iter->second * stride,
-             stride * sizeof(T));
-    }
-  }
-}
-
-/**
- * @brief filter values from x.values() using mask_indices
- */
-template <typename T, typename Context>
-void SparseMaskHelperKernel(const Context& dev_ctx,
-                            const SparseCooTensor& x,
-                            const DenseTensor& mask_indices,
-                            DenseTensor* out) {
-  PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "SparseMaskHelperCPUKernel", ([&] {
-        SparseMaskHelperCPUKernel<T, data_t>(dev_ctx, x, mask_indices, out);
-      }));
-}
-
-}  // namespace sparse
-}  // namespace phi
-
-PD_REGISTER_KERNEL(sparse_mask,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseMaskKernel,
-                   float,
-                   double,
-                   uint8_t,
-                   int8_t,
-                   int16_t,
-                   int,
-                   int64_t) {
-  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-
-PD_REGISTER_KERNEL(sparse_mask_helper,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseMaskHelperKernel,
-                   float,
-                   double,
-                   uint8_t,
-                   int16_t,
-                   int,
-                   int64_t) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
deleted file mode 100644
index 580cfe9bb94d0..0000000000000
--- a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/funcs/pooling.h"
-#include "paddle/phi/kernels/funcs/sparse/convolution.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename IntT = int>
-void MaxPoolGradCPUKernel(const CPUContext& dev_ctx,
-                          const SparseCooTensor& x,
-                          const DenseTensor& rulebook,
-                          const DenseTensor& counter,
-                          const SparseCooTensor& out,
-                          const SparseCooTensor& out_grad,
-                          const std::vector<int>& kernel_sizes,
-                          SparseCooTensor* x_grad) {
-  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
-  const int channels = x.dims()[4];
-  int rulebook_len = rulebook.dims()[1];
-  const IntT* rulebook_ptr = rulebook.data<IntT>();
-  std::vector<int> offsets(kernel_size + 1);
-  const int* counter_ptr = counter.data<int>();
-
-  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
-
-  const T* in_features_ptr = x.non_zero_elements().data<T>();
-  const T* out_features_ptr = out.non_zero_elements().data<T>();
-  const T* out_grad_ptr = out_grad.non_zero_elements().data<T>();
-  // TODO(zhangkaihuo): call phi::sparse::EmptyLike
-  DenseTensor x_grad_indices =
-      phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
-  DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
-  x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
-  T* x_grad_ptr = x_grad_values.data<T>();
-  memset(x_grad_ptr, 0, sizeof(T) * x_grad_values.numel());
-  phi::Copy<CPUContext>(dev_ctx,
-                        x.non_zero_indices(),
-                        dev_ctx.GetPlace(),
-                        false,
-                        &x_grad_indices);
-
-  phi::funcs::MaxPoolGrad<T> grad_functor;
-  for (int i = 0; i < kernel_size; i++) {
-    for (int j = 0; j < counter_ptr[i]; j++) {
-      IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
-      IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
-      for (int c = 0; c < channels; c++) {
-        grad_functor.compute(in_features_ptr[in_i * channels + c],
-                             out_features_ptr[out_i * channels + c],
-                             out_grad_ptr[out_i * channels + c],
-                             1,
-                             &x_grad_ptr[in_i * channels + c]);
-      }
-    }
-  }
-}
-
-template <typename T, typename Context>
-void MaxPoolGradKernel(const Context& dev_ctx,
-                       const SparseCooTensor& x,
-                       const DenseTensor& rulebook,
-                       const DenseTensor& counter,
-                       const SparseCooTensor& out,
-                       const SparseCooTensor& out_grad,
-                       const std::vector<int>& kernel_sizes,
-                       SparseCooTensor* x_grad) {
-  PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "MaxPoolGradCPUKernel", ([&] {
-        MaxPoolGradCPUKernel<T, data_t>(
-            dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad);
-      }));
-}
-
-}  // namespace sparse
-}  // namespace phi
-
-PD_REGISTER_KERNEL(sparse_maxpool_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::sparse::MaxPoolGradKernel,
-                   float,
-                   double) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
deleted file mode 100644
index a3224b6fe14bb..0000000000000
--- a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/tensor_meta.h"
-#include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/funcs/pooling.h"
-#include "paddle/phi/kernels/funcs/sparse/convolution.h"
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
-
-namespace phi {
-namespace sparse {
-
-/**
- * x: (N, D, H, W, C)
- * kernel: (D, H, W, C, OC)
- * out: (N, D, H, W, OC)
- **/
-template <typename T, typename IntT = int>
-void MaxPoolCPUKernel(const CPUContext& dev_ctx,
-                      const SparseCooTensor& x,
-                      const std::vector<int>& kernel_sizes,
-                      const std::vector<int>& paddings,
-                      const std::vector<int>& dilations,
-                      const std::vector<int>& strides,
-                      SparseCooTensor* out,
-                      DenseTensor* rulebook,
-                      DenseTensor* counter) {
-  const auto& x_dims = x.dims();
-  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
-  const std::vector<int>& real_kernel_sizes =
-      phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]);
-  DDim out_dims = {1, 1, 1, 1, 1};
-  phi::funcs::sparse::GetOutShape(
-      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
-  const int in_channels = real_kernel_sizes[3];
-
-  // DenseTensorMeta counter_meta(
-  //     DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  // DenseTensor counter_per_kernel = phi::Empty(dev_ctx,
-  // std::move(counter_meta));
-  std::vector<int> counter_per_kernel(kernel_size, 0);
-
-  const T* in_features_ptr = x.non_zero_elements().data<T>();
-  // 1. product rule book
-  ProductRuleBook<T, CPUContext, IntT>(dev_ctx,
-                                       x,
-                                       real_kernel_sizes,
-                                       paddings,
-                                       dilations,
-                                       strides,
-                                       out_dims,
-                                       false,
-                                       rulebook,
-                                       &counter_per_kernel);
-
-  UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
-      dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out);
-
-  int rulebook_len = rulebook->dims()[1];
-  const IntT* rulebook_ptr = rulebook->data<IntT>();
-
-  counter->Resize({kernel_size});
-  int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
-  memcpy(counter_ptr, counter_per_kernel.data(), kernel_size * sizeof(int));
-
-  std::vector<int> offsets(kernel_size + 1);
-  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
-  std::vector<bool> out_flags(out->nnz(), false);
-
-  // 2. max pool
-  T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
-  phi::funcs::MaxPool<T> max_pool_functor;
-  for (int i = 0; i < kernel_size; i++) {
-    for (int j = 0; j < counter_ptr[i]; j++) {
-      IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
-      IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
-      if (!out_flags[out_i]) {
-        out_flags[out_i] = true;
-        memcpy(&out_features_ptr[out_i * in_channels],
-               &in_features_ptr[in_i * in_channels],
-               in_channels * sizeof(T));
-      } else {
-        for (int c = 0; c < in_channels; c++) {
-          max_pool_functor.compute(in_features_ptr[in_i * in_channels + c],
-                                   &out_features_ptr[out_i * in_channels + c]);
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Context>
-void MaxPoolKernel(const Context& dev_ctx,
-                   const SparseCooTensor& x,
-                   const std::vector<int>& kernel_sizes,
-                   const std::vector<int>& paddings,
-                   const std::vector<int>& dilations,
-                   const std::vector<int>& strides,
-                   SparseCooTensor* out,
-                   DenseTensor* rulebook,
-                   DenseTensor* counter) {
-  PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "MaxPoolCPUKernel", ([&] {
-        MaxPoolCPUKernel<T, data_t>(dev_ctx,
-                                    x,
-                                    kernel_sizes,
-                                    paddings,
-                                    dilations,
-                                    strides,
-                                    out,
-                                    rulebook,
-                                    counter);
-      }));
-}
-
-}  // namespace sparse
-}  // namespace phi
-
-PD_REGISTER_KERNEL(sparse_maxpool,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::sparse::MaxPoolKernel,
-                   float,
-                   double) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
deleted file mode 100644
index 0e399a7b0e81f..0000000000000
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ /dev/null
@@ -1,340 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
-
-#include <thrust/binary_search.h>
-
-#include "paddle/phi/backends/gpu/gpu_info.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"
-#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename IntT>
-__global__ void MaskKernel(const T* x_ptr,
-                           const IntT* indices_ptr,
-                           const int64_t* sparse_offsets,
-                           const int64_t non_zero_num,
-                           const int cols,
-                           const int sparse_dim,
-                           T* out_values_ptr) {
-  CUDA_KERNEL_LOOP_TYPE(i, non_zero_num * cols, int64_t) {
-    int64_t out_i = i / cols;
-    int64_t col_i = i - out_i * cols;
-    int64_t index = 0;
-    for (int j = 0; j < sparse_dim; j++) {
-      index += indices_ptr[j * non_zero_num + out_i] * sparse_offsets[j];
-    }
-    out_values_ptr[out_i * cols + col_i] = x_ptr[index * cols + col_i];
-  }
-}
-
-template <typename T, typename IntT>
-void SparseMaskGPUKernel(const GPUContext& dev_ctx,
-                         const DenseTensor& x,
-                         const SparseCooTensor& mask,
-                         SparseCooTensor* out) {
-  const DDim& dims = x.dims();
-  PADDLE_ENFORCE_EQ(
-      x.dims(),
-      mask.dims(),
-      phi::errors::InvalidArgument("the input x and mask must have the shape"));
-  const DenseTensor& indices = mask.non_zero_indices();
-  const DenseTensor& values = mask.non_zero_elements();
-  const int sparse_dim = mask.sparse_dim();
-  DenseTensor sparse_offsets = phi::Empty<GPUContext>(
-      dev_ctx,
-      DenseTensorMeta(DataType::INT64, {sparse_dim}, DataLayout::NCHW));
-  std::vector<int64_t> h_sparse_offsets(sparse_dim);
-  phi::funcs::sparse::CalcOffsetsPerDim(
-      dims, sparse_dim, h_sparse_offsets.data());
-
-  phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data<int64_t>(),
-                                     &h_sparse_offsets[0],
-                                     sizeof(int64_t) * sparse_dim,
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyHostToDevice,
-#else
-                                     cudaMemcpyHostToDevice,
-#endif
-                                     dev_ctx.stream());
-
-  DenseTensor out_indices = phi::EmptyLike<T>(dev_ctx, indices);
-  DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, values);
-
-  phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &out_indices);
-
-  const IntT* indices_ptr = indices.data<IntT>();
-  T* out_values_ptr = out_values.data<T>();
-  const T* x_ptr = x.data<T>();
-  const int64_t non_zero_num = mask.nnz();
-  auto dims_2d = flatten_to_2d(dims, sparse_dim);
-  const int cols = dims_2d[1];
-
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1);
-  MaskKernel<T, IntT>
-      <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
-          x_ptr,
-          indices_ptr,
-          sparse_offsets.data<int64_t>(),
-          non_zero_num,
-          cols,
-          sparse_dim,
-          out_values_ptr);
-
-  out->SetMember(out_indices, out_values, dims, true);
-}
-
-/**
- * @brief Filter the DenseTensor x by the
- * mask.non_zero_indices() and output a SparseCooTensor
- * x and mask must have the same shape.
- **/
-template <typename T, typename Context>
-void SparseMaskKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const SparseCooTensor& mask,
-                      SparseCooTensor* out) {
-  PD_VISIT_INTEGRAL_TYPES(
-      mask.non_zero_indices().dtype(), "SparseMaskGPUKernel", ([&] {
-        SparseMaskGPUKernel<T, data_t>(dev_ctx, x, mask, out);
-      }));
-}
-
-template <typename T, typename IntT, int VecSize>
-__global__ void SparseMaskCopyKernel(const IntT* x_indexs,
-                                     const IntT* mask_indexs,
-                                     const IntT* bound_out,
-                                     const T* x_values,
-                                     const int64_t n,
-                                     const int64_t stride,
-                                     T* out_values) {
-  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
-    using LoadT = phi::AlignedVector<T, VecSize>;
-    using StoreT = phi::AlignedVector<T, VecSize>;
-    const IntT j = bound_out[i];
-    if (j >= 0 && j < n && mask_indexs[i] == x_indexs[j]) {
-      for (int k = 0; k < stride / VecSize; k++) {
-        // out_values[i * stride + k] = x_values[j * stride + k];
-        LoadT vec_x;
-        phi::Load<T, VecSize>(x_values + j * stride + k * VecSize, &vec_x);
-        phi::Store<T, VecSize>(vec_x, out_values + i * stride + k * VecSize);
-      }
-    }
-  }
-}
-
-template <typename IntT>
-__global__ void MaskTable(const IntT* x_indexs, const int n, int* table) {
-  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
-    int index = x_indexs[i];
-    table[index] = i == 0 ? -1 : i;
-  }
-}
-
-template <typename T, typename IntT, int VecSize>
-__global__ void MaskCopy(const IntT* mask_indexs,
-                         const int* table,
-                         const int n,
-                         const int stride,
-                         const T* x_values,
-                         T* out_values) {
-  using LoadT = phi::AlignedVector<T, VecSize>;
-  using StoreT = phi::AlignedVector<T, VecSize>;
-  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
-    int j = table[mask_indexs[i]];
-    if (j != 0) {
-      if (j == -1) j = 0;
-      for (int k = 0; k < stride; k += VecSize) {
-        LoadT vec_x;
-        phi::Load<T, VecSize>(x_values + j * stride + k, &vec_x);
-        phi::Store<T, VecSize>(vec_x, out_values + i * stride + k);
-      }
-    }
-  }
-}
-
-template <typename T, typename IntT>
-void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
-                               const SparseCooTensor& x,
-                               const DenseTensor& mask_indices,
-                               DenseTensor* out) {
-  PADDLE_ENFORCE_EQ(
-      mask_indices.dims().size(),
-      2,
-      phi::errors::InvalidArgument("the mask_indices must be 2-D tensor"));
-
-  const int32_t sparse_dim = x.sparse_dim();
-  auto indices_dtype = paddle::experimental::CppTypeToDataType<IntT>::Type();
-
-  std::vector<IntT> sparse_offsets(sparse_dim);
-
-  DenseTensorMeta x_indexs_meta(indices_dtype, {x.nnz()}, DataLayout::NCHW);
-  DenseTensorMeta mask_indexs_meta(
-      indices_dtype, {mask_indices.dims()[1]}, DataLayout::NCHW);
-  DenseTensorMeta sparse_offset_meta(
-      indices_dtype, {sparse_dim}, DataLayout::NCHW);
-
-  DenseTensor x_indexs =
-      phi::Empty<GPUContext>(dev_ctx, std::move(x_indexs_meta));
-  DenseTensor mask_indexs =
-      phi::Empty<GPUContext>(dev_ctx, std::move(mask_indexs_meta));
-  DenseTensor bound_out =
-      phi::Empty<GPUContext>(dev_ctx, std::move(mask_indexs_meta));
-  DenseTensor d_sparse_offsets =
-      phi::Empty<GPUContext>(dev_ctx, std::move(sparse_offset_meta));
-  IntT* x_indexs_ptr = x_indexs.data<IntT>();
-  IntT* mask_indexs_ptr = mask_indexs.data<IntT>();
-  IntT* bound_out_ptr = bound_out.data<IntT>();
-
-  // 1. calc the offsets of per dim
-  phi::funcs::sparse::CalcOffsetsPerDim(
-      x.dims(), sparse_dim, sparse_offsets.data());
-  // 2. copy sparse_offsets to device
-  phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
-                                     sparse_offsets.data(),
-                                     sizeof(IntT) * sparse_dim,
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyHostToDevice,
-#else
-                                     cudaMemcpyHostToDevice,
-#endif
-                                     dev_ctx.stream());
-
-  // 3. flatten x indices and mask indices
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1);
-  phi::funcs::sparse::FlattenIndicesKernel<<<config.block_per_grid,
-                                             config.thread_per_block,
-                                             0,
-                                             dev_ctx.stream()>>>(
-      x.non_zero_indices().data<IntT>(),
-      d_sparse_offsets.data<IntT>(),
-      x_indexs.numel(),
-      sparse_dim,
-      x_indexs_ptr);
-
-  config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1);
-  phi::funcs::sparse::FlattenIndicesKernel<<<config.block_per_grid,
-                                             config.thread_per_block,
-                                             0,
-                                             dev_ctx.stream()>>>(
-      mask_indices.data<IntT>(),
-      d_sparse_offsets.data<IntT>(),
-      mask_indexs.numel(),
-      sparse_dim,
-      mask_indexs_ptr);
-
-  int table_size = 1;
-  auto x_dims = x.dims();
-  for (int i = 0; i < x_dims.size() - 1; i++) {
-    table_size *= x_dims[i];
-  }
-  DenseTensor table = phi::Empty<int>(dev_ctx, {table_size});
-  cudaMemsetAsync(
-      table.data<int>(), 0, table_size * sizeof(int), dev_ctx.stream());
-  const int64_t stride =
-      x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];
-  *out = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
-  phi::funcs::SetConstant<GPUContext, T> set_zero;
-  set_zero(dev_ctx, out, static_cast<T>(0));
-  T* out_ptr = out->data<T>();
-  config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1);
-  MaskTable<<<config.block_per_grid,
-              config.thread_per_block,
-              0,
-              dev_ctx.stream()>>>(
-      x_indexs_ptr, x_indexs.numel(), table.data<int>());
-  config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1);
-  const int VecSize = VecBytes / sizeof(T);
-  if (stride % VecSize == 0) {
-    MaskCopy<T, IntT, VecSize>
-        <<<config.block_per_grid,
-           config.thread_per_block,
-           0,
-           dev_ctx.stream()>>>(mask_indexs_ptr,
-                               table.data<int>(),
-                               mask_indexs.numel(),
-                               stride,
-                               x.non_zero_elements().data<T>(),
-                               out_ptr);
-  } else {
-    MaskCopy<T, IntT, 1><<<config.block_per_grid,
-                           config.thread_per_block,
-                           0,
-                           dev_ctx.stream()>>>(mask_indexs_ptr,
-                                               table.data<int>(),
-                                               mask_indexs.numel(),
-                                               stride,
-                                               x.non_zero_elements().data<T>(),
-                                               out_ptr);
-  }
-}
-
-template <typename T, typename Context>
-void SparseMaskHelperKernel(const Context& dev_ctx,
-                            const SparseCooTensor& x,
-                            const DenseTensor& mask_indices,
-                            DenseTensor* out) {
-  PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "SparseMaskHelperGPUKernel", ([&] {
-        SparseMaskHelperGPUKernel<T, data_t>(dev_ctx, x, mask_indices, out);
-      }));
-}
-
-}  // namespace sparse
-}  // namespace phi
-
-PD_REGISTER_KERNEL(sparse_mask,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseMaskKernel,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   uint8_t,
-                   int8_t,
-                   int16_t,
-                   int,
-                   int64_t) {
-  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-
-PD_REGISTER_KERNEL(sparse_mask_helper,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseMaskHelperKernel,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   uint8_t,
-                   int16_t,
-                   int,
-                   int64_t) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
deleted file mode 100644
index 12225da7a01fb..0000000000000
--- a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_info.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/pooling.h"
-#include "paddle/phi/kernels/funcs/sparse/convolution.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename IntT = int>
-__global__ void MaxPoolGradCudaKernel(const T* in_features_ptr,
-                                      const T* out_features_ptr,
-                                      const T* out_grad_ptr,
-                                      const IntT* rulebook_ptr,
-                                      const int n,
-                                      const int rulebook_len,
-                                      const int channels,
-                                      T* x_grad_ptr) {
-  phi::funcs::MaxPoolGrad<T> grad_functor;
-  CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) {
-    int real_i = i / channels;
-    int c = i - real_i * channels;
-    IntT in_i = rulebook_ptr[real_i];
-    IntT out_i = rulebook_ptr[real_i + rulebook_len];
-    grad_functor.compute(in_features_ptr[in_i * channels + c],
-                         out_features_ptr[out_i * channels + c],
-                         out_grad_ptr[out_i * channels + c],
-                         1,
-                         &x_grad_ptr[in_i * channels + c]);
-  }
-}
-
-template <typename T, typename IntT = int>
-void MaxPoolGradGPUKernel(const GPUContext& dev_ctx,
-                          const SparseCooTensor& x,
-                          const DenseTensor& rulebook,
-                          const DenseTensor& counter,
-                          const SparseCooTensor& out,
-                          const SparseCooTensor& out_grad,
-                          const std::vector<int>& kernel_sizes,
-                          SparseCooTensor* x_grad) {
-  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
-  const int in_channels = x.dims()[4];
-  int rulebook_len = rulebook.dims()[1];
-  const IntT* rulebook_ptr = rulebook.data<IntT>();
-  std::vector<int> offsets(kernel_size + 1);
-  const int* counter_ptr = counter.data<int>();
-  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
-
-  const T* in_features_ptr = x.non_zero_elements().data<T>();
-  const T* out_features_ptr = out.non_zero_elements().data<T>();
-  const T* out_grad_ptr = out_grad.non_zero_elements().data<T>();
-  // TODO(zhangkaihuo): call phi::sparse::EmptyLike
-  DenseTensor x_grad_indices =
-      phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
-  DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
-  x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
-  T* x_grad_ptr = x_grad_values.data<T>();
-  phi::funcs::SetConstant<GPUContext, T> set_zero;
-  set_zero(dev_ctx, &x_grad_values, static_cast<T>(0.0f));
-  phi::Copy<GPUContext>(dev_ctx,
-                        x.non_zero_indices(),
-                        dev_ctx.GetPlace(),
-                        false,
-                        &x_grad_indices);
-
-  for (int i = 0; i < kernel_size; i++) {
-    if (counter_ptr[i] <= 0) {
-      continue;
-    }
-
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, counter_ptr[i] * in_channels, 1);
-    MaxPoolGradCudaKernel<T, IntT>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(in_features_ptr,
-                               out_features_ptr,
-                               out_grad_ptr,
-                               rulebook_ptr + offsets[i],
-                               counter_ptr[i],
-                               rulebook_len,
-                               in_channels,
-                               x_grad_ptr);
-  }
-}
-
-template <typename T, typename Context>
-void MaxPoolGradKernel(const Context& dev_ctx,
-                       const SparseCooTensor& x,
-                       const DenseTensor& rulebook,
-                       const DenseTensor& counter,
-                       const SparseCooTensor& out,
-                       const SparseCooTensor& out_grad,
-                       const std::vector<int>& kernel_sizes,
-                       SparseCooTensor* x_grad) {
-  PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "MaxPoolGradGPUKernel", ([&] {
-        MaxPoolGradGPUKernel<T, data_t>(
-            dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad);
-      }));
-}
-
-}  // namespace sparse
-}  // namespace phi
-
-PD_REGISTER_KERNEL(sparse_maxpool_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::MaxPoolGradKernel,
-                   float,
-                   double) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
deleted file mode 100644
index 61a622075efde..0000000000000
--- a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/tensor_meta.h"
-#include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/funcs/pooling.h"
-#include "paddle/phi/kernels/funcs/sparse/convolution.h"
-#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename IntT = int>
-__global__ void MaxPoolCudaKernel(const T* in_features_ptr,
-                                  const IntT* rulebook_ptr,
-                                  const int n,
-                                  const int rulebook_len,
-                                  const int channels,
-                                  T* out_features_ptr) {
-  phi::funcs::MaxPool<T> max_pool_functor;
-  CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) {
-    int real_i = i / channels;
-    int channel_i = i - real_i * channels;
-    IntT in_i = rulebook_ptr[real_i];
-    IntT out_i = rulebook_ptr[real_i + rulebook_len];
-    max_pool_functor.compute(in_features_ptr[in_i * channels + channel_i],
-                             &out_features_ptr[out_i * channels + channel_i]);
-  }
-}
-
-/**
- * x: (N, D, H, W, C)
- * kernel: (D, H, W, C, OC)
- * out: (N, D, H, W, OC)
- **/
-template <typename T, typename IntT = int>
-void MaxPoolGPUKernel(const GPUContext& dev_ctx,
-                      const SparseCooTensor& x,
-                      const std::vector<int>& kernel_sizes,
-                      const std::vector<int>& paddings,
-                      const std::vector<int>& dilations,
-                      const std::vector<int>& strides,
-                      SparseCooTensor* out,
-                      DenseTensor* rulebook,
-                      DenseTensor* counter) {
-  const auto& x_dims = x.dims();
-  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
-  const std::vector<int>& real_kernel_sizes =
-      phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]);
-  DDim out_dims = {1, 1, 1, 1, 1};
-  phi::funcs::sparse::GetOutShape(
-      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
-  const int in_channels = real_kernel_sizes[3];
-
-  std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
-  DenseTensorMeta counter_meta(
-      DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
-  DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
-  DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
-  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
-  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
-
-  // 1. product rulebook
-  int rulebook_len = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
-                                                          x,
-                                                          real_kernel_sizes,
-                                                          paddings,
-                                                          dilations,
-                                                          strides,
-                                                          out_dims,
-                                                          false,
-                                                          rulebook,
-                                                          &counter_per_kernel,
-                                                          &offsets_per_kernel,
-                                                          &out_index,
-                                                          &unique_value,
-                                                          out,
-                                                          &h_counter,
-                                                          &offsets);
-
-  const IntT* rulebook_ptr = rulebook->data<IntT>();
-
-  T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
-  const T* in_features_ptr = x.non_zero_elements().data<T>();
-  counter->Resize({kernel_size});
-  int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
-  memcpy(counter_ptr, h_counter.data(), h_counter.size() * sizeof(int));
-// 2. max pool
-#ifdef PADDLE_WITH_HIP
-  thrust::fill(thrust::hip::par.on(dev_ctx.stream()),
-#else
-  thrust::fill(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-               out_features_ptr,
-               out_features_ptr + out->non_zero_elements().numel(),
-               static_cast<T>(0));
-  // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster
-  for (int i = 0; i < kernel_size; i++) {
-    if (h_counter[i] <= 0) {
-      continue;
-    }
-
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, h_counter[i] * in_channels, 1);
-    MaxPoolCudaKernel<T, IntT><<<config.block_per_grid.x,
-                                 config.thread_per_block.x,
-                                 0,
-                                 dev_ctx.stream()>>>(in_features_ptr,
-                                                     rulebook_ptr + offsets[i],
-                                                     h_counter[i],
-                                                     rulebook_len,
-                                                     in_channels,
-                                                     out_features_ptr);
-  }
-}
-
-template <typename T, typename Context>
-void MaxPoolKernel(const Context& dev_ctx,
-                   const SparseCooTensor& x,
-                   const std::vector<int>& kernel_sizes,
-                   const std::vector<int>& paddings,
-                   const std::vector<int>& dilations,
-                   const std::vector<int>& strides,
-                   SparseCooTensor* out,
-                   DenseTensor* rulebook,
-                   DenseTensor* counter) {
-  PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "MaxPoolGPUKernel", ([&] {
-        MaxPoolGPUKernel<T, data_t>(dev_ctx,
-                                    x,
-                                    kernel_sizes,
-                                    paddings,
-                                    dilations,
-                                    strides,
-                                    out,
-                                    rulebook,
-                                    counter);
-      }));
-}
-
-}  // namespace sparse
-}  // namespace phi
-
-PD_REGISTER_KERNEL(sparse_maxpool,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::MaxPoolKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
diff --git a/paddle/phi/kernels/sparse/sparse_mask_kernel.h b/paddle/phi/kernels/sparse/sparse_mask_kernel.h
deleted file mode 100644
index 88899e3dc672e..0000000000000
--- a/paddle/phi/kernels/sparse/sparse_mask_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/sparse_coo_tensor.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void SparseMaskKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const SparseCooTensor& mask,
-                      SparseCooTensor* out);
-
-template <typename T, typename Context>
-void SparseMaskHelperKernel(const Context& dev_ctx,
-                            const SparseCooTensor& x,
-                            const DenseTensor& mask_indices,
-                            DenseTensor* out);
-
-}  // namespace sparse
-}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h
deleted file mode 100644
index ef9f8418b0116..0000000000000
--- a/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/sparse_coo_tensor.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void MaxPoolGradKernel(const Context& dev_ctx,
-                       const SparseCooTensor& x,
-                       const DenseTensor& rulebook,
-                       const DenseTensor& counter,
-                       const SparseCooTensor& out,
-                       const SparseCooTensor& out_grad,
-                       const std::vector<int>& kernel_sizes,
-                       SparseCooTensor* x_grad);
-
-template <typename T, typename Context>
-SparseCooTensor MaxPoolGrad(const Context& dev_ctx,
-                            const SparseCooTensor& x,
-                            const DenseTensor& rulebook,
-                            const DenseTensor& counter,
-                            const SparseCooTensor& out,
-                            const SparseCooTensor& out_grad,
-                            const std::vector<int>& kernel_sizes) {
-  SparseCooTensor x_grad;
-  MaxPoolGradKernel<T, Context>(
-      dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, &x_grad);
-  return x_grad;
-}
-
-}  // namespace sparse
-}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/sparse_pool_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_kernel.h
deleted file mode 100644
index 9f4939da8d52a..0000000000000
--- a/paddle/phi/kernels/sparse/sparse_pool_kernel.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/sparse_coo_tensor.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void MaxPoolKernel(const Context& dev_ctx,
-                   const SparseCooTensor& x,
-                   const std::vector<int>& kernel_sizes,
-                   const std::vector<int>& paddings,
-                   const std::vector<int>& dilations,
-                   const std::vector<int>& strides,
-                   SparseCooTensor* out,
-                   DenseTensor* rulebook,
-                   DenseTensor* counter);
-
-template <typename T, typename Context>
-SparseCooTensor MaxPool(const Context& dev_ctx,
-                        const SparseCooTensor& x,
-                        const std::vector<int>& kernel_sizes,
-                        const std::vector<int>& paddings,
-                        const std::vector<int>& dilations,
-                        const std::vector<int>& strides,
-                        DenseTensor* rulebook,
-                        DenseTensor* counter) {
-  SparseCooTensor coo;
-  MaxPoolKernel<T, Context>(dev_ctx,
-                            x,
-                            kernel_sizes,
-                            paddings,
-                            dilations,
-                            strides,
-                            &coo,
-                            rulebook,
-                            counter);
-  return coo;
-}
-
-}  // namespace sparse
-}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
index 69677be34b231..9425c14b79b36 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h"
 
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
 
 namespace phi {
 namespace sparse {
diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h
index a00b9c275c292..7cf97c3f48ece 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
-#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
+#include "paddle/phi/kernels/sparse/mask_kernel.h"
 
 namespace phi {
 namespace sparse {
diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
index 36fa99d9bfc75..7497dca51a59c 100644
--- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
@@ -23,8 +23,8 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/sparse/coalesced_kernel.h"
-#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
-#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
+#include "paddle/phi/kernels/sparse/pool_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/pool_kernel.h"
 
 namespace phi {
 namespace tests {

From 7c2fbf52f71c8835632928b7da5db7db9108e954 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 6 Jul 2022 05:42:40 +0000
Subject: [PATCH 59/70] add new file

---
 .../final_state_generator/eager_gen.py        |   3 -
 paddle/phi/kernels/sparse/cpu/mask_kernel.cc  | 183 ++++++++++
 .../kernels/sparse/cpu/pool_grad_kernel.cc    | 103 ++++++
 paddle/phi/kernels/sparse/cpu/pool_kernel.cc  | 140 ++++++++
 .../gpu/.convolution_grad_kernel.cu.swp       | Bin 0 -> 20480 bytes
 paddle/phi/kernels/sparse/gpu/mask_kernel.cu  | 338 ++++++++++++++++++
 .../kernels/sparse/gpu/pool_grad_kernel.cu    | 136 +++++++
 paddle/phi/kernels/sparse/gpu/pool_kernel.cu  | 167 +++++++++
 paddle/phi/kernels/sparse/mask_kernel.h       |  36 ++
 paddle/phi/kernels/sparse/pool_grad_kernel.h  |  49 +++
 paddle/phi/kernels/sparse/pool_kernel.h       |  58 +++
 11 files changed, 1210 insertions(+), 3 deletions(-)
 create mode 100644 paddle/phi/kernels/sparse/cpu/mask_kernel.cc
 create mode 100644 paddle/phi/kernels/sparse/cpu/pool_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/sparse/cpu/pool_kernel.cc
 create mode 100644 paddle/phi/kernels/sparse/gpu/.convolution_grad_kernel.cu.swp
 create mode 100644 paddle/phi/kernels/sparse/gpu/mask_kernel.cu
 create mode 100644 paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/sparse/gpu/pool_kernel.cu
 create mode 100644 paddle/phi/kernels/sparse/mask_kernel.h
 create mode 100644 paddle/phi/kernels/sparse/pool_grad_kernel.h
 create mode 100644 paddle/phi/kernels/sparse/pool_kernel.h

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index a595bf5c613c6..d406f00b25039 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -545,9 +545,6 @@ def BackwardValidationCheck(self):
         backward_forward_inputs_map = self.backward_forward_inputs_map
         backward_grad_inputs_map = self.backward_grad_inputs_map
         backward_attrs_list = self.backward_attrs_list
-        print(backward_forward_inputs_map)
-        print(backward_grad_inputs_map)
-        print(backward_attrs_list)
 
         # Check Order: TensorWrappers, GradTensors, Attributes
         max_fwd_input_position = -1
diff --git a/paddle/phi/kernels/sparse/cpu/mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/mask_kernel.cc
new file mode 100644
index 0000000000000..92c015101264c
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/mask_kernel.cc
@@ -0,0 +1,183 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/mask_kernel.h"
+
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename IntT>
+void SparseMaskCPUKernel(const CPUContext& dev_ctx,
+                         const DenseTensor& x,
+                         const SparseCooTensor& mask,
+                         SparseCooTensor* out) {
+  const DDim& dims = x.dims();
+  PADDLE_ENFORCE_EQ(
+      x.dims(),
+      mask.dims(),
+      phi::errors::InvalidArgument("the input x and mask must have the shape"));
+  const DenseTensor& indices = mask.non_zero_indices();
+  const DenseTensor& values = mask.non_zero_elements();
+  const int sparse_dim = mask.sparse_dim();
+
+  DenseTensor out_indices = phi::EmptyLike<T>(dev_ctx, indices);
+  DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, values);
+
+  // the out_indices is same as indices of mask
+  phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &out_indices);
+
+  T* out_values_ptr = out_values.data<T>();
+  const T* x_ptr = x.data<T>();
+
+  const int64_t non_zero_num = mask.nnz();
+  auto dims_2d = flatten_to_2d(dims, sparse_dim);
+  const int cols = dims_2d[1];
+  const IntT* indices_ptr = indices.data<IntT>();
+
+  std::vector<IntT> out_indexs(non_zero_num), sparse_offsets(sparse_dim);
+
+  phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
+      dims, sparse_dim, sparse_offsets.data());
+
+  for (int64_t i = 0; i < non_zero_num; i++) {
+    int64_t index = phi::funcs::sparse::CoordinateToIndex<IntT>(
+        indices_ptr, sparse_offsets.data(), non_zero_num, sparse_dim, i);
+    memcpy(out_values_ptr + i * cols, x_ptr + index * cols, cols * sizeof(T));
+  }
+
+  out->SetMember(out_indices, out_values, dims, true);
+}
+
+/**
+ * @brief Filter the DenseTensor x by the
+ * mask.non_zero_indices() and output a SparseCooTensor
+ * x and mask must have the same shape.
+ **/
+template <typename T, typename Context>
+void SparseMaskKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const SparseCooTensor& mask,
+                      SparseCooTensor* out) {
+  PD_VISIT_INTEGRAL_TYPES(
+      mask.non_zero_indices().dtype(), "SparseMaskCPUKernel", ([&] {
+        SparseMaskCPUKernel<T, data_t>(dev_ctx, x, mask, out);
+      }));
+}
+
+template <typename T, typename IntT>
+void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx,
+                               const SparseCooTensor& x,
+                               const DenseTensor& mask_indices,
+                               DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      mask_indices.dims().size(),
+      2,
+      phi::errors::InvalidArgument("the mask_indices must be 2-D tensor"));
+
+  const int32_t sparse_dim = x.sparse_dim();
+
+  std::vector<IntT> sparse_offsets(sparse_dim), x_indexs(x.nnz()),
+      mask_indexs(mask_indices.dims()[1]);
+  phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
+      x.dims(), sparse_dim, sparse_offsets.data());
+
+  phi::funcs::sparse::FlattenIndices(x.non_zero_indices().data<IntT>(),
+                                     sparse_offsets.data(),
+                                     x.nnz(),
+                                     sparse_dim,
+                                     0,
+                                     1,
+                                     x_indexs.data());
+  phi::funcs::sparse::FlattenIndices(mask_indices.data<IntT>(),
+                                     sparse_offsets.data(),
+                                     x.nnz(),
+                                     sparse_dim,
+                                     0,
+                                     1,
+                                     mask_indexs.data());
+
+  std::unordered_map<IntT, uint64_t> x_indexs_map;
+  for (uint64_t i = 0; i < x_indexs.size(); i++) {
+    x_indexs_map[x_indexs[i]] = i;
+  }
+  *out = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+  T* out_ptr = out->data<T>();
+  memset(out_ptr, static_cast<T>(0), out->numel() * sizeof(T));
+  const int64_t stride =
+      x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];
+  const T* in_ptr = x.non_zero_elements().data<T>();
+  // TODO(zhangkaihuo): multithreading can be used for acceleration
+  for (uint64_t i = 0; i < mask_indexs.size(); i++) {
+    auto iter = x_indexs_map.find(mask_indexs[i]);
+    if (iter != x_indexs_map.end()) {
+      memcpy(out_ptr + i * stride,
+             in_ptr + iter->second * stride,
+             stride * sizeof(T));
+    }
+  }
+}
+
+/**
+ * @brief filter values from x.values() using mask_indices
+ */
+template <typename T, typename Context>
+void SparseMaskHelperKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& mask_indices,
+                            DenseTensor* out) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "SparseMaskHelperCPUKernel", ([&] {
+        SparseMaskHelperCPUKernel<T, data_t>(dev_ctx, x, mask_indices, out);
+      }));
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_mask,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseMaskKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(sparse_mask_helper,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseMaskHelperKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/pool_grad_kernel.cc
new file mode 100644
index 0000000000000..d17d06e6e4f14
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/pool_grad_kernel.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/pool_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename IntT = int>
+void MaxPoolGradCPUKernel(const CPUContext& dev_ctx,
+                          const SparseCooTensor& x,
+                          const DenseTensor& rulebook,
+                          const DenseTensor& counter,
+                          const SparseCooTensor& out,
+                          const SparseCooTensor& out_grad,
+                          const std::vector<int>& kernel_sizes,
+                          SparseCooTensor* x_grad) {
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const int channels = x.dims()[4];
+  int rulebook_len = rulebook.dims()[1];
+  const IntT* rulebook_ptr = rulebook.data<IntT>();
+  std::vector<int> offsets(kernel_size + 1);
+  const int* counter_ptr = counter.data<int>();
+
+  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
+
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+  const T* out_features_ptr = out.non_zero_elements().data<T>();
+  const T* out_grad_ptr = out_grad.non_zero_elements().data<T>();
+  // TODO(zhangkaihuo): call phi::sparse::EmptyLike
+  DenseTensor x_grad_indices =
+      phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
+  DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+  x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
+  T* x_grad_ptr = x_grad_values.data<T>();
+  memset(x_grad_ptr, 0, sizeof(T) * x_grad_values.numel());
+  phi::Copy<CPUContext>(dev_ctx,
+                        x.non_zero_indices(),
+                        dev_ctx.GetPlace(),
+                        false,
+                        &x_grad_indices);
+
+  phi::funcs::MaxPoolGrad<T> grad_functor;
+  for (int i = 0; i < kernel_size; i++) {
+    for (int j = 0; j < counter_ptr[i]; j++) {
+      IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
+      IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
+      for (int c = 0; c < channels; c++) {
+        grad_functor.compute(in_features_ptr[in_i * channels + c],
+                             out_features_ptr[out_i * channels + c],
+                             out_grad_ptr[out_i * channels + c],
+                             1,
+                             &x_grad_ptr[in_i * channels + c]);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MaxPoolGradKernel(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const DenseTensor& rulebook,
+                       const DenseTensor& counter,
+                       const SparseCooTensor& out,
+                       const SparseCooTensor& out_grad,
+                       const std::vector<int>& kernel_sizes,
+                       SparseCooTensor* x_grad) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "MaxPoolGradCPUKernel", ([&] {
+        MaxPoolGradCPUKernel<T, data_t>(
+            dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad);
+      }));
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/pool_kernel.cc
new file mode 100644
index 0000000000000..38e512bd00c93
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/pool_kernel.cc
@@ -0,0 +1,140 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/pool_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+/**
+ * x: (N, D, H, W, C)
+ * kernel: (D, H, W, C, OC)
+ * out: (N, D, H, W, OC)
+ **/
+template <typename T, typename IntT = int>
+void MaxPoolCPUKernel(const CPUContext& dev_ctx,
+                      const SparseCooTensor& x,
+                      const std::vector<int>& kernel_sizes,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      const std::vector<int>& strides,
+                      SparseCooTensor* out,
+                      DenseTensor* rulebook,
+                      DenseTensor* counter) {
+  const auto& x_dims = x.dims();
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const std::vector<int>& real_kernel_sizes =
+      phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]);
+  DDim out_dims = {1, 1, 1, 1, 1};
+  phi::funcs::sparse::GetOutShape(
+      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
+  const int in_channels = real_kernel_sizes[3];
+
+  // DenseTensorMeta counter_meta(
+  //     DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  // DenseTensor counter_per_kernel = phi::Empty(dev_ctx,
+  // std::move(counter_meta));
+  std::vector<int> counter_per_kernel(kernel_size, 0);
+
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+  // 1. product rule book
+  ProductRuleBook<T, CPUContext, IntT>(dev_ctx,
+                                       x,
+                                       real_kernel_sizes,
+                                       paddings,
+                                       dilations,
+                                       strides,
+                                       out_dims,
+                                       false,
+                                       rulebook,
+                                       &counter_per_kernel);
+
+  UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
+      dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out);
+
+  int rulebook_len = rulebook->dims()[1];
+  const IntT* rulebook_ptr = rulebook->data<IntT>();
+
+  counter->Resize({kernel_size});
+  int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
+  memcpy(counter_ptr, counter_per_kernel.data(), kernel_size * sizeof(int));
+
+  std::vector<int> offsets(kernel_size + 1);
+  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
+  std::vector<bool> out_flags(out->nnz(), false);
+
+  // 2. max pool
+  T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
+  phi::funcs::MaxPool<T> max_pool_functor;
+  for (int i = 0; i < kernel_size; i++) {
+    for (int j = 0; j < counter_ptr[i]; j++) {
+      IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
+      IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
+      if (!out_flags[out_i]) {
+        out_flags[out_i] = true;
+        memcpy(&out_features_ptr[out_i * in_channels],
+               &in_features_ptr[in_i * in_channels],
+               in_channels * sizeof(T));
+      } else {
+        for (int c = 0; c < in_channels; c++) {
+          max_pool_functor.compute(in_features_ptr[in_i * in_channels + c],
+                                   &out_features_ptr[out_i * in_channels + c]);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MaxPoolKernel(const Context& dev_ctx,
+                   const SparseCooTensor& x,
+                   const std::vector<int>& kernel_sizes,
+                   const std::vector<int>& paddings,
+                   const std::vector<int>& dilations,
+                   const std::vector<int>& strides,
+                   SparseCooTensor* out,
+                   DenseTensor* rulebook,
+                   DenseTensor* counter) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "MaxPoolCPUKernel", ([&] {
+        MaxPoolCPUKernel<T, data_t>(dev_ctx,
+                                    x,
+                                    kernel_sizes,
+                                    paddings,
+                                    dilations,
+                                    strides,
+                                    out,
+                                    rulebook,
+                                    counter);
+      }));
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/.convolution_grad_kernel.cu.swp b/paddle/phi/kernels/sparse/gpu/.convolution_grad_kernel.cu.swp
new file mode 100644
index 0000000000000000000000000000000000000000..e1d0e2bee6c88631e06944143561a3ec7c50b945
GIT binary patch
literal 20480
zcmeHNZEPGz86GGlgtQPSsgMw2aFDrk?#p)6L|4afu}yl4V+Y?QQLxh0-tOGp;=SF?
z?yT)gNGl;xBt#I3gvzh-RfJR!@PpswLn@+D1!`1*A|WIY<p=!%h(AcsmS<*n&uibE
zeI`OkWLA1^Z)e~4otbAocHWt{QMy<?M^6-v6I_Q0`P`o;8nusP4m~nKi0`^wB=2=~
zua+kBK{v>^EhE2nGRUt_mP+}noxDCZHJvY&N+$|JgcRFuhZV22b*HtaTkX&-dcON<
zX7J*9-85}h^x|9FDy}i#VRldqJlzjivE_wD!*$kOJLHz@Xf0niwMeF5gnJcGy;ls}
z>kNdvnLqg;nVy;~2`Y|`eV9J<-V66SE2{pA0mXn~Krx^gPz)#r6a$KZdx`<>-AA58
zjo%vA`f_}K%Z~f&Z$_6Hr+;e4`?tl<$Kw0*JKo2S>ZTY_3@8Q^1BwB~fMP%~pcqgL
zC<YV*iUGyI|9}A<4*db-`+4E;<NQDI|9|{$LY@bn08Rl%fcF8fzKf7Q0$&Fn2aW(Q
zKS;<AfbRo0fMwwQz-tc>@-yHT@Hp@x;MGIW2VMbw3j754Ch!dK1>gzbD$oJufe!$G
ze<vX?0AB$bz$Ea-K|+2G+ypKH1>gYi+jkK19PlJC4IBi1hYEfV_%84n;5u*#co_KG
z+X?w0@Dwl)90gvxpO6=UF9Q~E4444^g-wo^fLnk8oB$30zq*f*UjokpHZTd?58TG4
z%2$D>flmTO;Gf|67l7dVDZzPk(NQixv{}P-*EE|sbb<!fHRd)mmcz3-GKxO!8SuC-
zRJ^uTE;n>zjX7peE+fdITt4H4=UB(^x|N{o7@6_5X1JllnZH`Pl4R3M%k(UwNI6={
zJ_@S=H_PR9gmt+73<@_(-DWdjJjg`x8k#6KJw_+9IU1WzzY$h)M3`Y*yT*JsW3qM4
z;F~!*Zfcualj%J4S&&0MI=2kX&;veGpUspCrDirNK%eHO3>eu(r9MF;8Esv+Ll$UY
zh@Ohm^0TJSb&-cO9|`7XSD2_t#W5?k?HW2~nazTO*c4G{joGZj98@}6Fs%-5QdJMR
zOB=Qxz_uj689D}-Kg0NGq|VH_V0x41DDsM{E%iVnZR>V34ta?vJVn!l7U|TZgK3MU
zM99o845W)wV;m&`zYb3qsn;Z;mBCmNffJLJ%4Y{-+urok7QYre$=brS8}cZ|Cu#0D
zkG}N6wx*GV+Ug^*Qo7sp$y-jZOU2@;l(aPJEH25@LdWCXq=f{j=<ywQon`vUr*#<D
zqsEt-^UMiY9Y3z$UrBn3$UOGQBwe4~?5E{MG@YvZZkAJGJ;R#?lXpFq!LYu*eOeHt
z7>N230bMt8bRL6gQSTzPTwa=c?2>Hr6fft%SU)$iIGLEA;d!BC`^_p%(|<3Q6lHNH
zs0%{c0#3!~PBVtdA$g1^kJZU5a&Y(LCwdZbr`=KzQ<a!@qsgkLAsJ1mK@O=ww|X4b
z(}hWqxI=P!hh%I;PU+iimagCvmCj^xXTns<mY5jF)8f5+V+KS42VJRt)5}iGbXj_R
z6pi@9Fhn#W51(9{24SO<eExQpV~gXK?}lEmyWGyt<2a@d-!kz<+qD$^YNlo51Bf29
zUqumSX*sPu^is@@_lD!{S#48^+I}AjNf+K-r6vPH=Ug|MM91l7j;y<uN#_tAI&Gdo
z+n<%OqnW5#kSso_<1|@D5Y9Rtm}7KC%vX-yVYHs3z5lVoEGZU~4k<bmIwbdXC-B?=
zANnjhcQV%t!KfcWCP~EmEpe=`d}e1Mctr@)RUu_zIVB%3+Is)v0b@BP+m!EDg-pWC
z>U6I6Un~NzlnD{#N06++XcQ`oe!rE3PY#~RQOhyx&}4LM<Oo|t)fU=gBWZ%?fpNKe
zsmrxbv7__0CjN@9yl0k?j#xnz#m`WA!}VE_%g?NwZFd)pS@&7X!dUDM6;77QG8`GJ
zWzE)6aIqS8npSIcHOpzbqf5iO=)ppP*jC39%U+Cn8lsc<l(!jOv<$JVDA0)_sWFyL
zqfIO{giPQu!)jUvwGn<D>McfFSg`mGCIad)zheb~w5mHMxm2w`wtS&ZFI84nDoge1
zLX9r3(7ENM`D(qoyo9Gu(8|(fdbYYWpQFrzeV?*T56kj^B0mcQ%wlGNn1WBM5prhs
z7nfQAO$)dV)g3xksnKd}jGnI4s<j-s;Ml^b&#s1+4=ar>)je1@^oGq)mkp7h-trmV
zVQp<Wbi=neDuq><+vFR%&%~;j(l+N_xmeuT*eK{2E^StD{Z<i%1am=gu{yV~R9nbT
z6%ZbK*$rt&?^3tHby(E#o%Zn5+(xX<3Dz)D&T%;n1Hn70)U;4AwrMa=eA%HY$j}`O
z&<zlkHc=pP!==fo$igK}nxhw)FB*oTU^F8dax7LF!@7wSCou1MUEgZ8In5YZI#rsQ
zqER?~^h+xtZ@Yd_pm0;@iVzGa*5J%v7cC_Gf8m?o2893raO{x20AK&Jz-izxa0h<=
zw}2*K0tT=MoCHn)hk<{<=f4Ad2e=9Nz(wE)@Jf<LY+v0J1BwB~fMP%~pcqgLC<YV*
ziUGyI-7;{4+<?0mIrRD2s^f*c!ZXv^0?stHHl8!lUVLgJx^}*@Qd`jGmY3yr^iBh%
z{TVoT5FV$@L+*)}%T3$W`Q#%*b|p<Wgf}~I-%BbEd?U?Adp_kdHm$a8eht*9EG}w`
zmCNw#2fjXY{z8tNpVw9v&Qxplg%$1W!phRZVkXK%Y;qur=pYT}yp&>NYTHgzUs|LH
zS+vNASKlF=Y>g3>Q`^=9D>&FpvbiQU)b^o<vp(k<4m-$Ts{>Nyrk~i@i%ylIJ^Fp<
zhmLhMWYGcT2qxnF&H{E|nBK``XJ>Jym^wfe2M0On|NjPO<X^+tx$ysw#RKq7`22qX
zegqi62Z3M1$G-`D5wHQ_`#%Ky75@E8z_)>C0T=i<@G;;$!0+MvzX*H@I13yC?%-Ts
zocr5A2_V25IM;t2_!Dp&co}#OxCI121^6iN0PqKt_ZxsH_cNmG>QW3S1{4E|0mXn~
zKr!(D!N4GAW}m*ufR(87Ok@zjzC05V0J0xvq|cMt$4Xy0GU-k`nj@3e=>0DZAw52@
z=WVz~riSBht$`5EwE|*ED6Juu6Y;KRWfTdbK4)fTVtFYf#aoEj5z-JBR}x#YEjj}s
zAnArgz-t_3g;)VZipW8jL=+^!w~QnHc2PzZz@>-a<a{Q{B+4rFJD&Ckhxg4yY>jn?
z&_U^V&p90`GLoO%6B6;W$F|WSsdasX0w84eg7jsp!W-^|QHpw|B8U($u;uS2d<_k}
zlo2yB)FTSE9AF6bzbi96itfgW3~t;iY)0%!)e(kNE?1Z8(^ETQCx=B#Mw82b6oQOh
No8tu<M17o)e*<mw$ejQH

literal 0
HcmV?d00001

diff --git a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
new file mode 100644
index 0000000000000..f1a955477a595
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
@@ -0,0 +1,338 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/mask_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"
+#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename IntT>
+__global__ void MaskKernel(const T* x_ptr,
+                           const IntT* indices_ptr,
+                           const int64_t* sparse_offsets,
+                           const int64_t non_zero_num,
+                           const int cols,
+                           const int sparse_dim,
+                           T* out_values_ptr) {
+  CUDA_KERNEL_LOOP_TYPE(i, non_zero_num * cols, int64_t) {
+    int64_t out_i = i / cols;
+    int64_t col_i = i - out_i * cols;
+    int64_t index = 0;
+    for (int j = 0; j < sparse_dim; j++) {
+      index += indices_ptr[j * non_zero_num + out_i] * sparse_offsets[j];
+    }
+    out_values_ptr[out_i * cols + col_i] = x_ptr[index * cols + col_i];
+  }
+}
+
+template <typename T, typename IntT>
+void SparseMaskGPUKernel(const GPUContext& dev_ctx,
+                         const DenseTensor& x,
+                         const SparseCooTensor& mask,
+                         SparseCooTensor* out) {
+  const DDim& dims = x.dims();
+  PADDLE_ENFORCE_EQ(
+      x.dims(),
+      mask.dims(),
+      phi::errors::InvalidArgument("the input x and mask must have the shape"));
+  const DenseTensor& indices = mask.non_zero_indices();
+  const DenseTensor& values = mask.non_zero_elements();
+  const int sparse_dim = mask.sparse_dim();
+  DenseTensor sparse_offsets = phi::Empty<GPUContext>(
+      dev_ctx,
+      DenseTensorMeta(DataType::INT64, {sparse_dim}, DataLayout::NCHW));
+  std::vector<int64_t> h_sparse_offsets(sparse_dim);
+  phi::funcs::sparse::CalcOffsetsPerDim(
+      dims, sparse_dim, h_sparse_offsets.data());
+
+  phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data<int64_t>(),
+                                     &h_sparse_offsets[0],
+                                     sizeof(int64_t) * sparse_dim,
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyHostToDevice,
+#else
+                                     cudaMemcpyHostToDevice,
+#endif
+                                     dev_ctx.stream());
+
+  DenseTensor out_indices = phi::EmptyLike<T>(dev_ctx, indices);
+  DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, values);
+
+  phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &out_indices);
+
+  const IntT* indices_ptr = indices.data<IntT>();
+  T* out_values_ptr = out_values.data<T>();
+  const T* x_ptr = x.data<T>();
+  const int64_t non_zero_num = mask.nnz();
+  auto dims_2d = flatten_to_2d(dims, sparse_dim);
+  const int cols = dims_2d[1];
+
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1);
+  MaskKernel<T, IntT>
+      <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
+          x_ptr,
+          indices_ptr,
+          sparse_offsets.data<int64_t>(),
+          non_zero_num,
+          cols,
+          sparse_dim,
+          out_values_ptr);
+
+  out->SetMember(out_indices, out_values, dims, true);
+}
+
+/**
+ * @brief Filter the DenseTensor x by the
+ * mask.non_zero_indices() and output a SparseCooTensor
+ * x and mask must have the same shape.
+ **/
+template <typename T, typename Context>
+void SparseMaskKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const SparseCooTensor& mask,
+                      SparseCooTensor* out) {
+  PD_VISIT_INTEGRAL_TYPES(
+      mask.non_zero_indices().dtype(), "SparseMaskGPUKernel", ([&] {
+        SparseMaskGPUKernel<T, data_t>(dev_ctx, x, mask, out);
+      }));
+}
+
+template <typename T, typename IntT, int VecSize>
+__global__ void SparseMaskCopyKernel(const IntT* x_indexs,
+                                     const IntT* mask_indexs,
+                                     const IntT* bound_out,
+                                     const T* x_values,
+                                     const int64_t n,
+                                     const int64_t stride,
+                                     T* out_values) {
+  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
+    using LoadT = phi::AlignedVector<T, VecSize>;
+    using StoreT = phi::AlignedVector<T, VecSize>;
+    const IntT j = bound_out[i];
+    if (j >= 0 && j < n && mask_indexs[i] == x_indexs[j]) {
+      for (int k = 0; k < stride / VecSize; k++) {
+        // out_values[i * stride + k] = x_values[j * stride + k];
+        LoadT vec_x;
+        phi::Load<T, VecSize>(x_values + j * stride + k * VecSize, &vec_x);
+        phi::Store<T, VecSize>(vec_x, out_values + i * stride + k * VecSize);
+      }
+    }
+  }
+}
+
+template <typename IntT>
+__global__ void MaskTable(const IntT* x_indexs, const int n, int* table) {
+  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
+    int index = x_indexs[i];
+    table[index] = i == 0 ? -1 : i;
+  }
+}
+
+template <typename T, typename IntT, int VecSize>
+__global__ void MaskCopy(const IntT* mask_indexs,
+                         const int* table,
+                         const int n,
+                         const int stride,
+                         const T* x_values,
+                         T* out_values) {
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
+  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
+    int j = table[mask_indexs[i]];
+    if (j != 0) {
+      if (j == -1) j = 0;
+      for (int k = 0; k < stride; k += VecSize) {
+        LoadT vec_x;
+        phi::Load<T, VecSize>(x_values + j * stride + k, &vec_x);
+        phi::Store<T, VecSize>(vec_x, out_values + i * stride + k);
+      }
+    }
+  }
+}
+
+template <typename T, typename IntT>
+void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
+                               const SparseCooTensor& x,
+                               const DenseTensor& mask_indices,
+                               DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      mask_indices.dims().size(),
+      2,
+      phi::errors::InvalidArgument("the mask_indices must be 2-D tensor"));
+
+  const int32_t sparse_dim = x.sparse_dim();
+  auto indices_dtype = paddle::experimental::CppTypeToDataType<IntT>::Type();
+
+  std::vector<IntT> sparse_offsets(sparse_dim);
+
+  DenseTensorMeta x_indexs_meta(indices_dtype, {x.nnz()}, DataLayout::NCHW);
+  DenseTensorMeta mask_indexs_meta(
+      indices_dtype, {mask_indices.dims()[1]}, DataLayout::NCHW);
+  DenseTensorMeta sparse_offset_meta(
+      indices_dtype, {sparse_dim}, DataLayout::NCHW);
+
+  DenseTensor x_indexs =
+      phi::Empty<GPUContext>(dev_ctx, std::move(x_indexs_meta));
+  DenseTensor mask_indexs =
+      phi::Empty<GPUContext>(dev_ctx, std::move(mask_indexs_meta));
+  DenseTensor bound_out =
+      phi::Empty<GPUContext>(dev_ctx, std::move(mask_indexs_meta));
+  DenseTensor d_sparse_offsets =
+      phi::Empty<GPUContext>(dev_ctx, std::move(sparse_offset_meta));
+  IntT* x_indexs_ptr = x_indexs.data<IntT>();
+  IntT* mask_indexs_ptr = mask_indexs.data<IntT>();
+  IntT* bound_out_ptr = bound_out.data<IntT>();
+
+  // 1. calc the offsets of per dim
+  phi::funcs::sparse::CalcOffsetsPerDim(
+      x.dims(), sparse_dim, sparse_offsets.data());
+  // 2. copy sparse_offsets to device
+  phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
+                                     sparse_offsets.data(),
+                                     sizeof(IntT) * sparse_dim,
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyHostToDevice,
+#else
+                                     cudaMemcpyHostToDevice,
+#endif
+                                     dev_ctx.stream());
+
+  // 3. flatten x indices and mask indices
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1);
+  phi::funcs::sparse::FlattenIndicesKernel<<<config.block_per_grid,
+                                             config.thread_per_block,
+                                             0,
+                                             dev_ctx.stream()>>>(
+      x.non_zero_indices().data<IntT>(),
+      d_sparse_offsets.data<IntT>(),
+      x_indexs.numel(),
+      sparse_dim,
+      x_indexs_ptr);
+
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1);
+  phi::funcs::sparse::FlattenIndicesKernel<<<config.block_per_grid,
+                                             config.thread_per_block,
+                                             0,
+                                             dev_ctx.stream()>>>(
+      mask_indices.data<IntT>(),
+      d_sparse_offsets.data<IntT>(),
+      mask_indexs.numel(),
+      sparse_dim,
+      mask_indexs_ptr);
+
+  int table_size = 1;
+  auto x_dims = x.dims();
+  for (int i = 0; i < x_dims.size() - 1; i++) {
+    table_size *= x_dims[i];
+  }
+  DenseTensor table = phi::Empty<int>(dev_ctx, {table_size});
+  cudaMemsetAsync(
+      table.data<int>(), 0, table_size * sizeof(int), dev_ctx.stream());
+  const int64_t stride =
+      x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];
+  *out = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+  phi::funcs::SetConstant<GPUContext, T> set_zero;
+  set_zero(dev_ctx, out, static_cast<T>(0));
+  T* out_ptr = out->data<T>();
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1);
+  MaskTable<<<config.block_per_grid,
+              config.thread_per_block,
+              0,
+              dev_ctx.stream()>>>(
+      x_indexs_ptr, x_indexs.numel(), table.data<int>());
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1);
+  const int VecSize = VecBytes / sizeof(T);
+  if (stride % VecSize == 0) {
+    MaskCopy<T, IntT, VecSize>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(mask_indexs_ptr,
+                               table.data<int>(),
+                               mask_indexs.numel(),
+                               stride,
+                               x.non_zero_elements().data<T>(),
+                               out_ptr);
+  } else {
+    MaskCopy<T, IntT, 1><<<config.block_per_grid,
+                           config.thread_per_block,
+                           0,
+                           dev_ctx.stream()>>>(mask_indexs_ptr,
+                                               table.data<int>(),
+                                               mask_indexs.numel(),
+                                               stride,
+                                               x.non_zero_elements().data<T>(),
+                                               out_ptr);
+  }
+}
+
+template <typename T, typename Context>
+void SparseMaskHelperKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& mask_indices,
+                            DenseTensor* out) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "SparseMaskHelperGPUKernel", ([&] {
+        SparseMaskHelperGPUKernel<T, data_t>(dev_ctx, x, mask_indices, out);
+      }));
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_mask,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseMaskKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(sparse_mask_helper,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseMaskHelperKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu
new file mode 100644
index 0000000000000..d4f40c7d8c19e
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu
@@ -0,0 +1,136 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/pool_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename IntT = int>
+__global__ void MaxPoolGradCudaKernel(const T* in_features_ptr,
+                                      const T* out_features_ptr,
+                                      const T* out_grad_ptr,
+                                      const IntT* rulebook_ptr,
+                                      const int n,
+                                      const int rulebook_len,
+                                      const int channels,
+                                      T* x_grad_ptr) {
+  phi::funcs::MaxPoolGrad<T> grad_functor;
+  CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) {
+    int real_i = i / channels;
+    int c = i - real_i * channels;
+    IntT in_i = rulebook_ptr[real_i];
+    IntT out_i = rulebook_ptr[real_i + rulebook_len];
+    grad_functor.compute(in_features_ptr[in_i * channels + c],
+                         out_features_ptr[out_i * channels + c],
+                         out_grad_ptr[out_i * channels + c],
+                         1,
+                         &x_grad_ptr[in_i * channels + c]);
+  }
+}
+
+template <typename T, typename IntT = int>
+void MaxPoolGradGPUKernel(const GPUContext& dev_ctx,
+                          const SparseCooTensor& x,
+                          const DenseTensor& rulebook,
+                          const DenseTensor& counter,
+                          const SparseCooTensor& out,
+                          const SparseCooTensor& out_grad,
+                          const std::vector<int>& kernel_sizes,
+                          SparseCooTensor* x_grad) {
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const int in_channels = x.dims()[4];
+  int rulebook_len = rulebook.dims()[1];
+  const IntT* rulebook_ptr = rulebook.data<IntT>();
+  std::vector<int> offsets(kernel_size + 1);
+  const int* counter_ptr = counter.data<int>();
+  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
+
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+  const T* out_features_ptr = out.non_zero_elements().data<T>();
+  const T* out_grad_ptr = out_grad.non_zero_elements().data<T>();
+  // TODO(zhangkaihuo): call phi::sparse::EmptyLike
+  DenseTensor x_grad_indices =
+      phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
+  DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+  x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
+  T* x_grad_ptr = x_grad_values.data<T>();
+  phi::funcs::SetConstant<GPUContext, T> set_zero;
+  set_zero(dev_ctx, &x_grad_values, static_cast<T>(0.0f));
+  phi::Copy<GPUContext>(dev_ctx,
+                        x.non_zero_indices(),
+                        dev_ctx.GetPlace(),
+                        false,
+                        &x_grad_indices);
+
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter_ptr[i] <= 0) {
+      continue;
+    }
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, counter_ptr[i] * in_channels, 1);
+    MaxPoolGradCudaKernel<T, IntT>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(in_features_ptr,
+                               out_features_ptr,
+                               out_grad_ptr,
+                               rulebook_ptr + offsets[i],
+                               counter_ptr[i],
+                               rulebook_len,
+                               in_channels,
+                               x_grad_ptr);
+  }
+}
+
+template <typename T, typename Context>
+void MaxPoolGradKernel(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const DenseTensor& rulebook,
+                       const DenseTensor& counter,
+                       const SparseCooTensor& out,
+                       const SparseCooTensor& out_grad,
+                       const std::vector<int>& kernel_sizes,
+                       SparseCooTensor* x_grad) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "MaxPoolGradGPUKernel", ([&] {
+        MaxPoolGradGPUKernel<T, data_t>(
+            dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad);
+      }));
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
new file mode 100644
index 0000000000000..255c6621da015
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
@@ -0,0 +1,167 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/pool_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename IntT = int>
+__global__ void MaxPoolCudaKernel(const T* in_features_ptr,
+                                  const IntT* rulebook_ptr,
+                                  const int n,
+                                  const int rulebook_len,
+                                  const int channels,
+                                  T* out_features_ptr) {
+  phi::funcs::MaxPool<T> max_pool_functor;
+  CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) {
+    int real_i = i / channels;
+    int channel_i = i - real_i * channels;
+    IntT in_i = rulebook_ptr[real_i];
+    IntT out_i = rulebook_ptr[real_i + rulebook_len];
+    max_pool_functor.compute(in_features_ptr[in_i * channels + channel_i],
+                             &out_features_ptr[out_i * channels + channel_i]);
+  }
+}
+
+/**
+ * x: (N, D, H, W, C)
+ * kernel: (D, H, W, C, OC)
+ * out: (N, D, H, W, OC)
+ **/
+template <typename T, typename IntT = int>
+void MaxPoolGPUKernel(const GPUContext& dev_ctx,
+                      const SparseCooTensor& x,
+                      const std::vector<int>& kernel_sizes,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      const std::vector<int>& strides,
+                      SparseCooTensor* out,
+                      DenseTensor* rulebook,
+                      DenseTensor* counter) {
+  const auto& x_dims = x.dims();
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const std::vector<int>& real_kernel_sizes =
+      phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]);
+  DDim out_dims = {1, 1, 1, 1, 1};
+  phi::funcs::sparse::GetOutShape(
+      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
+  const int in_channels = real_kernel_sizes[3];
+
+  std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
+  DenseTensorMeta counter_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
+  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
+
+  // 1. product rulebook
+  int rulebook_len = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
+                                                          x,
+                                                          real_kernel_sizes,
+                                                          paddings,
+                                                          dilations,
+                                                          strides,
+                                                          out_dims,
+                                                          false,
+                                                          rulebook,
+                                                          &counter_per_kernel,
+                                                          &offsets_per_kernel,
+                                                          &out_index,
+                                                          &unique_value,
+                                                          out,
+                                                          &h_counter,
+                                                          &offsets);
+
+  const IntT* rulebook_ptr = rulebook->data<IntT>();
+
+  T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+  counter->Resize({kernel_size});
+  int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
+  memcpy(counter_ptr, h_counter.data(), h_counter.size() * sizeof(int));
+// 2. max pool
+#ifdef PADDLE_WITH_HIP
+  thrust::fill(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::fill(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+               out_features_ptr,
+               out_features_ptr + out->non_zero_elements().numel(),
+               static_cast<T>(0));
+  // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster
+  for (int i = 0; i < kernel_size; i++) {
+    if (h_counter[i] <= 0) {
+      continue;
+    }
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, h_counter[i] * in_channels, 1);
+    MaxPoolCudaKernel<T, IntT><<<config.block_per_grid.x,
+                                 config.thread_per_block.x,
+                                 0,
+                                 dev_ctx.stream()>>>(in_features_ptr,
+                                                     rulebook_ptr + offsets[i],
+                                                     h_counter[i],
+                                                     rulebook_len,
+                                                     in_channels,
+                                                     out_features_ptr);
+  }
+}
+
+template <typename T, typename Context>
+void MaxPoolKernel(const Context& dev_ctx,
+                   const SparseCooTensor& x,
+                   const std::vector<int>& kernel_sizes,
+                   const std::vector<int>& paddings,
+                   const std::vector<int>& dilations,
+                   const std::vector<int>& strides,
+                   SparseCooTensor* out,
+                   DenseTensor* rulebook,
+                   DenseTensor* counter) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "MaxPoolGPUKernel", ([&] {
+        MaxPoolGPUKernel<T, data_t>(dev_ctx,
+                                    x,
+                                    kernel_sizes,
+                                    paddings,
+                                    dilations,
+                                    strides,
+                                    out,
+                                    rulebook,
+                                    counter);
+      }));
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/mask_kernel.h b/paddle/phi/kernels/sparse/mask_kernel.h
new file mode 100644
index 0000000000000..88899e3dc672e
--- /dev/null
+++ b/paddle/phi/kernels/sparse/mask_kernel.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void SparseMaskKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const SparseCooTensor& mask,
+                      SparseCooTensor* out);
+
+template <typename T, typename Context>
+void SparseMaskHelperKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& mask_indices,
+                            DenseTensor* out);
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/pool_grad_kernel.h b/paddle/phi/kernels/sparse/pool_grad_kernel.h
new file mode 100644
index 0000000000000..ef9f8418b0116
--- /dev/null
+++ b/paddle/phi/kernels/sparse/pool_grad_kernel.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void MaxPoolGradKernel(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const DenseTensor& rulebook,
+                       const DenseTensor& counter,
+                       const SparseCooTensor& out,
+                       const SparseCooTensor& out_grad,
+                       const std::vector<int>& kernel_sizes,
+                       SparseCooTensor* x_grad);
+
+template <typename T, typename Context>
+SparseCooTensor MaxPoolGrad(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& rulebook,
+                            const DenseTensor& counter,
+                            const SparseCooTensor& out,
+                            const SparseCooTensor& out_grad,
+                            const std::vector<int>& kernel_sizes) {
+  SparseCooTensor x_grad;
+  MaxPoolGradKernel<T, Context>(
+      dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, &x_grad);
+  return x_grad;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/pool_kernel.h b/paddle/phi/kernels/sparse/pool_kernel.h
new file mode 100644
index 0000000000000..9f4939da8d52a
--- /dev/null
+++ b/paddle/phi/kernels/sparse/pool_kernel.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void MaxPoolKernel(const Context& dev_ctx,
+                   const SparseCooTensor& x,
+                   const std::vector<int>& kernel_sizes,
+                   const std::vector<int>& paddings,
+                   const std::vector<int>& dilations,
+                   const std::vector<int>& strides,
+                   SparseCooTensor* out,
+                   DenseTensor* rulebook,
+                   DenseTensor* counter);
+
+template <typename T, typename Context>
+SparseCooTensor MaxPool(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        const std::vector<int>& kernel_sizes,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations,
+                        const std::vector<int>& strides,
+                        DenseTensor* rulebook,
+                        DenseTensor* counter) {
+  SparseCooTensor coo;
+  MaxPoolKernel<T, Context>(dev_ctx,
+                            x,
+                            kernel_sizes,
+                            paddings,
+                            dilations,
+                            strides,
+                            &coo,
+                            rulebook,
+                            counter);
+  return coo;
+}
+
+}  // namespace sparse
+}  // namespace phi

From 6684d944f555436bc4a1b00d329c912619b8f5d6 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 6 Jul 2022 06:55:28 +0000
Subject: [PATCH 60/70] for ci

---
 paddle/phi/core/sparse_coo_tensor.h           | 16 ++--
 paddle/phi/kernels/funcs/sparse/scatter.cu.h  |  5 +-
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 22 ++----
 .../kernels/sparse/gpu/convolution_kernel.cu  | 75 +++++++++----------
 paddle/phi/kernels/sparse/gpu/mask_kernel.cu  | 41 ++--------
 .../kernels/test_sparse_conv3d_dev_api.cc     |  8 +-
 6 files changed, 67 insertions(+), 100 deletions(-)

diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h
index 5982612946f7b..c69c7aab89d28 100644
--- a/paddle/phi/core/sparse_coo_tensor.h
+++ b/paddle/phi/core/sparse_coo_tensor.h
@@ -156,6 +156,7 @@ class SparseCooTensor : public TensorBase,
   /// \brief get the dnese dim
   int32_t dense_dim() const;
 
+  /// \brief query table according to key
   const std::pair<DenseTensor, std::vector<int>>* table(
       const std::string& key) const {
     const auto& iter = table_ptr_->find(key);
@@ -164,7 +165,8 @@ class SparseCooTensor : public TensorBase,
     }
     return &iter->second;
   }
-  // DenseTensor* mutable_rulebook() { return &rulebook_; }
+
+  /// \brief set table according to key
   void SetTable(const std::string& key,
                 const std::pair<DenseTensor, std::vector<int>>& table) {
     auto ret = table_ptr_->insert({key, table});
@@ -173,11 +175,14 @@ class SparseCooTensor : public TensorBase,
     }
   }
 
+  /// \brief get table_ptr_
   const std::shared_ptr<
       std::map<std::string, std::pair<DenseTensor, std::vector<int>>>>&
   GetTablePtr() const {
     return table_ptr_;
   }
+
+  /// \brief set table_ptr_
   void SetTablePtr(
       const std::shared_ptr<
           std::map<std::string, std::pair<DenseTensor, std::vector<int>>>>&
@@ -185,9 +190,6 @@ class SparseCooTensor : public TensorBase,
     table_ptr_ = table_ptr;
   }
 
-  // const bool subm() const { return subm_; }
-  // void SetSubm(const bool subm) { subm_ = subm; }
-
  private:
   // save the indices of non zero elements in original dense tensor
   DenseTensor non_zero_indices_;
@@ -198,11 +200,15 @@ class SparseCooTensor : public TensorBase,
   // save the number of non zero elements in each batch
   DDim dims_;
 
-  // for sparse conv
+  // for submanifold conv
+  // SubmConv will generate a rulebook and a counter, which can be
+  // reused by different SubmConv.
+  // refer to sparse/gpu/convolution_kernel.cu
   std::shared_ptr<
       std::map<std::string, std::pair<DenseTensor, std::vector<int>>>>
       table_ptr_ = std::make_shared<
           std::map<std::string, std::pair<DenseTensor, std::vector<int>>>>();
+
   /* --------------------------- */
   /*   example: non zero element is scalar */
   /* --------------------------- */
diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
index f7c4b7642d7bd..f27174d581818 100644
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -69,9 +69,10 @@ __global__ void ScatterKernel(const T* input,
                            out + indices_i * channels + channels_i * VecSize);
   }
 }
+
 // scatter's index has been grouped in advance
-// index_counts record the count of every group
-// index_groups save the index of every group
+// index_counts record the count of each group
+// index_groups save the index of each group
 template <typename T, int VecSize>
 __global__ void ScatterKernelV2(const T* input,
                                 const int* index_counts,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 9787393d06960..4363f94f69443 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -562,11 +562,8 @@ int ProductRuleBook(const Context& dev_ctx,
     IntT* rulebook_ptr = tmp_rulebook.data<IntT>();
     DenseTensor out_indices =
         phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
-    DenseTensor out_values =
-        phi::Empty(dev_ctx,
-                   DenseTensorMeta(x.dtype(),
-                                   {x.nnz(), kernel_sizes[4]},
-                                   x.non_zero_elements().layout()));
+    DenseTensor out_values = phi::Empty<T>(dev_ctx, {x.nnz(), kernel_sizes[4]});
+
     phi::Copy(
         dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
 
@@ -609,7 +606,7 @@ int ProductRuleBook(const Context& dev_ctx,
                                                           rulebook_ptr,
                                                           counter_ptr);
 
-    out->SetMember(out_indices, out_values, out_dims, true);
+    out->SetMember(out_indices, out_values, out_dims, false);
 
     thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
                            counter_ptr,
@@ -731,13 +728,11 @@ int ProductRuleBook(const Context& dev_ctx,
     dev_ctx.Wait();
 
     const int64_t sparse_dim = 4;
-    DenseTensorMeta indices_meta(
-        indices_dtype, {sparse_dim, out_nnz}, DataLayout::NCHW);
-    DenseTensorMeta values_meta(
-        x.dtype(), {out_nnz, kernel_sizes[4]}, x.non_zero_elements().layout());
-    phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
-    phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
-    out->SetMember(out_indices, out_values, out_dims, true);
+    phi::DenseTensor out_indices =
+        phi::Empty<IntT>(dev_ctx, {sparse_dim, out_nnz});
+    phi::DenseTensor out_values =
+        phi::Empty<T>(dev_ctx, {out_nnz, kernel_sizes[4]});
+    out->SetMember(out_indices, out_values, out_dims, false);
 
     IntT* out_indices_ptr = out_indices.data<IntT>();
 
@@ -754,7 +749,6 @@ int ProductRuleBook(const Context& dev_ctx,
     unique_value->ResizeAndAllocate({static_cast<int>(out_nnz * kernel_size)});
     int* unique_value_ptr = unique_value->data<int>();
 
-    // return rulebook_len;
     GroupIndexs<<<config.block_per_grid,
                   config.thread_per_block,
                   0,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 70453d371cc50..51e0dfcc40348 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -70,18 +70,13 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   // Second algorithm:
   // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
   // 1. product rulebook
-  DenseTensorMeta counter_meta(
-      DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  DenseTensorMeta offsets_meta(
-      DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
-  DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
-  DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
-  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
-  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor counter_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
+  DenseTensor offsets_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
+  DenseTensor out_index;
+  DenseTensor unique_value;
 
   VLOG(6) << "call SubmConv3D or Conv3D " << subm << " and the key is " << key;
-  int n = 0;
+  int rulebook_len = 0;
   const IntT* rulebook_ptr = nullptr;
   bool need_product_rulebook = true;
   if (subm && !key.empty()) {
@@ -93,7 +88,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
       memcpy(h_counter.data(), table->second.data(), kernel_size * sizeof(int));
       out->SetTablePtr(x.GetTablePtr());
 
-      n = rulebook.dims()[1];
+      rulebook_len = rulebook.dims()[1];
 
       DenseTensor out_indices =
           phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
@@ -113,24 +108,25 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
       offsets[kernel_size] = offset;
     }
   }
+
   if (need_product_rulebook) {
     DenseTensor tmp_rulebook;
-    n = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
-                                             x,
-                                             kernel_sizes,
-                                             subm_paddings,
-                                             dilations,
-                                             subm_strides,
-                                             out_dims,
-                                             subm,
-                                             &tmp_rulebook,
-                                             &counter_per_kernel,
-                                             &offsets_per_kernel,
-                                             &out_index,
-                                             &unique_value,
-                                             out,
-                                             &h_counter,
-                                             &offsets);
+    rulebook_len = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
+                                                        x,
+                                                        kernel_sizes,
+                                                        subm_paddings,
+                                                        dilations,
+                                                        subm_strides,
+                                                        out_dims,
+                                                        subm,
+                                                        &tmp_rulebook,
+                                                        &counter_per_kernel,
+                                                        &offsets_per_kernel,
+                                                        &out_index,
+                                                        &unique_value,
+                                                        out,
+                                                        &h_counter,
+                                                        &offsets);
     rulebook_ptr = tmp_rulebook.data<IntT>();
 
     out->SetTablePtr(x.GetTablePtr());
@@ -145,14 +141,10 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   }
 
   // 2. gather
-  DenseTensorMeta in_features_meta(
-      x.dtype(), {n, in_channels}, DataLayout::NCHW);
-  DenseTensorMeta out_features_meta(
-      x.dtype(), {n, out_channels}, DataLayout::NCHW);
   phi::DenseTensor in_features =
-      phi::Empty(dev_ctx, std::move(in_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
   phi::DenseTensor out_features =
-      phi::Empty(dev_ctx, std::move(out_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, out_channels});
   T* in_features_ptr = in_features.data<T>();
   T* out_features_ptr = out_features.data<T>();
   phi::funcs::SetConstant<GPUContext, T> set_zero;
@@ -161,7 +153,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   Gather<T, IntT>(dev_ctx,
                   x.non_zero_elements().data<T>(),
                   rulebook_ptr,
-                  n,
+                  rulebook_len,
                   in_channels,
                   in_features_ptr);
 
@@ -172,20 +164,25 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
 
   if (subm) {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1);
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
     unique_value.ResizeAndAllocate(
         {static_cast<int>(out->nnz() * kernel_size)});
-    out_index.ResizeAndAllocate({static_cast<int>(n)});
+    out_index.ResizeAndAllocate({static_cast<int>(rulebook_len)});
     int* out_index_ptr = out_index.data<int>();
     int* unique_value_ptr = unique_value.data<int>();
     phi::backends::gpu::GpuMemsetAsync(
-        out_index_ptr, 0, sizeof(int) * n, dev_ctx.stream());
+        out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
     GroupIndexs<<<config.block_per_grid,
                   config.thread_per_block,
                   0,
-                  dev_ctx.stream()>>>(
-        n, kernel_size, rulebook_ptr + n, out_index_ptr, unique_value_ptr);
+                  dev_ctx.stream()>>>(rulebook_len,
+                                      kernel_size,
+                                      rulebook_ptr + rulebook_len,
+                                      out_index_ptr,
+                                      unique_value_ptr);
   }
+
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
     if (h_counter[i] <= 0) {
diff --git a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
index f1a955477a595..ad55ad08ad527 100644
--- a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
@@ -71,11 +71,7 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx,
   phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data<int64_t>(),
                                      &h_sparse_offsets[0],
                                      sizeof(int64_t) * sparse_dim,
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyHostToDevice,
-#else
-                                     cudaMemcpyHostToDevice,
-#endif
+                                     gpuMemcpyHostToDevice,
                                      dev_ctx.stream());
 
   DenseTensor out_indices = phi::EmptyLike<T>(dev_ctx, indices);
@@ -121,29 +117,6 @@ void SparseMaskKernel(const Context& dev_ctx,
       }));
 }
 
-template <typename T, typename IntT, int VecSize>
-__global__ void SparseMaskCopyKernel(const IntT* x_indexs,
-                                     const IntT* mask_indexs,
-                                     const IntT* bound_out,
-                                     const T* x_values,
-                                     const int64_t n,
-                                     const int64_t stride,
-                                     T* out_values) {
-  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
-    using LoadT = phi::AlignedVector<T, VecSize>;
-    using StoreT = phi::AlignedVector<T, VecSize>;
-    const IntT j = bound_out[i];
-    if (j >= 0 && j < n && mask_indexs[i] == x_indexs[j]) {
-      for (int k = 0; k < stride / VecSize; k++) {
-        // out_values[i * stride + k] = x_values[j * stride + k];
-        LoadT vec_x;
-        phi::Load<T, VecSize>(x_values + j * stride + k * VecSize, &vec_x);
-        phi::Store<T, VecSize>(vec_x, out_values + i * stride + k * VecSize);
-      }
-    }
-  }
-}
-
 template <typename IntT>
 __global__ void MaskTable(const IntT* x_indexs, const int n, int* table) {
   CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
@@ -214,11 +187,7 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
   phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
                                      sparse_offsets.data(),
                                      sizeof(IntT) * sparse_dim,
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyHostToDevice,
-#else
-                                     cudaMemcpyHostToDevice,
-#endif
+                                     gpuMemcpyHostToDevice,
                                      dev_ctx.stream());
 
   // 3. flatten x indices and mask indices
@@ -252,7 +221,7 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
     table_size *= x_dims[i];
   }
   DenseTensor table = phi::Empty<int>(dev_ctx, {table_size});
-  cudaMemsetAsync(
+  phi::backends::gpu::GpuMemsetAsync(
       table.data<int>(), 0, table_size * sizeof(int), dev_ctx.stream());
   const int64_t stride =
       x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];
@@ -308,7 +277,7 @@ void SparseMaskHelperKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sparse_mask,
+PD_REGISTER_KERNEL(mask,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseMaskKernel,
@@ -323,7 +292,7 @@ PD_REGISTER_KERNEL(sparse_mask,
   kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
 
-PD_REGISTER_KERNEL(sparse_mask_helper,
+PD_REGISTER_KERNEL(mask_helper,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseMaskHelperKernel,
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index df4fec61a9a3d..48cdae5aa0868 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -121,7 +121,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                                             strides,
                                             1,
                                             subm,
-                                            "Conv3d_0",
+                                            "Conv3d",
                                             &rulebook,
                                             &counter);
 
@@ -152,7 +152,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                                 strides,
                                 1,
                                 subm,
-                                "Conv3d_0");
+                                "Conv3d");
       f_verify(std::get<0>(grads).non_zero_elements().data<T>(), features_grad);
       f_verify(std::get<1>(grads).data<T>(), kernel_grad);
     }
@@ -209,7 +209,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                                             strides,
                                             1,
                                             subm,
-                                            "Conv3d_0",
+                                            "Conv3d",
                                             &d_rulebook,
                                             &d_counter);
   SparseCooTensor tmp_d_out = sparse::Coalesced<T>(dev_ctx_gpu, d_out);
@@ -258,7 +258,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                               strides,
                               1,
                               subm,
-                              "Conv3d_0");
+                              "Conv3d");
     DenseTensor d_features_grad = std::get<0>(grads).non_zero_elements();
     DenseTensor d_kernel_grad = std::get<1>(grads);
     DenseTensor h_features_grad =

From 4346bbb188371be8b2088f6ba7b672796b3f8ce3 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 6 Jul 2022 08:03:26 +0000
Subject: [PATCH 61/70] fix comment

---
 paddle/phi/kernels/funcs/sparse/convolution.h | 29 +++++++++++++++
 .../sparse/cpu/convolution_grad_kernel.cc     | 21 ++---------
 .../sparse/gpu/convolution_grad_kernel.cu     | 35 ++++---------------
 .../kernels/sparse/gpu/convolution_kernel.cu  |  4 +--
 4 files changed, 40 insertions(+), 49 deletions(-)

diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h
index f3caa2a62f4a8..99aeb8eaf6098 100644
--- a/paddle/phi/kernels/funcs/sparse/convolution.h
+++ b/paddle/phi/kernels/funcs/sparse/convolution.h
@@ -188,6 +188,35 @@ inline void PrefixSum(const T* counter, T* offsets, const int n) {
   offsets[n] = offset;
 }
 
+template <typename IntT>
+inline const IntT* GetRulebookPtr(const SparseCooTensor& coo,
+                                  const DenseTensor& rulebook,
+                                  const std::string& key,
+                                  int* rulebook_len) {
+  if (!key.empty()) {
+    const auto* table = coo.table(key);
+    if (table != nullptr) {
+      const DenseTensor& tmp_rulebook = table->first;
+      *rulebook_len = tmp_rulebook.dims()[1];
+      return tmp_rulebook.data<IntT>();
+    }
+  }
+  *rulebook_len = rulebook.dims()[1];
+  return rulebook.data<IntT>();
+}
+
+inline const int* GetCounterPtr(const SparseCooTensor& coo,
+                                const DenseTensor& counter,
+                                const std::string& key) {
+  if (!key.empty()) {
+    const auto* table = coo.table(key);
+    if (table != nullptr) {
+      return table->second.data();
+    }
+  }
+  return counter.data<int>();
+}
+
 }  // namespace sparse
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
index 5c6c2539c0a74..5e51a56e53cb7 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -52,24 +52,9 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx,
   const int out_channels = kernel_dims[4];
 
   int rulebook_len = 0;
-  const IntT* rulebook_ptr = nullptr;
-  const int* counter_ptr = nullptr;
-  bool cache_in_table = false;
-  if (!key.empty()) {
-    const auto* table = out.table(key);
-    if (table != nullptr) {
-      cache_in_table = true;
-      const DenseTensor& tmp_rulebook = table->first;
-      rulebook_ptr = tmp_rulebook.data<IntT>();
-      rulebook_len = tmp_rulebook.dims()[1];
-      counter_ptr = table->second.data();
-    }
-  }
-  if (!cache_in_table) {
-    rulebook_ptr = rulebook.data<IntT>();
-    rulebook_len = rulebook.dims()[1];
-    counter_ptr = counter.data<int>();
-  }
+  const IntT* rulebook_ptr = phi::funcs::sparse::GetRulebookPtr<IntT>(
+      out, rulebook, key, &rulebook_len);
+  const int* counter_ptr = phi::funcs::sparse::GetCounterPtr(out, counter, key);
 
   DenseTensorMeta in_features_meta(
       x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 1a6842416e3dd..08e3d71c961ac 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -59,44 +58,22 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   const int out_channels = kernel_dims[4];
 
   int rulebook_len = 0;
-  const IntT* rulebook_ptr = nullptr;
-  const int* counter_ptr = nullptr;
-  bool cache_in_table = false;
-  if (!key.empty()) {
-    const auto* table = out.table(key);
-    if (table != nullptr) {
-      cache_in_table = true;
-      const DenseTensor& tmp_rulebook = table->first;
-      rulebook_ptr = tmp_rulebook.data<IntT>();
-      rulebook_len = tmp_rulebook.dims()[1];
-      counter_ptr = table->second.data();
-    }
-  }
-  if (!cache_in_table) {
-    rulebook_ptr = rulebook.data<IntT>();
-    rulebook_len = rulebook.dims()[1];
-    counter_ptr = counter.data<int>();
-  }
+  const IntT* rulebook_ptr = phi::funcs::sparse::GetRulebookPtr<IntT>(
+      out, rulebook, key, &rulebook_len);
+  const int* counter_ptr = phi::funcs::sparse::GetCounterPtr(out, counter, key);
 
-  DenseTensorMeta in_features_meta(
-      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
-  DenseTensorMeta d_x_features_meta(
-      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
-  DenseTensorMeta out_grad_features_meta(
-      x.dtype(), {rulebook_len, out_channels}, DataLayout::NCHW);
   phi::DenseTensor in_features =
-      phi::Empty(dev_ctx, std::move(in_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
   phi::DenseTensor d_x_features =
-      phi::Empty(dev_ctx, std::move(d_x_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
   phi::DenseTensor out_grad_features =
-      phi::Empty(dev_ctx, std::move(out_grad_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, out_channels});
 
   T* in_features_ptr = in_features.data<T>();
   T* d_x_features_ptr = d_x_features.data<T>();
   T* out_grad_features_ptr = out_grad_features.data<T>();
   *kernel_grad = phi::EmptyLike<T>(dev_ctx, kernel);
   T* d_kernel_ptr = kernel_grad->data<T>();
-  phi::funcs::SetConstant<GPUContext, T> set_zero;
   phi::backends::gpu::GpuMemsetAsync(
       d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel(), dev_ctx.stream());
 
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 51e0dfcc40348..4afa197eb4cca 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -72,8 +72,8 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   // 1. product rulebook
   DenseTensor counter_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
   DenseTensor offsets_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
-  DenseTensor out_index;
-  DenseTensor unique_value;
+  DenseTensor out_index = phi::Empty<int>(dev_ctx, {1});
+  DenseTensor unique_value = phi::Empty<int>(dev_ctx, {1});
 
   VLOG(6) << "call SubmConv3D or Conv3D " << subm << " and the key is " << key;
   int rulebook_len = 0;

From 3187f52b9b4a29b12873ac0235bf28745811b38d Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 6 Jul 2022 09:10:42 +0000
Subject: [PATCH 62/70] opt code structure

---
 paddle/phi/kernels/funcs/sparse/convolution.h | 54 +++++++++++++++++++
 .../kernels/sparse/cpu/convolution_kernel.cc  | 53 +++++++-----------
 .../kernels/sparse/gpu/convolution_kernel.cu  | 48 +++++------------
 3 files changed, 85 insertions(+), 70 deletions(-)

diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h
index 99aeb8eaf6098..a4027670a508c 100644
--- a/paddle/phi/kernels/funcs/sparse/convolution.h
+++ b/paddle/phi/kernels/funcs/sparse/convolution.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace phi {
@@ -217,6 +218,59 @@ inline const int* GetCounterPtr(const SparseCooTensor& coo,
   return counter.data<int>();
 }
 
+template <typename T, typename IntT, typename Context>
+inline const IntT* PrepareSubm(const Context& dev_ctx,
+                               const SparseCooTensor& x,
+                               const std::string& key,
+                               const DDim& out_dims,
+                               SparseCooTensor* out,
+                               std::vector<int>* counter,
+                               std::vector<int>* offsets,
+                               int* rulebook_len,
+                               bool* need_product_rulebook) {
+  const auto* table = x.table(key);
+  if (table != nullptr) {
+    *need_product_rulebook = false;
+    const DenseTensor& rulebook = table->first;
+    memcpy(counter->data(),
+           table->second.data(),
+           table->second.size() * sizeof(int));
+    out->SetTablePtr(x.GetTablePtr());
+
+    *rulebook_len = rulebook.dims()[1];
+
+    DenseTensor out_indices =
+        phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
+    DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+    phi::Copy(
+        dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
+    out->SetMember(out_indices, out_values, out_dims, false);
+    PrefixSum<int>(counter->data(), offsets->data(), counter->size());
+    return rulebook.data<IntT>();
+  }
+  return nullptr;
+}
+
+template <typename Context>
+inline void SaveToTable(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        const std::string& key,
+                        const DenseTensor& in_rulebook,
+                        const std::vector<int>& counter_vec,
+                        SparseCooTensor* out,
+                        DenseTensor* out_rulebook,
+                        DenseTensor* counter) {
+  out->SetTablePtr(x.GetTablePtr());
+  if (!key.empty()) {
+    out->SetTable(key, std::make_pair(in_rulebook, counter_vec));
+  } else {
+    *out_rulebook = in_rulebook;
+    counter->Resize({static_cast<int>(counter_vec.size())});
+    int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
+    memcpy(counter_ptr, counter_vec.data(), counter_vec.size() * sizeof(int));
+  }
+}
+
 }  // namespace sparse
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index f5f7497df96fb..42450427bc0a9 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -70,34 +70,23 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx,
   // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
   // 1. product rulebook
   std::vector<int> counter_per_kernel(kernel_size, 0);
+  std::vector<int> offsets(kernel_size + 1);
 
   // DenseTensor* rulebook = nullptr;
   const IntT* rulebook_ptr = nullptr;
   int n = 0;
   bool need_product_rulebook = true;
   if (subm && !key.empty()) {
-    const auto* table = x.table(key);
-    if (table != nullptr) {
-      need_product_rulebook = false;
-      const DenseTensor& rulebook = table->first;
-      rulebook_ptr = rulebook.data<IntT>();
-      out->SetTablePtr(x.GetTablePtr());
-      n = rulebook.dims()[1];
-
-      DenseTensor out_indices =
-          phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
-      DenseTensor out_values =
-          phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
-      phi::Copy(dev_ctx,
-                x.non_zero_indices(),
-                dev_ctx.GetPlace(),
-                false,
-                &out_indices);
-      out->SetMember(out_indices, out_values, out_dims, true);
-      memcpy(counter_per_kernel.data(),
-             table->second.data(),
-             kernel_size * sizeof(int));
-    }
+    rulebook_ptr = phi::funcs::sparse::PrepareSubm<T, IntT, CPUContext>(
+        dev_ctx,
+        x,
+        key,
+        out_dims,
+        out,
+        &counter_per_kernel,
+        &offsets,
+        &n,
+        &need_product_rulebook);
   }
   if (need_product_rulebook) {
     DenseTensor tmp_rulebook;
@@ -117,17 +106,14 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx,
     n = tmp_rulebook.dims()[1];
     rulebook_ptr = tmp_rulebook.data<IntT>();
 
-    out->SetTablePtr(x.GetTablePtr());
-    if (!key.empty()) {
-      out->SetTable(key, std::make_pair(tmp_rulebook, counter_per_kernel));
-    } else {
-      *rulebook = tmp_rulebook;
-      counter->Resize({kernel_size});
-      int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
-      memcpy(counter_ptr,
-             counter_per_kernel.data(),
-             counter_per_kernel.size() * sizeof(int));
-    }
+    phi::funcs::sparse::SaveToTable(dev_ctx,
+                                    x,
+                                    key,
+                                    tmp_rulebook,
+                                    counter_per_kernel,
+                                    out,
+                                    rulebook,
+                                    counter);
   }
   // int n = rulebook->dims()[1];
   const int* counter_ptr = counter_per_kernel.data();
@@ -152,7 +138,6 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx,
 
   // 3. call gemm for every werght
   auto blas = phi::funcs::GetBlas<CPUContext, T>(dev_ctx);
-  std::vector<int> offsets(kernel_size + 1);
   int offset = 0;
   for (int i = 0; i < kernel_size; i++) {
     offsets[i] = offset;
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 4afa197eb4cca..05aba7521eca5 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -80,33 +80,16 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   const IntT* rulebook_ptr = nullptr;
   bool need_product_rulebook = true;
   if (subm && !key.empty()) {
-    const auto* table = x.table(key);
-    if (table != nullptr) {
-      need_product_rulebook = false;
-      const DenseTensor& rulebook = table->first;
-      rulebook_ptr = rulebook.data<IntT>();
-      memcpy(h_counter.data(), table->second.data(), kernel_size * sizeof(int));
-      out->SetTablePtr(x.GetTablePtr());
-
-      rulebook_len = rulebook.dims()[1];
-
-      DenseTensor out_indices =
-          phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
-      DenseTensor out_values =
-          phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
-      phi::Copy(dev_ctx,
-                x.non_zero_indices(),
-                dev_ctx.GetPlace(),
-                false,
-                &out_indices);
-      out->SetMember(out_indices, out_values, out_dims, true);
-      IntT offset = 0;
-      for (int i = 0; i < kernel_size; i++) {
-        offsets[i] = offset;
-        offset += h_counter[i];
-      }
-      offsets[kernel_size] = offset;
-    }
+    rulebook_ptr = phi::funcs::sparse::PrepareSubm<T, IntT, GPUContext>(
+        dev_ctx,
+        x,
+        key,
+        out_dims,
+        out,
+        &h_counter,
+        &offsets,
+        &rulebook_len,
+        &need_product_rulebook);
   }
 
   if (need_product_rulebook) {
@@ -129,15 +112,8 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
                                                         &offsets);
     rulebook_ptr = tmp_rulebook.data<IntT>();
 
-    out->SetTablePtr(x.GetTablePtr());
-    if (!key.empty()) {
-      out->SetTable(key, std::make_pair(tmp_rulebook, h_counter));
-    } else {
-      *rulebook = tmp_rulebook;
-      counter->Resize({kernel_size});
-      int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
-      memcpy(counter_ptr, h_counter.data(), h_counter.size() * sizeof(int));
-    }
+    phi::funcs::sparse::SaveToTable(
+        dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter);
   }
 
   // 2. gather

From aa284f4a1b43280c91c8952797536720387f293c Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 6 Jul 2022 12:20:37 +0000
Subject: [PATCH 63/70] rename conv_kernel

---
 paddle/phi/api/yaml/sparse_api.yaml           |   6 +-
 paddle/phi/api/yaml/sparse_bw_api.yaml        |   6 +-
 paddle/phi/kernels/sparse/conv_grad_kernel.h  |  79 ++++++++++++++
 paddle/phi/kernels/sparse/conv_kernel.h       |  68 ++++++++++++
 .../kernels/sparse/convolution_grad_kernel.h  |  80 --------------
 .../phi/kernels/sparse/convolution_kernel.h   |  68 ------------
 .../kernels/sparse/cpu/coalesced_kernel.cc    |   2 +-
 .../sparse/cpu/{convolution.h => conv.h}      |   2 +-
 ...ion_grad_kernel.cc => conv_grad_kernel.cc} | 100 +++++++++---------
 .../{convolution_kernel.cc => conv_kernel.cc} |  78 +++++++-------
 paddle/phi/kernels/sparse/cpu/pool_kernel.cc  |   2 +-
 .../gpu/{convolution.cu.h => conv.cu.h}       |  80 +++++++-------
 ...ion_grad_kernel.cu => conv_grad_kernel.cu} | 100 +++++++++---------
 .../{convolution_kernel.cu => conv_kernel.cu} |  82 +++++++-------
 paddle/phi/kernels/sparse/gpu/pool_kernel.cu  |   2 +-
 paddle/phi/tests/api/test_sparse_conv_api.cc  |   6 +-
 .../kernels/test_sparse_conv3d_dev_api.cc     | 100 +++++++++---------
 .../tests/unittests/test_sparse_utils_op.py   |   2 +
 python/paddle/incubate/sparse/coalesced.py    |  27 +++++
 .../incubate/sparse/nn/functional/conv.py     |   6 +-
 20 files changed, 463 insertions(+), 433 deletions(-)
 create mode 100644 paddle/phi/kernels/sparse/conv_grad_kernel.h
 create mode 100644 paddle/phi/kernels/sparse/conv_kernel.h
 delete mode 100644 paddle/phi/kernels/sparse/convolution_grad_kernel.h
 delete mode 100644 paddle/phi/kernels/sparse/convolution_kernel.h
 rename paddle/phi/kernels/sparse/cpu/{convolution.h => conv.h} (99%)
 rename paddle/phi/kernels/sparse/cpu/{convolution_grad_kernel.cc => conv_grad_kernel.cc} (77%)
 rename paddle/phi/kernels/sparse/cpu/{convolution_kernel.cc => conv_kernel.cc} (82%)
 rename paddle/phi/kernels/sparse/gpu/{convolution.cu.h => conv.cu.h} (94%)
 rename paddle/phi/kernels/sparse/gpu/{convolution_grad_kernel.cu => conv_grad_kernel.cu} (81%)
 rename paddle/phi/kernels/sparse/gpu/{convolution_kernel.cu => conv_kernel.cu} (85%)

diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml
index a73529dde3c17..5780bec804008 100644
--- a/paddle/phi/api/yaml/sparse_api.yaml
+++ b/paddle/phi/api/yaml/sparse_api.yaml
@@ -7,14 +7,14 @@
     layout : x
   backward : add_grad
 
-- api : conv3d
+- api : conv3d_coo
   args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key)
   output : Tensor(out), Tensor(rulebook), Tensor(counter) 
   kernel :
-    func : sparse_conv3d{sparse_coo, dense -> sparse_coo, dense, dense}
+    func : conv3d_coo{sparse_coo, dense -> sparse_coo, dense, dense}
     layout : x
   intermediate: rulebook, counter
-  backward : conv3d_grad
+  backward : conv3d_coo_grad
 
 - api : coo_to_dense
   args : (Tensor x)
diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml
index 4d0371257d810..56f0595351d35 100644
--- a/paddle/phi/api/yaml/sparse_bw_api.yaml
+++ b/paddle/phi/api/yaml/sparse_bw_api.yaml
@@ -6,12 +6,12 @@
     func : add_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo},
            add_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
 
-- backward_api : conv3d_grad
-  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor), Tensor(counter@DenseTensor)
+- backward_api : conv3d_coo_grad
+  forward : conv3d_coo (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor), Tensor(counter@DenseTensor)
   args : (Tensor x, Tensor kernel, Tensor out, Tensor rulebook, Tensor counter, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key)
   output : Tensor(x_grad), Tensor(kernel_grad)
   kernel :
-    func : sparse_conv3d_grad{sparse_coo, dense, sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense}
+    func : conv3d_coo_grad{sparse_coo, dense, sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense}
 
 - backward_api : coo_to_dense_grad
   forward : coo_to_dense(Tensor x) -> Tensor(out)
diff --git a/paddle/phi/kernels/sparse/conv_grad_kernel.h b/paddle/phi/kernels/sparse/conv_grad_kernel.h
new file mode 100644
index 0000000000000..867f6b5a53f37
--- /dev/null
+++ b/paddle/phi/kernels/sparse/conv_grad_kernel.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void Conv3dCooGradKernel(const Context& dev_ctx,
+                         const SparseCooTensor& x,
+                         const DenseTensor& kernel,
+                         const SparseCooTensor& out,
+                         const DenseTensor& rulebook,
+                         const DenseTensor& counter,
+                         const SparseCooTensor& out_grad,
+                         const std::vector<int>& paddings,
+                         const std::vector<int>& dilations,
+                         const std::vector<int>& strides,
+                         const int groups,
+                         const bool subm,
+                         const std::string& key,
+                         SparseCooTensor* x_grad,
+                         DenseTensor* kernel_grad);
+
+template <typename T, typename Context>
+std::tuple<SparseCooTensor, DenseTensor> Conv3dCooGrad(
+    const Context& dev_ctx,
+    const SparseCooTensor& x,
+    const DenseTensor& kernel,
+    const SparseCooTensor& out,
+    const DenseTensor& rulebook,
+    const DenseTensor& counter,
+    const SparseCooTensor& out_grad,
+    const std::vector<int>& paddings,
+    const std::vector<int>& dilations,
+    const std::vector<int>& strides,
+    const int groups,
+    const bool subm,
+    const std::string& key) {
+  SparseCooTensor x_grad;
+  DenseTensor kernel_grad;
+
+  // TODO(zhangkaihuo): call InferMeta func here
+  Conv3dCooGradKernel<T, Context>(dev_ctx,
+                                  x,
+                                  kernel,
+                                  out,
+                                  rulebook,
+                                  counter,
+                                  out_grad,
+                                  paddings,
+                                  dilations,
+                                  strides,
+                                  groups,
+                                  subm,
+                                  key,
+                                  &x_grad,
+                                  &kernel_grad);
+  return std::make_tuple(x_grad, kernel_grad);
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/conv_kernel.h b/paddle/phi/kernels/sparse/conv_kernel.h
new file mode 100644
index 0000000000000..0c5a2081a6f3d
--- /dev/null
+++ b/paddle/phi/kernels/sparse/conv_kernel.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void Conv3dCooKernel(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     const DenseTensor& kernel,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     const std::vector<int>& strides,
+                     const int groups,
+                     const bool subm,
+                     const std::string& key,
+                     SparseCooTensor* out,
+                     DenseTensor* rulebook,
+                     DenseTensor* counter);
+
+template <typename T, typename Context>
+SparseCooTensor Conv3dCoo(const Context& dev_ctx,
+                          const SparseCooTensor& x,
+                          const DenseTensor kernel,
+                          const std::vector<int>& paddings,
+                          const std::vector<int>& dilations,
+                          const std::vector<int>& strides,
+                          const int groups,
+                          const bool subm,
+                          const std::string& key,
+                          DenseTensor* rulebook,
+                          DenseTensor* counter) {
+  SparseCooTensor coo;
+  Conv3dCooKernel<T, Context>(dev_ctx,
+                              x,
+                              kernel,
+                              paddings,
+                              dilations,
+                              strides,
+                              groups,
+                              subm,
+                              key,
+                              &coo,
+                              rulebook,
+                              counter);
+  return coo;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
deleted file mode 100644
index 54d09babb2cf9..0000000000000
--- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/sparse_coo_tensor.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/sparse/convolution_kernel.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void Conv3dGradKernel(const Context& dev_ctx,
-                      const SparseCooTensor& x,
-                      const DenseTensor& kernel,
-                      const SparseCooTensor& out,
-                      const DenseTensor& rulebook,
-                      const DenseTensor& counter,
-                      const SparseCooTensor& out_grad,
-                      const std::vector<int>& paddings,
-                      const std::vector<int>& dilations,
-                      const std::vector<int>& strides,
-                      const int groups,
-                      const bool subm,
-                      const std::string& key,
-                      SparseCooTensor* x_grad,
-                      DenseTensor* kernel_grad);
-
-template <typename T, typename Context>
-std::tuple<SparseCooTensor, DenseTensor> Conv3dGrad(
-    const Context& dev_ctx,
-    const SparseCooTensor& x,
-    const DenseTensor& kernel,
-    const SparseCooTensor& out,
-    const DenseTensor& rulebook,
-    const DenseTensor& counter,
-    const SparseCooTensor& out_grad,
-    const std::vector<int>& paddings,
-    const std::vector<int>& dilations,
-    const std::vector<int>& strides,
-    const int groups,
-    const bool subm,
-    const std::string& key) {
-  SparseCooTensor x_grad;
-  DenseTensor kernel_grad;
-
-  // TODO(zhangkaihuo): call InferMeta func here
-  Conv3dGradKernel<T, Context>(dev_ctx,
-                               x,
-                               kernel,
-                               out,
-                               rulebook,
-                               counter,
-                               out_grad,
-                               paddings,
-                               dilations,
-                               strides,
-                               groups,
-                               subm,
-                               key,
-                               &x_grad,
-                               &kernel_grad);
-  return std::make_tuple(x_grad, kernel_grad);
-}
-
-}  // namespace sparse
-}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
deleted file mode 100644
index 62559d4e0ff1e..0000000000000
--- a/paddle/phi/kernels/sparse/convolution_kernel.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/sparse_coo_tensor.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/funcs/sparse/convolution.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void Conv3dKernel(const Context& dev_ctx,
-                  const SparseCooTensor& x,
-                  const DenseTensor& kernel,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const int groups,
-                  const bool subm,
-                  const std::string& key,
-                  SparseCooTensor* out,
-                  DenseTensor* rulebook,
-                  DenseTensor* counter);
-
-template <typename T, typename Context>
-SparseCooTensor Conv3d(const Context& dev_ctx,
-                       const SparseCooTensor& x,
-                       const DenseTensor kernel,
-                       const std::vector<int>& paddings,
-                       const std::vector<int>& dilations,
-                       const std::vector<int>& strides,
-                       const int groups,
-                       const bool subm,
-                       const std::string& key,
-                       DenseTensor* rulebook,
-                       DenseTensor* counter) {
-  SparseCooTensor coo;
-  Conv3dKernel<T, Context>(dev_ctx,
-                           x,
-                           kernel,
-                           paddings,
-                           dilations,
-                           strides,
-                           groups,
-                           subm,
-                           key,
-                           &coo,
-                           rulebook,
-                           counter);
-  return coo;
-}
-
-}  // namespace sparse
-}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
index 9d1f71afceb5e..b42294cfc0315 100644
--- a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
@@ -107,7 +107,7 @@ void CoalescedKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sort,
+PD_REGISTER_KERNEL(coalesced,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::CoalescedKernel,
diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/conv.h
similarity index 99%
rename from paddle/phi/kernels/sparse/cpu/convolution.h
rename to paddle/phi/kernels/sparse/cpu/conv.h
index 07baf77ff5d27..82480e492abae 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/conv.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/sparse/conv_kernel.h"
 
 namespace phi {
 namespace sparse {
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
similarity index 77%
rename from paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
rename to paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
index 5e51a56e53cb7..44ad2fa588b55 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/conv_grad_kernel.h"
 
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+#include "paddle/phi/kernels/sparse/cpu/conv.h"
 
 namespace phi {
 namespace sparse {
@@ -31,21 +31,21 @@ namespace sparse {
 // x_grad = out_grad * transpose(kenrel)
 // kernel_grad = transpose(x) * out_grad
 template <typename T, typename IntT = int>
-void Conv3dGradCPUKernel(const CPUContext& dev_ctx,
-                         const SparseCooTensor& x,
-                         const DenseTensor& kernel,
-                         const SparseCooTensor& out,
-                         const DenseTensor& rulebook,
-                         const DenseTensor& counter,
-                         const SparseCooTensor& out_grad,
-                         const std::vector<int>& paddings,
-                         const std::vector<int>& dilations,
-                         const std::vector<int>& strides,
-                         const int groups,
-                         const bool subm,
-                         const std::string& key,
-                         SparseCooTensor* x_grad,
-                         DenseTensor* kernel_grad) {
+void Conv3dCooGradCPUKernel(const CPUContext& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& kernel,
+                            const SparseCooTensor& out,
+                            const DenseTensor& rulebook,
+                            const DenseTensor& counter,
+                            const SparseCooTensor& out_grad,
+                            const std::vector<int>& paddings,
+                            const std::vector<int>& dilations,
+                            const std::vector<int>& strides,
+                            const int groups,
+                            const bool subm,
+                            const std::string& key,
+                            SparseCooTensor* x_grad,
+                            DenseTensor* kernel_grad) {
   const auto& kernel_dims = kernel.dims();
   const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   const int in_channels = kernel_dims[3];
@@ -181,48 +181,48 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx,
 }
 
 template <typename T, typename Context>
-void Conv3dGradKernel(const Context& dev_ctx,
-                      const SparseCooTensor& x,
-                      const DenseTensor& kernel,
-                      const SparseCooTensor& out,
-                      const DenseTensor& rulebook,
-                      const DenseTensor& counter,
-                      const SparseCooTensor& out_grad,
-                      const std::vector<int>& paddings,
-                      const std::vector<int>& dilations,
-                      const std::vector<int>& strides,
-                      const int groups,
-                      const bool subm,
-                      const std::string& key,
-                      SparseCooTensor* x_grad,
-                      DenseTensor* kernel_grad) {
+void Conv3dCooGradKernel(const Context& dev_ctx,
+                         const SparseCooTensor& x,
+                         const DenseTensor& kernel,
+                         const SparseCooTensor& out,
+                         const DenseTensor& rulebook,
+                         const DenseTensor& counter,
+                         const SparseCooTensor& out_grad,
+                         const std::vector<int>& paddings,
+                         const std::vector<int>& dilations,
+                         const std::vector<int>& strides,
+                         const int groups,
+                         const bool subm,
+                         const std::string& key,
+                         SparseCooTensor* x_grad,
+                         DenseTensor* kernel_grad) {
   PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "Conv3dGradCPUKernel", ([&] {
-        Conv3dGradCPUKernel<T, data_t>(dev_ctx,
-                                       x,
-                                       kernel,
-                                       out,
-                                       rulebook,
-                                       counter,
-                                       out_grad,
-                                       paddings,
-                                       dilations,
-                                       strides,
-                                       groups,
-                                       subm,
-                                       key,
-                                       x_grad,
-                                       kernel_grad);
+      x.non_zero_indices().dtype(), "Conv3dCooGradCPUKernel", ([&] {
+        Conv3dCooGradCPUKernel<T, data_t>(dev_ctx,
+                                          x,
+                                          kernel,
+                                          out,
+                                          rulebook,
+                                          counter,
+                                          out_grad,
+                                          paddings,
+                                          dilations,
+                                          strides,
+                                          groups,
+                                          subm,
+                                          key,
+                                          x_grad,
+                                          kernel_grad);
       }));
 }
 
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sparse_conv3d_grad,
+PD_REGISTER_KERNEL(conv3d_coo_grad,
                    CPU,
                    ALL_LAYOUT,
-                   phi::sparse::Conv3dGradKernel,
+                   phi::sparse::Conv3dCooGradKernel,
                    float,
                    double) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
similarity index 82%
rename from paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
rename to paddle/phi/kernels/sparse/cpu/conv_kernel.cc
index 42450427bc0a9..5a892e64c65a0 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+#include "paddle/phi/kernels/sparse/cpu/conv.h"
 
 namespace phi {
 namespace sparse {
@@ -28,18 +28,18 @@ namespace sparse {
  * out: (N, D, H, W, OC)
  **/
 template <typename T, typename IntT = int>
-void Conv3dCPUKernel(const CPUContext& dev_ctx,
-                     const SparseCooTensor& x,
-                     const DenseTensor& kernel,
-                     const std::vector<int>& paddings,
-                     const std::vector<int>& dilations,
-                     const std::vector<int>& strides,
-                     const int groups,
-                     const bool subm,
-                     const std::string& key,
-                     SparseCooTensor* out,
-                     DenseTensor* rulebook,
-                     DenseTensor* counter) {
+void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
+                        const SparseCooTensor& x,
+                        const DenseTensor& kernel,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations,
+                        const std::vector<int>& strides,
+                        const int groups,
+                        const bool subm,
+                        const std::string& key,
+                        SparseCooTensor* out,
+                        DenseTensor* rulebook,
+                        DenseTensor* counter) {
   // update padding and dilation
   // Currently, only support x.layout is NDHWC, groups = 1
   // if x.layout != NDHWC then transpose(x), transpose(weight)
@@ -178,32 +178,32 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx,
 }
 
 template <typename T, typename Context>
-void Conv3dKernel(const Context& dev_ctx,
-                  const SparseCooTensor& x,
-                  const DenseTensor& kernel,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const int groups,
-                  const bool subm,
-                  const std::string& key,
-                  SparseCooTensor* out,
-                  DenseTensor* rulebook,
-                  DenseTensor* counter) {
+void Conv3dCooKernel(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     const DenseTensor& kernel,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     const std::vector<int>& strides,
+                     const int groups,
+                     const bool subm,
+                     const std::string& key,
+                     SparseCooTensor* out,
+                     DenseTensor* rulebook,
+                     DenseTensor* counter) {
   PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "Conv3dCPUKernel", ([&] {
-        Conv3dCPUKernel<T, data_t>(dev_ctx,
-                                   x,
-                                   kernel,
-                                   paddings,
-                                   dilations,
-                                   strides,
-                                   groups,
-                                   subm,
-                                   key,
-                                   out,
-                                   rulebook,
-                                   counter);
+      x.non_zero_indices().dtype(), "Conv3dCooCPUKernel", ([&] {
+        Conv3dCooCPUKernel<T, data_t>(dev_ctx,
+                                      x,
+                                      kernel,
+                                      paddings,
+                                      dilations,
+                                      strides,
+                                      groups,
+                                      subm,
+                                      key,
+                                      out,
+                                      rulebook,
+                                      counter);
       }));
 }
 
@@ -211,6 +211,6 @@ void Conv3dKernel(const Context& dev_ctx,
 }  // namespace phi
 
 PD_REGISTER_KERNEL(
-    sparse_conv3d, CPU, ALL_LAYOUT, phi::sparse::Conv3dKernel, float, double) {
+    conv3d_coo, CPU, ALL_LAYOUT, phi::sparse::Conv3dCooKernel, float, double) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
diff --git a/paddle/phi/kernels/sparse/cpu/pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/pool_kernel.cc
index 38e512bd00c93..36949f7161245 100644
--- a/paddle/phi/kernels/sparse/cpu/pool_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/pool_kernel.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/sparse/convolution.h"
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+#include "paddle/phi/kernels/sparse/cpu/conv.h"
 
 namespace phi {
 namespace sparse {
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/conv.cu.h
similarity index 94%
rename from paddle/phi/kernels/sparse/gpu/convolution.cu.h
rename to paddle/phi/kernels/sparse/gpu/conv.cu.h
index 4363f94f69443..8cd55e50d98d0 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h
@@ -14,11 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
 #include <thrust/remove.h>
-#include <thrust/sort.h>
 #include <thrust/unique.h>
+#include "paddle/phi/kernels/sparse/conv_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
@@ -31,7 +29,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
 #include "paddle/phi/kernels/funcs/sparse/utils.cu.h"
 #include "paddle/phi/kernels/primitive/compute_primitives.h"
-#include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -490,6 +487,34 @@ __global__ void GroupIndexsV2(const int rulebook_len,
   }
 }
 
+inline void CallThrustScan(const GPUContext& dev_ctx,
+                           const int* counter_ptr,
+                           const int kernel_size,
+                           int* offsets_ptr,
+                           int* h_counter_ptr,
+                           int* h_offsets_ptr) {
+#ifdef PADDLE_WITH_HIP
+  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                         counter_ptr,
+                         counter_ptr + kernel_size,
+                         offsets_ptr);
+
+  phi::backends::gpu::GpuMemcpyAsync(h_counter_ptr,
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     gpuMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+
+  phi::backends::gpu::GpuMemcpyAsync(h_offsets_ptr,
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     gpuMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+}
+
 // the basic algorithm can refer to convolution_kernel.cc or
 // the second paper
 // example:
@@ -608,22 +633,13 @@ int ProductRuleBook(const Context& dev_ctx,
 
     out->SetMember(out_indices, out_values, out_dims, false);
 
-    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
-                           counter_ptr,
-                           counter_ptr + kernel_size,
-                           offsets_ptr);
-
-    phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
-                                       counter_ptr,
-                                       kernel_size * sizeof(int),
-                                       gpuMemcpyDeviceToHost,
-                                       dev_ctx.stream());
+    CallThrustScan(dev_ctx,
+                   counter_ptr,
+                   kernel_size,
+                   offsets_ptr,
+                   h_counter->data(),
+                   h_offsets->data());
 
-    phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
-                                       offsets_ptr,
-                                       kernel_size * sizeof(int),
-                                       gpuMemcpyDeviceToHost,
-                                       dev_ctx.stream());
     dev_ctx.Wait();
     int rulebook_len =
         (*h_offsets)[kernel_size - 1] + (*h_counter)[kernel_size - 1];
@@ -675,26 +691,12 @@ int ProductRuleBook(const Context& dev_ctx,
 
     IntT rulebook_len = (last - rulebook_ptr) / 2;
 
-#ifdef PADDLE_WITH_HIP
-    thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
-#else
-    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                           counter_ptr,
-                           counter_ptr + kernel_size,
-                           offsets_ptr);
-
-    phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
-                                       counter_ptr,
-                                       kernel_size * sizeof(int),
-                                       gpuMemcpyDeviceToHost,
-                                       dev_ctx.stream());
-
-    phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
-                                       offsets_ptr,
-                                       kernel_size * sizeof(int),
-                                       gpuMemcpyDeviceToHost,
-                                       dev_ctx.stream());
+    CallThrustScan(dev_ctx,
+                   counter_ptr,
+                   kernel_size,
+                   offsets_ptr,
+                   h_counter->data(),
+                   h_offsets->data());
 
     rulebook->Resize({rulebook_rows, static_cast<int>(rulebook_len)});
     // 3. sorted or merge the out index
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
similarity index 81%
rename from paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
rename to paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
index 08e3d71c961ac..848517aae2549 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/conv_grad_kernel.h"
 
 #include "glog/logging.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"
 
 namespace phi {
 namespace sparse {
@@ -37,21 +37,21 @@ namespace sparse {
 // x_grad = out_grad * transpose(kenrel)
 // kernel_grad = transpose(x) * out_grad
 template <typename T, typename IntT>
-void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
-                         const SparseCooTensor& x,
-                         const DenseTensor& kernel,
-                         const SparseCooTensor& out,
-                         const DenseTensor& rulebook,
-                         const DenseTensor& counter,
-                         const SparseCooTensor& out_grad,
-                         const std::vector<int>& paddings,
-                         const std::vector<int>& dilations,
-                         const std::vector<int>& strides,
-                         const int groups,
-                         const bool subm,
-                         const std::string& key,
-                         SparseCooTensor* x_grad,
-                         DenseTensor* kernel_grad) {
+void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& kernel,
+                            const SparseCooTensor& out,
+                            const DenseTensor& rulebook,
+                            const DenseTensor& counter,
+                            const SparseCooTensor& out_grad,
+                            const std::vector<int>& paddings,
+                            const std::vector<int>& dilations,
+                            const std::vector<int>& strides,
+                            const int groups,
+                            const bool subm,
+                            const std::string& key,
+                            SparseCooTensor* x_grad,
+                            DenseTensor* kernel_grad) {
   const auto& kernel_dims = kernel.dims();
   const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   const int in_channels = kernel_dims[3];
@@ -218,48 +218,48 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
 }
 
 template <typename T, typename Context>
-void Conv3dGradKernel(const Context& dev_ctx,
-                      const SparseCooTensor& x,
-                      const DenseTensor& kernel,
-                      const SparseCooTensor& out,
-                      const DenseTensor& rulebook,
-                      const DenseTensor& counter,
-                      const SparseCooTensor& out_grad,
-                      const std::vector<int>& paddings,
-                      const std::vector<int>& dilations,
-                      const std::vector<int>& strides,
-                      const int groups,
-                      const bool subm,
-                      const std::string& key,
-                      SparseCooTensor* x_grad,
-                      DenseTensor* kernel_grad) {
+void Conv3dCooGradKernel(const Context& dev_ctx,
+                         const SparseCooTensor& x,
+                         const DenseTensor& kernel,
+                         const SparseCooTensor& out,
+                         const DenseTensor& rulebook,
+                         const DenseTensor& counter,
+                         const SparseCooTensor& out_grad,
+                         const std::vector<int>& paddings,
+                         const std::vector<int>& dilations,
+                         const std::vector<int>& strides,
+                         const int groups,
+                         const bool subm,
+                         const std::string& key,
+                         SparseCooTensor* x_grad,
+                         DenseTensor* kernel_grad) {
   PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "Conv3dGradGPUKernel", ([&] {
-        Conv3dGradGPUKernel<T, data_t>(dev_ctx,
-                                       x,
-                                       kernel,
-                                       out,
-                                       rulebook,
-                                       counter,
-                                       out_grad,
-                                       paddings,
-                                       dilations,
-                                       strides,
-                                       groups,
-                                       subm,
-                                       key,
-                                       x_grad,
-                                       kernel_grad);
+      x.non_zero_indices().dtype(), "Conv3dCooGradGPUKernel", ([&] {
+        Conv3dCooGradGPUKernel<T, data_t>(dev_ctx,
+                                          x,
+                                          kernel,
+                                          out,
+                                          rulebook,
+                                          counter,
+                                          out_grad,
+                                          paddings,
+                                          dilations,
+                                          strides,
+                                          groups,
+                                          subm,
+                                          key,
+                                          x_grad,
+                                          kernel_grad);
       }));
 }
 
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sparse_conv3d_grad,
+PD_REGISTER_KERNEL(conv3d_coo_grad,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::Conv3dGradKernel,
+                   phi::sparse::Conv3dCooGradKernel,
                    float,
                    double,
                    phi::dtype::float16) {
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
similarity index 85%
rename from paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
rename to paddle/phi/kernels/sparse/gpu/conv_kernel.cu
index 05aba7521eca5..89159f4c55a70 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/sparse/conv_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"
 #include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
-#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"
 
 #include "glog/logging.h"
 
@@ -29,18 +29,18 @@ namespace phi {
 namespace sparse {
 
 template <typename T, typename IntT>
-void Conv3dGPUKernel(const GPUContext& dev_ctx,
-                     const SparseCooTensor& x,
-                     const DenseTensor& kernel,
-                     const std::vector<int>& paddings,
-                     const std::vector<int>& dilations,
-                     const std::vector<int>& strides,
-                     const int groups,
-                     const bool subm,
-                     const std::string& key,
-                     SparseCooTensor* out,
-                     DenseTensor* rulebook,
-                     DenseTensor* counter) {
+void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
+                        const SparseCooTensor& x,
+                        const DenseTensor& kernel,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations,
+                        const std::vector<int>& strides,
+                        const int groups,
+                        const bool subm,
+                        const std::string& key,
+                        SparseCooTensor* out,
+                        DenseTensor* rulebook,
+                        DenseTensor* counter) {
   // update padding and dilation
   // Currently, only support x.layout is NDHWC, groups = 1
   // if x.layout != NDHWC then transpose(x), transpose(weight)
@@ -205,42 +205,42 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
  * counter: return counter if key is not vailed else return nullptr
  **/
 template <typename T, typename Context>
-void Conv3dKernel(const Context& dev_ctx,
-                  const SparseCooTensor& x,
-                  const DenseTensor& kernel,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const int groups,
-                  const bool subm,
-                  const std::string& key,
-                  SparseCooTensor* out,
-                  DenseTensor* rulebook,
-                  DenseTensor* counter) {
+void Conv3dCooKernel(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     const DenseTensor& kernel,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     const std::vector<int>& strides,
+                     const int groups,
+                     const bool subm,
+                     const std::string& key,
+                     SparseCooTensor* out,
+                     DenseTensor* rulebook,
+                     DenseTensor* counter) {
   PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "Conv3dGPUKernel", ([&] {
-        Conv3dGPUKernel<T, data_t>(dev_ctx,
-                                   x,
-                                   kernel,
-                                   paddings,
-                                   dilations,
-                                   strides,
-                                   groups,
-                                   subm,
-                                   key,
-                                   out,
-                                   rulebook,
-                                   counter);
+      x.non_zero_indices().dtype(), "Conv3dCooGPUKernel", ([&] {
+        Conv3dCooGPUKernel<T, data_t>(dev_ctx,
+                                      x,
+                                      kernel,
+                                      paddings,
+                                      dilations,
+                                      strides,
+                                      groups,
+                                      subm,
+                                      key,
+                                      out,
+                                      rulebook,
+                                      counter);
       }));
 }
 
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sparse_conv3d,
+PD_REGISTER_KERNEL(conv3d_coo,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::Conv3dKernel,
+                   phi::sparse::Conv3dCooKernel,
                    float,
                    double,
                    phi::dtype::float16) {
diff --git a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
index 255c6621da015..7ac727cae4ca9 100644
--- a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/sparse/convolution.h"
-#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"
 
 namespace phi {
 namespace sparse {
diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc
index f8ca0f6651c9d..b1df197f42f47 100644
--- a/paddle/phi/tests/api/test_sparse_conv_api.cc
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 
-PD_DECLARE_KERNEL(sparse_conv3d, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(conv3d_coo, CPU, ALL_LAYOUT);
 
 template <typename T>
 void TestConv3dBase(const std::vector<int>& indices,
@@ -76,8 +76,8 @@ void TestConv3dBase(const std::vector<int>& indices,
          kernel.size() * sizeof(T));
 
   if (!std::is_same<T, phi::dtype::float16>::value) {
-    auto tensor_out = paddle::experimental::sparse::conv3d(
-        x, weight, paddings, dilations, strides, 1, false, "Conv3d_0");
+    auto tensor_out = paddle::experimental::sparse::conv3d_coo(
+        x, weight, paddings, dilations, strides, 1, false, "Conv3d");
 
     auto out =
         std::dynamic_pointer_cast<phi::SparseCooTensor>(tensor_out.impl());
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index 48cdae5aa0868..df0e87c6b5a49 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -23,8 +23,8 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/sparse/coalesced_kernel.h"
-#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
-#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/sparse/conv_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/conv_kernel.h"
 
 namespace phi {
 namespace tests {
@@ -113,17 +113,17 @@ void TestConv3dBase(const std::vector<IntT>& indices,
 
   if (!std::is_same<T, phi::dtype::float16>::value) {
     DenseTensor rulebook, counter;
-    SparseCooTensor out = sparse::Conv3d<T>(dev_ctx_cpu,
-                                            x_tensor,
-                                            kernel_tensor,
-                                            paddings,
-                                            dilations,
-                                            strides,
-                                            1,
-                                            subm,
-                                            "Conv3d",
-                                            &rulebook,
-                                            &counter);
+    SparseCooTensor out = sparse::Conv3dCoo<T>(dev_ctx_cpu,
+                                               x_tensor,
+                                               kernel_tensor,
+                                               paddings,
+                                               dilations,
+                                               strides,
+                                               1,
+                                               subm,
+                                               "Conv3d",
+                                               &rulebook,
+                                               &counter);
 
     ASSERT_EQ(correct_out_dims.size(), out.dims().size());
     for (int i = 0; i < correct_out_dims.size(); i++) {
@@ -140,19 +140,19 @@ void TestConv3dBase(const std::vector<IntT>& indices,
 
     if (backward) {
       std::tuple<SparseCooTensor, DenseTensor> grads =
-          sparse::Conv3dGrad<T>(dev_ctx_cpu,
-                                x_tensor,
-                                kernel_tensor,
-                                out,
-                                rulebook,
-                                counter,
-                                out,
-                                paddings,
-                                dilations,
-                                strides,
-                                1,
-                                subm,
-                                "Conv3d");
+          sparse::Conv3dCooGrad<T>(dev_ctx_cpu,
+                                   x_tensor,
+                                   kernel_tensor,
+                                   out,
+                                   rulebook,
+                                   counter,
+                                   out,
+                                   paddings,
+                                   dilations,
+                                   strides,
+                                   1,
+                                   subm,
+                                   "Conv3d");
       f_verify(std::get<0>(grads).non_zero_elements().data<T>(), features_grad);
       f_verify(std::get<1>(grads).data<T>(), kernel_grad);
     }
@@ -201,17 +201,17 @@ void TestConv3dBase(const std::vector<IntT>& indices,
       dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);
 
   DenseTensor d_rulebook, d_counter;
-  SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu,
-                                            d_x_tensor,
-                                            d_kernel_tensor,
-                                            paddings,
-                                            dilations,
-                                            strides,
-                                            1,
-                                            subm,
-                                            "Conv3d",
-                                            &d_rulebook,
-                                            &d_counter);
+  SparseCooTensor d_out = sparse::Conv3dCoo<T>(dev_ctx_gpu,
+                                               d_x_tensor,
+                                               d_kernel_tensor,
+                                               paddings,
+                                               dilations,
+                                               strides,
+                                               1,
+                                               subm,
+                                               "Conv3d",
+                                               &d_rulebook,
+                                               &d_counter);
   SparseCooTensor tmp_d_out = sparse::Coalesced<T>(dev_ctx_gpu, d_out);
 
   ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
@@ -246,19 +246,19 @@ void TestConv3dBase(const std::vector<IntT>& indices,
 
   if (backward) {
     std::tuple<SparseCooTensor, DenseTensor> grads =
-        sparse::Conv3dGrad<T>(dev_ctx_gpu,
-                              d_x_tensor,
-                              d_kernel_tensor,
-                              d_out,
-                              d_rulebook,
-                              d_counter,
-                              d_out,
-                              paddings,
-                              dilations,
-                              strides,
-                              1,
-                              subm,
-                              "Conv3d");
+        sparse::Conv3dCooGrad<T>(dev_ctx_gpu,
+                                 d_x_tensor,
+                                 d_kernel_tensor,
+                                 d_out,
+                                 d_rulebook,
+                                 d_counter,
+                                 d_out,
+                                 paddings,
+                                 dilations,
+                                 strides,
+                                 1,
+                                 subm,
+                                 "Conv3d");
     DenseTensor d_features_grad = std::get<0>(grads).non_zero_elements();
     DenseTensor d_kernel_grad = std::get<1>(grads);
     DenseTensor h_features_grad =
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index a12425b69299e..b71ef0357cb37 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -298,6 +298,7 @@ def test_sparse_coo_tensor_sorted(self):
                     values = paddle.to_tensor(values, dtype='float32')
                     sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                         indices, values)
+                    sparse_x = paddle.incubate.sparse.coalesced(sparse_x)
                     indices_sorted = [[0, 1], [1, 0]]
                     values_sorted = [5.0, 1.0]
                     assert np.array_equal(indices_sorted,
@@ -310,6 +311,7 @@ def test_sparse_coo_tensor_sorted(self):
                     values = paddle.to_tensor(values, dtype='float32')
                     sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                         indices, values)
+                    sparse_x = paddle.incubate.sparse.coalesced(sparse_x)
                     values_sorted = [[5.0, 5.0], [1.0, 1.0]]
                     assert np.array_equal(indices_sorted,
                                           sparse_x.indices().numpy())
diff --git a/python/paddle/incubate/sparse/coalesced.py b/python/paddle/incubate/sparse/coalesced.py
index dcd2f8ca28f3a..23c82499851d3 100644
--- a/python/paddle/incubate/sparse/coalesced.py
+++ b/python/paddle/incubate/sparse/coalesced.py
@@ -22,4 +22,31 @@
 
 @dygraph_only
 def coalesced(x):
+    r"""
+    the coalesced operator include sorted and merge, after coalesced, the indices of x is sorted and unique, .
+
+    Args:
+        x (Tensor): the input SparseCooTensor.
+
+    Returns:
+        Tensor: return the SparseCooTensor after coalesced.
+
+    Examples:
+
+    ..  code-block:: python
+
+        import paddle
+        from paddle.incubate import sparse
+        from paddle.fluid.framework import _test_eager_guard
+
+        with _test_eager_guard():
+            indices = [[0, 0, 1], [1, 1, 2]]
+            values = [1.0, 2.0, 3.0]
+            sp_x = sparse.sparse_coo_tensor(indices, values)
+            sp_x = sparse.coalesced(sp_x)
+            print(sp_x.indices())
+            #[[0, 1], [1, 2]]
+            print(sp_x.values())
+            #[3.0, 3.0]
+	"""
     return _C_ops.final_state_sparse_coalesced(x)
diff --git a/python/paddle/incubate/sparse/nn/functional/conv.py b/python/paddle/incubate/sparse/nn/functional/conv.py
index 2dda83b2c1659..503ad9a127b0f 100644
--- a/python/paddle/incubate/sparse/nn/functional/conv.py
+++ b/python/paddle/incubate/sparse/nn/functional/conv.py
@@ -63,9 +63,9 @@ def _conv3d(x,
     dilation = convert_to_list(dilation, dims, 'dilation')
     op_type = "conv3d"
 
-    pre_bias = _C_ops.final_state_sparse_conv3d(x, weight, padding, dilation,
-                                                stride, groups, subm,
-                                                key if key is not None else "")
+    pre_bias = _C_ops.final_state_sparse_conv3d_coo(
+        x, weight, padding, dilation, stride, groups, subm,
+        key if key is not None else "")
     if bias is not None:
         values = pre_bias.values()
         add_bias = elementwise_add(values, bias, axis=1)

From 0b5ca0eeaa907535f26d18a80a5f6f68734b148d Mon Sep 17 00:00:00 2001
From: Zihang Yao <1162526220@qq.com>
Date: Fri, 8 Jul 2022 12:40:52 +0800
Subject: [PATCH 64/70] fix

---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu | 40 ++++++++++++---------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 1936fccc63e4a..61694db7e8ed3 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -236,8 +236,12 @@ static __global__ void BNForwardTraining2DChannelLastCompStat(
     }
 
     // vertical block sum
-    merge_block_vertical(
-        x_sum, x_square_sum, smem_sum, smem_square_sum, &x_sum, &x_square_sum);
+    merge_block_vertical<T>(x_sum,
+                            x_square_sum,
+                            &smem_sum[0],
+                            &smem_square_sum[0],
+                            &x_sum,
+                            &x_square_sum);
 
     if (gridDim.y > 1) {
       volatile BatchNormParamType<T> *staging_sum = block_data_ptr;
@@ -272,12 +276,12 @@ static __global__ void BNForwardTraining2DChannelLastCompStat(
         }
 
         // vertical block sum
-        merge_block_vertical(x_sum,
-                             x_square_sum,
-                             smem_sum,
-                             smem_square_sum,
-                             &x_sum,
-                             &x_square_sum);
+        merge_block_vertical<T>(x_sum,
+                                x_square_sum,
+                                &smem_sum[0],
+                                &smem_square_sum[0],
+                                &x_sum,
+                                &x_square_sum);
 
         // final compute
         if (threadIdx.y == 0) {
@@ -400,8 +404,12 @@ static __global__ void BNForwardTraining2DCompStat(
     }
 
     // horizonal block sum
-    merge_block_horizonal(
-        x_sum, x_square_sum, smem_sum, smem_square_sum, &x_sum, &x_square_sum);
+    merge_block_horizonal<T>(x_sum,
+                             x_square_sum,
+                             &smem_sum[0],
+                             &smem_square_sum[0],
+                             &x_sum,
+                             &x_square_sum);
 
     if (gridDim.x > 1) {
       volatile BatchNormParamType<T> *staging_sum = block_data_ptr;
@@ -436,12 +444,12 @@ static __global__ void BNForwardTraining2DCompStat(
         }
 
         // horizonal block sum
-        merge_block_horizonal(x_sum,
-                              x_square_sum,
-                              smem_sum,
-                              smem_square_sum,
-                              &x_sum,
-                              &x_square_sum);
+        merge_block_horizonal<T>(x_sum,
+                                 x_square_sum,
+                                 &smem_sum[0],
+                                 &smem_square_sum[0],
+                                 &x_sum,
+                                 &x_square_sum);
 
         // final compute
         if (threadIdx.x == 0) {

From 33ebaf5d8c35cbd0613477e0d37c84f82c7fd13f Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Fri, 8 Jul 2022 08:21:24 +0000
Subject: [PATCH 65/70] rename table_ptr to indices_dict

---
 paddle/phi/api/yaml/sparse_api.yaml           |   4 +-
 paddle/phi/api/yaml/sparse_bw_api.yaml        |   2 +-
 paddle/phi/core/sparse_coo_tensor.h           |  50 +++++++++--------
 paddle/phi/kernels/funcs/sparse/convolution.h |  42 +++++++-------
 .../{coalesced_kernel.h => coalesce_kernel.h} |  10 ++--
 ...coalesced_kernel.cc => coalesce_kernel.cc} |  22 ++++----
 paddle/phi/kernels/sparse/cpu/conv.h          |   6 +-
 paddle/phi/kernels/sparse/cpu/conv_kernel.cc  |  38 ++++++-------
 paddle/phi/kernels/sparse/cpu/pool_kernel.cc  |   6 +-
 .../gpu/.convolution_grad_kernel.cu.swp       | Bin 20480 -> 0 bytes
 ...coalesced_kernel.cu => coalesce_kernel.cu} |  22 ++++----
 paddle/phi/kernels/sparse/gpu/conv.cu.h       |  23 +++-----
 paddle/phi/kernels/sparse/gpu/conv_kernel.cu  |  22 +++++---
 paddle/phi/kernels/sparse/gpu/pool_kernel.cu  |   4 +-
 .../phi/kernels/sparse/sparse_utils_kernel.h  |   7 +--
 .../kernels/test_sparse_conv3d_dev_api.cc     |   4 +-
 .../tests/kernels/test_sparse_pool_dev_api.cc |   4 +-
 .../tests/unittests/test_sparse_conv_op.py    |   2 +-
 python/paddle/incubate/sparse/__init__.py     |   5 +-
 python/paddle/incubate/sparse/coalesced.py    |  52 ------------------
 python/paddle/incubate/sparse/unary.py        |  32 +++++++++++
 21 files changed, 163 insertions(+), 194 deletions(-)
 rename paddle/phi/kernels/sparse/{coalesced_kernel.h => coalesce_kernel.h} (78%)
 rename paddle/phi/kernels/sparse/cpu/{coalesced_kernel.cc => coalesce_kernel.cc} (87%)
 delete mode 100644 paddle/phi/kernels/sparse/gpu/.convolution_grad_kernel.cu.swp
 rename paddle/phi/kernels/sparse/gpu/{coalesced_kernel.cu => coalesce_kernel.cu} (92%)
 delete mode 100644 python/paddle/incubate/sparse/coalesced.py

diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml
index 5780bec804008..917137e2343d9 100644
--- a/paddle/phi/api/yaml/sparse_api.yaml
+++ b/paddle/phi/api/yaml/sparse_api.yaml
@@ -132,11 +132,11 @@
     layout : x
   backward : values_grad
 
-- api: coalesced
+- api: coalesce
   args : (Tensor x)
   output : Tensor(out)
   kernel :
-    func: coalesced{sparse_coo -> sparse_coo}
+    func: coalesce{sparse_coo -> sparse_coo}
     layout : x
 
 - api: full_like
diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml
index 56f0595351d35..b4d990e3ae5e0 100644
--- a/paddle/phi/api/yaml/sparse_bw_api.yaml
+++ b/paddle/phi/api/yaml/sparse_bw_api.yaml
@@ -7,7 +7,7 @@
            add_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
 
 - backward_api : conv3d_coo_grad
-  forward : conv3d_coo (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor), Tensor(counter@DenseTensor)
+  forward : conv3d_coo (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out), Tensor(rulebook), Tensor(counter)
   args : (Tensor x, Tensor kernel, Tensor out, Tensor rulebook, Tensor counter, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key)
   output : Tensor(x_grad), Tensor(kernel_grad)
   kernel :
diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h
index c69c7aab89d28..300ae8a0ab958 100644
--- a/paddle/phi/core/sparse_coo_tensor.h
+++ b/paddle/phi/core/sparse_coo_tensor.h
@@ -157,37 +157,45 @@ class SparseCooTensor : public TensorBase,
   int32_t dense_dim() const;
 
   /// \brief query table according to key
-  const std::pair<DenseTensor, std::vector<int>>* table(
+  const std::pair<DenseTensor, DenseTensor>* IndicesPairs(
       const std::string& key) const {
-    const auto& iter = table_ptr_->find(key);
-    if (iter == table_ptr_->end()) {
+    if (indices_dict_ == nullptr) {
+      return nullptr;
+    }
+    const auto& iter = indices_dict_->find(key);
+    if (iter == indices_dict_->end()) {
       return nullptr;
     }
     return &iter->second;
   }
 
-  /// \brief set table according to key
-  void SetTable(const std::string& key,
-                const std::pair<DenseTensor, std::vector<int>>& table) {
-    auto ret = table_ptr_->insert({key, table});
+  /// \brief save (key, indices_pairs)
+  void SaveIndicesPairs(
+      const std::string& key,
+      const std::pair<DenseTensor, DenseTensor>& indices_pairs) {
+    if (indices_dict_ == nullptr) {
+      indices_dict_ = std::make_shared<
+          std::map<std::string, std::pair<DenseTensor, DenseTensor>>>();
+    }
+    auto ret = indices_dict_->insert({key, indices_pairs});
     if (ret.second == false) {
-      ret.first->second = table;
+      ret.first->second = indices_pairs;
     }
   }
 
-  /// \brief get table_ptr_
+  /// \brief get indices_dict_
   const std::shared_ptr<
-      std::map<std::string, std::pair<DenseTensor, std::vector<int>>>>&
-  GetTablePtr() const {
-    return table_ptr_;
+      std::map<std::string, std::pair<DenseTensor, DenseTensor>>>&
+  GetIndicesDict() const {
+    return indices_dict_;
   }
 
-  /// \brief set table_ptr_
-  void SetTablePtr(
+  /// \brief set indices_dict_
+  void SetIndicesDict(
       const std::shared_ptr<
-          std::map<std::string, std::pair<DenseTensor, std::vector<int>>>>&
-          table_ptr) {
-    table_ptr_ = table_ptr;
+          std::map<std::string, std::pair<DenseTensor, DenseTensor>>>&
+          indices_dict) {
+    indices_dict_ = indices_dict;
   }
 
  private:
@@ -203,11 +211,9 @@ class SparseCooTensor : public TensorBase,
   // for submanifold conv
   // SubmConv will generate a rulebook and a counter, which can be
   // reused by different SubmConv.
-  // refer to sparse/gpu/convolution_kernel.cu
-  std::shared_ptr<
-      std::map<std::string, std::pair<DenseTensor, std::vector<int>>>>
-      table_ptr_ = std::make_shared<
-          std::map<std::string, std::pair<DenseTensor, std::vector<int>>>>();
+  // refer to sparse/gpu/convolution_kernel.cu.
+  std::shared_ptr<std::map<std::string, std::pair<DenseTensor, DenseTensor>>>
+      indices_dict_ = nullptr;
 
   /* --------------------------- */
   /*   example: non zero element is scalar */
diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h
index a4027670a508c..0c6b8b76b54d8 100644
--- a/paddle/phi/kernels/funcs/sparse/convolution.h
+++ b/paddle/phi/kernels/funcs/sparse/convolution.h
@@ -195,9 +195,9 @@ inline const IntT* GetRulebookPtr(const SparseCooTensor& coo,
                                   const std::string& key,
                                   int* rulebook_len) {
   if (!key.empty()) {
-    const auto* table = coo.table(key);
-    if (table != nullptr) {
-      const DenseTensor& tmp_rulebook = table->first;
+    const auto* indices_pairs = coo.IndicesPairs(key);
+    if (indices_pairs != nullptr) {
+      const DenseTensor& tmp_rulebook = indices_pairs->first;
       *rulebook_len = tmp_rulebook.dims()[1];
       return tmp_rulebook.data<IntT>();
     }
@@ -210,9 +210,9 @@ inline const int* GetCounterPtr(const SparseCooTensor& coo,
                                 const DenseTensor& counter,
                                 const std::string& key) {
   if (!key.empty()) {
-    const auto* table = coo.table(key);
-    if (table != nullptr) {
-      return table->second.data();
+    const auto* indices_pairs = coo.IndicesPairs(key);
+    if (indices_pairs != nullptr) {
+      return indices_pairs->second.data<int>();
     }
   }
   return counter.data<int>();
@@ -224,18 +224,18 @@ inline const IntT* PrepareSubm(const Context& dev_ctx,
                                const std::string& key,
                                const DDim& out_dims,
                                SparseCooTensor* out,
-                               std::vector<int>* counter,
-                               std::vector<int>* offsets,
+                               int* counter,
+                               int* offsets,
                                int* rulebook_len,
                                bool* need_product_rulebook) {
-  const auto* table = x.table(key);
-  if (table != nullptr) {
+  const auto* indices_pairs = x.IndicesPairs(key);
+  if (indices_pairs != nullptr) {
     *need_product_rulebook = false;
-    const DenseTensor& rulebook = table->first;
-    memcpy(counter->data(),
-           table->second.data(),
-           table->second.size() * sizeof(int));
-    out->SetTablePtr(x.GetTablePtr());
+    const DenseTensor& rulebook = indices_pairs->first;
+    const int counter_size = indices_pairs->second.numel();
+    memcpy(
+        counter, indices_pairs->second.data<int>(), counter_size * sizeof(int));
+    out->SetIndicesDict(x.GetIndicesDict());
 
     *rulebook_len = rulebook.dims()[1];
 
@@ -245,7 +245,7 @@ inline const IntT* PrepareSubm(const Context& dev_ctx,
     phi::Copy(
         dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices);
     out->SetMember(out_indices, out_values, out_dims, false);
-    PrefixSum<int>(counter->data(), offsets->data(), counter->size());
+    PrefixSum<int>(counter, offsets, counter_size);
     return rulebook.data<IntT>();
   }
   return nullptr;
@@ -256,18 +256,18 @@ inline void SaveToTable(const Context& dev_ctx,
                         const SparseCooTensor& x,
                         const std::string& key,
                         const DenseTensor& in_rulebook,
-                        const std::vector<int>& counter_vec,
+                        const DenseTensor& h_counter,
                         SparseCooTensor* out,
                         DenseTensor* out_rulebook,
                         DenseTensor* counter) {
-  out->SetTablePtr(x.GetTablePtr());
+  out->SetIndicesDict(x.GetIndicesDict());
   if (!key.empty()) {
-    out->SetTable(key, std::make_pair(in_rulebook, counter_vec));
+    out->SaveIndicesPairs(key, std::make_pair(in_rulebook, h_counter));
   } else {
     *out_rulebook = in_rulebook;
-    counter->Resize({static_cast<int>(counter_vec.size())});
+    counter->Resize({h_counter.numel()});
     int* counter_ptr = dev_ctx.template HostAlloc<int>(counter);
-    memcpy(counter_ptr, counter_vec.data(), counter_vec.size() * sizeof(int));
+    memcpy(counter_ptr, h_counter.data<int>(), h_counter.numel() * sizeof(int));
   }
 }
 
diff --git a/paddle/phi/kernels/sparse/coalesced_kernel.h b/paddle/phi/kernels/sparse/coalesce_kernel.h
similarity index 78%
rename from paddle/phi/kernels/sparse/coalesced_kernel.h
rename to paddle/phi/kernels/sparse/coalesce_kernel.h
index d2f5f8f3150af..cb8b98fd87404 100644
--- a/paddle/phi/kernels/sparse/coalesced_kernel.h
+++ b/paddle/phi/kernels/sparse/coalesce_kernel.h
@@ -22,14 +22,14 @@ namespace phi {
 namespace sparse {
 
 template <typename T, typename Context>
-void CoalescedKernel(const Context& dev_ctx,
-                     const SparseCooTensor& x,
-                     SparseCooTensor* out);
+void CoalesceKernel(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    SparseCooTensor* out);
 
 template <typename T, typename Context>
-SparseCooTensor Coalesced(const Context& dev_ctx, const SparseCooTensor& x) {
+SparseCooTensor Coalesce(const Context& dev_ctx, const SparseCooTensor& x) {
   SparseCooTensor coo;
-  CoalescedKernel<T, Context>(dev_ctx, x, &coo);
+  CoalesceKernel<T, Context>(dev_ctx, x, &coo);
   return coo;
 }
 
diff --git a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc b/paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc
similarity index 87%
rename from paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
rename to paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc
index b42294cfc0315..95d8abd6bcf5c 100644
--- a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
+#include "paddle/phi/kernels/sparse/coalesce_kernel.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
@@ -22,9 +22,9 @@ namespace phi {
 namespace sparse {
 
 template <typename T, typename IntT>
-void CoalescedCPUKernel(const CPUContext& dev_ctx,
-                        const SparseCooTensor& x,
-                        SparseCooTensor* out) {
+void CoalesceCPUKernel(const CPUContext& dev_ctx,
+                       const SparseCooTensor& x,
+                       SparseCooTensor* out) {
   const DenseTensor& x_indices = x.non_zero_indices();
   const DenseTensor& x_values = x.non_zero_elements();
   DenseTensor out_indices = phi::EmptyLike<IntT>(dev_ctx, x_indices);
@@ -95,22 +95,22 @@ void CoalescedCPUKernel(const CPUContext& dev_ctx,
 }
 
 template <typename T, typename Context>
-void CoalescedKernel(const Context& dev_ctx,
-                     const SparseCooTensor& x,
-                     SparseCooTensor* out) {
+void CoalesceKernel(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    SparseCooTensor* out) {
   PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "CoalescedCPUKernel", ([&] {
-        CoalescedCPUKernel<T, data_t>(dev_ctx, x, out);
+      x.non_zero_indices().dtype(), "CoalesceCPUKernel", ([&] {
+        CoalesceCPUKernel<T, data_t>(dev_ctx, x, out);
       }));
 }
 
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(coalesced,
+PD_REGISTER_KERNEL(coalesce,
                    CPU,
                    ALL_LAYOUT,
-                   phi::sparse::CoalescedKernel,
+                   phi::sparse::CoalesceKernel,
                    float,
                    double,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/sparse/cpu/conv.h b/paddle/phi/kernels/sparse/cpu/conv.h
index 82480e492abae..e47f33c8c4834 100644
--- a/paddle/phi/kernels/sparse/cpu/conv.h
+++ b/paddle/phi/kernels/sparse/cpu/conv.h
@@ -41,12 +41,12 @@ void ProductRuleBook(const Context& dev_ctx,
                      const DDim& out_dims,
                      const bool subm,
                      DenseTensor* rulebook,
-                     std::vector<int>* counter_per_kernel) {
+                     int* counter_per_kernel) {
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
   const IntT* indices_ptr = non_zero_indices.data<IntT>();
   int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
-  memset(counter_per_kernel->data(), 0, kernel_size * sizeof(int));
+  memset(counter_per_kernel, 0, kernel_size * sizeof(int));
 
   int rulebook_len = 0;
   // calc the rulebook_len
@@ -106,7 +106,7 @@ void ProductRuleBook(const Context& dev_ctx,
               }
 
               if (rulebook_ptr == nullptr) {
-                (*counter_per_kernel)[kernel_index - 1] += 1;
+                counter_per_kernel[kernel_index - 1] += 1;
                 ++rulebook_len;
               } else {
                 rulebook_ptr[rulebook_index] = kernel_index - 1;
diff --git a/paddle/phi/kernels/sparse/cpu/conv_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
index 5a892e64c65a0..f15a636f96d45 100644
--- a/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
@@ -69,8 +69,11 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
   // Second algorithm:
   // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
   // 1. product rulebook
-  std::vector<int> counter_per_kernel(kernel_size, 0);
-  std::vector<int> offsets(kernel_size + 1);
+  DenseTensor h_counter, h_offsets;
+  h_counter.Resize({kernel_size});
+  h_offsets.Resize({kernel_size + 1});
+  int* h_counter_ptr = dev_ctx.template HostAlloc<int>(&h_counter);
+  int* h_offsets_ptr = dev_ctx.template HostAlloc<int>(&h_offsets);
 
   // DenseTensor* rulebook = nullptr;
   const IntT* rulebook_ptr = nullptr;
@@ -83,8 +86,8 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
         key,
         out_dims,
         out,
-        &counter_per_kernel,
-        &offsets,
+        h_counter_ptr,
+        h_offsets_ptr,
         &n,
         &need_product_rulebook);
   }
@@ -99,24 +102,17 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
                                          out_dims,
                                          subm,
                                          &tmp_rulebook,
-                                         &counter_per_kernel);
+                                         h_counter_ptr);
 
     UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
         dev_ctx, x, kernel_size, out_channels, out_dims, &tmp_rulebook, out);
     n = tmp_rulebook.dims()[1];
     rulebook_ptr = tmp_rulebook.data<IntT>();
 
-    phi::funcs::sparse::SaveToTable(dev_ctx,
-                                    x,
-                                    key,
-                                    tmp_rulebook,
-                                    counter_per_kernel,
-                                    out,
-                                    rulebook,
-                                    counter);
+    phi::funcs::sparse::SaveToTable(
+        dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter);
   }
   // int n = rulebook->dims()[1];
-  const int* counter_ptr = counter_per_kernel.data();
 
   // 2. gather
   DenseTensorMeta in_features_meta(
@@ -140,24 +136,24 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
   auto blas = phi::funcs::GetBlas<CPUContext, T>(dev_ctx);
   int offset = 0;
   for (int i = 0; i < kernel_size; i++) {
-    offsets[i] = offset;
-    offset += counter_ptr[i];
+    h_offsets_ptr[i] = offset;
+    offset += h_counter_ptr[i];
   }
-  offsets[kernel_size] = offset;
+  h_offsets_ptr[kernel_size] = offset;
 
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
-    if (counter_ptr[i] <= 0) {
+    if (h_counter_ptr[i] <= 0) {
       continue;
     }
 
     // call gemm: (n, in_channels) * (in_channels, out_channels)
-    const int M = counter_ptr[i];
+    const int M = h_counter_ptr[i];
     const int K = in_channels;   // in_channels
     const int N = out_channels;  // out_channels
-    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels;
     const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
-    T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
+    T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels;
     blas.GEMM(CblasNoTrans,
               CblasNoTrans,
               M,
diff --git a/paddle/phi/kernels/sparse/cpu/pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/pool_kernel.cc
index 36949f7161245..a8d41d0578b87 100644
--- a/paddle/phi/kernels/sparse/cpu/pool_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/pool_kernel.cc
@@ -48,10 +48,6 @@ void MaxPoolCPUKernel(const CPUContext& dev_ctx,
       x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
   const int in_channels = real_kernel_sizes[3];
 
-  // DenseTensorMeta counter_meta(
-  //     DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  // DenseTensor counter_per_kernel = phi::Empty(dev_ctx,
-  // std::move(counter_meta));
   std::vector<int> counter_per_kernel(kernel_size, 0);
 
   const T* in_features_ptr = x.non_zero_elements().data<T>();
@@ -65,7 +61,7 @@ void MaxPoolCPUKernel(const CPUContext& dev_ctx,
                                        out_dims,
                                        false,
                                        rulebook,
-                                       &counter_per_kernel);
+                                       counter_per_kernel.data());
 
   UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
       dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out);
diff --git a/paddle/phi/kernels/sparse/gpu/.convolution_grad_kernel.cu.swp b/paddle/phi/kernels/sparse/gpu/.convolution_grad_kernel.cu.swp
deleted file mode 100644
index e1d0e2bee6c88631e06944143561a3ec7c50b945..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 20480
zcmeHNZEPGz86GGlgtQPSsgMw2aFDrk?#p)6L|4afu}yl4V+Y?QQLxh0-tOGp;=SF?
z?yT)gNGl;xBt#I3gvzh-RfJR!@PpswLn@+D1!`1*A|WIY<p=!%h(AcsmS<*n&uibE
zeI`OkWLA1^Z)e~4otbAocHWt{QMy<?M^6-v6I_Q0`P`o;8nusP4m~nKi0`^wB=2=~
zua+kBK{v>^EhE2nGRUt_mP+}noxDCZHJvY&N+$|JgcRFuhZV22b*HtaTkX&-dcON<
zX7J*9-85}h^x|9FDy}i#VRldqJlzjivE_wD!*$kOJLHz@Xf0niwMeF5gnJcGy;ls}
z>kNdvnLqg;nVy;~2`Y|`eV9J<-V66SE2{pA0mXn~Krx^gPz)#r6a$KZdx`<>-AA58
zjo%vA`f_}K%Z~f&Z$_6Hr+;e4`?tl<$Kw0*JKo2S>ZTY_3@8Q^1BwB~fMP%~pcqgL
zC<YV*iUGyI|9}A<4*db-`+4E;<NQDI|9|{$LY@bn08Rl%fcF8fzKf7Q0$&Fn2aW(Q
zKS;<AfbRo0fMwwQz-tc>@-yHT@Hp@x;MGIW2VMbw3j754Ch!dK1>gzbD$oJufe!$G
ze<vX?0AB$bz$Ea-K|+2G+ypKH1>gYi+jkK19PlJC4IBi1hYEfV_%84n;5u*#co_KG
z+X?w0@Dwl)90gvxpO6=UF9Q~E4444^g-wo^fLnk8oB$30zq*f*UjokpHZTd?58TG4
z%2$D>flmTO;Gf|67l7dVDZzPk(NQixv{}P-*EE|sbb<!fHRd)mmcz3-GKxO!8SuC-
zRJ^uTE;n>zjX7peE+fdITt4H4=UB(^x|N{o7@6_5X1JllnZH`Pl4R3M%k(UwNI6={
zJ_@S=H_PR9gmt+73<@_(-DWdjJjg`x8k#6KJw_+9IU1WzzY$h)M3`Y*yT*JsW3qM4
z;F~!*Zfcualj%J4S&&0MI=2kX&;veGpUspCrDirNK%eHO3>eu(r9MF;8Esv+Ll$UY
zh@Ohm^0TJSb&-cO9|`7XSD2_t#W5?k?HW2~nazTO*c4G{joGZj98@}6Fs%-5QdJMR
zOB=Qxz_uj689D}-Kg0NGq|VH_V0x41DDsM{E%iVnZR>V34ta?vJVn!l7U|TZgK3MU
zM99o845W)wV;m&`zYb3qsn;Z;mBCmNffJLJ%4Y{-+urok7QYre$=brS8}cZ|Cu#0D
zkG}N6wx*GV+Ug^*Qo7sp$y-jZOU2@;l(aPJEH25@LdWCXq=f{j=<ywQon`vUr*#<D
zqsEt-^UMiY9Y3z$UrBn3$UOGQBwe4~?5E{MG@YvZZkAJGJ;R#?lXpFq!LYu*eOeHt
z7>N230bMt8bRL6gQSTzPTwa=c?2>Hr6fft%SU)$iIGLEA;d!BC`^_p%(|<3Q6lHNH
zs0%{c0#3!~PBVtdA$g1^kJZU5a&Y(LCwdZbr`=KzQ<a!@qsgkLAsJ1mK@O=ww|X4b
z(}hWqxI=P!hh%I;PU+iimagCvmCj^xXTns<mY5jF)8f5+V+KS42VJRt)5}iGbXj_R
z6pi@9Fhn#W51(9{24SO<eExQpV~gXK?}lEmyWGyt<2a@d-!kz<+qD$^YNlo51Bf29
zUqumSX*sPu^is@@_lD!{S#48^+I}AjNf+K-r6vPH=Ug|MM91l7j;y<uN#_tAI&Gdo
z+n<%OqnW5#kSso_<1|@D5Y9Rtm}7KC%vX-yVYHs3z5lVoEGZU~4k<bmIwbdXC-B?=
zANnjhcQV%t!KfcWCP~EmEpe=`d}e1Mctr@)RUu_zIVB%3+Is)v0b@BP+m!EDg-pWC
z>U6I6Un~NzlnD{#N06++XcQ`oe!rE3PY#~RQOhyx&}4LM<Oo|t)fU=gBWZ%?fpNKe
zsmrxbv7__0CjN@9yl0k?j#xnz#m`WA!}VE_%g?NwZFd)pS@&7X!dUDM6;77QG8`GJ
zWzE)6aIqS8npSIcHOpzbqf5iO=)ppP*jC39%U+Cn8lsc<l(!jOv<$JVDA0)_sWFyL
zqfIO{giPQu!)jUvwGn<D>McfFSg`mGCIad)zheb~w5mHMxm2w`wtS&ZFI84nDoge1
zLX9r3(7ENM`D(qoyo9Gu(8|(fdbYYWpQFrzeV?*T56kj^B0mcQ%wlGNn1WBM5prhs
z7nfQAO$)dV)g3xksnKd}jGnI4s<j-s;Ml^b&#s1+4=ar>)je1@^oGq)mkp7h-trmV
zVQp<Wbi=neDuq><+vFR%&%~;j(l+N_xmeuT*eK{2E^StD{Z<i%1am=gu{yV~R9nbT
z6%ZbK*$rt&?^3tHby(E#o%Zn5+(xX<3Dz)D&T%;n1Hn70)U;4AwrMa=eA%HY$j}`O
z&<zlkHc=pP!==fo$igK}nxhw)FB*oTU^F8dax7LF!@7wSCou1MUEgZ8In5YZI#rsQ
zqER?~^h+xtZ@Yd_pm0;@iVzGa*5J%v7cC_Gf8m?o2893raO{x20AK&Jz-izxa0h<=
zw}2*K0tT=MoCHn)hk<{<=f4Ad2e=9Nz(wE)@Jf<LY+v0J1BwB~fMP%~pcqgLC<YV*
ziUGyI-7;{4+<?0mIrRD2s^f*c!ZXv^0?stHHl8!lUVLgJx^}*@Qd`jGmY3yr^iBh%
z{TVoT5FV$@L+*)}%T3$W`Q#%*b|p<Wgf}~I-%BbEd?U?Adp_kdHm$a8eht*9EG}w`
zmCNw#2fjXY{z8tNpVw9v&Qxplg%$1W!phRZVkXK%Y;qur=pYT}yp&>NYTHgzUs|LH
zS+vNASKlF=Y>g3>Q`^=9D>&FpvbiQU)b^o<vp(k<4m-$Ts{>Nyrk~i@i%ylIJ^Fp<
zhmLhMWYGcT2qxnF&H{E|nBK``XJ>Jym^wfe2M0On|NjPO<X^+tx$ysw#RKq7`22qX
zegqi62Z3M1$G-`D5wHQ_`#%Ky75@E8z_)>C0T=i<@G;;$!0+MvzX*H@I13yC?%-Ts
zocr5A2_V25IM;t2_!Dp&co}#OxCI121^6iN0PqKt_ZxsH_cNmG>QW3S1{4E|0mXn~
zKr!(D!N4GAW}m*ufR(87Ok@zjzC05V0J0xvq|cMt$4Xy0GU-k`nj@3e=>0DZAw52@
z=WVz~riSBht$`5EwE|*ED6Juu6Y;KRWfTdbK4)fTVtFYf#aoEj5z-JBR}x#YEjj}s
zAnArgz-t_3g;)VZipW8jL=+^!w~QnHc2PzZz@>-a<a{Q{B+4rFJD&Ckhxg4yY>jn?
z&_U^V&p90`GLoO%6B6;W$F|WSsdasX0w84eg7jsp!W-^|QHpw|B8U($u;uS2d<_k}
zlo2yB)FTSE9AF6bzbi96itfgW3~t;iY)0%!)e(kNE?1Z8(^ETQCx=B#Mw82b6oQOh
No8tu<M17o)e*<mw$ejQH

diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
similarity index 92%
rename from paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
rename to paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
index ac147ccd0abb6..a8e88f351ccbc 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
+#include "paddle/phi/kernels/sparse/coalesce_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
@@ -27,9 +27,9 @@ namespace phi {
 namespace sparse {
 
 template <typename T, typename IntT>
-void CoalescedGPUKernel(const GPUContext& dev_ctx,
-                        const SparseCooTensor& x,
-                        SparseCooTensor* out) {
+void CoalesceGPUKernel(const GPUContext& dev_ctx,
+                       const SparseCooTensor& x,
+                       SparseCooTensor* out) {
   const DenseTensor& x_indices = x.non_zero_indices();
   const DenseTensor& x_values = x.non_zero_elements();
   DenseTensor out_indices = phi::EmptyLike<IntT>(dev_ctx, x_indices);
@@ -172,21 +172,21 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx,
 }
 
 template <typename T, typename Context>
-void CoalescedKernel(const Context& dev_ctx,
-                     const SparseCooTensor& x,
-                     SparseCooTensor* out) {
+void CoalesceKernel(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    SparseCooTensor* out) {
   PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "CoalescedGPUKernel", ([&] {
-        CoalescedGPUKernel<T, data_t>(dev_ctx, x, out);
+      x.non_zero_indices().dtype(), "CoalesceGPUKernel", ([&] {
+        CoalesceGPUKernel<T, data_t>(dev_ctx, x, out);
       }));
 }
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(coalesced,
+PD_REGISTER_KERNEL(coalesce,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::CoalescedKernel,
+                   phi::sparse::CoalesceKernel,
                    float,
                    double,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/sparse/gpu/conv.cu.h b/paddle/phi/kernels/sparse/gpu/conv.cu.h
index 8cd55e50d98d0..859857ed7baac 100644
--- a/paddle/phi/kernels/sparse/gpu/conv.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h
@@ -545,8 +545,8 @@ int ProductRuleBook(const Context& dev_ctx,
                     DenseTensor* out_index,
                     DenseTensor* unique_value,
                     SparseCooTensor* out,
-                    std::vector<int>* h_counter,
-                    std::vector<int>* h_offsets) {
+                    int* h_counter,
+                    int* h_offsets) {
   auto indices_dtype = paddle::experimental::CppTypeToDataType<IntT>::Type();
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
@@ -633,16 +633,11 @@ int ProductRuleBook(const Context& dev_ctx,
 
     out->SetMember(out_indices, out_values, out_dims, false);
 
-    CallThrustScan(dev_ctx,
-                   counter_ptr,
-                   kernel_size,
-                   offsets_ptr,
-                   h_counter->data(),
-                   h_offsets->data());
+    CallThrustScan(
+        dev_ctx, counter_ptr, kernel_size, offsets_ptr, h_counter, h_offsets);
 
     dev_ctx.Wait();
-    int rulebook_len =
-        (*h_offsets)[kernel_size - 1] + (*h_counter)[kernel_size - 1];
+    int rulebook_len = h_offsets[kernel_size - 1] + h_counter[kernel_size - 1];
     DenseTensor out_rulebook =
         phi::Empty<IntT>(dev_ctx, {rulebook_rows, rulebook_len});
     IntT* out_rulebook_ptr = out_rulebook.data<IntT>();
@@ -691,12 +686,8 @@ int ProductRuleBook(const Context& dev_ctx,
 
     IntT rulebook_len = (last - rulebook_ptr) / 2;
 
-    CallThrustScan(dev_ctx,
-                   counter_ptr,
-                   kernel_size,
-                   offsets_ptr,
-                   h_counter->data(),
-                   h_offsets->data());
+    CallThrustScan(
+        dev_ctx, counter_ptr, kernel_size, offsets_ptr, h_counter, h_offsets);
 
     rulebook->Resize({rulebook_rows, static_cast<int>(rulebook_len)});
     // 3. sorted or merge the out index
diff --git a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
index 89159f4c55a70..543f3884edcb4 100644
--- a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
@@ -65,7 +65,11 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
       x_dims, kernel_sizes, subm_paddings, dilations, subm_strides, &out_dims);
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
-  std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
+  DenseTensor h_counter, h_offsets;
+  h_counter.Resize({kernel_size});
+  h_offsets.Resize({kernel_size + 1});
+  int* h_counter_ptr = dev_ctx.template HostAlloc<int>(&h_counter);
+  int* h_offsets_ptr = dev_ctx.template HostAlloc<int>(&h_offsets);
 
   // Second algorithm:
   // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
@@ -86,8 +90,8 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
         key,
         out_dims,
         out,
-        &h_counter,
-        &offsets,
+        h_counter.data<int>(),
+        h_offsets.data<int>(),
         &rulebook_len,
         &need_product_rulebook);
   }
@@ -108,8 +112,8 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
                                                         &out_index,
                                                         &unique_value,
                                                         out,
-                                                        &h_counter,
-                                                        &offsets);
+                                                        h_counter_ptr,
+                                                        h_offsets_ptr);
     rulebook_ptr = tmp_rulebook.data<IntT>();
 
     phi::funcs::sparse::SaveToTable(
@@ -161,17 +165,17 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
 
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
-    if (h_counter[i] <= 0) {
+    if (h_counter_ptr[i] <= 0) {
       continue;
     }
 
     // call gemm: (n, in_channels) * (in_channels, out_channels)
-    const int M = h_counter[i];
+    const int M = h_counter_ptr[i];
     const int K = in_channels;
     const int N = out_channels;
-    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels;
     const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
-    T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
+    T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels;
 
     blas.GEMM(CblasNoTrans,
               CblasNoTrans,
diff --git a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
index 7ac727cae4ca9..a34a87eb1f645 100644
--- a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
@@ -90,8 +90,8 @@ void MaxPoolGPUKernel(const GPUContext& dev_ctx,
                                                           &out_index,
                                                           &unique_value,
                                                           out,
-                                                          &h_counter,
-                                                          &offsets);
+                                                          h_counter.data(),
+                                                          offsets.data());
 
   const IntT* rulebook_ptr = rulebook->data<IntT>();
 
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index 2f5bb189c0ffe..12d55596a935d 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -154,10 +153,8 @@ void SparseCooTensorKernel(const Context& dev_ctx,
                            const DenseTensor& indices,
                            const IntArray& dense_shape,
                            SparseCooTensor* out) {
-  SparseCooTensor before_coalesced(
-      indices, values, phi::make_ddim(dense_shape.GetData()));
-  // CoalescedKernel<T, Context>(dev_ctx, before_coalesced, out);
-  *out = before_coalesced;
+  *out =
+      SparseCooTensor(indices, values, phi::make_ddim(dense_shape.GetData()));
 }
 
 }  // namespace sparse
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index df0e87c6b5a49..f7c7b7e9486ee 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
+#include "paddle/phi/kernels/sparse/coalesce_kernel.h"
 #include "paddle/phi/kernels/sparse/conv_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/conv_kernel.h"
 
@@ -212,7 +212,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                                                "Conv3d",
                                                &d_rulebook,
                                                &d_counter);
-  SparseCooTensor tmp_d_out = sparse::Coalesced<T>(dev_ctx_gpu, d_out);
+  SparseCooTensor tmp_d_out = sparse::Coalesce<T>(dev_ctx_gpu, d_out);
 
   ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
   ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
index 7497dca51a59c..a06d85738586e 100644
--- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
+#include "paddle/phi/kernels/sparse/coalesce_kernel.h"
 #include "paddle/phi/kernels/sparse/pool_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/pool_kernel.h"
 
@@ -160,7 +160,7 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
                                              strides,
                                              &d_rulebook,
                                              &d_counter);
-  SparseCooTensor tmp_d_out = sparse::Coalesced<T>(dev_ctx_gpu, d_out);
+  SparseCooTensor tmp_d_out = sparse::Coalesce<T>(dev_ctx_gpu, d_out);
 
   ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
   ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
index ede33e4167472..36ecfeccd1a1d 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
@@ -53,7 +53,7 @@ def test_conv3d(self):
                 groups=1,
                 data_format="NDHWC")
             out.backward(out)
-            out = paddle.incubate.sparse.coalesced(out)
+            out = paddle.incubate.sparse.coalesce(out)
             assert np.array_equal(correct_out_values, out.values().numpy())
 
     def test_subm_conv3d(self):
diff --git a/python/paddle/incubate/sparse/__init__.py b/python/paddle/incubate/sparse/__init__.py
index 6c9678873abe5..7e8fdfa7bfd35 100644
--- a/python/paddle/incubate/sparse/__init__.py
+++ b/python/paddle/incubate/sparse/__init__.py
@@ -18,6 +18,7 @@
 from .unary import sqrt
 from .unary import sin
 from .unary import tanh
+from .unary import coalesce
 
 from .binary import mv
 from .binary import matmul
@@ -28,8 +29,6 @@
 from .math import multiply
 from .math import subtract
 
-from .coalesced import coalesced
-
 from . import nn
 
 __all__ = [
@@ -45,5 +44,5 @@
     'subtract',
     'multiply',
     'divide',
-    'coalesced',
+    'coalesce',
 ]
diff --git a/python/paddle/incubate/sparse/coalesced.py b/python/paddle/incubate/sparse/coalesced.py
deleted file mode 100644
index 23c82499851d3..0000000000000
--- a/python/paddle/incubate/sparse/coalesced.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle import _C_ops
-from paddle.fluid.framework import core, dygraph_only
-
-__all__ = [
-    'coalesced',
-]
-
-
-@dygraph_only
-def coalesced(x):
-    r"""
-    the coalesced operator include sorted and merge, after coalesced, the indices of x is sorted and unique, .
-
-    Args:
-        x (Tensor): the input SparseCooTensor.
-
-    Returns:
-        Tensor: return the SparseCooTensor after coalesced.
-
-    Examples:
-
-    ..  code-block:: python
-
-        import paddle
-        from paddle.incubate import sparse
-        from paddle.fluid.framework import _test_eager_guard
-
-        with _test_eager_guard():
-            indices = [[0, 0, 1], [1, 1, 2]]
-            values = [1.0, 2.0, 3.0]
-            sp_x = sparse.sparse_coo_tensor(indices, values)
-            sp_x = sparse.coalesced(sp_x)
-            print(sp_x.indices())
-            #[[0, 1], [1, 2]]
-            print(sp_x.values())
-            #[3.0, 3.0]
-	"""
-    return _C_ops.final_state_sparse_coalesced(x)
diff --git a/python/paddle/incubate/sparse/unary.py b/python/paddle/incubate/sparse/unary.py
index 09e449b0d9c5e..9220debdc24de 100644
--- a/python/paddle/incubate/sparse/unary.py
+++ b/python/paddle/incubate/sparse/unary.py
@@ -109,3 +109,35 @@ def sin(x, name=None):
                 out = paddle.incubate.sparse.sin(sparse_x)
     """
     return _C_ops.final_state_sparse_sin(x)
+
+
+@dygraph_only
+def coalesce(x):
+    r"""
+    the coalesced operator include sorted and merge, after coalesced, the indices of x is sorted and unique, .
+
+    Args:
+        x (Tensor): the input SparseCooTensor.
+
+    Returns:
+        Tensor: return the SparseCooTensor after coalesced.
+
+    Examples:
+
+    ..  code-block:: python
+
+        import paddle
+        from paddle.incubate import sparse
+        from paddle.fluid.framework import _test_eager_guard
+
+        with _test_eager_guard():
+            indices = [[0, 0, 1], [1, 1, 2]]
+            values = [1.0, 2.0, 3.0]
+            sp_x = sparse.sparse_coo_tensor(indices, values)
+            sp_x = sparse.coalesced(sp_x)
+            print(sp_x.indices())
+            #[[0, 1], [1, 2]]
+            print(sp_x.values())
+            #[3.0, 3.0]
+	"""
+    return _C_ops.final_state_sparse_coalesce(x)

From 123b16c977acdebc1482dc703fc7df1002cabd6a Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Fri, 8 Jul 2022 13:05:59 +0000
Subject: [PATCH 66/70] fix test_sparse_utils

---
 python/paddle/fluid/tests/unittests/test_sparse_utils_op.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index b71ef0357cb37..471ab432215ef 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -298,7 +298,7 @@ def test_sparse_coo_tensor_sorted(self):
                     values = paddle.to_tensor(values, dtype='float32')
                     sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                         indices, values)
-                    sparse_x = paddle.incubate.sparse.coalesced(sparse_x)
+                    sparse_x = paddle.incubate.sparse.coalesce(sparse_x)
                     indices_sorted = [[0, 1], [1, 0]]
                     values_sorted = [5.0, 1.0]
                     assert np.array_equal(indices_sorted,
@@ -311,7 +311,7 @@ def test_sparse_coo_tensor_sorted(self):
                     values = paddle.to_tensor(values, dtype='float32')
                     sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                         indices, values)
-                    sparse_x = paddle.incubate.sparse.coalesced(sparse_x)
+                    sparse_x = paddle.incubate.sparse.coalesce(sparse_x)
                     values_sorted = [[5.0, 5.0], [1.0, 1.0]]
                     assert np.array_equal(indices_sorted,
                                           sparse_x.indices().numpy())

From af66998715bd5d0a85d796f676125bfa74567f21 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 13 Jul 2022 06:28:15 +0000
Subject: [PATCH 67/70] sparse support amp

---
 paddle/fluid/eager/eager_amp_auto_cast.h           | 7 ++++++-
 paddle/phi/kernels/gpu/pad3d_grad_kernel.cu        | 9 +++++++--
 paddle/phi/kernels/sparse/empty_kernel.cc          | 2 ++
 paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu | 4 ++++
 paddle/phi/kernels/sparse/gpu/unary_kernel.cu      | 4 ++++
 5 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h
index 438ccbaca8a5e..e13d0cbc7484d 100644
--- a/paddle/fluid/eager/eager_amp_auto_cast.h
+++ b/paddle/fluid/eager/eager_amp_auto_cast.h
@@ -85,7 +85,12 @@ inline paddle::experimental::Tensor EagerAmpAutoCast(
     }
   }
   if (NeedCast(input, dst_dtype)) {
-    return cast_final_state_dygraph_function(input, dst_dtype);
+    if (input.is_sparse_coo_tensor() || input.is_sparse_csr_tensor()) {
+      return sparse::cast_final_state_dygraph_function(
+          input, paddle::experimental::DataType::UNDEFINED, dst_dtype);
+    } else {
+      return cast_final_state_dygraph_function(input, dst_dtype);
+    }
   }
   return input;
 }
diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
index 8f4af0a450890..e9f820a318482 100644
--- a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
@@ -503,5 +503,10 @@ void Pad3dGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    pad3d_grad, GPU, ALL_LAYOUT, phi::Pad3dGradKernel, float, double) {}
+PD_REGISTER_KERNEL(pad3d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pad3dGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/sparse/empty_kernel.cc b/paddle/phi/kernels/sparse/empty_kernel.cc
index fe7fb72b4caa6..c1706b9919d90 100644
--- a/paddle/phi/kernels/sparse/empty_kernel.cc
+++ b/paddle/phi/kernels/sparse/empty_kernel.cc
@@ -97,6 +97,7 @@ PD_REGISTER_KERNEL(empty_like_coo,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::EmptyLikeCooKernel,
+                   phi::dtype::float16,
                    float,
                    double,
                    int8_t,
@@ -112,6 +113,7 @@ PD_REGISTER_KERNEL(empty_like_csr,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::EmptyLikeCsrKernel,
+                   phi::dtype::float16,
                    float,
                    double,
                    int8_t,
diff --git a/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu
index c1f2b2a1f0d1d..be0f13fb0e538 100644
--- a/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu
@@ -23,6 +23,7 @@
                      GPU,                                          \
                      ALL_LAYOUT,                                   \
                      phi::sparse::prefix##CooGradKernel,           \
+                     phi::dtype::float16,                          \
                      float,                                        \
                      double) {                                     \
     kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \
@@ -32,6 +33,7 @@
                      GPU,                                          \
                      ALL_LAYOUT,                                   \
                      phi::sparse::prefix##CsrGradKernel,           \
+                     phi::dtype::float16,                          \
                      float,                                        \
                      double) {                                     \
     kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \
@@ -56,6 +58,7 @@ PD_REGISTER_KERNEL(cast_coo_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::CastCooGradKernel,
+                   phi::dtype::float16,
                    float,
                    double,
                    int8_t,
@@ -69,6 +72,7 @@ PD_REGISTER_KERNEL(cast_csr_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::CastCsrGradKernel,
+                   phi::dtype::float16,
                    float,
                    double,
                    int8_t,
diff --git a/paddle/phi/kernels/sparse/gpu/unary_kernel.cu b/paddle/phi/kernels/sparse/gpu/unary_kernel.cu
index fdf0b5106d3cf..6358b7b983576 100644
--- a/paddle/phi/kernels/sparse/gpu/unary_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/unary_kernel.cu
@@ -67,6 +67,7 @@ void DivCsrScalarKernel(const Context& dev_ctx,
                      GPU,                                          \
                      ALL_LAYOUT,                                   \
                      phi::sparse::prefix##CooKernel,               \
+                     phi::dtype::float16,                          \
                      float,                                        \
                      double) {                                     \
     kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \
@@ -76,6 +77,7 @@ void DivCsrScalarKernel(const Context& dev_ctx,
                      GPU,                                          \
                      ALL_LAYOUT,                                   \
                      phi::sparse::prefix##CsrKernel,               \
+                     phi::dtype::float16,                          \
                      float,                                        \
                      double) {                                     \
     kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \
@@ -119,6 +121,7 @@ PD_REGISTER_KERNEL(cast_coo,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::CastCooKernel,
+                   phi::dtype::float16,
                    float,
                    double,
                    int8_t,
@@ -132,6 +135,7 @@ PD_REGISTER_KERNEL(cast_csr,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::CastCsrKernel,
+                   phi::dtype::float16,
                    float,
                    double,
                    int8_t,

From 91ee01bea4416955ae964b2926f18fe9b712b991 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 18 Jul 2022 09:07:39 +0000
Subject: [PATCH 68/70] resolve conflict

---
 paddle/phi/kernels/sparse/conv_grad_kernel.h  |  11 +-
 paddle/phi/kernels/sparse/gpu/conv_kernel.cu  | 197 ++++++++++--------
 .../kernels/test_sparse_conv3d_dev_api.cc     |   2 -
 3 files changed, 120 insertions(+), 90 deletions(-)

diff --git a/paddle/phi/kernels/sparse/conv_grad_kernel.h b/paddle/phi/kernels/sparse/conv_grad_kernel.h
index 205823e620375..867f6b5a53f37 100644
--- a/paddle/phi/kernels/sparse/conv_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/conv_grad_kernel.h
@@ -25,13 +25,16 @@ template <typename T, typename Context>
 void Conv3dCooGradKernel(const Context& dev_ctx,
                          const SparseCooTensor& x,
                          const DenseTensor& kernel,
+                         const SparseCooTensor& out,
                          const DenseTensor& rulebook,
+                         const DenseTensor& counter,
                          const SparseCooTensor& out_grad,
                          const std::vector<int>& paddings,
                          const std::vector<int>& dilations,
                          const std::vector<int>& strides,
                          const int groups,
                          const bool subm,
+                         const std::string& key,
                          SparseCooTensor* x_grad,
                          DenseTensor* kernel_grad);
 
@@ -40,13 +43,16 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dCooGrad(
     const Context& dev_ctx,
     const SparseCooTensor& x,
     const DenseTensor& kernel,
+    const SparseCooTensor& out,
     const DenseTensor& rulebook,
+    const DenseTensor& counter,
     const SparseCooTensor& out_grad,
     const std::vector<int>& paddings,
     const std::vector<int>& dilations,
     const std::vector<int>& strides,
     const int groups,
-    const bool subm) {
+    const bool subm,
+    const std::string& key) {
   SparseCooTensor x_grad;
   DenseTensor kernel_grad;
 
@@ -54,13 +60,16 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dCooGrad(
   Conv3dCooGradKernel<T, Context>(dev_ctx,
                                   x,
                                   kernel,
+                                  out,
                                   rulebook,
+                                  counter,
                                   out_grad,
                                   paddings,
                                   dilations,
                                   strides,
                                   groups,
                                   subm,
+                                  key,
                                   &x_grad,
                                   &kernel_grad);
   return std::make_tuple(x_grad, kernel_grad);
diff --git a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
index 6820b677147f3..543f3884edcb4 100644
--- a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
@@ -21,7 +21,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"
 #include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
-#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"
+
+#include "glog/logging.h"
 
 namespace phi {
 namespace sparse {
@@ -35,8 +37,10 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
                         const std::vector<int>& strides,
                         const int groups,
                         const bool subm,
+                        const std::string& key,
                         SparseCooTensor* out,
-                        DenseTensor* rulebook) {
+                        DenseTensor* rulebook,
+                        DenseTensor* counter) {
   // update padding and dilation
   // Currently, only support x.layout is NDHWC, groups = 1
   // if x.layout != NDHWC then transpose(x), transpose(weight)
@@ -61,85 +65,117 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
       x_dims, kernel_sizes, subm_paddings, dilations, subm_strides, &out_dims);
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
-  std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
+  DenseTensor h_counter, h_offsets;
+  h_counter.Resize({kernel_size});
+  h_offsets.Resize({kernel_size + 1});
+  int* h_counter_ptr = dev_ctx.template HostAlloc<int>(&h_counter);
+  int* h_offsets_ptr = dev_ctx.template HostAlloc<int>(&h_offsets);
 
   // Second algorithm:
   // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
   // 1. product rulebook
-  DenseTensorMeta counter_meta(
-      DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  DenseTensorMeta offsets_meta(
-      DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
-  DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
-  DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
-  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
-  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
-
-  int n = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
-                                               x,
-                                               kernel_sizes,
-                                               subm_paddings,
-                                               dilations,
-                                               subm_strides,
-                                               out_dims,
-                                               subm,
-                                               rulebook,
-                                               &counter_per_kernel,
-                                               &offsets_per_kernel,
-                                               &out_index,
-                                               &unique_value,
-                                               out,
-                                               &h_counter,
-                                               &offsets);
-
-  const int* counter_ptr = counter_per_kernel.data<int>();
-  const int* offsets_ptr = counter_per_kernel.data<int>();
-  const IntT* rulebook_ptr = rulebook->data<IntT>();
+  DenseTensor counter_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
+  DenseTensor offsets_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
+  DenseTensor out_index = phi::Empty<int>(dev_ctx, {1});
+  DenseTensor unique_value = phi::Empty<int>(dev_ctx, {1});
+
+  VLOG(6) << "call SubmConv3D or Conv3D " << subm << " and the key is " << key;
+  int rulebook_len = 0;
+  const IntT* rulebook_ptr = nullptr;
+  bool need_product_rulebook = true;
+  if (subm && !key.empty()) {
+    rulebook_ptr = phi::funcs::sparse::PrepareSubm<T, IntT, GPUContext>(
+        dev_ctx,
+        x,
+        key,
+        out_dims,
+        out,
+        h_counter.data<int>(),
+        h_offsets.data<int>(),
+        &rulebook_len,
+        &need_product_rulebook);
+  }
+
+  if (need_product_rulebook) {
+    DenseTensor tmp_rulebook;
+    rulebook_len = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
+                                                        x,
+                                                        kernel_sizes,
+                                                        subm_paddings,
+                                                        dilations,
+                                                        subm_strides,
+                                                        out_dims,
+                                                        subm,
+                                                        &tmp_rulebook,
+                                                        &counter_per_kernel,
+                                                        &offsets_per_kernel,
+                                                        &out_index,
+                                                        &unique_value,
+                                                        out,
+                                                        h_counter_ptr,
+                                                        h_offsets_ptr);
+    rulebook_ptr = tmp_rulebook.data<IntT>();
+
+    phi::funcs::sparse::SaveToTable(
+        dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter);
+  }
 
   // 2. gather
-  DenseTensorMeta in_features_meta(
-      x.dtype(), {n, in_channels}, DataLayout::NCHW);
-  DenseTensorMeta out_features_meta(
-      x.dtype(), {n, out_channels}, DataLayout::NCHW);
   phi::DenseTensor in_features =
-      phi::Empty(dev_ctx, std::move(in_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
   phi::DenseTensor out_features =
-      phi::Empty(dev_ctx, std::move(out_features_meta));
+      phi::Empty<T>(dev_ctx, {rulebook_len, out_channels});
   T* in_features_ptr = in_features.data<T>();
   T* out_features_ptr = out_features.data<T>();
   phi::funcs::SetConstant<GPUContext, T> set_zero;
   set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
 
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
-  GatherKernel<T, IntT><<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          0,
-                          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                                              rulebook_ptr + n,
-                                              in_features_ptr,
-                                              n,
-                                              in_channels);
+  Gather<T, IntT>(dev_ctx,
+                  x.non_zero_elements().data<T>(),
+                  rulebook_ptr,
+                  rulebook_len,
+                  in_channels,
+                  in_features_ptr);
 
   // 3. call gemm for every werght
   auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
   auto* out_values = out->mutable_non_zero_elements();
   T* out_values_ptr = out_values->data<T>();
+  set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
+
+  if (subm) {
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    unique_value.ResizeAndAllocate(
+        {static_cast<int>(out->nnz() * kernel_size)});
+    out_index.ResizeAndAllocate({static_cast<int>(rulebook_len)});
+    int* out_index_ptr = out_index.data<int>();
+    int* unique_value_ptr = unique_value.data<int>();
+    phi::backends::gpu::GpuMemsetAsync(
+        out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
+    GroupIndexs<<<config.block_per_grid,
+                  config.thread_per_block,
+                  0,
+                  dev_ctx.stream()>>>(rulebook_len,
+                                      kernel_size,
+                                      rulebook_ptr + rulebook_len,
+                                      out_index_ptr,
+                                      unique_value_ptr);
+  }
 
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
-    if (h_counter[i] <= 0) {
+    if (h_counter_ptr[i] <= 0) {
       continue;
     }
 
     // call gemm: (n, in_channels) * (in_channels, out_channels)
-    const int M = h_counter[i];
+    const int M = h_counter_ptr[i];
     const int K = in_channels;
     const int N = out_channels;
-    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels;
     const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
-    T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
+    T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels;
 
     blas.GEMM(CblasNoTrans,
               CblasNoTrans,
@@ -154,40 +190,23 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
   }
 
   // 4. scatter
-  if (subm) {
-    set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
-    config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1);
-    phi::funcs::ScatterCUDAKernel<T, IntT>
-        <<<config.block_per_grid,
-           config.thread_per_block,
-           0,
-           dev_ctx.stream()>>>(out_features_ptr,
-                               rulebook_ptr + 2 * n,
-                               out_values_ptr,
-                               n,
-                               out_channels,
-                               false);
-  } else {
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, out->nnz() * out_channels, 1);
-    phi::funcs::sparse::ScatterKernel<T>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(out_features_ptr,
-                               unique_value.data<int>(),
-                               out_index.data<int>(),
-                               out->nnz(),
-                               n,
-                               out_channels,
-                               out_values_ptr);
-  }
+  phi::funcs::sparse::ScatterV2<T>(dev_ctx,
+                                   out_features_ptr,
+                                   out_index.data<int>(),
+                                   unique_value.data<int>(),
+                                   out->nnz(),
+                                   kernel_size,
+                                   out_channels,
+                                   1,
+                                   out_values_ptr);
 }
+
 /**
- * x: (N, D, H, W, C)
- * kernel: (D, H, W, C, OC)
- * out: (N, D, H, W, OC)
+ * x: the input SparseCooTensor, shape is (N, D, H, W, C)
+ * kernel: the weight data, shape is (D, H, W, C, OC)
+ * out: the output SparseCooTensor, shape is (N, D, H, W, OC)
+ * rulebook: return rulebook if key is not vailed else return nullptr
+ * counter: return counter if key is not vailed else return nullptr
  **/
 template <typename T, typename Context>
 void Conv3dCooKernel(const Context& dev_ctx,
@@ -198,8 +217,10 @@ void Conv3dCooKernel(const Context& dev_ctx,
                      const std::vector<int>& strides,
                      const int groups,
                      const bool subm,
+                     const std::string& key,
                      SparseCooTensor* out,
-                     DenseTensor* rulebook) {
+                     DenseTensor* rulebook,
+                     DenseTensor* counter) {
   PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "Conv3dCooGPUKernel", ([&] {
         Conv3dCooGPUKernel<T, data_t>(dev_ctx,
@@ -210,8 +231,10 @@ void Conv3dCooKernel(const Context& dev_ctx,
                                       strides,
                                       groups,
                                       subm,
+                                      key,
                                       out,
-                                      rulebook);
+                                      rulebook,
+                                      counter);
       }));
 }
 
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index 27b0ef2667973..f7c7b7e9486ee 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -214,8 +214,6 @@ void TestConv3dBase(const std::vector<IntT>& indices,
                                                &d_counter);
   SparseCooTensor tmp_d_out = sparse::Coalesce<T>(dev_ctx_gpu, d_out);
 
-  SparseCooTensor tmp_d_out = sparse::Coalesce<T>(dev_ctx_gpu, d_out);
-
   ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
   ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
   for (int i = 0; i < correct_out_dims.size(); i++) {

From a51402a82941977e11a009b488431e200dc02583 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 19 Jul 2022 08:29:31 +0000
Subject: [PATCH 69/70] fix codestyle

---
 paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
index fdaff98de4329..a8e88f351ccbc 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
@@ -155,7 +155,6 @@ void CoalesceGPUKernel(const GPUContext& dev_ctx,
                                out_values.data<T>());
   }
 
-
   // 6. convert index to coordinate
   Dim<DDim::kMaxRank> const_dims;
   for (int i = 0; i < x.dims().size(); i++) {

From ab996e00fa09007055409d2943339aa188dc58c1 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 25 Jul 2022 11:22:10 +0000
Subject: [PATCH 70/70] supplement the description of key

---
 python/paddle/incubate/sparse/nn/functional/conv.py | 4 ++++
 python/paddle/incubate/sparse/nn/layer/conv.py      | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/python/paddle/incubate/sparse/nn/functional/conv.py b/python/paddle/incubate/sparse/nn/functional/conv.py
index 503ad9a127b0f..60cbb94bea236 100644
--- a/python/paddle/incubate/sparse/nn/functional/conv.py
+++ b/python/paddle/incubate/sparse/nn/functional/conv.py
@@ -277,6 +277,10 @@ def subm_conv3d(x,
             will be consistent with that of the input. An optional string from: `"NCDHW"`, `"NDHWC"`.
             The default is `"NDHWC"`. When it is `"NDHWC"`, the data is stored in the order of:
             `[batch_size, input_depth, input_height, input_width, input_channels]`.
+        key(str, optional): the key is used to save or use the same rulebook, 
+            the definition and role of rulebook refers to
+            https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf. The 
+            default value is None.
         name(str|None): For detailed information, please refer 
            to :ref:`api_guide_Name`. Usually name is no need to set and 
            None by default.
diff --git a/python/paddle/incubate/sparse/nn/layer/conv.py b/python/paddle/incubate/sparse/nn/layer/conv.py
index c7fe1f7b4033e..f44358bbe9f3e 100644
--- a/python/paddle/incubate/sparse/nn/layer/conv.py
+++ b/python/paddle/incubate/sparse/nn/layer/conv.py
@@ -297,6 +297,10 @@ class SubmConv3D(_Conv3D):
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. The default value is 1.
         padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Currently only support ``'zeros'``.
+        key(str, optional): the key is used to save or use the same rulebook, 
+            the definition and role of rulebook refers to
+            https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf. The
+            default value is None.
         weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as param_attr. If it is set to None, the parameter