From bb4db9b1d883d374a330699a85fef61b23539ff5 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 9 May 2022 07:52:18 +0000 Subject: [PATCH 01/70] test sparse model --- .../unittests/test_sparse_middle_extractor.py | 324 ++++++++++++++++++ .../tests/unittests/test_sparse_mnist.py | 126 +++++++ 2 files changed, 450 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py create mode 100644 python/paddle/fluid/tests/unittests/test_sparse_mnist.py diff --git a/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py b/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py new file mode 100644 index 0000000000000..ae52b4a413336 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py @@ -0,0 +1,324 @@ +import paddle +import paddle.nn as nn +import paddle.sparse as sparse +from paddle.fluid.framework import _test_eager_guard +import time +import numpy as np +import torch +import spconv.pytorch as spconv +import inspect + +class MiddleExtractor(paddle.nn.Layer): + def __init__(self, + #output_shape, + use_norm=True, + num_input_features=128, + num_filters_down1=[64], + num_filters_down2=[64, 64], + name='MiddleExtractor'): + super(MiddleExtractor, self).__init__() + self.name = name + if not use_norm: + self.middle_conv = paddle.nn.Sequential( + #nn.Pad3D(1), + nn.Conv3D(num_input_features, 64, 3, stride=(2, 1, 1), data_format='NDHWC'), + #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), + nn.ReLU(), + #nn.Pad3D([1, 1, 1, 1, 0, 0]), + nn.Conv3D(64, 64, 3, stride=(1, 1, 1), data_format='NDHWC'), + #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), + nn.ReLU(), + #nn.Pad3D(1), + nn.Conv3D(64, 64, 3, stride=(2, 1, 1), data_format='NDHWC'), + #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), + nn.ReLU(), + ) + else: + self.middle_conv = paddle.nn.Sequential( + #nn.Pad3D(1), + nn.Conv3D(num_input_features, 64, 3, stride=(2, 1, 1), data_format='NDHWC'), + nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), + nn.ReLU(), + #nn.Pad3D([1, 1, 1, 1, 0, 0]), + nn.Conv3D(64, 64, 3, stride=(1, 1, 1), data_format='NDHWC'), + nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), + nn.ReLU(), + #nn.Pad3D(1), + nn.Conv3D(64, 64, 3, stride=(2, 1, 1), data_format='NDHWC'), + nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), + nn.ReLU(), + ) + def forward(self, x): + return self.middle_conv(x) + + +def get_pos_to_kw_map(func): + pos_to_kw = {} + fsig = inspect.signature(func) + pos = 0 + for name, info in fsig.parameters.items(): + if info.kind is info.POSITIONAL_OR_KEYWORD: + pos_to_kw[pos] = name + pos += 1 + return pos_to_kw + +def change_default_args(**kwargs): + def layer_wrapper(layer_class): + class DefaultArgLayer(layer_class): + def __init__(self, *args, **kw): + pos_to_kw = get_pos_to_kw_map(layer_class.__init__) + kw_to_pos = {kw: pos for pos, kw in pos_to_kw.items()} + for key, val in kwargs.items(): + if key not in kw and kw_to_pos[key] > len(args): + kw[key] = val + super().__init__(*args, **kw) + + return DefaultArgLayer + + return layer_wrapper + +class Empty(torch.nn.Module): + def __init__(self, *args, **kwargs): + super(Empty, self).__init__() + + def forward(self, *args, **kwargs): + if len(args) == 1: + return args[0] + elif len(args) == 0: + return None + return args + +class SpconvMiddleExtractor(torch.nn.Module): + def __init__(self, + #output_shape, + use_norm=True, + num_input_features=128, + num_filters_down1=[64], + num_filters_down2=[64, 64], + name='SpconvMiddleExtractor'): + super(SpconvMiddleExtractor, self).__init__() + if use_norm: + BatchNorm1d = change_default_args( + eps=1e-3, momentum=0.01)(torch.nn.BatchNorm1d) + Linear = change_default_args(bias=False)(nn.Linear) + else: + BatchNorm1d = Empty + Linear = change_default_args(bias=True)(nn.Linear) + + middle_layers = [] + + num_filters = [num_input_features] + num_filters_down1 + filters_pairs_d1 = [[num_filters[i], num_filters[i + 1]] + for i in range(len(num_filters) - 1)] + + for i, o in filters_pairs_d1: + middle_layers.append(spconv.SubMConv3d(i, o, 3, bias=False)) + if use_norm: + #middle_layers.append(BatchNorm1d(o)) + middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01)) + middle_layers.append(torch.nn.ReLU()) + + middle_layers.append( + spconv.SparseConv3d( + num_filters[-1], + num_filters[-1], (3, 1, 1), (2, 1, 1), + bias=False)) + + if use_norm: + #middle_layers.append( + # BatchNorm1d(num_filters[-1])) + middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01)) + middle_layers.append(torch.nn.ReLU()) + + + # assert len(num_filters_down2) > 0 + if len(num_filters_down1) == 0: + num_filters = [num_filters[-1]] + num_filters_down2 + else: + num_filters = [num_filters_down1[-1]] + num_filters_down2 + filters_pairs_d2 = [[num_filters[i], num_filters[i + 1]] + for i in range(len(num_filters) - 1)] + for i, o in filters_pairs_d2: + middle_layers.append(spconv.SubMConv3d(i, o, 3, bias=False)) + if use_norm: + #middle_layers.append(BatchNorm1d(o)) + middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01)) + middle_layers.append(torch.nn.ReLU()) + middle_layers.append( + spconv.SparseConv3d( + num_filters[-1], + num_filters[-1], (3, 1, 1), (2, 1, 1), + bias=False)) + if use_norm: + #middle_layers.append( + #BatchNorm1d(num_filters[-1])) + middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01)) + middle_layers.append(torch.nn.ReLU()) + #middle_layers.append(scn.SparseToDense(3, num_filters[-1])) + middle_layers.append(spconv.ToDense()) + self.middle_conv = spconv.SparseSequential(*middle_layers) + + def forward(self, x): + out = self.middle_conv(x) + return out + +class SparseMiddleExtractor(paddle.nn.Layer): + def __init__(self, + #output_shape, + use_norm=True, + num_input_features=128, + num_filters_down1=[64], + num_filters_down2=[64, 64], + name='SparseMiddleExtractor'): + super(SparseMiddleExtractor, self).__init__() + self.name = name + + middle_layers = [] + num_filters = [num_input_features] + num_filters_down1 + filters_pairs_d1 = [[num_filters[i], num_filters[i + 1]] for i in range(len(num_filters) - 1)] + for i, o in filters_pairs_d1: + middle_layers.append(sparse.SubmConv3D(i, o, 3, bias_attr=False)) + if use_norm: + middle_layers.append(sparse.BatchNorm(o, epsilon=1e-3, momentum=0.01)) + middle_layers.append(sparse.ReLU()) + + middle_layers.append(sparse.Conv3D(num_filters[-1], num_filters[-1], (3, 1, 1), (2, 1, 1), bias_attr=False)) + + if use_norm: + middle_layers.append(sparse.BatchNorm(num_filters[-1], epsilon=1e-3, momentum=0.01)) + middle_layers.append(sparse.ReLU()) + + + if len(num_filters_down1) == 0: + num_filters = [num_filters[-1]] + num_filters_down2 + else: + num_filters = [num_filters_down1[-1]] + num_filters_down2 + + filters_pairs_d2 = [[num_filters[i], num_filters[i + 1]] for i in range(len(num_filters) - 1)] + + for i, o in filters_pairs_d2: + middle_layers.append(sparse.SubmConv3D(i, o, 3, bias_attr=False)) + if use_norm: + middle_layers.append(sparse.BatchNorm(o, epsilon=1e-3, momentum=0.01)) + middle_layers.append(sparse.ReLU()) + + middle_layers.append(sparse.Conv3D(num_filters[-1], num_filters[-1], (3, 1, 1), (2, 1, 1), bias_attr=False)) + if use_norm: + middle_layers.append(sparse.BatchNorm(num_filters[-1], epsilon=1e-3, momentum=0.01)) + middle_layers.append(sparse.ReLU()) + + self.middle_conv = nn.Sequential(*middle_layers) + + def forward(self, x): + sparse_out = self.middle_conv(x) + #return sparse_out + return sparse_out.to_dense() + + +def test(): + paddle.seed(0) + with _test_eager_guard(): + in_channels = 128 + # Note: 1. paddle的BatchNorm1D的输入shape不能太大,否则报CUDNN_STATUS_NOT_SUPPORTED. + shape = [20, 40, 100] + batch_size = 1 + sparsity = 0.95 + + full_shape = [batch_size] + shape + [in_channels] + print(full_shape) + + total_elements = np.prod(shape) + nnz = int(total_elements * (1-sparsity)) + print("nnz=", nnz) + + #product indices + indices = [] + for i in range(4): + indices.append(paddle.randint(0, full_shape[i], [1, nnz])) + + indices = paddle.concat(indices) + #product values + values = paddle.randn((nnz, in_channels)) + + sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, shape=full_shape) + + dense_x = sparse_x.to_dense() + + #spconv + device = torch.device("cuda") + torch_x = torch.tensor(dense_x.numpy(), device=device) + + spconv_x = spconv.SparseConvTensor.from_dense(torch_x) + + #whether to use batch_norm + use_norm = True + + dense_model = MiddleExtractor(use_norm=use_norm, num_input_features=in_channels) + spconv_model = SpconvMiddleExtractor(use_norm=use_norm, num_input_features=in_channels).to(device) + sparse_model = SparseMiddleExtractor(use_norm=use_norm, num_input_features=in_channels) + layer_nums = len(sparse_model.middle_conv) + block_size = 3 if use_norm else 2 + layer_nums = int(layer_nums / block_size) + + for i in range(0, layer_nums): + weight = paddle.to_tensor(spconv_model.middle_conv[i * block_size].weight.detach().cpu().numpy()) + sparse_model.middle_conv[i * block_size].weight.set_value(paddle.transpose(paddle.to_tensor(weight), [1,2,3,4,0])) + if use_norm: + bn_weight = paddle.to_tensor(spconv_model.middle_conv[i*block_size + 1].weight.detach().cpu().numpy()) + sparse_model.middle_conv[i * block_size + 1].weight.set_value(bn_weight) + + print(dense_model) + print(sparse_model) + print(spconv_model) + paddle.device.cuda.synchronize() + + #warm up + dense_x.stop_gradient=True + out1 = dense_model(dense_x) + paddle.device.cuda.synchronize() + sparse_x.stop_gradient=True + out2 = sparse_model(sparse_x) + paddle.device.cuda.synchronize() + spconv_x.features.required_grad=False + out3 = spconv_model(spconv_x) + torch.cuda.synchronize(device) + #warm up + + t0 = time.time() + #padde dense + dense_x.stop_gradient=False + out1 = dense_model(dense_x) + out1.backward(out1) + paddle.device.cuda.synchronize() + t1 = time.time() + + #padde sparse + sparse_x.stop_gradient=False + out2 = sparse_model(sparse_x) + out2.backward(out2) + paddle.device.cuda.synchronize() + t2 = time.time() + + #spconv + spconv_x.features.required_grad=True + spconv_x.features.requires_grad_() + out3 = spconv_model(spconv_x) + out3.backward(out3) + torch.cuda.synchronize(device) + t3 = time.time() + + # Note 2. sparse的BatchNorm底层是使用paddle.nn.BatchNorm1D对values进行bn计算,测试发现BatchNorm1D的性能比BatchNorm3D差,因此use_norm=True的情况,需要更高的稀疏度才能比dense的快 + # Note 3. 只跑前向,sparse的耗时和spconv接近,稀疏度越高sparse的性能越好,当前方式测试前向+反向,spconv的耗时很高, 原因未知 + print("dense time: ", t1 - t0) + print("sparse time: ", t2 - t1) + print("spconv time: ", t3 - t2) + + # Note 4. paddle和torch的BN存在误差,测试shape=(4000, 64)的随机输入,单层BN前向误差在1e-6, 反向误差在1e-4 + #verify the forward calculation result + assert np.allclose(paddle.transpose(out2, [0, 4, 1, 2, 3]).numpy(), out3.detach().cpu().numpy(), atol=1e-4, rtol=1e-4) + + #verify the backward calculation result + assert np.allclose(spconv_x.features.grad.cpu().numpy(), + sparse_x.grad.values().numpy(), atol=1e-3, rtol=1e-3) + +test() diff --git a/python/paddle/fluid/tests/unittests/test_sparse_mnist.py b/python/paddle/fluid/tests/unittests/test_sparse_mnist.py new file mode 100644 index 0000000000000..3589dc83090f3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sparse_mnist.py @@ -0,0 +1,126 @@ +import paddle +from paddle.vision.transforms import Compose, Normalize, ToTensor +from paddle.fluid.framework import _test_eager_guard +import time + +paddle.disable_static() +#transform = Compose([Normalize(mean=[127.5], +# std=[127.5], +# data_format='CHW')]) +transform = Compose([ToTensor()]) +# 使用transform对数据集做归一化 +print('download training data and load training data') +train_dataset = paddle.vision.datasets.MNIST(mode='train', transform=transform) +test_dataset = paddle.vision.datasets.MNIST(mode='test', transform=transform) +print('load finished') + +import numpy as np +#import matplotlib.pyplot as plt +train_data0, train_label_0 = train_dataset[0][0],train_dataset[0][1] +train_data0 = train_data0.reshape([28,28]) +#plt.figure(figsize=(2,2)) +#plt.imshow(train_data0, cmap=plt.cm.binary) +print('train_data0 label is: ' + str(train_label_0)) + + +import paddle +import paddle.nn.functional as F +class SparseLeNet(paddle.nn.Layer): + def __init__(self): + super(SparseLeNet, self).__init__() + #self.bn = paddle.sparse.BatchNorm(1) + self.conv1 = paddle.sparse.Conv3D(in_channels=1, out_channels=6, kernel_size=[1, 5, 5], stride=[1, 1, 1], padding=[0, 2, 2]) + self.relu1 = paddle.sparse.ReLU() + self.pool1 = paddle.sparse.MaxPool3D(kernel_size=[1, 2, 2], stride=[1, 2, 2]) + self.conv2 = paddle.sparse.Conv3D(in_channels=6, out_channels=16, kernel_size=[1, 5, 5], stride=[1, 1, 1]) + self.relu2 = paddle.sparse.ReLU() + self.pool2 = paddle.sparse.MaxPool3D(kernel_size=[1, 2, 2], stride=[1, 2, 2]) + + self.fc1 = paddle.nn.Linear(16*5*5, 120) + self.fc2 = paddle.nn.Linear(120, 84) + self.fc3 = paddle.nn.Linear(84, 10) + + def forward(self, x): + #x = self.bn(x) + x = self.conv1(x) + x = self.relu1(x) + x = self.pool1(x) + x = self.conv2(x) + x = self.relu2(x) + x = self.pool2(x) + x = x.to_dense() + + x = paddle.flatten(x, start_axis=1, stop_axis=-1) + x = self.fc1(x) + x = paddle.nn.functional.relu(x) + x = self.fc2(x) + x = paddle.nn.functional.relu(x) + x = self.fc3(x) + return x + +import paddle.nn.functional as F +train_loader = paddle.io.DataLoader(train_dataset, batch_size=64, shuffle=True) +# 加载训练集 batch_size 设为 64 +# sparse 训练 + +def prepare_data(x_data): + x_data = paddle.transpose(x_data, perm=[0, 2, 3, 1]) + x_data = paddle.reshape(x_data, [x_data.shape[0], 1, x_data.shape[1], x_data.shape[2], x_data.shape[3]]) + return x_data + +def sparse_train(model): + model.train() + epochs = 2 + optim = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()) + # 用Adam作为优化函数 + for epoch in range(epochs): + for batch_id, data in enumerate(train_loader()): + x_data = data[0] + y_data = data[1] + x_data = prepare_data(x_data) + x_data = x_data.to_sparse_coo(4) + x_data.stop_gradient=False + predicts = model(x_data) + loss = F.cross_entropy(predicts, y_data) + # 计算损失 + acc = paddle.metric.accuracy(predicts, y_data) + loss.backward() + if batch_id % 300 == 0: + print("epoch: {}, batch_id: {}, loss is: {}, acc is: {}".format(epoch, batch_id, loss.numpy(), acc.numpy())) + optim.step() + optim.clear_grad() + +test_loader = paddle.io.DataLoader(test_dataset, places=paddle.CPUPlace(), batch_size=64) +# 加载测试数据集 +def test(model): + model.eval() + batch_size = 64 + for batch_id, data in enumerate(test_loader()): + x_data = data[0] + y_data = data[1] + x_data = prepare_data(x_data) + x_data = x_data.to_sparse_coo(4) + predicts = model(x_data) + # 获取预测结果 + loss = F.cross_entropy(predicts, y_data) + acc = paddle.metric.accuracy(predicts, y_data) + if batch_id % 20 == 0: + print("batch_id: {}, loss is: {}, acc is: {}".format(batch_id, loss.numpy(), acc.numpy())) + +with _test_eager_guard(): + sparse_model = SparseLeNet() + print(sparse_model) + + t0 = time.time() + sparse_train(sparse_model) + t1 = time.time() + print("spare time:", t1-t0) + test(sparse_model) + #x = paddle.randn((1, 1,28,28,1)) + #x.stop_gradient=False + #sparse_x = x.to_sparse_coo(4) + #print("sparse_x values shape:", sparse_x.values().shape) + #out = sparse_model(sparse_x) + #out.backward(out) + #print("end") + From 441da36cb4080172c37acb547a73cc580344cbd3 Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Sun, 29 May 2022 18:00:48 +0800 Subject: [PATCH 02/70] refactor code structure --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 182 ++++++++++---------- 1 file changed, 89 insertions(+), 93 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 361e62e566035..e572ec70dbebe 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -451,10 +451,80 @@ void BatchNormKernel(const Context &ctx, paddle::framework::TensorCopy(x, ctx.GetPlace(), y); } else { double this_factor = 1. - momentum; - - bool called = false; +#ifdef PADDLE_WITH_HIP + const int num = transformed_x.numel(); + const int block = 256; + const int max_threads = ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min(C, max_blocks); + if (compute_format == DataLayout::kNCHW) { + BNForwardTraining< + T, + block, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + this_factor, + transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } else { + BNForwardTraining< + T, + block, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + this_factor, + transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenBatchNormalizationForwardTraining( +// handle, mode_, const_cast(static_cast( +// CudnnDataType::kOne())), +// const_cast( +// static_cast(CudnnDataType::kZero())), +// data_desc_, +// static_cast(transformed_x.template data()), +// data_desc_, +// static_cast( +// transformed_y.template mutable_data(ctx.GetPlace())), +// bn_param_desc_, +// const_cast(static_cast( +// scale->template data>())), +// const_cast(static_cast( +// bias->template data>())), +// this_factor, +// static_cast( +// mean_out->template mutable_data>( +// ctx.GetPlace())), +// static_cast(variance_out->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())), +// epsilon, +// static_cast( +// saved_mean->template mutable_data>( +// ctx.GetPlace())), +// static_cast(saved_variance->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())))); +#else #if CUDNN_VERSION_MIN(7, 4, 1) - called = true; size_t workspace_size = 0; size_t reserve_space_size = 0; void *reserve_space_ptr = nullptr; @@ -530,102 +600,28 @@ void BatchNormKernel(const Context &ctx, workspace_size, reserve_space_ptr, reserve_space_size)); -#endif // CUDNN_VERSION_MIN(7, 4, 1) - if (!called) { -#ifdef PADDLE_WITH_HIP - const int num = transformed_x.numel(); - const int block = 256; - const int max_threads = ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - const int grid = std::min(C, max_blocks); - if (compute_format == DataLayout::kNCHW) { - BNForwardTraining< - T, - block, - DataLayout::kNCHW><<>>( +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationForwardTraining( + handle, + mode_, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, transformed_x.template data(), + data_desc_, + ctx.template Alloc(&transformed_y), + bn_param_desc_, scale.template data>(), bias.template data>(), - C, - N, - H * W * D, - epsilon, this_factor, - transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>()); - } else { - BNForwardTraining< - T, - block, - DataLayout::kNHWC><<>>( - transformed_x.template data(), - scale.template data>(), - bias.template data>(), - C, - N, - H * W * D, + ctx.template Alloc>(mean_out), + ctx.template Alloc>(variance_out), epsilon, - this_factor, - transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>()); - } -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationForwardTraining( -// handle, mode_, const_cast(static_cast( -// CudnnDataType::kOne())), -// const_cast( -// static_cast(CudnnDataType::kZero())), -// data_desc_, -// static_cast(transformed_x.template data()), -// data_desc_, -// static_cast( -// transformed_y.template mutable_data(ctx.GetPlace())), -// bn_param_desc_, -// const_cast(static_cast( -// scale->template data>())), -// const_cast(static_cast( -// bias->template data>())), -// this_factor, -// static_cast( -// mean_out->template mutable_data>( -// ctx.GetPlace())), -// static_cast(variance_out->template mutable_data< -// BatchNormParamType>(ctx.GetPlace())), -// epsilon, -// static_cast( -// saved_mean->template mutable_data>( -// ctx.GetPlace())), -// static_cast(saved_variance->template mutable_data< -// BatchNormParamType>(ctx.GetPlace())))); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnBatchNormalizationForwardTraining( - handle, - mode_, - CudnnDataType::kOne(), - CudnnDataType::kZero(), - data_desc_, - transformed_x.template data(), - data_desc_, - ctx.template Alloc(&transformed_y), - bn_param_desc_, - scale.template data>(), - bias.template data>(), - this_factor, - ctx.template Alloc>(mean_out), - ctx.template Alloc>(variance_out), - epsilon, - ctx.template Alloc>(saved_mean), - ctx.template Alloc>(saved_variance))); + ctx.template Alloc>(saved_mean), + ctx.template Alloc>(saved_variance))); +#endif // CUDNN_VERSION_MIN(7, 4, 1) #endif - } } } From c48e076349ba257bcd87f874f792aaf449852fd8 Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Sun, 29 May 2022 22:55:48 +0800 Subject: [PATCH 03/70] add native kernel usage --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 222 ++++++++++++-------- 1 file changed, 134 insertions(+), 88 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index e572ec70dbebe..08eea1f8717cd 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -524,103 +524,149 @@ void BatchNormKernel(const Context &ctx, // static_cast(saved_variance->template mutable_data< // BatchNormParamType>(ctx.GetPlace())))); #else -#if CUDNN_VERSION_MIN(7, 4, 1) - size_t workspace_size = 0; - size_t reserve_space_size = 0; - void *reserve_space_ptr = nullptr; - void *workspace_ptr = nullptr; - DenseTensor workspace_tensor; - DenseTensor reserve_space_tensor; - // Create reserve space and workspace for batch norm. - // Create tensor for each batchnorm op, it will be used in the - // backward. Thus this tensor shouldn't be temp. - // auto *reserve_space = ctx.Output("ReserveSpace"); - if (reserve_space == nullptr) { - reserve_space = &reserve_space_tensor; - } - PADDLE_ENFORCE_NOT_NULL( - reserve_space, - phi::errors::NotFound( - "The argument ReserveSpace of batch_norm op is not found.")); - // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload:: - cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - /*handle=*/handle, - /*mode=*/mode_, - /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, - /*xDesc=*/data_desc_, - /*zDesc=*/nullptr, - /*yDesc=*/data_desc_, - /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, - /*activationDesc=*/nullptr, - /*sizeInBytes=*/&workspace_size)); - - // -------------- cudnn batchnorm reserve space -------------- - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload:: - cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - /*handle=*/handle, - /*mode=*/mode_, - /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, - /*activationDesc=*/nullptr, - /*xDesc=*/data_desc_, - /*sizeInBytes=*/&reserve_space_size)); - - reserve_space->Resize({static_cast(reserve_space_size)}); - reserve_space_ptr = - static_cast(ctx.template Alloc(reserve_space)); - workspace_tensor.Resize({static_cast(workspace_size)}); - workspace_ptr = - static_cast(ctx.template Alloc(&workspace_tensor)); - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx( - handle, - mode_, - CUDNN_BATCHNORM_OPS_BN, - CudnnDataType::kOne(), - CudnnDataType::kZero(), - data_desc_, + const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070); + if(use_native_kernel) { + const int num = transformed_x.numel(); + const int block = 256; + const int max_threads = ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min(C, max_blocks); + if (compute_format == DataLayout::kNCHW) { + BNForwardTraining< + T, + block, + DataLayout::kNCHW><<>>( transformed_x.template data(), - nullptr, - nullptr, - data_desc_, - transformed_y.template data(), - bn_param_desc_, scale.template data>(), bias.template data>(), - this_factor, - ctx.template Alloc>(mean_out), - ctx.template Alloc>(variance_out), + C, + N, + H * W * D, epsilon, - ctx.template Alloc>(saved_mean), - ctx.template Alloc>(saved_variance), - nullptr, - workspace_ptr, - workspace_size, - reserve_space_ptr, - reserve_space_size)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnBatchNormalizationForwardTraining( - handle, - mode_, - CudnnDataType::kOne(), - CudnnDataType::kZero(), - data_desc_, + this_factor, + transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } else { + BNForwardTraining< + T, + block, + DataLayout::kNHWC><<>>( transformed_x.template data(), - data_desc_, - ctx.template Alloc(&transformed_y), - bn_param_desc_, scale.template data>(), bias.template data>(), - this_factor, - ctx.template Alloc>(mean_out), - ctx.template Alloc>(variance_out), + C, + N, + H * W * D, epsilon, - ctx.template Alloc>(saved_mean), - ctx.template Alloc>(saved_variance))); + this_factor, + transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } + } else { +#if CUDNN_VERSION_MIN(7, 4, 1) + size_t workspace_size = 0; + size_t reserve_space_size = 0; + void *reserve_space_ptr = nullptr; + void *workspace_ptr = nullptr; + DenseTensor workspace_tensor; + DenseTensor reserve_space_tensor; + // Create reserve space and workspace for batch norm. + // Create tensor for each batchnorm op, it will be used in the + // backward. Thus this tensor shouldn't be temp. + // auto *reserve_space = ctx.Output("ReserveSpace"); + if (reserve_space == nullptr) { + reserve_space = &reserve_space_tensor; + } + PADDLE_ENFORCE_NOT_NULL( + reserve_space, + phi::errors::NotFound( + "The argument ReserveSpace of batch_norm op is not found.")); + // --------------- cudnn batchnorm workspace --------------- + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload:: + cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + /*handle=*/handle, + /*mode=*/mode_, + /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, + /*xDesc=*/data_desc_, + /*zDesc=*/nullptr, + /*yDesc=*/data_desc_, + /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, + /*activationDesc=*/nullptr, + /*sizeInBytes=*/&workspace_size)); + + // -------------- cudnn batchnorm reserve space -------------- + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload:: + cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + /*handle=*/handle, + /*mode=*/mode_, + /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, + /*activationDesc=*/nullptr, + /*xDesc=*/data_desc_, + /*sizeInBytes=*/&reserve_space_size)); + + reserve_space->Resize({static_cast(reserve_space_size)}); + reserve_space_ptr = + static_cast(ctx.template Alloc(reserve_space)); + workspace_tensor.Resize({static_cast(workspace_size)}); + workspace_ptr = + static_cast(ctx.template Alloc(&workspace_tensor)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx( + handle, + mode_, + CUDNN_BATCHNORM_OPS_BN, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + transformed_x.template data(), + nullptr, + nullptr, + data_desc_, + transformed_y.template data(), + bn_param_desc_, + scale.template data>(), + bias.template data>(), + this_factor, + ctx.template Alloc>(mean_out), + ctx.template Alloc>(variance_out), + epsilon, + ctx.template Alloc>(saved_mean), + ctx.template Alloc>(saved_variance), + nullptr, + workspace_ptr, + workspace_size, + reserve_space_ptr, + reserve_space_size)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationForwardTraining( + handle, + mode_, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + transformed_x.template data(), + data_desc_, + ctx.template Alloc(&transformed_y), + bn_param_desc_, + scale.template data>(), + bias.template data>(), + this_factor, + ctx.template Alloc>(mean_out), + ctx.template Alloc>(variance_out), + epsilon, + ctx.template Alloc>(saved_mean), + ctx.template Alloc>(saved_variance))); #endif // CUDNN_VERSION_MIN(7, 4, 1) + } #endif } } From 0a68ba3641939219cc22c45a5ca772e419107ecf Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Sat, 4 Jun 2022 00:21:54 +0800 Subject: [PATCH 04/70] add wellford impl --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 268 +++++++++++++++++++- 1 file changed, 264 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 08eea1f8717cd..15a84c4ae918e 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -140,6 +140,265 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( } } + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellford( + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + double exponentialAverageFactor, + T *y, + BatchNormParamType *mean, + BatchNormParamType *variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance) { + int outer_size = C; + int inner_size = N * HxW; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType variance_val; + __shared__ BatchNormParamType inv_var_val; + + constexpr int THREADS_PER_WARP = 32; + constexpr int THREADS_BITS_PER_WARP = 5; + + constexpr int WARP_PER_BLOCK = BlockDim / THREADS_PER_WARP; + const int WARP_BITS_PER_BLOCK = (31 - __clz(WARP_PER_BLOCK)); + + __shared__ int warp_shared_count[WARP_PER_BLOCK]; + __shared__ BatchNormParamType warp_shared_mean[WARP_PER_BLOCK]; + __shared__ BatchNormParamType warp_shared_var_n[WARP_PER_BLOCK]; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType local_mean = static_cast>(0); + BatchNormParamType local_var_n = static_cast>(0); + int local_count = 0; + + // thread-local iterative computation + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = static_cast>(x[index]); + BatchNormParamType delta = (x_i - local_mean); + local_count++; + local_mean += delta / local_count; + local_var_n += delta * (x_i - local_mean); + } + + // warp sum + for(int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) { + BatchNormParamType o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); + int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); + BatchNormParamType factor = 1.0 / static_cast(max(1, local_count+o_count)); + local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor); + local_mean = (local_count * local_mean + o_count * o_mean) * factor; + local_count += o_count; + } + + if (threadIdx.x % THREADS_PER_WARP == 0) { + warp_shared_count[threadIdx.x / THREADS_PER_WARP] = local_count; + warp_shared_mean[threadIdx.x / THREADS_PER_WARP] = local_mean; + warp_shared_var_n[threadIdx.x / THREADS_PER_WARP] = local_var_n; + } + __syncthreads(); + + // block sum + if (threadIdx.x < WARP_PER_BLOCK) { + local_count = warp_shared_count[threadIdx.x]; + local_mean = warp_shared_count[threadIdx.x]; + local_var_n = warp_shared_count[threadIdx.x]; + } + + for(int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) { + BatchNormParamType o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); + int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); + BatchNormParamType factor = 1.0 / static_cast(max(1, local_count+o_count)); + local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor); + local_mean = (local_count * local_mean + o_count * o_mean) * factor; + local_count += o_count; + } + + if (threadIdx.x == 0) { + mean_val = local_mean; + variance_val = local_var_n / local_count; + inv_var_val = 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = mean_val; + save_inv_variance[i] = inv_var_val; + } + mean[i] = (1 - exponentialAverageFactor) * mean_val + + exponentialAverageFactor * mean[i]; + variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * variance[i]; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_sub_mean = + static_cast>(x[index]) - mean_val; + y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; + } + } +} + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel( + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + double exponentialAverageFactor, + T *y, + BatchNormParamType *mean, + BatchNormParamType *variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance) { + int outer_size = C; + int inner_size = N * HxW; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType variance_val; + __shared__ BatchNormParamType inv_var_val; + + constexpr int PARALLEL_LOADS = 4; + + constexpr int THREADS_PER_WARP = 32; + constexpr int THREADS_BITS_PER_WARP = 5; + + constexpr int WARP_PER_BLOCK = BlockDim / THREADS_PER_WARP; + const int WARP_BITS_PER_BLOCK = (31 - __clz(WARP_PER_BLOCK)); + + __shared__ int warp_shared_count[WARP_PER_BLOCK]; + __shared__ BatchNormParamType warp_shared_mean[WARP_PER_BLOCK]; + __shared__ BatchNormParamType warp_shared_var_n[WARP_PER_BLOCK]; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType tmp_local_mean[PARALLEL_LOADS]; + BatchNormParamType tmp_local_var_n[PARALLEL_LOADS]; + int tmp_local_count[PARALLEL_LOADS]; + + #pragma unroll + for(int k = 0; k < PARALLEL_LOADS; k++) { + tmp_local_mean[k] = static_cast>(0); + tmp_local_var_n[k] = static_cast>(0); + tmp_local_count[k] = 0; + } + + // thread-local iterative computation + for (int j = threadIdx.x; j < inner_size; j += PARALLEL_LOADS * blockDim.x) { + BatchNormParamType tmp_local_x[PARALLEL_LOADS]; + BatchNormParamType tmp_local_count_inv[PARALLEL_LOADS]; + BatchNormParamType valid[PARALLEL_LOADS]; + auto offset = j; + #pragma unroll + for(int k = 0; k < PARALLEL_LOADS; k++) { + if(offset < inner_size) { + const int index = layout == phi::DataLayout::kNCHW + ? (offset / HxW * C + i) * HxW + offset % HxW + : offset * outer_size + i; + tmp_local_x[k] = static_cast>(x[index]); + tmp_local_count[k]++; + tmp_local_count_inv[k] = static_cast>(1) / tmp_local_count[k]; + valid[k] = static_cast>(1); + } else { + tmp_local_x[k] = static_cast>(0); + tmp_local_count_inv[k] = static_cast>(0); + valid[k] = static_cast>(0); + } + offset += blockDim.x; + } + + #pragma unroll + for(int k = 0; k < PARALLEL_LOADS; k++) { + BatchNormParamType delta = (tmp_local_x[k] - tmp_local_mean[k]); + tmp_local_mean[k] += delta * tmp_local_count_inv[k]; + tmp_local_var_n[k] += delta * (tmp_local_x[k] - tmp_local_mean[k]) * valid[k]; + } + } + + #pragma unroll + for(int k = 1; k < PARALLEL_LOADS; k++) { + BatchNormParamType factor = 1.0 / static_cast(max(1, tmp_local_count[0]+tmp_local_count[k])); + BatchNormParamType delta = (tmp_local_mean[0] - tmp_local_mean[k]); + tmp_local_mean[0] = (tmp_local_count[0] * tmp_local_mean[0] + tmp_local_count[k] * tmp_local_mean[k]) * factor; + tmp_local_var_n[0] += (tmp_local_var_n[k] + delta * delta * tmp_local_count[0] * tmp_local_count[k] * factor); + tmp_local_count[0] += tmp_local_count[k]; + } + + BatchNormParamType local_mean = tmp_local_mean[0]; + BatchNormParamType local_var_n = tmp_local_var_n[0]; + int local_count = tmp_local_count[0]; + + // warp sum + for(int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) { + BatchNormParamType o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); + int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); + BatchNormParamType factor = 1.0 / static_cast(max(1, local_count+o_count)); + local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor); + local_mean = (local_count * local_mean + o_count * o_mean) * factor; + local_count += o_count; + } + + if (threadIdx.x % THREADS_PER_WARP == 0) { + warp_shared_count[threadIdx.x / THREADS_PER_WARP] = local_count; + warp_shared_mean[threadIdx.x / THREADS_PER_WARP] = local_mean; + warp_shared_var_n[threadIdx.x / THREADS_PER_WARP] = local_var_n; + } + __syncthreads(); + + // block sum + if (threadIdx.x < WARP_PER_BLOCK) { + local_count = warp_shared_count[threadIdx.x]; + local_mean = warp_shared_count[threadIdx.x]; + local_var_n = warp_shared_count[threadIdx.x]; + } + + for(int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) { + BatchNormParamType o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); + int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); + BatchNormParamType factor = 1.0 / static_cast(max(1, local_count+o_count)); + local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor); + local_mean = (local_count * local_mean + o_count * o_mean) * factor; + local_count += o_count; + } + + if (threadIdx.x == 0) { + mean_val = local_mean; + variance_val = local_var_n / local_count; + inv_var_val = 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = mean_val; + save_inv_variance[i] = inv_var_val; + } + mean[i] = (1 - exponentialAverageFactor) * mean_val + + exponentialAverageFactor * mean[i]; + variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * variance[i]; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_sub_mean = + static_cast>(x[index]) - mean_val; + y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; + } + } +} + template void BatchNormKernel(const Context &ctx, const DenseTensor &x, @@ -524,15 +783,16 @@ void BatchNormKernel(const Context &ctx, // static_cast(saved_variance->template mutable_data< // BatchNormParamType>(ctx.GetPlace())))); #else - const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070); + //const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070); + const bool use_native_kernel = true; if(use_native_kernel) { const int num = transformed_x.numel(); - const int block = 256; + const int block = 1024; const int max_threads = ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); const int grid = std::min(C, max_blocks); if (compute_format == DataLayout::kNCHW) { - BNForwardTraining< + BNForwardTrainingWellfordParallel< T, block, DataLayout::kNCHW><<>>( @@ -550,7 +810,7 @@ void BatchNormKernel(const Context &ctx, saved_mean->template data>(), saved_variance->template data>()); } else { - BNForwardTraining< + BNForwardTrainingWellfordParallel< T, block, DataLayout::kNHWC><<>>( From b3248c9ef649ac2073853ae0b39f8b4fa3175d1b Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Sat, 4 Jun 2022 15:32:25 +0800 Subject: [PATCH 05/70] add shmem impl --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 83 +++++++++++++++++++-- 1 file changed, 77 insertions(+), 6 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 15a84c4ae918e..394b6399977dd 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -399,6 +399,77 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel } } + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingSMem( + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + double exponentialAverageFactor, + T *y, + BatchNormParamType *mean, + BatchNormParamType *variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance) { + extern __shared__ __align__(sizeof(double)) char smem_buf[]; + BatchNormParamType* x_buf = reinterpret_cast*>(smem_buf); + + int outer_size = C; + int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage mean_storage; + __shared__ typename BlockReduce::TempStorage variance_storeage; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType variance_val; + __shared__ BatchNormParamType inv_var_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = static_cast>(0); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = static_cast>(x[index]); + x_buf[j] = x_i; + x_sum += x_i; + x_square_sum += x_i * x_i; + } + x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); + x_square_sum = + BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); + if (threadIdx.x == 0) { + mean_val = x_sum / inner_size; + variance_val = x_square_sum / inner_size - mean_val * mean_val; + inv_var_val = 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = mean_val; + save_inv_variance[i] = inv_var_val; + } + mean[i] = (1 - exponentialAverageFactor) * mean_val + + exponentialAverageFactor * mean[i]; + variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * variance[i]; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_sub_mean = + static_cast>(x_buf[j]) - mean_val; + y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; + } + } +} + template void BatchNormKernel(const Context &ctx, const DenseTensor &x, @@ -786,16 +857,16 @@ void BatchNormKernel(const Context &ctx, //const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070); const bool use_native_kernel = true; if(use_native_kernel) { - const int num = transformed_x.numel(); - const int block = 1024; + const int block = 512; const int max_threads = ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); const int grid = std::min(C, max_blocks); + const size_t smem_size = N * H * W * D * sizeof(BatchNormParamType); if (compute_format == DataLayout::kNCHW) { - BNForwardTrainingWellfordParallel< + BNForwardTrainingSMem< T, block, - DataLayout::kNCHW><<>>( + DataLayout::kNCHW><<>>( transformed_x.template data(), scale.template data>(), bias.template data>(), @@ -810,10 +881,10 @@ void BatchNormKernel(const Context &ctx, saved_mean->template data>(), saved_variance->template data>()); } else { - BNForwardTrainingWellfordParallel< + BNForwardTrainingSMem< T, block, - DataLayout::kNHWC><<>>( + DataLayout::kNHWC><<>>( transformed_x.template data(), scale.template data>(), bias.template data>(), From 78349a2d57bc34fc0897bacd9f42cc2fa9a918ee Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Sat, 4 Jun 2022 17:07:29 +0800 Subject: [PATCH 06/70] add dispatch logic --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 92 ++++++++++++++++++--- 1 file changed, 79 insertions(+), 13 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 394b6399977dd..ab797aa186f65 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -470,6 +470,81 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingSMem( } } +template +inline bool TryDispatchBNForwardTrainingSMem( + const Context &ctx, + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + double exponentialAverageFactor, + T *y, + BatchNormParamType *mean, + BatchNormParamType *variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance) { + constexpr int block_size = 512; + const size_t smem = N * HxW * sizeof(BatchNormParamType); + int max_active_blocks_conf; + { + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf, + BNForwardTrainingSMem, + block_size, smem); + } + if (max_active_blocks_conf <= 0) { + return false; + } + const int max_threads = ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block_size, 1); + const int grid = std::min(C, max_blocks); + BNForwardTrainingSMem<<>>( + x, scale, bias, C, N, HxW, epsilon, exponentialAverageFactor, + y, mean, variance, save_mean, save_inv_variance); + return true; +} + +template +inline void DispatchBNForwardTraining( + const Context &ctx, + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + double exponentialAverageFactor, + T *y, + BatchNormParamType *mean, + BatchNormParamType *variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance) { + if ((N * HxW) <= 1024) { + // TODO: impl register-cache version + return; + } else { + bool dispatch_smem_impl_success = false; + { + dispatch_smem_impl_success = TryDispatchBNForwardTrainingSMem( + ctx, x, scale, bias, C, N, HxW, epsilon, exponentialAverageFactor, + y, mean, variance, save_mean, save_inv_variance); + } + if (!dispatch_smem_impl_success) { + const int block = 512; + const int max_threads = ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min(C, max_blocks); + return BNForwardTraining<<>>( + x, scale, bias, C, N, HxW, epsilon, exponentialAverageFactor, + y, mean, variance, save_mean, save_inv_variance); + } + } +} + template void BatchNormKernel(const Context &ctx, const DenseTensor &x, @@ -857,16 +932,9 @@ void BatchNormKernel(const Context &ctx, //const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070); const bool use_native_kernel = true; if(use_native_kernel) { - const int block = 512; - const int max_threads = ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - const int grid = std::min(C, max_blocks); - const size_t smem_size = N * H * W * D * sizeof(BatchNormParamType); if (compute_format == DataLayout::kNCHW) { - BNForwardTrainingSMem< - T, - block, - DataLayout::kNCHW><<>>( + DispatchBNForwardTraining( + ctx, transformed_x.template data(), scale.template data>(), bias.template data>(), @@ -881,10 +949,8 @@ void BatchNormKernel(const Context &ctx, saved_mean->template data>(), saved_variance->template data>()); } else { - BNForwardTrainingSMem< - T, - block, - DataLayout::kNHWC><<>>( + DispatchBNForwardTraining( + ctx, transformed_x.template data(), scale.template data>(), bias.template data>(), From 98c66f0df7b2ab0c33c89a0bd2a18fce2b13440a Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Sun, 5 Jun 2022 00:21:12 +0800 Subject: [PATCH 07/70] add channel_last impl --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 280 +++++++++++++++++--- 1 file changed, 250 insertions(+), 30 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index ab797aa186f65..14778a89a4657 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -25,6 +25,7 @@ namespace cub = hipcub; #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/batch_norm_kernel.h" #include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/fluid/operators/norm_utils.cu.h" #include "paddle/fluid/operators/norm_utils.h" @@ -399,6 +400,158 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel } } +template +static __global__ void BNForwardTraining2D( + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + double exponentialAverageFactor, + T *y, + BatchNormParamType *mean, + BatchNormParamType *variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance, + BatchNormParamType *block_data_ptr, + int *flag_ptr) { + int outer_size = C; + int inner_size = N * HxW; + + extern __shared__ __align__(sizeof(double)) char smem_buf[]; + + BatchNormParamType* mean_val = reinterpret_cast*>(smem_buf); + BatchNormParamType* variance_val = reinterpret_cast*>(&smem_buf[blockDim.x]); + BatchNormParamType* inv_var_val = reinterpret_cast*>(&smem_buf[2*blockDim.x]); + + __shared__ BatchNormParamType smem_sum[BlockDim]; + __shared__ BatchNormParamType smem_square_sum[BlockDim]; + + int outer_loop_stride = gridDim.x * blockDim.x; + int inner_loop_stride = gridDim.y * blockDim.y; + + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size; i += outer_loop_stride) { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = static_cast>(0); + + for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size; j += inner_loop_stride) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + + // vertical block sum + int tid = threadIdx.x + threadIdx.y * blockDim.x; + #pragma unroll + for (int offset = blockDim.y/2; offset > 0; offset >>= 1) { + if (threadIdx.y < offset*2) { + smem_sum[tid] = x_sum; + smem_square_sum[tid] = x_square_sum; + } + __syncthreads(); + if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { + int pair_tid = tid + offset * blockDim.x; + x_sum += smem_sum[pair_tid]; + x_square_sum += smem_square_sum[pair_tid]; + } + } + + if (gridDim.y > 1) { + volatile BatchNormParamType* staging_sum = block_data_ptr; + volatile BatchNormParamType* staging_square_sum = &block_data_ptr[C*gridDim.y]; + // write block data to global memory + if (threadIdx.y == 0) { + staging_sum[i + blockIdx.y * C] = x_sum; + staging_square_sum[i + blockIdx.y * C] = x_square_sum; + } + + // make sure write is visible to all blocks + __threadfence(); + __syncthreads(); + + __shared__ bool is_last_block_done; + // mark block done + if (threadIdx.x == 0 && threadIdx.y == 0) { + int old = atomicAdd(&flag_ptr[blockIdx.x], 1); + is_last_block_done = (old == (gridDim.y-1)); + } + + __syncthreads(); + + if (is_last_block_done) { + x_sum = static_cast>(0); + x_square_sum = static_cast>(0); + // thread sum + for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) { + x_sum += staging_sum[i+y*C]; + x_square_sum += staging_square_sum[i+y*C]; + } + + // vertical block sum + int tid = threadIdx.x + threadIdx.y * blockDim.x; + #pragma unroll + for (int offset = blockDim.y/2; offset > 0; offset >>= 1) { + if (threadIdx.y < offset*2) { + smem_sum[tid] = x_sum; + smem_square_sum[tid] = x_square_sum; + } + __syncthreads(); + if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { + int pair_tid = tid + offset * blockDim.x; + x_sum += smem_sum[pair_tid]; + x_square_sum += smem_square_sum[pair_tid]; + } + } + + // final compute + if(threadIdx.y == 0) { + mean_val[threadIdx.x] = x_sum / inner_size; + variance_val[threadIdx.x] = x_square_sum / inner_size - mean_val[threadIdx.x] * mean_val[threadIdx.x]; + inv_var_val[threadIdx.x] = 1 / sqrt(variance_val[threadIdx.x] + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = mean_val[threadIdx.x]; + save_inv_variance[i] = inv_var_val[threadIdx.x]; + } + mean[i] = (1 - exponentialAverageFactor) * mean_val[threadIdx.x] + + exponentialAverageFactor * mean[i]; + variance[i] = (1 - exponentialAverageFactor) * variance_val[threadIdx.x] + + exponentialAverageFactor * variance[i]; + } + } + } else { + if(blockIdx.y == 0 && threadIdx.y == 0) { + mean_val[threadIdx.x] = x_sum / inner_size; + variance_val[threadIdx.x] = x_square_sum / inner_size - mean_val[threadIdx.x] * mean_val[threadIdx.x]; + inv_var_val[threadIdx.x] = 1 / sqrt(variance_val[threadIdx.x] + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = mean_val[threadIdx.x]; + save_inv_variance[i] = inv_var_val[threadIdx.x]; + } + mean[i] = (1 - exponentialAverageFactor) * mean_val[threadIdx.x] + + exponentialAverageFactor * mean[i]; + variance[i] = (1 - exponentialAverageFactor) * variance_val[threadIdx.x] + + exponentialAverageFactor * variance[i]; + } + } + __syncthreads(); + + for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_sub_mean = + static_cast>(x[index]) - mean_val[threadIdx.x]; + y[index] = scale[i] * x_sub_mean * inv_var_val[threadIdx.x] + bias[i]; + } + } +} template static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingSMem( @@ -932,38 +1085,105 @@ void BatchNormKernel(const Context &ctx, //const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070); const bool use_native_kernel = true; if(use_native_kernel) { + dim3 block; + dim3 grid; + + const int block_size = 512; + // init block&grid config + int block_x = std::min(phi::funcs::details::GetLastPow2(C), 32); + int block_y = std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16), block_size / block_x); + if (block_x * block_y != block_size) { + block_x = std::min(phi::funcs::details::GetLastPow2(C), block_size / block_y); + } + int grid_x = (C + block_x - 1) / block_x; + int grid_y = std::min((N * H * W * D + block_y * 16 - 1) / (block_y * 16), 128); + + block.x = block_x; + block.y = block_y; + grid.x = grid_x; + grid.y = grid_y; + + // init intermediate storage + DenseTensor block_data_tensor; + DenseTensor flag_tensor; + BatchNormParamType* block_data_ptr = nullptr; + int* flag_ptr = nullptr; + if(grid.y > 1) { + block_data_tensor.Resize({static_cast(2 * C * grid.y * sizeof(BatchNormParamType))}); + flag_tensor.Resize({static_cast(grid.x * sizeof(int))}); + + block_data_ptr = static_cast*>(ctx.template Alloc>(&block_data_tensor)); + flag_ptr = static_cast(ctx.template Alloc(&flag_tensor)); + } + + size_t smem_size = 3 * sizeof(BatchNormParamType) * block.x; if (compute_format == DataLayout::kNCHW) { - DispatchBNForwardTraining( - ctx, - transformed_x.template data(), - scale.template data>(), - bias.template data>(), - C, - N, - H * W * D, - epsilon, - this_factor, - transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>()); + BNForwardTraining2D + <<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + this_factor, + transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>(), + block_data_ptr, + flag_ptr); + // DispatchBNForwardTraining( + // ctx, + // transformed_x.template data(), + // scale.template data>(), + // bias.template data>(), + // C, + // N, + // H * W * D, + // epsilon, + // this_factor, + // transformed_y.template data(), + // mean_out->template data>(), + // variance_out->template data>(), + // saved_mean->template data>(), + // saved_variance->template data>()); } else { - DispatchBNForwardTraining( - ctx, - transformed_x.template data(), - scale.template data>(), - bias.template data>(), - C, - N, - H * W * D, - epsilon, - this_factor, - transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>()); + BNForwardTraining2D + <<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + this_factor, + transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>(), + block_data_ptr, + flag_ptr); + + // DispatchBNForwardTraining( + // ctx, + // transformed_x.template data(), + // scale.template data>(), + // bias.template data>(), + // C, + // N, + // H * W * D, + // epsilon, + // this_factor, + // transformed_y.template data(), + // mean_out->template data>(), + // variance_out->template data>(), + // saved_mean->template data>(), + // saved_variance->template data>()); } } else { #if CUDNN_VERSION_MIN(7, 4, 1) From 570dc551bb3958269b6ea7d65309d57a6e2c20ab Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Mon, 6 Jun 2022 10:13:28 +0800 Subject: [PATCH 08/70] refine the global space init --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 14778a89a4657..0cfffbb81f1ba 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -1109,11 +1109,13 @@ void BatchNormKernel(const Context &ctx, BatchNormParamType* block_data_ptr = nullptr; int* flag_ptr = nullptr; if(grid.y > 1) { - block_data_tensor.Resize({static_cast(2 * C * grid.y * sizeof(BatchNormParamType))}); - flag_tensor.Resize({static_cast(grid.x * sizeof(int))}); + block_data_tensor = phi::Empty, Context>(ctx, {2 * C * grid.y}); + flag_tensor = phi::Empty(ctx, {grid.x}); - block_data_ptr = static_cast*>(ctx.template Alloc>(&block_data_tensor)); - flag_ptr = static_cast(ctx.template Alloc(&flag_tensor)); + block_data_ptr = block_data_tensor.data>(); + flag_ptr = flag_tensor.data(); + funcs::SetConstant set_zero; + set_zero(ctx, &flag_tensor, static_cast(0)); } size_t smem_size = 3 * sizeof(BatchNormParamType) * block.x; From aaca04a6bc9f8a671488b8f2fd99a95c6303fb34 Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Tue, 7 Jun 2022 14:49:24 +0800 Subject: [PATCH 09/70] impl 2d kernel --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 549 ++++++++++++-------- 1 file changed, 339 insertions(+), 210 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 0cfffbb81f1ba..c726f31de232d 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -141,7 +141,6 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( } } - template static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellford( const T *x, @@ -181,8 +180,8 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellford( // thread-local iterative computation for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { const int index = layout == phi::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; BatchNormParamType x_i = static_cast>(x[index]); BatchNormParamType delta = (x_i - local_mean); local_count++; @@ -191,11 +190,17 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellford( } // warp sum - for(int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) { - BatchNormParamType o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); - int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); - BatchNormParamType factor = 1.0 / static_cast(max(1, local_count+o_count)); - local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor); + for (int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) { + BatchNormParamType o_mean = + __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); + int o_count = + __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); + BatchNormParamType factor = + 1.0 / static_cast(max(1, local_count + o_count)); + local_var_n += (__shfl_xor_sync( + 0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + + (local_mean - o_mean) * (local_mean - o_mean) * + local_count * o_count * factor); local_mean = (local_count * local_mean + o_count * o_mean) * factor; local_count += o_count; } @@ -214,11 +219,17 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellford( local_var_n = warp_shared_count[threadIdx.x]; } - for(int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) { - BatchNormParamType o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); - int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); - BatchNormParamType factor = 1.0 / static_cast(max(1, local_count+o_count)); - local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor); + for (int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) { + BatchNormParamType o_mean = + __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); + int o_count = + __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); + BatchNormParamType factor = + 1.0 / static_cast(max(1, local_count + o_count)); + local_var_n += (__shfl_xor_sync( + 0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + + (local_mean - o_mean) * (local_mean - o_mean) * + local_count * o_count * factor); local_mean = (local_count * local_mean + o_count * o_mean) * factor; local_count += o_count; } @@ -251,20 +262,21 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellford( } template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel( - const T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const int C, - const int N, - const int HxW, - const double epsilon, - double exponentialAverageFactor, - T *y, - BatchNormParamType *mean, - BatchNormParamType *variance, - BatchNormParamType *save_mean, - BatchNormParamType *save_inv_variance) { +static __global__ + LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel( + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + double exponentialAverageFactor, + T *y, + BatchNormParamType *mean, + BatchNormParamType *variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance) { int outer_size = C; int inner_size = N * HxW; __shared__ BatchNormParamType mean_val; @@ -288,28 +300,30 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel BatchNormParamType tmp_local_var_n[PARALLEL_LOADS]; int tmp_local_count[PARALLEL_LOADS]; - #pragma unroll - for(int k = 0; k < PARALLEL_LOADS; k++) { +#pragma unroll + for (int k = 0; k < PARALLEL_LOADS; k++) { tmp_local_mean[k] = static_cast>(0); tmp_local_var_n[k] = static_cast>(0); tmp_local_count[k] = 0; } // thread-local iterative computation - for (int j = threadIdx.x; j < inner_size; j += PARALLEL_LOADS * blockDim.x) { + for (int j = threadIdx.x; j < inner_size; + j += PARALLEL_LOADS * blockDim.x) { BatchNormParamType tmp_local_x[PARALLEL_LOADS]; BatchNormParamType tmp_local_count_inv[PARALLEL_LOADS]; BatchNormParamType valid[PARALLEL_LOADS]; auto offset = j; - #pragma unroll - for(int k = 0; k < PARALLEL_LOADS; k++) { - if(offset < inner_size) { +#pragma unroll + for (int k = 0; k < PARALLEL_LOADS; k++) { + if (offset < inner_size) { const int index = layout == phi::DataLayout::kNCHW - ? (offset / HxW * C + i) * HxW + offset % HxW - : offset * outer_size + i; + ? (offset / HxW * C + i) * HxW + offset % HxW + : offset * outer_size + i; tmp_local_x[k] = static_cast>(x[index]); tmp_local_count[k]++; - tmp_local_count_inv[k] = static_cast>(1) / tmp_local_count[k]; + tmp_local_count_inv[k] = + static_cast>(1) / tmp_local_count[k]; valid[k] = static_cast>(1); } else { tmp_local_x[k] = static_cast>(0); @@ -319,20 +333,27 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel offset += blockDim.x; } - #pragma unroll - for(int k = 0; k < PARALLEL_LOADS; k++) { +#pragma unroll + for (int k = 0; k < PARALLEL_LOADS; k++) { BatchNormParamType delta = (tmp_local_x[k] - tmp_local_mean[k]); tmp_local_mean[k] += delta * tmp_local_count_inv[k]; - tmp_local_var_n[k] += delta * (tmp_local_x[k] - tmp_local_mean[k]) * valid[k]; + tmp_local_var_n[k] += + delta * (tmp_local_x[k] - tmp_local_mean[k]) * valid[k]; } } - #pragma unroll - for(int k = 1; k < PARALLEL_LOADS; k++) { - BatchNormParamType factor = 1.0 / static_cast(max(1, tmp_local_count[0]+tmp_local_count[k])); +#pragma unroll + for (int k = 1; k < PARALLEL_LOADS; k++) { + BatchNormParamType factor = + 1.0 / + static_cast(max(1, tmp_local_count[0] + tmp_local_count[k])); BatchNormParamType delta = (tmp_local_mean[0] - tmp_local_mean[k]); - tmp_local_mean[0] = (tmp_local_count[0] * tmp_local_mean[0] + tmp_local_count[k] * tmp_local_mean[k]) * factor; - tmp_local_var_n[0] += (tmp_local_var_n[k] + delta * delta * tmp_local_count[0] * tmp_local_count[k] * factor); + tmp_local_mean[0] = (tmp_local_count[0] * tmp_local_mean[0] + + tmp_local_count[k] * tmp_local_mean[k]) * + factor; + tmp_local_var_n[0] += + (tmp_local_var_n[k] + + delta * delta * tmp_local_count[0] * tmp_local_count[k] * factor); tmp_local_count[0] += tmp_local_count[k]; } @@ -341,11 +362,17 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel int local_count = tmp_local_count[0]; // warp sum - for(int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) { - BatchNormParamType o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); - int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); - BatchNormParamType factor = 1.0 / static_cast(max(1, local_count+o_count)); - local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor); + for (int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) { + BatchNormParamType o_mean = + __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); + int o_count = + __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); + BatchNormParamType factor = + 1.0 / static_cast(max(1, local_count + o_count)); + local_var_n += (__shfl_xor_sync( + 0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + + (local_mean - o_mean) * (local_mean - o_mean) * + local_count * o_count * factor); local_mean = (local_count * local_mean + o_count * o_mean) * factor; local_count += o_count; } @@ -364,11 +391,17 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel local_var_n = warp_shared_count[threadIdx.x]; } - for(int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) { - BatchNormParamType o_mean = __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); - int o_count = __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); - BatchNormParamType factor = 1.0 / static_cast(max(1, local_count+o_count)); - local_var_n += (__shfl_xor_sync(0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + (local_mean - o_mean) * (local_mean - o_mean) * local_count * o_count * factor); + for (int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) { + BatchNormParamType o_mean = + __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); + int o_count = + __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); + BatchNormParamType factor = + 1.0 / static_cast(max(1, local_count + o_count)); + local_var_n += (__shfl_xor_sync( + 0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + + (local_mean - o_mean) * (local_mean - o_mean) * + local_count * o_count * factor); local_mean = (local_count * local_mean + o_count * o_mean) * factor; local_count += o_count; } @@ -401,7 +434,7 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel } template -static __global__ void BNForwardTraining2D( +static __global__ void BNForwardTraining2DComputeStatistic( const T *x, const BatchNormParamType *scale, const BatchNormParamType *bias, @@ -411,32 +444,30 @@ static __global__ void BNForwardTraining2D( const double epsilon, double exponentialAverageFactor, T *y, - BatchNormParamType *mean, - BatchNormParamType *variance, + BatchNormParamType *global_mean, + BatchNormParamType *global_variance, BatchNormParamType *save_mean, BatchNormParamType *save_inv_variance, + BatchNormParamType *compute_mean, + BatchNormParamType *compute_inv_var, BatchNormParamType *block_data_ptr, int *flag_ptr) { int outer_size = C; int inner_size = N * HxW; - extern __shared__ __align__(sizeof(double)) char smem_buf[]; - - BatchNormParamType* mean_val = reinterpret_cast*>(smem_buf); - BatchNormParamType* variance_val = reinterpret_cast*>(&smem_buf[blockDim.x]); - BatchNormParamType* inv_var_val = reinterpret_cast*>(&smem_buf[2*blockDim.x]); - __shared__ BatchNormParamType smem_sum[BlockDim]; __shared__ BatchNormParamType smem_square_sum[BlockDim]; int outer_loop_stride = gridDim.x * blockDim.x; int inner_loop_stride = gridDim.y * blockDim.y; - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size; i += outer_loop_stride) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size; + i += outer_loop_stride) { BatchNormParamType x_sum = static_cast>(0); BatchNormParamType x_square_sum = static_cast>(0); - for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size; j += inner_loop_stride) { + for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size; + j += inner_loop_stride) { const int index = layout == phi::DataLayout::kNCHW ? (j / HxW * C + i) * HxW + j % HxW : j * outer_size + i; @@ -447,9 +478,9 @@ static __global__ void BNForwardTraining2D( // vertical block sum int tid = threadIdx.x + threadIdx.y * blockDim.x; - #pragma unroll - for (int offset = blockDim.y/2; offset > 0; offset >>= 1) { - if (threadIdx.y < offset*2) { +#pragma unroll + for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) { + if (threadIdx.y < offset * 2) { smem_sum[tid] = x_sum; smem_square_sum[tid] = x_square_sum; } @@ -462,8 +493,9 @@ static __global__ void BNForwardTraining2D( } if (gridDim.y > 1) { - volatile BatchNormParamType* staging_sum = block_data_ptr; - volatile BatchNormParamType* staging_square_sum = &block_data_ptr[C*gridDim.y]; + volatile BatchNormParamType *staging_sum = block_data_ptr; + volatile BatchNormParamType *staging_square_sum = + &block_data_ptr[C * gridDim.y]; // write block data to global memory if (threadIdx.y == 0) { staging_sum[i + blockIdx.y * C] = x_sum; @@ -478,7 +510,7 @@ static __global__ void BNForwardTraining2D( // mark block done if (threadIdx.x == 0 && threadIdx.y == 0) { int old = atomicAdd(&flag_ptr[blockIdx.x], 1); - is_last_block_done = (old == (gridDim.y-1)); + is_last_block_done = (old == (gridDim.y - 1)); } __syncthreads(); @@ -488,15 +520,15 @@ static __global__ void BNForwardTraining2D( x_square_sum = static_cast>(0); // thread sum for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) { - x_sum += staging_sum[i+y*C]; - x_square_sum += staging_square_sum[i+y*C]; + x_sum += staging_sum[i + y * C]; + x_square_sum += staging_square_sum[i + y * C]; } // vertical block sum int tid = threadIdx.x + threadIdx.y * blockDim.x; - #pragma unroll - for (int offset = blockDim.y/2; offset > 0; offset >>= 1) { - if (threadIdx.y < offset*2) { +#pragma unroll + for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) { + if (threadIdx.y < offset * 2) { smem_sum[tid] = x_sum; smem_square_sum[tid] = x_square_sum; } @@ -509,46 +541,90 @@ static __global__ void BNForwardTraining2D( } // final compute - if(threadIdx.y == 0) { - mean_val[threadIdx.x] = x_sum / inner_size; - variance_val[threadIdx.x] = x_square_sum / inner_size - mean_val[threadIdx.x] * mean_val[threadIdx.x]; - inv_var_val[threadIdx.x] = 1 / sqrt(variance_val[threadIdx.x] + epsilon); - + if (threadIdx.y == 0) { + BatchNormParamType compute_mean_val = x_sum / inner_size; + BatchNormParamType variance_val = + x_square_sum / inner_size - compute_mean_val * compute_mean_val; + BatchNormParamType compute_inv_var_val = + 1 / sqrt(variance_val + epsilon); + if (save_mean && save_inv_variance) { - save_mean[i] = mean_val[threadIdx.x]; - save_inv_variance[i] = inv_var_val[threadIdx.x]; + save_mean[i] = compute_mean_val; + save_inv_variance[i] = compute_inv_var_val; } - mean[i] = (1 - exponentialAverageFactor) * mean_val[threadIdx.x] + - exponentialAverageFactor * mean[i]; - variance[i] = (1 - exponentialAverageFactor) * variance_val[threadIdx.x] + - exponentialAverageFactor * variance[i]; + global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val + + exponentialAverageFactor * global_mean[i]; + global_variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * global_variance[i]; + + compute_mean[i] = compute_mean_val; + compute_inv_var[i] = compute_inv_var_val; } } } else { - if(blockIdx.y == 0 && threadIdx.y == 0) { - mean_val[threadIdx.x] = x_sum / inner_size; - variance_val[threadIdx.x] = x_square_sum / inner_size - mean_val[threadIdx.x] * mean_val[threadIdx.x]; - inv_var_val[threadIdx.x] = 1 / sqrt(variance_val[threadIdx.x] + epsilon); + if (blockIdx.y == 0 && threadIdx.y == 0) { + BatchNormParamType compute_mean_val = x_sum / inner_size; + BatchNormParamType variance_val = + x_square_sum / inner_size - compute_mean_val * compute_mean_val; + BatchNormParamType compute_inv_var_val = + 1 / sqrt(variance_val + epsilon); if (save_mean && save_inv_variance) { - save_mean[i] = mean_val[threadIdx.x]; - save_inv_variance[i] = inv_var_val[threadIdx.x]; + save_mean[i] = compute_mean_val; + save_inv_variance[i] = compute_inv_var_val; } - mean[i] = (1 - exponentialAverageFactor) * mean_val[threadIdx.x] + - exponentialAverageFactor * mean[i]; - variance[i] = (1 - exponentialAverageFactor) * variance_val[threadIdx.x] + - exponentialAverageFactor * variance[i]; + global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val + + exponentialAverageFactor * global_mean[i]; + global_variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * global_variance[i]; + + compute_mean[i] = compute_mean_val; + compute_inv_var[i] = compute_inv_var_val; } } + } +} + +template +static __global__ void BNForwardTraining2DUpdateOutput( + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + T *y, + BatchNormParamType *compute_mean, + BatchNormParamType *compute_inv_var) { + int outer_size = C; + int inner_size = N * HxW; + + extern __shared__ __align__(sizeof(double)) char smem_buf[]; + + BatchNormParamType *smem_mean = + reinterpret_cast *>(smem_buf); + BatchNormParamType *smem_inv_var = + reinterpret_cast *>(&smem_buf[blockDim.x]); + + int outer_loop_stride = gridDim.x * blockDim.x; + int inner_loop_stride = gridDim.y * blockDim.y; + + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size; + i += outer_loop_stride) { + if (threadIdx.y == 0) { + smem_mean[threadIdx.x] = compute_mean[i]; + smem_inv_var[threadIdx.x] = compute_inv_var[i]; + } __syncthreads(); - for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size; j += blockDim.x) { + for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size; + j += inner_loop_stride) { const int index = layout == phi::DataLayout::kNCHW ? (j / HxW * C + i) * HxW + j % HxW : j * outer_size + i; BatchNormParamType x_sub_mean = - static_cast>(x[index]) - mean_val[threadIdx.x]; - y[index] = scale[i] * x_sub_mean * inv_var_val[threadIdx.x] + bias[i]; + static_cast>(x[index]) - smem_mean[threadIdx.x]; + y[index] = scale[i] * x_sub_mean * smem_inv_var[threadIdx.x] + bias[i]; } } } @@ -569,7 +645,8 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingSMem( BatchNormParamType *save_mean, BatchNormParamType *save_inv_variance) { extern __shared__ __align__(sizeof(double)) char smem_buf[]; - BatchNormParamType* x_buf = reinterpret_cast*>(smem_buf); + BatchNormParamType *x_buf = + reinterpret_cast *>(smem_buf); int outer_size = C; int inner_size = N * HxW; @@ -646,7 +723,8 @@ inline bool TryDispatchBNForwardTrainingSMem( cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks_conf, BNForwardTrainingSMem, - block_size, smem); + block_size, + smem); } if (max_active_blocks_conf <= 0) { return false; @@ -654,46 +732,85 @@ inline bool TryDispatchBNForwardTrainingSMem( const int max_threads = ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block_size, 1); const int grid = std::min(C, max_blocks); - BNForwardTrainingSMem<<>>( - x, scale, bias, C, N, HxW, epsilon, exponentialAverageFactor, - y, mean, variance, save_mean, save_inv_variance); + BNForwardTrainingSMem<<>>( + x, + scale, + bias, + C, + N, + HxW, + epsilon, + exponentialAverageFactor, + y, + mean, + variance, + save_mean, + save_inv_variance); return true; } template inline void DispatchBNForwardTraining( - const Context &ctx, - const T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const int C, - const int N, - const int HxW, - const double epsilon, - double exponentialAverageFactor, - T *y, - BatchNormParamType *mean, - BatchNormParamType *variance, - BatchNormParamType *save_mean, - BatchNormParamType *save_inv_variance) { + const Context &ctx, + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + double exponentialAverageFactor, + T *y, + BatchNormParamType *mean, + BatchNormParamType *variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance) { if ((N * HxW) <= 1024) { - // TODO: impl register-cache version + // TODO(yaozihang): impl register-cache version return; } else { bool dispatch_smem_impl_success = false; { - dispatch_smem_impl_success = TryDispatchBNForwardTrainingSMem( - ctx, x, scale, bias, C, N, HxW, epsilon, exponentialAverageFactor, - y, mean, variance, save_mean, save_inv_variance); + dispatch_smem_impl_success = + TryDispatchBNForwardTrainingSMem( + ctx, + x, + scale, + bias, + C, + N, + HxW, + epsilon, + exponentialAverageFactor, + y, + mean, + variance, + save_mean, + save_inv_variance); } if (!dispatch_smem_impl_success) { const int block = 512; const int max_threads = ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); const int grid = std::min(C, max_blocks); - return BNForwardTraining<<>>( - x, scale, bias, C, N, HxW, epsilon, exponentialAverageFactor, - y, mean, variance, save_mean, save_inv_variance); + return BNForwardTraining<<>>( + x, + scale, + bias, + C, + N, + HxW, + epsilon, + exponentialAverageFactor, + y, + mean, + variance, + save_mean, + save_inv_variance); } } } @@ -1016,10 +1133,9 @@ void BatchNormKernel(const Context &ctx, const int max_blocks = std::max(max_threads / block, 1); const int grid = std::min(C, max_blocks); if (compute_format == DataLayout::kNCHW) { - BNForwardTraining< - T, - block, - DataLayout::kNCHW><<>>( + BNForwardTraining<<>>( transformed_x.template data(), scale.template data>(), bias.template data>(), @@ -1034,10 +1150,9 @@ void BatchNormKernel(const Context &ctx, saved_mean->template data>(), saved_variance->template data>()); } else { - BNForwardTraining< - T, - block, - DataLayout::kNHWC><<>>( + BNForwardTraining<<>>( transformed_x.template data(), scale.template data>(), bias.template data>(), @@ -1082,21 +1197,25 @@ void BatchNormKernel(const Context &ctx, // static_cast(saved_variance->template mutable_data< // BatchNormParamType>(ctx.GetPlace())))); #else - //const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070); + // const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070); const bool use_native_kernel = true; - if(use_native_kernel) { + if (use_native_kernel) { dim3 block; dim3 grid; const int block_size = 512; // init block&grid config int block_x = std::min(phi::funcs::details::GetLastPow2(C), 32); - int block_y = std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16), block_size / block_x); + int block_y = + std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16), + block_size / block_x); if (block_x * block_y != block_size) { - block_x = std::min(phi::funcs::details::GetLastPow2(C), block_size / block_y); + block_x = std::min(phi::funcs::details::GetLastPow2(C), + block_size / block_y); } int grid_x = (C + block_x - 1) / block_x; - int grid_y = std::min((N * H * W * D + block_y * 16 - 1) / (block_y * 16), 128); + int grid_y = + std::min((N * H * W * D + block_y * 16 - 1) / (block_y * 16), 128); block.x = block_x; block.y = block_y; @@ -1106,10 +1225,16 @@ void BatchNormKernel(const Context &ctx, // init intermediate storage DenseTensor block_data_tensor; DenseTensor flag_tensor; - BatchNormParamType* block_data_ptr = nullptr; - int* flag_ptr = nullptr; - if(grid.y > 1) { - block_data_tensor = phi::Empty, Context>(ctx, {2 * C * grid.y}); + DenseTensor compute_mean_tensor = + phi::Empty, Context>(ctx, {C}); + DenseTensor compute_inv_var_tensor = + phi::Empty, Context>(ctx, {C}); + + BatchNormParamType *block_data_ptr = nullptr; + int *flag_ptr = nullptr; + if (grid.y > 1) { + block_data_tensor = + phi::Empty, Context>(ctx, {2 * C * grid.y}); flag_tensor = phi::Empty(ctx, {grid.x}); block_data_ptr = block_data_tensor.data>(); @@ -1118,74 +1243,78 @@ void BatchNormKernel(const Context &ctx, set_zero(ctx, &flag_tensor, static_cast(0)); } - size_t smem_size = 3 * sizeof(BatchNormParamType) * block.x; if (compute_format == DataLayout::kNCHW) { - BNForwardTraining2D - <<>>( - transformed_x.template data(), - scale.template data>(), - bias.template data>(), - C, - N, - H * W * D, - epsilon, - this_factor, - transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>(), - block_data_ptr, - flag_ptr); - // DispatchBNForwardTraining( - // ctx, - // transformed_x.template data(), - // scale.template data>(), - // bias.template data>(), - // C, - // N, - // H * W * D, - // epsilon, - // this_factor, - // transformed_y.template data(), - // mean_out->template data>(), - // variance_out->template data>(), - // saved_mean->template data>(), - // saved_variance->template data>()); + BNForwardTraining2DComputeStatistic< + T, + block_size, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + this_factor, + transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>(), + compute_mean_tensor.data>(), + compute_inv_var_tensor.data>(), + block_data_ptr, + flag_ptr); + + size_t smem_size = block.x * 2 * sizeof(BatchNormParamType); + BNForwardTraining2DUpdateOutput< + T, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + transformed_y.template data(), + compute_mean_tensor.data>(), + compute_inv_var_tensor.data>()); } else { - BNForwardTraining2D - <<>>( - transformed_x.template data(), - scale.template data>(), - bias.template data>(), - C, - N, - H * W * D, - epsilon, - this_factor, - transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>(), - block_data_ptr, - flag_ptr); - - // DispatchBNForwardTraining( - // ctx, - // transformed_x.template data(), - // scale.template data>(), - // bias.template data>(), - // C, - // N, - // H * W * D, - // epsilon, - // this_factor, - // transformed_y.template data(), - // mean_out->template data>(), - // variance_out->template data>(), - // saved_mean->template data>(), - // saved_variance->template data>()); + BNForwardTraining2DComputeStatistic< + T, + block_size, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + this_factor, + transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>(), + compute_mean_tensor.data>(), + compute_inv_var_tensor.data>(), + block_data_ptr, + flag_ptr); + + size_t smem_size = block.x * 2 * sizeof(BatchNormParamType); + BNForwardTraining2DUpdateOutput< + T, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + transformed_y.template data(), + compute_mean_tensor.data>(), + compute_inv_var_tensor.data>()); } } else { #if CUDNN_VERSION_MIN(7, 4, 1) @@ -1284,7 +1413,7 @@ void BatchNormKernel(const Context &ctx, epsilon, ctx.template Alloc>(saved_mean), ctx.template Alloc>(saved_variance))); -#endif // CUDNN_VERSION_MIN(7, 4, 1) +#endif // CUDNN_VERSION_MIN(7, 4, 1) } #endif } From 74b792b38bf9e6110b619473f49613f958fa0eb1 Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Sat, 11 Jun 2022 15:55:40 +0800 Subject: [PATCH 10/70] rm wellford --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 292 -------------------- 1 file changed, 292 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 4b263fd983171..5cf81367e4cf7 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -138,298 +138,6 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( } } -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellford( - const T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const int C, - const int N, - const int HxW, - const double epsilon, - double exponentialAverageFactor, - T *y, - BatchNormParamType *mean, - BatchNormParamType *variance, - BatchNormParamType *save_mean, - BatchNormParamType *save_inv_variance) { - int outer_size = C; - int inner_size = N * HxW; - __shared__ BatchNormParamType mean_val; - __shared__ BatchNormParamType variance_val; - __shared__ BatchNormParamType inv_var_val; - - constexpr int THREADS_PER_WARP = 32; - constexpr int THREADS_BITS_PER_WARP = 5; - - constexpr int WARP_PER_BLOCK = BlockDim / THREADS_PER_WARP; - const int WARP_BITS_PER_BLOCK = (31 - __clz(WARP_PER_BLOCK)); - - __shared__ int warp_shared_count[WARP_PER_BLOCK]; - __shared__ BatchNormParamType warp_shared_mean[WARP_PER_BLOCK]; - __shared__ BatchNormParamType warp_shared_var_n[WARP_PER_BLOCK]; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType local_mean = static_cast>(0); - BatchNormParamType local_var_n = static_cast>(0); - int local_count = 0; - - // thread-local iterative computation - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == phi::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_i = static_cast>(x[index]); - BatchNormParamType delta = (x_i - local_mean); - local_count++; - local_mean += delta / local_count; - local_var_n += delta * (x_i - local_mean); - } - - // warp sum - for (int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) { - BatchNormParamType o_mean = - __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); - int o_count = - __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); - BatchNormParamType factor = - 1.0 / static_cast(max(1, local_count + o_count)); - local_var_n += (__shfl_xor_sync( - 0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + - (local_mean - o_mean) * (local_mean - o_mean) * - local_count * o_count * factor); - local_mean = (local_count * local_mean + o_count * o_mean) * factor; - local_count += o_count; - } - - if (threadIdx.x % THREADS_PER_WARP == 0) { - warp_shared_count[threadIdx.x / THREADS_PER_WARP] = local_count; - warp_shared_mean[threadIdx.x / THREADS_PER_WARP] = local_mean; - warp_shared_var_n[threadIdx.x / THREADS_PER_WARP] = local_var_n; - } - __syncthreads(); - - // block sum - if (threadIdx.x < WARP_PER_BLOCK) { - local_count = warp_shared_count[threadIdx.x]; - local_mean = warp_shared_count[threadIdx.x]; - local_var_n = warp_shared_count[threadIdx.x]; - } - - for (int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) { - BatchNormParamType o_mean = - __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); - int o_count = - __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); - BatchNormParamType factor = - 1.0 / static_cast(max(1, local_count + o_count)); - local_var_n += (__shfl_xor_sync( - 0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + - (local_mean - o_mean) * (local_mean - o_mean) * - local_count * o_count * factor); - local_mean = (local_count * local_mean + o_count * o_mean) * factor; - local_count += o_count; - } - - if (threadIdx.x == 0) { - mean_val = local_mean; - variance_val = local_var_n / local_count; - inv_var_val = 1 / sqrt(variance_val + epsilon); - - if (save_mean && save_inv_variance) { - save_mean[i] = mean_val; - save_inv_variance[i] = inv_var_val; - } - mean[i] = (1 - exponentialAverageFactor) * mean_val + - exponentialAverageFactor * mean[i]; - variance[i] = (1 - exponentialAverageFactor) * variance_val + - exponentialAverageFactor * variance[i]; - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == phi::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_sub_mean = - static_cast>(x[index]) - mean_val; - y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; - } - } -} - -template -static __global__ -LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingWellfordParallel( - const T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const int C, - const int N, - const int HxW, - const double epsilon, - double exponentialAverageFactor, - T *y, - BatchNormParamType *mean, - BatchNormParamType *variance, - BatchNormParamType *save_mean, - BatchNormParamType *save_inv_variance) { - int outer_size = C; - int inner_size = N * HxW; - __shared__ BatchNormParamType mean_val; - __shared__ BatchNormParamType variance_val; - __shared__ BatchNormParamType inv_var_val; - - constexpr int PARALLEL_LOADS = 4; - - constexpr int THREADS_PER_WARP = 32; - constexpr int THREADS_BITS_PER_WARP = 5; - - constexpr int WARP_PER_BLOCK = BlockDim / THREADS_PER_WARP; - const int WARP_BITS_PER_BLOCK = (31 - __clz(WARP_PER_BLOCK)); - - __shared__ int warp_shared_count[WARP_PER_BLOCK]; - __shared__ BatchNormParamType warp_shared_mean[WARP_PER_BLOCK]; - __shared__ BatchNormParamType warp_shared_var_n[WARP_PER_BLOCK]; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType tmp_local_mean[PARALLEL_LOADS]; - BatchNormParamType tmp_local_var_n[PARALLEL_LOADS]; - int tmp_local_count[PARALLEL_LOADS]; - -#pragma unroll - for (int k = 0; k < PARALLEL_LOADS; k++) { - tmp_local_mean[k] = static_cast>(0); - tmp_local_var_n[k] = static_cast>(0); - tmp_local_count[k] = 0; - } - - // thread-local iterative computation - for (int j = threadIdx.x; j < inner_size; - j += PARALLEL_LOADS * blockDim.x) { - BatchNormParamType tmp_local_x[PARALLEL_LOADS]; - BatchNormParamType tmp_local_count_inv[PARALLEL_LOADS]; - BatchNormParamType valid[PARALLEL_LOADS]; - auto offset = j; -#pragma unroll - for (int k = 0; k < PARALLEL_LOADS; k++) { - if (offset < inner_size) { - const int index = layout == phi::DataLayout::kNCHW - ? (offset / HxW * C + i) * HxW + offset % HxW - : offset * outer_size + i; - tmp_local_x[k] = static_cast>(x[index]); - tmp_local_count[k]++; - tmp_local_count_inv[k] = - static_cast>(1) / tmp_local_count[k]; - valid[k] = static_cast>(1); - } else { - tmp_local_x[k] = static_cast>(0); - tmp_local_count_inv[k] = static_cast>(0); - valid[k] = static_cast>(0); - } - offset += blockDim.x; - } - -#pragma unroll - for (int k = 0; k < PARALLEL_LOADS; k++) { - BatchNormParamType delta = (tmp_local_x[k] - tmp_local_mean[k]); - tmp_local_mean[k] += delta * tmp_local_count_inv[k]; - tmp_local_var_n[k] += - delta * (tmp_local_x[k] - tmp_local_mean[k]) * valid[k]; - } - } - -#pragma unroll - for (int k = 1; k < PARALLEL_LOADS; k++) { - BatchNormParamType factor = - 1.0 / - static_cast(max(1, tmp_local_count[0] + tmp_local_count[k])); - BatchNormParamType delta = (tmp_local_mean[0] - tmp_local_mean[k]); - tmp_local_mean[0] = (tmp_local_count[0] * tmp_local_mean[0] + - tmp_local_count[k] * tmp_local_mean[k]) * - factor; - tmp_local_var_n[0] += - (tmp_local_var_n[k] + - delta * delta * tmp_local_count[0] * tmp_local_count[k] * factor); - tmp_local_count[0] += tmp_local_count[k]; - } - - BatchNormParamType local_mean = tmp_local_mean[0]; - BatchNormParamType local_var_n = tmp_local_var_n[0]; - int local_count = tmp_local_count[0]; - - // warp sum - for (int b_i = 0; b_i < THREADS_BITS_PER_WARP; b_i++) { - BatchNormParamType o_mean = - __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); - int o_count = - __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); - BatchNormParamType factor = - 1.0 / static_cast(max(1, local_count + o_count)); - local_var_n += (__shfl_xor_sync( - 0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + - (local_mean - o_mean) * (local_mean - o_mean) * - local_count * o_count * factor); - local_mean = (local_count * local_mean + o_count * o_mean) * factor; - local_count += o_count; - } - - if (threadIdx.x % THREADS_PER_WARP == 0) { - warp_shared_count[threadIdx.x / THREADS_PER_WARP] = local_count; - warp_shared_mean[threadIdx.x / THREADS_PER_WARP] = local_mean; - warp_shared_var_n[threadIdx.x / THREADS_PER_WARP] = local_var_n; - } - __syncthreads(); - - // block sum - if (threadIdx.x < WARP_PER_BLOCK) { - local_count = warp_shared_count[threadIdx.x]; - local_mean = warp_shared_count[threadIdx.x]; - local_var_n = warp_shared_count[threadIdx.x]; - } - - for (int b_i = 0; b_i < WARP_BITS_PER_BLOCK; b_i++) { - BatchNormParamType o_mean = - __shfl_xor_sync(0xffffffff, local_mean, 1 << b_i, THREADS_PER_WARP); - int o_count = - __shfl_xor_sync(0xffffffff, local_count, 1 << b_i, THREADS_PER_WARP); - BatchNormParamType factor = - 1.0 / static_cast(max(1, local_count + o_count)); - local_var_n += (__shfl_xor_sync( - 0xffffffff, local_var_n, 1 << b_i, THREADS_PER_WARP) + - (local_mean - o_mean) * (local_mean - o_mean) * - local_count * o_count * factor); - local_mean = (local_count * local_mean + o_count * o_mean) * factor; - local_count += o_count; - } - - if (threadIdx.x == 0) { - mean_val = local_mean; - variance_val = local_var_n / local_count; - inv_var_val = 1 / sqrt(variance_val + epsilon); - - if (save_mean && save_inv_variance) { - save_mean[i] = mean_val; - save_inv_variance[i] = inv_var_val; - } - mean[i] = (1 - exponentialAverageFactor) * mean_val + - exponentialAverageFactor * mean[i]; - variance[i] = (1 - exponentialAverageFactor) * variance_val + - exponentialAverageFactor * variance[i]; - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == phi::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_sub_mean = - static_cast>(x[index]) - mean_val; - y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; - } - } -} - template static __global__ void BNForwardTraining2DComputeStatistic( const T *x, From a0bd5b67697ea2f0d6981db58bb5f4d3cbca967a Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Sat, 11 Jun 2022 13:49:46 +0800 Subject: [PATCH 11/70] fix backward --- .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 194 +++++++++++------- 1 file changed, 115 insertions(+), 79 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index 6de239182c15b..b23b119342d68 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -542,70 +542,60 @@ void BatchNormGradRawKernel(const Context &ctx, // This branch calls CUDNN APIs if (d_x && d_scale && d_bias) { - bool called = false; -#if CUDNN_VERSION_MIN(7, 4, 1) - called = true; - size_t workspace_size = 0; - void *workspace_ptr = nullptr; - DenseTensor workspace_tensor; - auto reserve_space_size = reserve_space->memory_size(); - // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload:: - cudnnGetBatchNormalizationBackwardExWorkspaceSize( - /*handle=*/ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, - /*xDesc=*/data_desc_, - /*yDesc=*/data_desc_, - /*dyDesc=*/data_desc_, - /*dzDesc=*/nullptr, - /*dxDesc=*/data_desc_, - /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, - /*activationDesc=*/nullptr, - /*sizeInBytes=*/&workspace_size)); - - workspace_tensor.Resize({static_cast(workspace_size)}); - workspace_ptr = - static_cast(ctx.template Alloc(&workspace_tensor)); - - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnBatchNormalizationBackwardEx( - /*handle=*/ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, - /*alphaDataDiff=*/CudnnDataType::kOne(), - /*betaDataDiff=*/CudnnDataType::kZero(), - /*alphaParamDiff=*/CudnnDataType::kOne(), - /*betaParamDiff=*/CudnnDataType::kZero(), - /*xDesc=*/data_desc_, - /*xData=*/transformed_x.template data(), - /*yDesc=*/nullptr, - /*yData=*/nullptr, - /*dyDesc=*/data_desc_, - /*dyData=*/transformed_d_y.template data(), - /*dzDesc=*/nullptr, - /*dzData=*/nullptr, - /*dxDesc=*/data_desc_, - /*dxData=*/ctx.template Alloc(&transformed_d_x), - /*dBnScaleBiasDesc=*/bn_param_desc_, - /*bnScaleData=*/scale.template data>(), - /*bnBiasData=*/nullptr, - /*dBnScaleData=*/ - ctx.template Alloc>(d_scale), - /*dBnBiasData=*/ctx.template Alloc>(d_bias), - /*epsilon=*/epsilon, - /*savedMean=*/saved_mean_data, - /*savedInvVariance=*/saved_var_data, - /*activationDesc=*/nullptr, - /*workspace=*/workspace_ptr, - /*workSpaceSizeInBytes=*/workspace_size, - /*reserveSpace=*/ - const_cast(reserve_space->template data()), - /*reserveSpaceSizeInBytes=*/reserve_space_size)); -#endif // CUDNN_VERSION_MIN(7, 4, 1) - if (!called) { #ifdef PADDLE_WITH_HIP + if (compute_format == DataLayout::kNCHW) { + BNBackward + <<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale.template data>(), + saved_mean_data, + saved_var_data, + C, + N, + H * W * D, + epsilon, + transformed_d_x.template data(), + ctx.template Alloc>(d_scale), + ctx.template Alloc>(d_bias)); + } else { + BNBackward + <<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale.template data>(), + saved_mean_data, + saved_var_data, + C, + N, + H * W * D, + epsilon, + transformed_d_x.template data(), + ctx.template Alloc>(d_scale), + ctx.template Alloc>(d_bias)); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenBatchNormalizationBackward( +// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), +// CudnnDataType::kZero(), CudnnDataType::kOne(), +// CudnnDataType::kZero(), data_desc_, +// transformed_x.template data(), data_desc_, +// transformed_d_y.template data(), data_desc_, +// transformed_d_x.template mutable_data(ctx.GetPlace()), +// bn_param_desc_, scale->template data>(), +// d_scale->template mutable_data>( +// ctx.GetPlace()), +// d_bias->template mutable_data>( +// ctx.GetPlace()), +// epsilon, saved_mean_data, saved_var_data)); +#else + // CUDNN PER_ACTIVATION mode only support small batch size + const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070; + const bool use_native_kernel = + (x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD); + if (use_native_kernel) { if (compute_format == DataLayout::kNCHW) { BNBackward <<>>( @@ -637,22 +627,67 @@ void BatchNormGradRawKernel(const Context &ctx, ctx.template Alloc>(d_scale), ctx.template Alloc>(d_bias)); } + } else { +#if CUDNN_VERSION_MIN(7, 4, 1) + size_t workspace_size = 0; + void *workspace_ptr = nullptr; + DenseTensor workspace_tensor; + auto reserve_space_size = reserve_space->memory_size(); + // --------------- cudnn batchnorm workspace --------------- + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload:: + cudnnGetBatchNormalizationBackwardExWorkspaceSize( + /*handle=*/ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, + /*xDesc=*/data_desc_, + /*yDesc=*/data_desc_, + /*dyDesc=*/data_desc_, + /*dzDesc=*/nullptr, + /*dxDesc=*/data_desc_, + /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, + /*activationDesc=*/nullptr, + /*sizeInBytes=*/&workspace_size)); -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationBackward( -// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), -// CudnnDataType::kZero(), CudnnDataType::kOne(), -// CudnnDataType::kZero(), data_desc_, -// transformed_x.template data(), data_desc_, -// transformed_d_y.template data(), data_desc_, -// transformed_d_x.template mutable_data(ctx.GetPlace()), -// bn_param_desc_, scale->template data>(), -// d_scale->template mutable_data>( -// ctx.GetPlace()), -// d_bias->template mutable_data>( -// ctx.GetPlace()), -// epsilon, saved_mean_data, saved_var_data)); + workspace_tensor.Resize({static_cast(workspace_size)}); + workspace_ptr = + static_cast(ctx.template Alloc(&workspace_tensor)); + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationBackwardEx( + /*handle=*/ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, + /*alphaDataDiff=*/CudnnDataType::kOne(), + /*betaDataDiff=*/CudnnDataType::kZero(), + /*alphaParamDiff=*/CudnnDataType::kOne(), + /*betaParamDiff=*/CudnnDataType::kZero(), + /*xDesc=*/data_desc_, + /*xData=*/transformed_x.template data(), + /*yDesc=*/nullptr, + /*yData=*/nullptr, + /*dyDesc=*/data_desc_, + /*dyData=*/transformed_d_y.template data(), + /*dzDesc=*/nullptr, + /*dzData=*/nullptr, + /*dxDesc=*/data_desc_, + /*dxData=*/ctx.template Alloc(&transformed_d_x), + /*dBnScaleBiasDesc=*/bn_param_desc_, + /*bnScaleData=*/scale.template data>(), + /*bnBiasData=*/nullptr, + /*dBnScaleData=*/ + ctx.template Alloc>(d_scale), + /*dBnBiasData=*/ + ctx.template Alloc>(d_bias), + /*epsilon=*/epsilon, + /*savedMean=*/saved_mean_data, + /*savedInvVariance=*/saved_var_data, + /*activationDesc=*/nullptr, + /*workspace=*/workspace_ptr, + /*workSpaceSizeInBytes=*/workspace_size, + /*reserveSpace=*/ + const_cast(reserve_space->template data()), + /*reserveSpaceSizeInBytes=*/reserve_space_size)); #else PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cudnnBatchNormalizationBackward( @@ -675,8 +710,9 @@ void BatchNormGradRawKernel(const Context &ctx, epsilon, saved_mean_data, saved_var_data)); -#endif +#endif // CUDNN_VERSION_MIN(7, 4, 1) } +#endif if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW) { From 2433ebf80f0e9c019b2d1ce587d6b965476b0638 Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Sat, 11 Jun 2022 14:39:53 +0800 Subject: [PATCH 12/70] add unit test for batchnorm1d --- .../tests/unittests/test_batch_norm_op_v2.py | 42 ++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py index 9db95f094a7e3..cfd5d5f7c9bd0 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py @@ -110,11 +110,43 @@ def compute_v2(x): y.backward() return y.numpy(), x1.gradient() - x = np.random.randn(*shape).astype("float32") - y1, g1 = compute_v1(x) - y2, g2 = compute_v2(x) - self.assertTrue(np.allclose(g1, g2)) - self.assertTrue(np.allclose(y1, y2)) + x = np.random.randn(*shape).astype("float32") + y1, g1 = compute_v1(x) + y2, g2 = compute_v2(x) + self.assertTrue(np.allclose(g1, g2)) + self.assertTrue(np.allclose(y1, y2)) + + def test_eager_api_1d(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + shape = [200000, 4] + + def compute_v1(x): + with fluid.dygraph.guard(p): + bn = fluid.dygraph.BatchNorm(shape[1]) + x1 = paddle.to_tensor(x) + x1.stop_gradient = False + y = bn(x1) + y.backward() + return y.numpy(), x1.gradient() + + def compute_v2(x): + with fluid.dygraph.guard(p): + with _test_eager_guard(): + bn = paddle.nn.BatchNorm1D(shape[1]) + x1 = paddle.to_tensor(x) + x1.stop_gradient = False + y = bn(x1) + y.backward() + return y.numpy(), x1.gradient() + + x = np.random.randn(*shape).astype("float32") + y1, g1 = compute_v1(x) + y2, g2 = compute_v2(x) + self.assertTrue(np.allclose(g1, g2)) + self.assertTrue(np.allclose(y1, y2)) def test_dygraph(self): places = [fluid.CPUPlace()] From 90c27a653c5a3a9d0295eed48446d2a58bb667f8 Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Sat, 11 Jun 2022 21:40:24 +0800 Subject: [PATCH 13/70] fix bug --- paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu | 5 +++-- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index b23b119342d68..1e93803866e3f 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -593,8 +593,9 @@ void BatchNormGradRawKernel(const Context &ctx, #else // CUDNN PER_ACTIVATION mode only support small batch size const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070; - const bool use_native_kernel = - (x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD); + // const bool use_native_kernel = + // (x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD); + const bool use_native_kernel = true; if (use_native_kernel) { if (compute_format == DataLayout::kNCHW) { BNBackward diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 5cf81367e4cf7..90e9b45d3ab4e 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -304,12 +304,13 @@ static __global__ void BNForwardTraining2DUpdateOutput( int outer_size = C; int inner_size = N * HxW; - extern __shared__ __align__(sizeof(double)) char smem_buf[]; + extern __shared__ __align__(sizeof(BatchNormParamType)) char smem_buf[]; BatchNormParamType *smem_mean = reinterpret_cast *>(smem_buf); BatchNormParamType *smem_inv_var = - reinterpret_cast *>(&smem_buf[blockDim.x]); + reinterpret_cast *>( + smem_buf + blockDim.x * sizeof(BatchNormParamType)); int outer_loop_stride = gridDim.x * blockDim.x; int inner_loop_stride = gridDim.y * blockDim.y; From 91d83e559cf7fc7c0b8607b2d3a8a7b628ca1c89 Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Sun, 12 Jun 2022 00:12:56 +0800 Subject: [PATCH 14/70] impl channel last 2d --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 296 +++++++++++++++++--- 1 file changed, 260 insertions(+), 36 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 90e9b45d3ab4e..07539c221d06d 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -139,7 +139,7 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( } template -static __global__ void BNForwardTraining2DComputeStatistic( +static __global__ void BNForwardTraining2DChannelLastCompStat( const T *x, const BatchNormParamType *scale, const BatchNormParamType *bias, @@ -291,7 +291,7 @@ static __global__ void BNForwardTraining2DComputeStatistic( } template -static __global__ void BNForwardTraining2DUpdateOutput( +static __global__ void BNForwardTraining2DChannelLastWriteRes( const T *x, const BatchNormParamType *scale, const BatchNormParamType *bias, @@ -335,6 +335,203 @@ static __global__ void BNForwardTraining2DUpdateOutput( } } +template +static __global__ void BNForwardTraining2DCompStat( + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + double exponentialAverageFactor, + T *y, + BatchNormParamType *global_mean, + BatchNormParamType *global_variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance, + BatchNormParamType *compute_mean, + BatchNormParamType *compute_inv_var, + BatchNormParamType *block_data_ptr, + int *flag_ptr) { + int outer_size = C; + int inner_size = N * HxW; + + __shared__ BatchNormParamType smem_sum[BlockDim]; + __shared__ BatchNormParamType smem_square_sum[BlockDim]; + + int outer_loop_stride = gridDim.y * blockDim.y; + int inner_loop_stride = gridDim.x * blockDim.x; + + for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < outer_size; + i += outer_loop_stride) { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = static_cast>(0); + + for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size; + j += inner_loop_stride) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + + // horizonal block sum + int tid = threadIdx.x + threadIdx.y * blockDim.x; +#pragma unroll + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) { + if (threadIdx.x < offset * 2) { + smem_sum[tid] = x_sum; + smem_square_sum[tid] = x_square_sum; + } + __syncthreads(); + if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) { + int pair_tid = tid + offset; + x_sum += smem_sum[pair_tid]; + x_square_sum += smem_square_sum[pair_tid]; + } + } + + if (gridDim.x > 1) { + volatile BatchNormParamType *staging_sum = block_data_ptr; + volatile BatchNormParamType *staging_square_sum = + &block_data_ptr[C * gridDim.x]; + // write block data to global memory + if (threadIdx.x == 0) { + staging_sum[i + blockIdx.x * C] = x_sum; + staging_square_sum[i + blockIdx.x * C] = x_square_sum; + } + + // make sure write is visible to all blocks + __threadfence(); + __syncthreads(); + + __shared__ bool is_last_block_done; + // mark block done + if (threadIdx.x == 0 && threadIdx.y == 0) { + int old = atomicAdd(&flag_ptr[blockIdx.y], 1); + is_last_block_done = (old == (gridDim.x - 1)); + } + + __syncthreads(); + + if (is_last_block_done) { + x_sum = static_cast>(0); + x_square_sum = static_cast>(0); + // thread sum + for (int x = threadIdx.x; x < gridDim.x; x += blockDim.x) { + x_sum += staging_sum[i + x * C]; + x_square_sum += staging_square_sum[i + x * C]; + } + + // vertical block sum + int tid = threadIdx.x + threadIdx.y * blockDim.x; +#pragma unroll + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) { + if (threadIdx.x < offset * 2) { + smem_sum[tid] = x_sum; + smem_square_sum[tid] = x_square_sum; + } + __syncthreads(); + if (threadIdx.x < offset && threadIdx.x + offset < blockDim.y) { + int pair_tid = tid + offset; + x_sum += smem_sum[pair_tid]; + x_square_sum += smem_square_sum[pair_tid]; + } + } + + // final compute + if (threadIdx.x == 0) { + BatchNormParamType compute_mean_val = x_sum / inner_size; + BatchNormParamType variance_val = + x_square_sum / inner_size - compute_mean_val * compute_mean_val; + BatchNormParamType compute_inv_var_val = + 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = compute_mean_val; + save_inv_variance[i] = compute_inv_var_val; + } + global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val + + exponentialAverageFactor * global_mean[i]; + global_variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * global_variance[i]; + + compute_mean[i] = compute_mean_val; + compute_inv_var[i] = compute_inv_var_val; + } + } + } else { + if (blockIdx.x == 0 && threadIdx.x == 0) { + BatchNormParamType compute_mean_val = x_sum / inner_size; + BatchNormParamType variance_val = + x_square_sum / inner_size - compute_mean_val * compute_mean_val; + BatchNormParamType compute_inv_var_val = + 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = compute_mean_val; + save_inv_variance[i] = compute_inv_var_val; + } + global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val + + exponentialAverageFactor * global_mean[i]; + global_variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * global_variance[i]; + + compute_mean[i] = compute_mean_val; + compute_inv_var[i] = compute_inv_var_val; + } + } + } +} + +template +static __global__ void BNForwardTraining2DWriteRes( + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + T *y, + BatchNormParamType *compute_mean, + BatchNormParamType *compute_inv_var) { + int outer_size = C; + int inner_size = N * HxW; + + extern __shared__ __align__(sizeof(BatchNormParamType)) char smem_buf[]; + + BatchNormParamType *smem_mean = + reinterpret_cast *>(smem_buf); + BatchNormParamType *smem_inv_var = + reinterpret_cast *>( + smem_buf + blockDim.y * sizeof(BatchNormParamType)); + + int outer_loop_stride = gridDim.y * blockDim.y; + int inner_loop_stride = gridDim.x * blockDim.x; + + for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < outer_size; + i += outer_loop_stride) { + if (threadIdx.x == 0) { + smem_mean[threadIdx.y] = compute_mean[i]; + smem_inv_var[threadIdx.y] = compute_inv_var[i]; + } + __syncthreads(); + + for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size; + j += inner_loop_stride) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_sub_mean = + static_cast>(x[index]) - smem_mean[threadIdx.y]; + y[index] = scale[i] * x_sub_mean * smem_inv_var[threadIdx.y] + bias[i]; + } + } +} + template static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingSMem( const T *x, @@ -900,25 +1097,7 @@ void BatchNormKernel(const Context &ctx, if (use_native_kernel) { dim3 block; dim3 grid; - const int block_size = 512; - // init block&grid config - int block_x = std::min(phi::funcs::details::GetLastPow2(C), 32); - int block_y = - std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16), - block_size / block_x); - if (block_x * block_y != block_size) { - block_x = std::min(phi::funcs::details::GetLastPow2(C), - block_size / block_y); - } - int grid_x = (C + block_x - 1) / block_x; - int grid_y = - std::min((N * H * W * D + block_y * 16 - 1) / (block_y * 16), 128); - - block.x = block_x; - block.y = block_y; - grid.x = grid_x; - grid.y = grid_y; // init intermediate storage DenseTensor block_data_tensor; @@ -930,19 +1109,34 @@ void BatchNormKernel(const Context &ctx, BatchNormParamType *block_data_ptr = nullptr; int *flag_ptr = nullptr; - if (grid.y > 1) { - block_data_tensor = - phi::Empty, Context>(ctx, {2 * C * grid.y}); - flag_tensor = phi::Empty(ctx, {grid.x}); - - block_data_ptr = block_data_tensor.data>(); - flag_ptr = flag_tensor.data(); - funcs::SetConstant set_zero; - set_zero(ctx, &flag_tensor, static_cast(0)); - } - if (compute_format == DataLayout::kNCHW) { - BNForwardTraining2DComputeStatistic + if (x_dims.size() != 2 && compute_format == DataLayout::kNCHW) { + // init block&grid config + int block_x = std::min( + phi::funcs::details::GetLastPow2(N * H * W * D / 16), block_size); + int block_y = std::min(phi::funcs::details::GetLastPow2(C), + block_size / block_x); + + int grid_x = std::min( + (N * H * W * D + block_x * 16 - 1) / (block_x * 16), 128); + int grid_y = (C + block_y - 1) / block_y; + + block.x = block_x; + block.y = block_y; + grid.x = grid_x; + grid.y = grid_y; + + if (grid.x > 1) { + block_data_tensor = phi::Empty, Context>( + ctx, {2 * C * grid.x}); + flag_tensor = phi::Empty(ctx, {grid.y}); + + block_data_ptr = block_data_tensor.data>(); + flag_ptr = flag_tensor.data(); + funcs::SetConstant set_zero; + set_zero(ctx, &flag_tensor, static_cast(0)); + } + BNForwardTraining2DCompStat <<>>( transformed_x.template data(), scale.template data>(), @@ -962,8 +1156,8 @@ void BatchNormKernel(const Context &ctx, block_data_ptr, flag_ptr); - size_t smem_size = block.x * 2 * sizeof(BatchNormParamType); - BNForwardTraining2DUpdateOutput + size_t smem_size = block.y * 2 * sizeof(BatchNormParamType); + BNForwardTraining2DWriteRes <<>>( transformed_x.template data(), scale.template data>(), @@ -975,7 +1169,37 @@ void BatchNormKernel(const Context &ctx, compute_mean_tensor.data>(), compute_inv_var_tensor.data>()); } else { - BNForwardTraining2DComputeStatistic + // init block&grid config + int block_x = std::min(phi::funcs::details::GetLastPow2(C), 32); + int block_y = + std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16), + block_size / block_x); + if (block_x * block_y != block_size) { + block_x = std::min(phi::funcs::details::GetLastPow2(C), + block_size / block_y); + } + int grid_x = (C + block_x - 1) / block_x; + int grid_y = std::min( + (N * H * W * D + block_y * 16 - 1) / (block_y * 16), 128); + + block.x = block_x; + block.y = block_y; + grid.x = grid_x; + grid.y = grid_y; + + if (grid.y > 1) { + block_data_tensor = phi::Empty, Context>( + ctx, {2 * C * grid.y}); + flag_tensor = phi::Empty(ctx, {grid.x}); + + block_data_ptr = block_data_tensor.data>(); + flag_ptr = flag_tensor.data(); + funcs::SetConstant set_zero; + set_zero(ctx, &flag_tensor, static_cast(0)); + } + BNForwardTraining2DChannelLastCompStat <<>>( transformed_x.template data(), scale.template data>(), @@ -996,7 +1220,7 @@ void BatchNormKernel(const Context &ctx, flag_ptr); size_t smem_size = block.x * 2 * sizeof(BatchNormParamType); - BNForwardTraining2DUpdateOutput + BNForwardTraining2DChannelLastWriteRes <<>>( transformed_x.template data(), scale.template data>(), From 6871dbf6a0f606a798d22a62df075111667fe023 Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Wed, 15 Jun 2022 11:11:22 +0800 Subject: [PATCH 15/70] refine --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 226 ++------------------ 1 file changed, 20 insertions(+), 206 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 07539c221d06d..abaca230d0ec1 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -138,7 +138,7 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( } } -template +template static __global__ void BNForwardTraining2DChannelLastCompStat( const T *x, const BatchNormParamType *scale, @@ -173,9 +173,7 @@ static __global__ void BNForwardTraining2DChannelLastCompStat( for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size; j += inner_loop_stride) { - const int index = layout == phi::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; + const int index = j * outer_size + i; BatchNormParamType x_i = static_cast>(x[index]); x_sum += x_i; x_square_sum += x_i * x_i; @@ -290,7 +288,7 @@ static __global__ void BNForwardTraining2DChannelLastCompStat( } } -template +template static __global__ void BNForwardTraining2DChannelLastWriteRes( const T *x, const BatchNormParamType *scale, @@ -325,9 +323,7 @@ static __global__ void BNForwardTraining2DChannelLastWriteRes( for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size; j += inner_loop_stride) { - const int index = layout == phi::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; + const int index = j * outer_size + i; BatchNormParamType x_sub_mean = static_cast>(x[index]) - smem_mean[threadIdx.x]; y[index] = scale[i] * x_sub_mean * smem_inv_var[threadIdx.x] + bias[i]; @@ -335,7 +331,7 @@ static __global__ void BNForwardTraining2DChannelLastWriteRes( } } -template +template static __global__ void BNForwardTraining2DCompStat( const T *x, const BatchNormParamType *scale, @@ -370,9 +366,7 @@ static __global__ void BNForwardTraining2DCompStat( for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size; j += inner_loop_stride) { - const int index = layout == phi::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; + const int index = (j / HxW * C + i) * HxW + j % HxW; BatchNormParamType x_i = static_cast>(x[index]); x_sum += x_i; x_square_sum += x_i * x_i; @@ -487,7 +481,7 @@ static __global__ void BNForwardTraining2DCompStat( } } -template +template static __global__ void BNForwardTraining2DWriteRes( const T *x, const BatchNormParamType *scale, @@ -522,9 +516,7 @@ static __global__ void BNForwardTraining2DWriteRes( for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size; j += inner_loop_stride) { - const int index = layout == phi::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; + const int index = (j / HxW * C + i) * HxW + j % HxW; BatchNormParamType x_sub_mean = static_cast>(x[index]) - smem_mean[threadIdx.y]; y[index] = scale[i] * x_sub_mean * smem_inv_var[threadIdx.y] + bias[i]; @@ -532,188 +524,6 @@ static __global__ void BNForwardTraining2DWriteRes( } } -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTrainingSMem( - const T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const int C, - const int N, - const int HxW, - const double epsilon, - double exponentialAverageFactor, - T *y, - BatchNormParamType *mean, - BatchNormParamType *variance, - BatchNormParamType *save_mean, - BatchNormParamType *save_inv_variance) { - extern __shared__ __align__(sizeof(double)) char smem_buf[]; - BatchNormParamType *x_buf = - reinterpret_cast *>(smem_buf); - - int outer_size = C; - int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage mean_storage; - __shared__ typename BlockReduce::TempStorage variance_storeage; - __shared__ BatchNormParamType mean_val; - __shared__ BatchNormParamType variance_val; - __shared__ BatchNormParamType inv_var_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType x_sum = static_cast>(0); - BatchNormParamType x_square_sum = static_cast>(0); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == phi::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_i = static_cast>(x[index]); - x_buf[j] = x_i; - x_sum += x_i; - x_square_sum += x_i * x_i; - } - x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); - x_square_sum = - BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); - if (threadIdx.x == 0) { - mean_val = x_sum / inner_size; - variance_val = x_square_sum / inner_size - mean_val * mean_val; - inv_var_val = 1 / sqrt(variance_val + epsilon); - - if (save_mean && save_inv_variance) { - save_mean[i] = mean_val; - save_inv_variance[i] = inv_var_val; - } - mean[i] = (1 - exponentialAverageFactor) * mean_val + - exponentialAverageFactor * mean[i]; - variance[i] = (1 - exponentialAverageFactor) * variance_val + - exponentialAverageFactor * variance[i]; - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == phi::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_sub_mean = - static_cast>(x_buf[j]) - mean_val; - y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; - } - } -} - -template -inline bool TryDispatchBNForwardTrainingSMem( - const Context &ctx, - const T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const int C, - const int N, - const int HxW, - const double epsilon, - double exponentialAverageFactor, - T *y, - BatchNormParamType *mean, - BatchNormParamType *variance, - BatchNormParamType *save_mean, - BatchNormParamType *save_inv_variance) { - constexpr int block_size = 512; - const size_t smem = N * HxW * sizeof(BatchNormParamType); - int max_active_blocks_conf; - { - cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf, - BNForwardTrainingSMem, - block_size, - smem); - } - if (max_active_blocks_conf <= 0) { - return false; - } - const int max_threads = ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block_size, 1); - const int grid = std::min(C, max_blocks); - BNForwardTrainingSMem - <<>>(x, - scale, - bias, - C, - N, - HxW, - epsilon, - exponentialAverageFactor, - y, - mean, - variance, - save_mean, - save_inv_variance); - return true; -} - -template -inline void DispatchBNForwardTraining( - const Context &ctx, - const T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const int C, - const int N, - const int HxW, - const double epsilon, - double exponentialAverageFactor, - T *y, - BatchNormParamType *mean, - BatchNormParamType *variance, - BatchNormParamType *save_mean, - BatchNormParamType *save_inv_variance) { - if ((N * HxW) <= 1024) { - // TODO(yaozihang): impl register-cache version - return; - } else { - bool dispatch_smem_impl_success = false; - { - dispatch_smem_impl_success = - TryDispatchBNForwardTrainingSMem( - ctx, - x, - scale, - bias, - C, - N, - HxW, - epsilon, - exponentialAverageFactor, - y, - mean, - variance, - save_mean, - save_inv_variance); - } - if (!dispatch_smem_impl_success) { - const int block = 512; - const int max_threads = ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - const int grid = std::min(C, max_blocks); - return BNForwardTraining - <<>>(x, - scale, - bias, - C, - N, - HxW, - epsilon, - exponentialAverageFactor, - y, - mean, - variance, - save_mean, - save_inv_variance); - } - } -} - template void BatchNormKernel(const Context &ctx, const DenseTensor &x, @@ -1112,11 +922,17 @@ void BatchNormKernel(const Context &ctx, if (x_dims.size() != 2 && compute_format == DataLayout::kNCHW) { // init block&grid config - int block_x = std::min( - phi::funcs::details::GetLastPow2(N * H * W * D / 16), block_size); + int block_x = + std::min(phi::funcs::details::GetLastPow2(H * W * D), block_size); int block_y = std::min(phi::funcs::details::GetLastPow2(C), block_size / block_x); + if (block_x * block_y != block_size) { + block_x = + std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16), + block_size / block_y); + } + int grid_x = std::min( (N * H * W * D + block_x * 16 - 1) / (block_x * 16), 128); int grid_y = (C + block_y - 1) / block_y; @@ -1136,7 +952,7 @@ void BatchNormKernel(const Context &ctx, funcs::SetConstant set_zero; set_zero(ctx, &flag_tensor, static_cast(0)); } - BNForwardTraining2DCompStat + BNForwardTraining2DCompStat <<>>( transformed_x.template data(), scale.template data>(), @@ -1157,7 +973,7 @@ void BatchNormKernel(const Context &ctx, flag_ptr); size_t smem_size = block.y * 2 * sizeof(BatchNormParamType); - BNForwardTraining2DWriteRes + BNForwardTraining2DWriteRes <<>>( transformed_x.template data(), scale.template data>(), @@ -1197,9 +1013,7 @@ void BatchNormKernel(const Context &ctx, funcs::SetConstant set_zero; set_zero(ctx, &flag_tensor, static_cast(0)); } - BNForwardTraining2DChannelLastCompStat + BNForwardTraining2DChannelLastCompStat <<>>( transformed_x.template data(), scale.template data>(), @@ -1220,7 +1034,7 @@ void BatchNormKernel(const Context &ctx, flag_ptr); size_t smem_size = block.x * 2 * sizeof(BatchNormParamType); - BNForwardTraining2DChannelLastWriteRes + BNForwardTraining2DChannelLastWriteRes <<>>( transformed_x.template data(), scale.template data>(), From 0571ecc7ef662a4b718cd9dfe1cd0b0a4cf0b43a Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Wed, 15 Jun 2022 16:01:14 +0800 Subject: [PATCH 16/70] fix memory thpt --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 82 +++++++++------------ 1 file changed, 33 insertions(+), 49 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index abaca230d0ec1..6d54e4193b007 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -302,31 +302,22 @@ static __global__ void BNForwardTraining2DChannelLastWriteRes( int outer_size = C; int inner_size = N * HxW; - extern __shared__ __align__(sizeof(BatchNormParamType)) char smem_buf[]; - - BatchNormParamType *smem_mean = - reinterpret_cast *>(smem_buf); - BatchNormParamType *smem_inv_var = - reinterpret_cast *>( - smem_buf + blockDim.x * sizeof(BatchNormParamType)); - int outer_loop_stride = gridDim.x * blockDim.x; int inner_loop_stride = gridDim.y * blockDim.y; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size; i += outer_loop_stride) { - if (threadIdx.y == 0) { - smem_mean[threadIdx.x] = compute_mean[i]; - smem_inv_var[threadIdx.x] = compute_inv_var[i]; - } - __syncthreads(); + BatchNormParamType mean_val = compute_mean[i]; + BatchNormParamType inv_var_val = compute_inv_var[i]; + BatchNormParamType scale_val = scale[i]; + BatchNormParamType bias_val = bias[i]; for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size; j += inner_loop_stride) { const int index = j * outer_size + i; BatchNormParamType x_sub_mean = - static_cast>(x[index]) - smem_mean[threadIdx.x]; - y[index] = scale[i] * x_sub_mean * smem_inv_var[threadIdx.x] + bias[i]; + static_cast>(x[index]) - mean_val; + y[index] = scale_val * x_sub_mean * inv_var_val + bias_val; } } } @@ -495,31 +486,22 @@ static __global__ void BNForwardTraining2DWriteRes( int outer_size = C; int inner_size = N * HxW; - extern __shared__ __align__(sizeof(BatchNormParamType)) char smem_buf[]; - - BatchNormParamType *smem_mean = - reinterpret_cast *>(smem_buf); - BatchNormParamType *smem_inv_var = - reinterpret_cast *>( - smem_buf + blockDim.y * sizeof(BatchNormParamType)); - int outer_loop_stride = gridDim.y * blockDim.y; int inner_loop_stride = gridDim.x * blockDim.x; for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < outer_size; i += outer_loop_stride) { - if (threadIdx.x == 0) { - smem_mean[threadIdx.y] = compute_mean[i]; - smem_inv_var[threadIdx.y] = compute_inv_var[i]; - } - __syncthreads(); + BatchNormParamType mean_val = compute_mean[i]; + BatchNormParamType inv_var_val = compute_inv_var[i]; + BatchNormParamType scale_val = scale[i]; + BatchNormParamType bias_val = bias[i]; for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size; j += inner_loop_stride) { const int index = (j / HxW * C + i) * HxW + j % HxW; BatchNormParamType x_sub_mean = - static_cast>(x[index]) - smem_mean[threadIdx.y]; - y[index] = scale[i] * x_sub_mean * smem_inv_var[threadIdx.y] + bias[i]; + static_cast>(x[index]) - mean_val; + y[index] = scale_val * x_sub_mean * inv_var_val + bias_val; } } } @@ -908,6 +890,8 @@ void BatchNormKernel(const Context &ctx, dim3 block; dim3 grid; const int block_size = 512; + const int MAX_GRID_SIZE = 128; + const int WARP_SIZE = 32; // init intermediate storage DenseTensor block_data_tensor; @@ -933,8 +917,9 @@ void BatchNormKernel(const Context &ctx, block_size / block_y); } - int grid_x = std::min( - (N * H * W * D + block_x * 16 - 1) / (block_x * 16), 128); + int grid_x = + std::min((N * H * W * D + block_x * 16 - 1) / (block_x * 16), + MAX_GRID_SIZE); int grid_y = (C + block_y - 1) / block_y; block.x = block_x; @@ -972,21 +957,20 @@ void BatchNormKernel(const Context &ctx, block_data_ptr, flag_ptr); - size_t smem_size = block.y * 2 * sizeof(BatchNormParamType); - BNForwardTraining2DWriteRes - <<>>( - transformed_x.template data(), - scale.template data>(), - bias.template data>(), - C, - N, - H * W * D, - transformed_y.template data(), - compute_mean_tensor.data>(), - compute_inv_var_tensor.data>()); + BNForwardTraining2DWriteRes<<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + transformed_y.template data(), + compute_mean_tensor.data>(), + compute_inv_var_tensor.data>()); } else { // init block&grid config - int block_x = std::min(phi::funcs::details::GetLastPow2(C), 32); + int block_x = + std::min(phi::funcs::details::GetLastPow2(C), WARP_SIZE); int block_y = std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16), block_size / block_x); @@ -995,8 +979,9 @@ void BatchNormKernel(const Context &ctx, block_size / block_y); } int grid_x = (C + block_x - 1) / block_x; - int grid_y = std::min( - (N * H * W * D + block_y * 16 - 1) / (block_y * 16), 128); + int grid_y = + std::min((N * H * W * D + block_y * 16 - 1) / (block_y * 16), + MAX_GRID_SIZE); block.x = block_x; block.y = block_y; @@ -1033,9 +1018,8 @@ void BatchNormKernel(const Context &ctx, block_data_ptr, flag_ptr); - size_t smem_size = block.x * 2 * sizeof(BatchNormParamType); BNForwardTraining2DChannelLastWriteRes - <<>>( + <<>>( transformed_x.template data(), scale.template data>(), bias.template data>(), From 3fc54ada9a14d577234023064f772c82dcc01c76 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Thu, 16 Jun 2022 05:47:24 +0000 Subject: [PATCH 17/70] opt gather --- .../phi/kernels/sparse/gpu/convolution.cu.h | 19 +++++++++++++------ .../sparse/gpu/convolution_grad_kernel.cu | 8 ++++---- .../kernels/sparse/gpu/convolution_kernel.cu | 4 ++-- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 24a7387d4fe19..74d3108806a2a 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -29,6 +29,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/sparse/utils.cu.h" #include "paddle/phi/kernels/primitive/compute_primitives.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" namespace phi { namespace sparse { @@ -46,18 +47,24 @@ using Dims4D = phi::funcs::sparse::Dims4D; * index_size: the size of indices * slice_size: slice size corresponding to each index, here is the channel size **/ -template +template __global__ void GatherKernel(const T* params, const IndexT* indices, T* output, size_t index_size, size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; // offset inside the slice + CUDA_KERNEL_LOOP_TYPE(i, index_size*slice_size/VecSize, int64_t) { + const int vec_slice_size = slice_size / VecSize; + int indices_i = i / vec_slice_size; + int slice_i = i - indices_i * vec_slice_size; // offset inside the slice IndexT gather_i = indices[indices_i]; - int64_t params_i = gather_i * slice_size + slice_i; - *(output + i) = *(params + params_i); + int64_t params_i = gather_i * slice_size + slice_i*VecSize; + //*(output + i) = *(params + params_i); + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + LoadT params_vec; + phi::Load(params + params_i, ¶ms_vec); + phi::Store(params_vec, output + i * VecSize); } } diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index d83d064418eec..0b89d4a2dc269 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -138,8 +138,8 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, } auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * in_channels, 1); - GatherKernel<<<<>>(x.non_zero_elements().data(), @@ -149,8 +149,8 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, in_channels); config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * out_channels, 1); - GatherKernel + dev_ctx, rulebook_len * out_channels/sizeof(T), 1); + GatherKernel <<(0.0f)); auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1); - GatherKernel<<<<>>(x.non_zero_elements().data(), From 804ba033849e230dbce5ee827a68e4e578c46485 Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Thu, 16 Jun 2022 15:24:16 +0800 Subject: [PATCH 18/70] fix threshold --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 6d54e4193b007..9c21b0e716165 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -884,8 +884,8 @@ void BatchNormKernel(const Context &ctx, // static_cast(saved_variance->template mutable_data< // BatchNormParamType>(ctx.GetPlace())))); #else - // const bool use_native_kernel = (x_dims.size() == 2 && N >= 131070); - const bool use_native_kernel = true; + const bool use_native_kernel = ((x_dims.size() == 2 && N >= 131070) || + (x_dims.size() == 3 && N >= 880801)); if (use_native_kernel) { dim3 block; dim3 grid; From 48c634469b8194117a5ed388cdf45c17297e1aa7 Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Thu, 16 Jun 2022 22:56:40 +0800 Subject: [PATCH 19/70] fix backward threshold --- paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu | 9 +++++---- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 7 +++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index 1e93803866e3f..0f028f42a956c 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -591,11 +591,12 @@ void BatchNormGradRawKernel(const Context &ctx, // ctx.GetPlace()), // epsilon, saved_mean_data, saved_var_data)); #else - // CUDNN PER_ACTIVATION mode only support small batch size + // CUDNN only support small batch size const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070; - // const bool use_native_kernel = - // (x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD); - const bool use_native_kernel = true; + const size_t CUDNN_SPATIAL_THRESHOLD = 880801; + const bool use_native_kernel = + ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) || + (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD)); if (use_native_kernel) { if (compute_format == DataLayout::kNCHW) { BNBackward diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 9c21b0e716165..c70bfc3d2c27a 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -884,8 +884,11 @@ void BatchNormKernel(const Context &ctx, // static_cast(saved_variance->template mutable_data< // BatchNormParamType>(ctx.GetPlace())))); #else - const bool use_native_kernel = ((x_dims.size() == 2 && N >= 131070) || - (x_dims.size() == 3 && N >= 880801)); + const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070; + const size_t CUDNN_SPATIAL_THRESHOLD = 880801; + const bool use_native_kernel = + ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) || + (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD)); if (use_native_kernel) { dim3 block; dim3 grid; From 6785f6f20076d9bf1b48768fec5efaf5f960658a Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Thu, 16 Jun 2022 23:39:10 +0800 Subject: [PATCH 20/70] refine unit test --- .../tests/unittests/test_batch_norm_op_v2.py | 51 ++++++++++--------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py index cfd5d5f7c9bd0..193f43c7eb9f0 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py @@ -82,50 +82,50 @@ def error3d(): self.assertRaises(ValueError, error2d_dataformat) self.assertRaises(ValueError, error3d_dataformat) - def test_eager_api(self): - places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(fluid.CUDAPlace(0)) - for p in places: - shape = [4, 10, 4, 4] + def test_large_batch(self): - def compute_v1(x): - with fluid.dygraph.guard(p): - bn = fluid.dygraph.BatchNorm(shape[1]) - #bn = paddle.nn.BatchNorm2D(shape[1]) + def compute_baseline(x): + with fluid.dygraph.guard(p): + bn = fluid.dygraph.BatchNorm(shape[1]) + x1 = paddle.to_tensor(x) + x1.stop_gradient = False + y = bn(x1) + y.backward() + return y.numpy(), x1.gradient() + + def compute_1d(x): + with fluid.dygraph.guard(p): + with _test_eager_guard(): + bn = paddle.nn.BatchNorm1D(shape[1]) x1 = paddle.to_tensor(x) x1.stop_gradient = False y = bn(x1) y.backward() return y.numpy(), x1.gradient() - def compute_v2(x): - with fluid.dygraph.guard(p): - with _test_eager_guard(): - print("v2") - bn = paddle.nn.BatchNorm2D(shape[1]) - x1 = paddle.to_tensor(x) - x1.stop_gradient = False - y = bn(x1) - y.backward() - return y.numpy(), x1.gradient() + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + shape = [200000, 4] x = np.random.randn(*shape).astype("float32") - y1, g1 = compute_v1(x) - y2, g2 = compute_v2(x) + y1, g1 = compute_baseline(x) + y2, g2 = compute_1d(x) self.assertTrue(np.allclose(g1, g2)) self.assertTrue(np.allclose(y1, y2)) - def test_eager_api_1d(self): + def test_eager_api(self): places = [fluid.CPUPlace()] if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: - shape = [200000, 4] + shape = [4, 10, 4, 4] def compute_v1(x): with fluid.dygraph.guard(p): bn = fluid.dygraph.BatchNorm(shape[1]) + #bn = paddle.nn.BatchNorm2D(shape[1]) x1 = paddle.to_tensor(x) x1.stop_gradient = False y = bn(x1) @@ -135,7 +135,8 @@ def compute_v1(x): def compute_v2(x): with fluid.dygraph.guard(p): with _test_eager_guard(): - bn = paddle.nn.BatchNorm1D(shape[1]) + print("v2") + bn = paddle.nn.BatchNorm2D(shape[1]) x1 = paddle.to_tensor(x) x1.stop_gradient = False y = bn(x1) From e46ef54a4bd6a9e53660e165950415a22ecc8bb8 Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Thu, 16 Jun 2022 23:42:53 +0800 Subject: [PATCH 21/70] refine test --- .../paddle/fluid/tests/unittests/test_batch_norm_op_v2.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py index 193f43c7eb9f0..7aa3b8cddf80c 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py @@ -107,8 +107,16 @@ def compute_1d(x): if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: + # [N, C] shape = [200000, 4] + x = np.random.randn(*shape).astype("float32") + y1, g1 = compute_baseline(x) + y2, g2 = compute_1d(x) + self.assertTrue(np.allclose(g1, g2)) + self.assertTrue(np.allclose(y1, y2)) + # [N, C, L] + shape = [1000000, 4, 4] x = np.random.randn(*shape).astype("float32") y1, g1 = compute_baseline(x) y2, g2 = compute_1d(x) From 938cde3131225bbdab9a8eb28d6cd83331c58830 Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Fri, 17 Jun 2022 14:32:07 +0800 Subject: [PATCH 22/70] delete pragma unroll --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 3 --- 1 file changed, 3 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index c70bfc3d2c27a..a26bba041912b 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -229,7 +229,6 @@ static __global__ void BNForwardTraining2DChannelLastCompStat( // vertical block sum int tid = threadIdx.x + threadIdx.y * blockDim.x; -#pragma unroll for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) { if (threadIdx.y < offset * 2) { smem_sum[tid] = x_sum; @@ -365,7 +364,6 @@ static __global__ void BNForwardTraining2DCompStat( // horizonal block sum int tid = threadIdx.x + threadIdx.y * blockDim.x; -#pragma unroll for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) { if (threadIdx.x < offset * 2) { smem_sum[tid] = x_sum; @@ -413,7 +411,6 @@ static __global__ void BNForwardTraining2DCompStat( // vertical block sum int tid = threadIdx.x + threadIdx.y * blockDim.x; -#pragma unroll for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) { if (threadIdx.x < offset * 2) { smem_sum[tid] = x_sum; From a24f2aa8d0681e8a940aa686bb5f80f70fb71671 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 20 Jun 2022 09:20:32 +0000 Subject: [PATCH 23/70] opt gather and scatter --- paddle/phi/kernels/funcs/sparse/scatter.cu.h | 31 +++++--- .../kernels/sparse/gpu/coalesced_kernel.cu | 36 ++++++--- .../phi/kernels/sparse/gpu/convolution.cu.h | 1 - .../sparse/gpu/convolution_grad_kernel.cu | 73 +++++++++++------ .../kernels/sparse/gpu/convolution_kernel.cu | 78 +++++++++++++------ 5 files changed, 150 insertions(+), 69 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h index b9568f1df716d..48b12f8a1b6de 100644 --- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h +++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/phi/kernels/funcs/aligned_vector.h" + +#define VecBytes 16 namespace phi { namespace funcs { @@ -28,33 +31,37 @@ namespace sparse { * channels: the output channel size * out: the outputs **/ -template +template __global__ void ScatterKernel(const T* input, const int* unique_value, const int* out_index, const int non_zero_num, const int rulebook_len, const int channels, - T* out, - const bool subm = false) { + T* out) { int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) { - int indices_i = i / channels; - int channels_i = i - indices_i * channels; + const int vec_channels = channels / VecSize; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + for (int i = tid; i < non_zero_num * vec_channels; i += gridDim.x * blockDim.x) { + int indices_i = i / vec_channels; + int channels_i = i - indices_i * vec_channels; int start = unique_value[indices_i]; int end = indices_i == non_zero_num - 1 ? rulebook_len : unique_value[indices_i + 1]; // max(end-start) = kernel_size - T sum = static_cast(0); - if (subm) { - sum = out[indices_i * channels + channels_i]; - } + StoreT sums={static_cast(0)}; for (int j = start; j < end; j++) { const int out_feature_i = out_index[j]; - sum += input[out_feature_i * channels + channels_i]; + LoadT vec_in; + phi::Load(input + out_feature_i * channels + channels_i * VecSize, &vec_in); +#pragma unroll + for(int k = 0; k < VecSize; k++){ + sums[k] += vec_in[k]; + } } - out[indices_i * channels + channels_i] = sum; + phi::Store(sums, out + indices_i * channels + channels_i * VecSize); } } diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu index 7d9e566916add..44ecb4365a9c1 100644 --- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu @@ -132,16 +132,32 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx, } // 5. scatter the values - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1); - phi::funcs::sparse::ScatterKernel - <<>>( - x_values_ptr, - public_indexs.data(), - values_indexs_ptr, - out_nnz, - nnz, - stride, - out_values.data()); + const int VecSize = VecBytes / sizeof(T); + if(stride % VecSize == 0){ + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, + nnz * stride / VecSize, 1); + phi::funcs::sparse::ScatterKernel + <<>>( + x_values_ptr, + public_indexs.data(), + values_indexs_ptr, + out_nnz, + nnz, + stride, + out_values.data()); + }else{ + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, + nnz * stride, 1); + phi::funcs::sparse::ScatterKernel + <<>>( + x_values_ptr, + public_indexs.data(), + values_indexs_ptr, + out_nnz, + nnz, + stride, + out_values.data()); + } // 6. convert index to coordinate Dim const_dims; diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 74d3108806a2a..7f0d4814b75d0 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -59,7 +59,6 @@ __global__ void GatherKernel(const T* params, int slice_i = i - indices_i * vec_slice_size; // offset inside the slice IndexT gather_i = indices[indices_i]; int64_t params_i = gather_i * slice_size + slice_i*VecSize; - //*(output + i) = *(params + params_i); using LoadT = phi::AlignedVector; using StoreT = phi::AlignedVector; LoadT params_vec; diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index 0b89d4a2dc269..ebf2bf6cae896 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/scatter.cu.h" +#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" @@ -137,28 +138,56 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, } } - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * in_channels/sizeof(T), 1); - GatherKernel<<>>(x.non_zero_elements().data(), - rulebook_ptr + rulebook_len, - in_features_ptr, - rulebook_len, - in_channels); + const int VecSize = VecBytes / sizeof(T); + if(in_channels % VecSize == 0){ + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * in_channels / VecSize, 1); + GatherKernel<<>>(x.non_zero_elements().data(), + rulebook_ptr + rulebook_len, + in_features_ptr, + rulebook_len, + in_channels); + }else{ + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * in_channels, 1); + GatherKernel<<>>(x.non_zero_elements().data(), + rulebook_ptr + rulebook_len, + in_features_ptr, + rulebook_len, + in_channels); + } - config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * out_channels/sizeof(T), 1); - GatherKernel - <<>>(out_grad.non_zero_elements().data(), - rulebook_ptr + rulebook_len * 2, - out_grad_features_ptr, - rulebook_len, - out_channels); + if(out_channels % VecSize == 0){ + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * out_channels / VecSize, 1); + GatherKernel + <<>>(out_grad.non_zero_elements().data(), + rulebook_ptr + rulebook_len * 2, + out_grad_features_ptr, + rulebook_len, + out_channels); + }else{ + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * out_channels, 1); + GatherKernel + <<>>(out_grad.non_zero_elements().data(), + rulebook_ptr + rulebook_len * 2, + out_grad_features_ptr, + rulebook_len, + out_channels); + } const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { @@ -203,7 +232,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, } // 4. scatter - config = phi::backends::gpu::GetGpuLaunchConfig1D( + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( dev_ctx, rulebook_len * in_channels, 1); phi::funcs::ScatterCUDAKernel<< set_zero; set_zero(dev_ctx, &out_features, static_cast(0.0f)); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels/sizeof(T), 1); - GatherKernel<<>>(x.non_zero_elements().data(), - rulebook_ptr + n, - in_features_ptr, - n, - in_channels); + const int VecSize = VecBytes / sizeof(T); + if(in_channels % VecSize == 0){ + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels / VecSize, 1); + GatherKernel<<>>(x.non_zero_elements().data(), + rulebook_ptr + n, + in_features_ptr, + n, + in_channels); + }else{ + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1); + GatherKernel<<>>(x.non_zero_elements().data(), + rulebook_ptr + n, + in_features_ptr, + n, + in_channels); + } // 3. call gemm for every werght auto blas = phi::funcs::GetBlas(dev_ctx); @@ -155,7 +169,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, // 4. scatter if (subm) { set_zero(dev_ctx, out_values, static_cast(0.0f)); - config = + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1); phi::funcs::ScatterCUDAKernel <<nnz() * out_channels, 1); - phi::funcs::sparse::ScatterKernel - <<>>(out_features_ptr, - unique_value.data(), - out_index.data(), - out->nnz(), - n, - out_channels, - out_values_ptr); + if(out_channels % VecSize == 0){ + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, out->nnz() * out_channels / VecSize, 1); + phi::funcs::sparse::ScatterKernel + <<>>(out_features_ptr, + unique_value.data(), + out_index.data(), + out->nnz(), + n, + out_channels, + out_values_ptr); + }else{ + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, out->nnz() * out_channels, 1); + phi::funcs::sparse::ScatterKernel + <<>>(out_features_ptr, + unique_value.data(), + out_index.data(), + out->nnz(), + n, + out_channels, + out_values_ptr); + } } } /** From 64be38b56a5c931cfeff710aa69bc6336472fe46 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 20 Jun 2022 10:56:50 +0000 Subject: [PATCH 24/70] opt conv --- paddle/phi/kernels/funcs/sparse/scatter.cu.h | 34 +++++--- .../kernels/sparse/gpu/coalesced_kernel.cu | 39 ++++++--- .../phi/kernels/sparse/gpu/convolution.cu.h | 18 +++-- .../sparse/gpu/convolution_grad_kernel.cu | 75 ++++++++++++----- .../kernels/sparse/gpu/convolution_kernel.cu | 80 +++++++++++++------ 5 files changed, 172 insertions(+), 74 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h index b9568f1df716d..cd89c916db577 100644 --- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h +++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/phi/kernels/funcs/aligned_vector.h" + +#define VecBytes 16 namespace phi { namespace funcs { @@ -28,33 +31,40 @@ namespace sparse { * channels: the output channel size * out: the outputs **/ -template +template __global__ void ScatterKernel(const T* input, const int* unique_value, const int* out_index, const int non_zero_num, const int rulebook_len, const int channels, - T* out, - const bool subm = false) { + T* out) { int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) { - int indices_i = i / channels; - int channels_i = i - indices_i * channels; + const int vec_channels = channels / VecSize; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + for (int i = tid; i < non_zero_num * vec_channels; + i += gridDim.x * blockDim.x) { + int indices_i = i / vec_channels; + int channels_i = i - indices_i * vec_channels; int start = unique_value[indices_i]; int end = indices_i == non_zero_num - 1 ? rulebook_len : unique_value[indices_i + 1]; // max(end-start) = kernel_size - T sum = static_cast(0); - if (subm) { - sum = out[indices_i * channels + channels_i]; - } + StoreT sums = {static_cast(0)}; for (int j = start; j < end; j++) { const int out_feature_i = out_index[j]; - sum += input[out_feature_i * channels + channels_i]; + LoadT vec_in; + phi::Load( + input + out_feature_i * channels + channels_i * VecSize, &vec_in); +#pragma unroll + for (int k = 0; k < VecSize; k++) { + sums[k] += vec_in[k]; + } } - out[indices_i * channels + channels_i] = sum; + phi::Store(sums, + out + indices_i * channels + channels_i * VecSize); } } diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu index 7d9e566916add..60d90a18d4633 100644 --- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu @@ -132,16 +132,35 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx, } // 5. scatter the values - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1); - phi::funcs::sparse::ScatterKernel - <<>>( - x_values_ptr, - public_indexs.data(), - values_indexs_ptr, - out_nnz, - nnz, - stride, - out_values.data()); + const int VecSize = VecBytes / sizeof(T); + if (stride % VecSize == 0) { + config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, nnz * stride / VecSize, 1); + phi::funcs::sparse::ScatterKernel + <<>>(x_values_ptr, + public_indexs.data(), + values_indexs_ptr, + out_nnz, + nnz, + stride, + out_values.data()); + } else { + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1); + phi::funcs::sparse::ScatterKernel + <<>>(x_values_ptr, + public_indexs.data(), + values_indexs_ptr, + out_nnz, + nnz, + stride, + out_values.data()); + } // 6. convert index to coordinate Dim const_dims; diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 24a7387d4fe19..a08c7931bb4f4 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/index_impl.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/sparse/utils.cu.h" @@ -46,18 +47,23 @@ using Dims4D = phi::funcs::sparse::Dims4D; * index_size: the size of indices * slice_size: slice size corresponding to each index, here is the channel size **/ -template +template __global__ void GatherKernel(const T* params, const IndexT* indices, T* output, size_t index_size, size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; // offset inside the slice + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size / VecSize, int64_t) { + const int vec_slice_size = slice_size / VecSize; + int indices_i = i / vec_slice_size; + int slice_i = i - indices_i * vec_slice_size; // offset inside the slice IndexT gather_i = indices[indices_i]; - int64_t params_i = gather_i * slice_size + slice_i; - *(output + i) = *(params + params_i); + int64_t params_i = gather_i * slice_size + slice_i * VecSize; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + LoadT params_vec; + phi::Load(params + params_i, ¶ms_vec); + phi::Store(params_vec, output + i * VecSize); } } diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index d83d064418eec..d91c93fde66fb 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/scatter.cu.h" +#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" @@ -137,28 +138,58 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, } } - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * in_channels, 1); - GatherKernel<<>>(x.non_zero_elements().data(), - rulebook_ptr + rulebook_len, - in_features_ptr, - rulebook_len, - in_channels); + const int VecSize = VecBytes / sizeof(T); + if (in_channels % VecSize == 0) { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * in_channels / VecSize, 1); + GatherKernel + <<>>(x.non_zero_elements().data(), + rulebook_ptr + rulebook_len, + in_features_ptr, + rulebook_len, + in_channels); + } else { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * in_channels, 1); + GatherKernel + <<>>(x.non_zero_elements().data(), + rulebook_ptr + rulebook_len, + in_features_ptr, + rulebook_len, + in_channels); + } - config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * out_channels, 1); - GatherKernel - <<>>(out_grad.non_zero_elements().data(), - rulebook_ptr + rulebook_len * 2, - out_grad_features_ptr, - rulebook_len, - out_channels); + if (out_channels % VecSize == 0) { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * out_channels / VecSize, 1); + GatherKernel + <<>>(out_grad.non_zero_elements().data(), + rulebook_ptr + rulebook_len * 2, + out_grad_features_ptr, + rulebook_len, + out_channels); + } else { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * out_channels, 1); + GatherKernel + <<>>(out_grad.non_zero_elements().data(), + rulebook_ptr + rulebook_len * 2, + out_grad_features_ptr, + rulebook_len, + out_channels); + } const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { @@ -203,7 +234,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, } // 4. scatter - config = phi::backends::gpu::GetGpuLaunchConfig1D( + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( dev_ctx, rulebook_len * in_channels, 1); phi::funcs::ScatterCUDAKernel<< set_zero; set_zero(dev_ctx, &out_features, static_cast(0.0f)); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1); - GatherKernel<<>>(x.non_zero_elements().data(), - rulebook_ptr + n, - in_features_ptr, - n, - in_channels); + const int VecSize = VecBytes / sizeof(T); + if (in_channels % VecSize == 0) { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, n * in_channels / VecSize, 1); + GatherKernel + <<>>(x.non_zero_elements().data(), + rulebook_ptr + n, + in_features_ptr, + n, + in_channels); + } else { + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1); + GatherKernel + <<>>(x.non_zero_elements().data(), + rulebook_ptr + n, + in_features_ptr, + n, + in_channels); + } // 3. call gemm for every werght auto blas = phi::funcs::GetBlas(dev_ctx); @@ -155,7 +171,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, // 4. scatter if (subm) { set_zero(dev_ctx, out_values, static_cast(0.0f)); - config = + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1); phi::funcs::ScatterCUDAKernel <<nnz() * out_channels, 1); - phi::funcs::sparse::ScatterKernel - <<>>(out_features_ptr, - unique_value.data(), - out_index.data(), - out->nnz(), - n, - out_channels, - out_values_ptr); + if (out_channels % VecSize == 0) { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, out->nnz() * out_channels / VecSize, 1); + phi::funcs::sparse::ScatterKernel + <<>>(out_features_ptr, + unique_value.data(), + out_index.data(), + out->nnz(), + n, + out_channels, + out_values_ptr); + } else { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, out->nnz() * out_channels, 1); + phi::funcs::sparse::ScatterKernel + <<>>(out_features_ptr, + unique_value.data(), + out_index.data(), + out->nnz(), + n, + out_channels, + out_values_ptr); + } } } /** From 26ca7ab0c54dd03cbc1e85c095a6839d23cfff54 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 21 Jun 2022 06:58:05 +0000 Subject: [PATCH 25/70] fix batch csr --- .../kernels/sparse/cpu/sparse_utils_kernel.cc | 2 +- .../kernels/sparse/gpu/sparse_utils_kernel.cu | 2 +- .../tests/unittests/test_sparse_utils_op.py | 78 ++++++++----------- 3 files changed, 36 insertions(+), 46 deletions(-) diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc index 1cd3086d5f74c..8bf0104ef0baf 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc @@ -198,7 +198,7 @@ void SparseCooToCsrCPUKernel(const CPUContext& dev_ctx, const auto& coo_values = x.non_zero_elements(); const IntT* batchs_ptr = coo_indices.data(); const IntT* coo_rows_data = - batchs == 1 ? batchs_ptr : batchs_ptr + non_zero_num; + x_dims.size() == 2 ? batchs_ptr : batchs_ptr + non_zero_num; const IntT* coo_cols_data = coo_rows_data + non_zero_num; const T* coo_values_data = coo_values.data(); diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu index 1ed4ebd23db87..bcc979dcbe51d 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu @@ -370,7 +370,7 @@ void SparseCooToCsrGPUKernel(const GPUContext& dev_ctx, const auto& coo_values = x.non_zero_elements(); const IntT* batchs_ptr = coo_indices.data(); const IntT* coo_rows_data = - batchs == 1 ? batchs_ptr : batchs_ptr + non_zero_num; + x_dims.size() == 2 ? batchs_ptr : batchs_ptr + non_zero_num; const IntT* coo_cols_data = coo_rows_data + non_zero_num; const T* coo_values_data = coo_values.data(); diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py index 6cc1d9cf96cae..a12425b69299e 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py @@ -318,50 +318,40 @@ def test_sparse_coo_tensor_sorted(self): def test_batch_csr(self): with _test_eager_guard(): - shape = [3, 3, 3] - - def verify(x, crows, cols, values): - x = paddle.to_tensor(x) - csr = x.to_sparse_csr() - assert np.allclose(crows, csr.crows().numpy()) - assert np.allclose(cols, csr.cols().numpy()) - assert np.allclose(values, csr.values().numpy()) - - dense = csr.to_dense() - assert np.allclose(x.numpy(), dense.numpy()) - - x = [ - [[1.0, 0, 0], [0, 2.0, 0], [0, 0, 3.0]], - [[0, 0, 0], [0, 0, 0], [0, 0, 0]], - [[1.0, 0, 0], [0, 2.0, 0], [0, 0, 3.0]], - ] - crows = [[0, 1, 2, 3, 0, 0, 0, 0, 0, 1, 2, 3]] - cols = [0, 1, 2, 0, 1, 2] - values = [1.0, 2.0, 3.0, 1.0, 2.0, 3.0] - - verify(x, crows, cols, values) - - x = [ - [[0, 0, 0], [0, 0, 0], [0, 0, 0]], - [[1.0, 0, 0], [0, 2.0, 0], [0, 0, 3.0]], - [[1.0, 0, 0], [0, 2.0, 0], [0, 0, 3.0]], - ] - crows = [[0, 0, 0, 0, 0, 1, 2, 3, 0, 1, 2, 3]] - cols = [0, 1, 2, 0, 1, 2] - values = [1.0, 2.0, 3.0, 1.0, 2.0, 3.0] - - verify(x, crows, cols, values) - - x = [ - [[1.0, 0, 0], [0, 2.0, 0], [0, 0, 3.0]], - [[1.0, 0, 0], [0, 2.0, 0], [0, 0, 3.0]], - [[0, 0, 0], [0, 0, 0], [0, 0, 0]], - ] - crows = [[0, 1, 2, 3, 0, 1, 2, 3, 0, 0, 0, 0]] - cols = [0, 1, 2, 0, 1, 2] - values = [1.0, 2.0, 3.0, 1.0, 2.0, 3.0] - - verify(x, crows, cols, values) + + def verify(dense_x): + sparse_x = dense_x.to_sparse_csr() + out = sparse_x.to_dense() + assert np.allclose(out.numpy(), dense_x.numpy()) + + shape = np.random.randint(low=1, high=10, size=3) + shape = list(shape) + dense_x = paddle.randn(shape) + dense_x = paddle.nn.functional.dropout(dense_x, p=0.5) + verify(dense_x) + + #test batchs=1 + shape[0] = 1 + dense_x = paddle.randn(shape) + dense_x = paddle.nn.functional.dropout(dense_x, p=0.5) + verify(dense_x) + + shape = np.random.randint(low=2, high=10, size=3) + shape = list(shape) + dense_x = paddle.randn(shape) + #set the 0th batch to zero + dense_x[0] = 0 + verify(dense_x) + + dense_x = paddle.randn(shape) + #set the 1th batch to zero + dense_x[1] = 0 + verify(dense_x) + + dense_x = paddle.randn(shape) + #set the 2th batch to zero + dense_x[2] = 0 + verify(dense_x) class TestCooError(unittest.TestCase): From 11011c0b63288e6c8168f997bfae1caa9f2d3ef1 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 21 Jun 2022 07:00:11 +0000 Subject: [PATCH 26/70] remove the unused file --- .../unittests/test_sparse_middle_extractor.py | 324 ------------------ .../tests/unittests/test_sparse_mnist.py | 126 ------- 2 files changed, 450 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py delete mode 100644 python/paddle/fluid/tests/unittests/test_sparse_mnist.py diff --git a/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py b/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py deleted file mode 100644 index ae52b4a413336..0000000000000 --- a/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py +++ /dev/null @@ -1,324 +0,0 @@ -import paddle -import paddle.nn as nn -import paddle.sparse as sparse -from paddle.fluid.framework import _test_eager_guard -import time -import numpy as np -import torch -import spconv.pytorch as spconv -import inspect - -class MiddleExtractor(paddle.nn.Layer): - def __init__(self, - #output_shape, - use_norm=True, - num_input_features=128, - num_filters_down1=[64], - num_filters_down2=[64, 64], - name='MiddleExtractor'): - super(MiddleExtractor, self).__init__() - self.name = name - if not use_norm: - self.middle_conv = paddle.nn.Sequential( - #nn.Pad3D(1), - nn.Conv3D(num_input_features, 64, 3, stride=(2, 1, 1), data_format='NDHWC'), - #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), - nn.ReLU(), - #nn.Pad3D([1, 1, 1, 1, 0, 0]), - nn.Conv3D(64, 64, 3, stride=(1, 1, 1), data_format='NDHWC'), - #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), - nn.ReLU(), - #nn.Pad3D(1), - nn.Conv3D(64, 64, 3, stride=(2, 1, 1), data_format='NDHWC'), - #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), - nn.ReLU(), - ) - else: - self.middle_conv = paddle.nn.Sequential( - #nn.Pad3D(1), - nn.Conv3D(num_input_features, 64, 3, stride=(2, 1, 1), data_format='NDHWC'), - nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), - nn.ReLU(), - #nn.Pad3D([1, 1, 1, 1, 0, 0]), - nn.Conv3D(64, 64, 3, stride=(1, 1, 1), data_format='NDHWC'), - nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), - nn.ReLU(), - #nn.Pad3D(1), - nn.Conv3D(64, 64, 3, stride=(2, 1, 1), data_format='NDHWC'), - nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), - nn.ReLU(), - ) - def forward(self, x): - return self.middle_conv(x) - - -def get_pos_to_kw_map(func): - pos_to_kw = {} - fsig = inspect.signature(func) - pos = 0 - for name, info in fsig.parameters.items(): - if info.kind is info.POSITIONAL_OR_KEYWORD: - pos_to_kw[pos] = name - pos += 1 - return pos_to_kw - -def change_default_args(**kwargs): - def layer_wrapper(layer_class): - class DefaultArgLayer(layer_class): - def __init__(self, *args, **kw): - pos_to_kw = get_pos_to_kw_map(layer_class.__init__) - kw_to_pos = {kw: pos for pos, kw in pos_to_kw.items()} - for key, val in kwargs.items(): - if key not in kw and kw_to_pos[key] > len(args): - kw[key] = val - super().__init__(*args, **kw) - - return DefaultArgLayer - - return layer_wrapper - -class Empty(torch.nn.Module): - def __init__(self, *args, **kwargs): - super(Empty, self).__init__() - - def forward(self, *args, **kwargs): - if len(args) == 1: - return args[0] - elif len(args) == 0: - return None - return args - -class SpconvMiddleExtractor(torch.nn.Module): - def __init__(self, - #output_shape, - use_norm=True, - num_input_features=128, - num_filters_down1=[64], - num_filters_down2=[64, 64], - name='SpconvMiddleExtractor'): - super(SpconvMiddleExtractor, self).__init__() - if use_norm: - BatchNorm1d = change_default_args( - eps=1e-3, momentum=0.01)(torch.nn.BatchNorm1d) - Linear = change_default_args(bias=False)(nn.Linear) - else: - BatchNorm1d = Empty - Linear = change_default_args(bias=True)(nn.Linear) - - middle_layers = [] - - num_filters = [num_input_features] + num_filters_down1 - filters_pairs_d1 = [[num_filters[i], num_filters[i + 1]] - for i in range(len(num_filters) - 1)] - - for i, o in filters_pairs_d1: - middle_layers.append(spconv.SubMConv3d(i, o, 3, bias=False)) - if use_norm: - #middle_layers.append(BatchNorm1d(o)) - middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01)) - middle_layers.append(torch.nn.ReLU()) - - middle_layers.append( - spconv.SparseConv3d( - num_filters[-1], - num_filters[-1], (3, 1, 1), (2, 1, 1), - bias=False)) - - if use_norm: - #middle_layers.append( - # BatchNorm1d(num_filters[-1])) - middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01)) - middle_layers.append(torch.nn.ReLU()) - - - # assert len(num_filters_down2) > 0 - if len(num_filters_down1) == 0: - num_filters = [num_filters[-1]] + num_filters_down2 - else: - num_filters = [num_filters_down1[-1]] + num_filters_down2 - filters_pairs_d2 = [[num_filters[i], num_filters[i + 1]] - for i in range(len(num_filters) - 1)] - for i, o in filters_pairs_d2: - middle_layers.append(spconv.SubMConv3d(i, o, 3, bias=False)) - if use_norm: - #middle_layers.append(BatchNorm1d(o)) - middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01)) - middle_layers.append(torch.nn.ReLU()) - middle_layers.append( - spconv.SparseConv3d( - num_filters[-1], - num_filters[-1], (3, 1, 1), (2, 1, 1), - bias=False)) - if use_norm: - #middle_layers.append( - #BatchNorm1d(num_filters[-1])) - middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01)) - middle_layers.append(torch.nn.ReLU()) - #middle_layers.append(scn.SparseToDense(3, num_filters[-1])) - middle_layers.append(spconv.ToDense()) - self.middle_conv = spconv.SparseSequential(*middle_layers) - - def forward(self, x): - out = self.middle_conv(x) - return out - -class SparseMiddleExtractor(paddle.nn.Layer): - def __init__(self, - #output_shape, - use_norm=True, - num_input_features=128, - num_filters_down1=[64], - num_filters_down2=[64, 64], - name='SparseMiddleExtractor'): - super(SparseMiddleExtractor, self).__init__() - self.name = name - - middle_layers = [] - num_filters = [num_input_features] + num_filters_down1 - filters_pairs_d1 = [[num_filters[i], num_filters[i + 1]] for i in range(len(num_filters) - 1)] - for i, o in filters_pairs_d1: - middle_layers.append(sparse.SubmConv3D(i, o, 3, bias_attr=False)) - if use_norm: - middle_layers.append(sparse.BatchNorm(o, epsilon=1e-3, momentum=0.01)) - middle_layers.append(sparse.ReLU()) - - middle_layers.append(sparse.Conv3D(num_filters[-1], num_filters[-1], (3, 1, 1), (2, 1, 1), bias_attr=False)) - - if use_norm: - middle_layers.append(sparse.BatchNorm(num_filters[-1], epsilon=1e-3, momentum=0.01)) - middle_layers.append(sparse.ReLU()) - - - if len(num_filters_down1) == 0: - num_filters = [num_filters[-1]] + num_filters_down2 - else: - num_filters = [num_filters_down1[-1]] + num_filters_down2 - - filters_pairs_d2 = [[num_filters[i], num_filters[i + 1]] for i in range(len(num_filters) - 1)] - - for i, o in filters_pairs_d2: - middle_layers.append(sparse.SubmConv3D(i, o, 3, bias_attr=False)) - if use_norm: - middle_layers.append(sparse.BatchNorm(o, epsilon=1e-3, momentum=0.01)) - middle_layers.append(sparse.ReLU()) - - middle_layers.append(sparse.Conv3D(num_filters[-1], num_filters[-1], (3, 1, 1), (2, 1, 1), bias_attr=False)) - if use_norm: - middle_layers.append(sparse.BatchNorm(num_filters[-1], epsilon=1e-3, momentum=0.01)) - middle_layers.append(sparse.ReLU()) - - self.middle_conv = nn.Sequential(*middle_layers) - - def forward(self, x): - sparse_out = self.middle_conv(x) - #return sparse_out - return sparse_out.to_dense() - - -def test(): - paddle.seed(0) - with _test_eager_guard(): - in_channels = 128 - # Note: 1. paddle的BatchNorm1D的输入shape不能太大,否则报CUDNN_STATUS_NOT_SUPPORTED. - shape = [20, 40, 100] - batch_size = 1 - sparsity = 0.95 - - full_shape = [batch_size] + shape + [in_channels] - print(full_shape) - - total_elements = np.prod(shape) - nnz = int(total_elements * (1-sparsity)) - print("nnz=", nnz) - - #product indices - indices = [] - for i in range(4): - indices.append(paddle.randint(0, full_shape[i], [1, nnz])) - - indices = paddle.concat(indices) - #product values - values = paddle.randn((nnz, in_channels)) - - sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, shape=full_shape) - - dense_x = sparse_x.to_dense() - - #spconv - device = torch.device("cuda") - torch_x = torch.tensor(dense_x.numpy(), device=device) - - spconv_x = spconv.SparseConvTensor.from_dense(torch_x) - - #whether to use batch_norm - use_norm = True - - dense_model = MiddleExtractor(use_norm=use_norm, num_input_features=in_channels) - spconv_model = SpconvMiddleExtractor(use_norm=use_norm, num_input_features=in_channels).to(device) - sparse_model = SparseMiddleExtractor(use_norm=use_norm, num_input_features=in_channels) - layer_nums = len(sparse_model.middle_conv) - block_size = 3 if use_norm else 2 - layer_nums = int(layer_nums / block_size) - - for i in range(0, layer_nums): - weight = paddle.to_tensor(spconv_model.middle_conv[i * block_size].weight.detach().cpu().numpy()) - sparse_model.middle_conv[i * block_size].weight.set_value(paddle.transpose(paddle.to_tensor(weight), [1,2,3,4,0])) - if use_norm: - bn_weight = paddle.to_tensor(spconv_model.middle_conv[i*block_size + 1].weight.detach().cpu().numpy()) - sparse_model.middle_conv[i * block_size + 1].weight.set_value(bn_weight) - - print(dense_model) - print(sparse_model) - print(spconv_model) - paddle.device.cuda.synchronize() - - #warm up - dense_x.stop_gradient=True - out1 = dense_model(dense_x) - paddle.device.cuda.synchronize() - sparse_x.stop_gradient=True - out2 = sparse_model(sparse_x) - paddle.device.cuda.synchronize() - spconv_x.features.required_grad=False - out3 = spconv_model(spconv_x) - torch.cuda.synchronize(device) - #warm up - - t0 = time.time() - #padde dense - dense_x.stop_gradient=False - out1 = dense_model(dense_x) - out1.backward(out1) - paddle.device.cuda.synchronize() - t1 = time.time() - - #padde sparse - sparse_x.stop_gradient=False - out2 = sparse_model(sparse_x) - out2.backward(out2) - paddle.device.cuda.synchronize() - t2 = time.time() - - #spconv - spconv_x.features.required_grad=True - spconv_x.features.requires_grad_() - out3 = spconv_model(spconv_x) - out3.backward(out3) - torch.cuda.synchronize(device) - t3 = time.time() - - # Note 2. sparse的BatchNorm底层是使用paddle.nn.BatchNorm1D对values进行bn计算,测试发现BatchNorm1D的性能比BatchNorm3D差,因此use_norm=True的情况,需要更高的稀疏度才能比dense的快 - # Note 3. 只跑前向,sparse的耗时和spconv接近,稀疏度越高sparse的性能越好,当前方式测试前向+反向,spconv的耗时很高, 原因未知 - print("dense time: ", t1 - t0) - print("sparse time: ", t2 - t1) - print("spconv time: ", t3 - t2) - - # Note 4. paddle和torch的BN存在误差,测试shape=(4000, 64)的随机输入,单层BN前向误差在1e-6, 反向误差在1e-4 - #verify the forward calculation result - assert np.allclose(paddle.transpose(out2, [0, 4, 1, 2, 3]).numpy(), out3.detach().cpu().numpy(), atol=1e-4, rtol=1e-4) - - #verify the backward calculation result - assert np.allclose(spconv_x.features.grad.cpu().numpy(), - sparse_x.grad.values().numpy(), atol=1e-3, rtol=1e-3) - -test() diff --git a/python/paddle/fluid/tests/unittests/test_sparse_mnist.py b/python/paddle/fluid/tests/unittests/test_sparse_mnist.py deleted file mode 100644 index 3589dc83090f3..0000000000000 --- a/python/paddle/fluid/tests/unittests/test_sparse_mnist.py +++ /dev/null @@ -1,126 +0,0 @@ -import paddle -from paddle.vision.transforms import Compose, Normalize, ToTensor -from paddle.fluid.framework import _test_eager_guard -import time - -paddle.disable_static() -#transform = Compose([Normalize(mean=[127.5], -# std=[127.5], -# data_format='CHW')]) -transform = Compose([ToTensor()]) -# 使用transform对数据集做归一化 -print('download training data and load training data') -train_dataset = paddle.vision.datasets.MNIST(mode='train', transform=transform) -test_dataset = paddle.vision.datasets.MNIST(mode='test', transform=transform) -print('load finished') - -import numpy as np -#import matplotlib.pyplot as plt -train_data0, train_label_0 = train_dataset[0][0],train_dataset[0][1] -train_data0 = train_data0.reshape([28,28]) -#plt.figure(figsize=(2,2)) -#plt.imshow(train_data0, cmap=plt.cm.binary) -print('train_data0 label is: ' + str(train_label_0)) - - -import paddle -import paddle.nn.functional as F -class SparseLeNet(paddle.nn.Layer): - def __init__(self): - super(SparseLeNet, self).__init__() - #self.bn = paddle.sparse.BatchNorm(1) - self.conv1 = paddle.sparse.Conv3D(in_channels=1, out_channels=6, kernel_size=[1, 5, 5], stride=[1, 1, 1], padding=[0, 2, 2]) - self.relu1 = paddle.sparse.ReLU() - self.pool1 = paddle.sparse.MaxPool3D(kernel_size=[1, 2, 2], stride=[1, 2, 2]) - self.conv2 = paddle.sparse.Conv3D(in_channels=6, out_channels=16, kernel_size=[1, 5, 5], stride=[1, 1, 1]) - self.relu2 = paddle.sparse.ReLU() - self.pool2 = paddle.sparse.MaxPool3D(kernel_size=[1, 2, 2], stride=[1, 2, 2]) - - self.fc1 = paddle.nn.Linear(16*5*5, 120) - self.fc2 = paddle.nn.Linear(120, 84) - self.fc3 = paddle.nn.Linear(84, 10) - - def forward(self, x): - #x = self.bn(x) - x = self.conv1(x) - x = self.relu1(x) - x = self.pool1(x) - x = self.conv2(x) - x = self.relu2(x) - x = self.pool2(x) - x = x.to_dense() - - x = paddle.flatten(x, start_axis=1, stop_axis=-1) - x = self.fc1(x) - x = paddle.nn.functional.relu(x) - x = self.fc2(x) - x = paddle.nn.functional.relu(x) - x = self.fc3(x) - return x - -import paddle.nn.functional as F -train_loader = paddle.io.DataLoader(train_dataset, batch_size=64, shuffle=True) -# 加载训练集 batch_size 设为 64 -# sparse 训练 - -def prepare_data(x_data): - x_data = paddle.transpose(x_data, perm=[0, 2, 3, 1]) - x_data = paddle.reshape(x_data, [x_data.shape[0], 1, x_data.shape[1], x_data.shape[2], x_data.shape[3]]) - return x_data - -def sparse_train(model): - model.train() - epochs = 2 - optim = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()) - # 用Adam作为优化函数 - for epoch in range(epochs): - for batch_id, data in enumerate(train_loader()): - x_data = data[0] - y_data = data[1] - x_data = prepare_data(x_data) - x_data = x_data.to_sparse_coo(4) - x_data.stop_gradient=False - predicts = model(x_data) - loss = F.cross_entropy(predicts, y_data) - # 计算损失 - acc = paddle.metric.accuracy(predicts, y_data) - loss.backward() - if batch_id % 300 == 0: - print("epoch: {}, batch_id: {}, loss is: {}, acc is: {}".format(epoch, batch_id, loss.numpy(), acc.numpy())) - optim.step() - optim.clear_grad() - -test_loader = paddle.io.DataLoader(test_dataset, places=paddle.CPUPlace(), batch_size=64) -# 加载测试数据集 -def test(model): - model.eval() - batch_size = 64 - for batch_id, data in enumerate(test_loader()): - x_data = data[0] - y_data = data[1] - x_data = prepare_data(x_data) - x_data = x_data.to_sparse_coo(4) - predicts = model(x_data) - # 获取预测结果 - loss = F.cross_entropy(predicts, y_data) - acc = paddle.metric.accuracy(predicts, y_data) - if batch_id % 20 == 0: - print("batch_id: {}, loss is: {}, acc is: {}".format(batch_id, loss.numpy(), acc.numpy())) - -with _test_eager_guard(): - sparse_model = SparseLeNet() - print(sparse_model) - - t0 = time.time() - sparse_train(sparse_model) - t1 = time.time() - print("spare time:", t1-t0) - test(sparse_model) - #x = paddle.randn((1, 1,28,28,1)) - #x.stop_gradient=False - #sparse_x = x.to_sparse_coo(4) - #print("sparse_x values shape:", sparse_x.values().shape) - #out = sparse_model(sparse_x) - #out.backward(out) - #print("end") - From de91d400ca163421bb4a71d7c4703fa873f62bbc Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 21 Jun 2022 08:34:15 +0000 Subject: [PATCH 27/70] opt SparseMaskCopyKernel --- .../kernels/sparse/gpu/sparse_mask_kernel.cu | 48 ++- .../unittests/test_sparse_middle_extractor.py | 324 ------------------ .../tests/unittests/test_sparse_mnist.py | 126 ------- 3 files changed, 35 insertions(+), 463 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py delete mode 100644 python/paddle/fluid/tests/unittests/test_sparse_mnist.py diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu index cbbdc122f616f..0b05433de83db 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h" +#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" #include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" namespace phi { @@ -120,7 +121,7 @@ void SparseMaskKernel(const Context& dev_ctx, })); } -template +template __global__ void SparseMaskCopyKernel(const IntT* x_indexs, const IntT* mask_indexs, const IntT* bound_out, @@ -129,10 +130,15 @@ __global__ void SparseMaskCopyKernel(const IntT* x_indexs, const int64_t stride, T* out_values) { CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; const IntT j = bound_out[i]; if (j >= 0 && j < n && mask_indexs[i] == x_indexs[j]) { - for (int k = 0; k < stride; k++) { - out_values[i * stride + k] = x_values[j * stride + k]; + for (int k = 0; k < stride / VecSize; k++) { + // out_values[i * stride + k] = x_values[j * stride + k]; + LoadT vec_x; + phi::Load(x_values + j * stride + k * VecSize, &vec_x); + phi::Store(vec_x, out_values + i * stride + k * VecSize); } } } @@ -230,16 +236,32 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx, const int64_t stride = x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1]; - SparseMaskCopyKernel<<>>(x_indexs_ptr, - mask_indexs_ptr, - bound_out_ptr, - x.non_zero_elements().data(), - mask_indexs.numel(), - stride, - out_ptr); + const int VecSize = VecBytes / sizeof(T); + if (stride % VecSize == 0) { + SparseMaskCopyKernel + <<>>(x_indexs_ptr, + mask_indexs_ptr, + bound_out_ptr, + x.non_zero_elements().data(), + mask_indexs.numel(), + stride, + out_ptr); + } else { + SparseMaskCopyKernel + <<>>(x_indexs_ptr, + mask_indexs_ptr, + bound_out_ptr, + x.non_zero_elements().data(), + mask_indexs.numel(), + stride, + out_ptr); + } } template diff --git a/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py b/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py deleted file mode 100644 index ae52b4a413336..0000000000000 --- a/python/paddle/fluid/tests/unittests/test_sparse_middle_extractor.py +++ /dev/null @@ -1,324 +0,0 @@ -import paddle -import paddle.nn as nn -import paddle.sparse as sparse -from paddle.fluid.framework import _test_eager_guard -import time -import numpy as np -import torch -import spconv.pytorch as spconv -import inspect - -class MiddleExtractor(paddle.nn.Layer): - def __init__(self, - #output_shape, - use_norm=True, - num_input_features=128, - num_filters_down1=[64], - num_filters_down2=[64, 64], - name='MiddleExtractor'): - super(MiddleExtractor, self).__init__() - self.name = name - if not use_norm: - self.middle_conv = paddle.nn.Sequential( - #nn.Pad3D(1), - nn.Conv3D(num_input_features, 64, 3, stride=(2, 1, 1), data_format='NDHWC'), - #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), - nn.ReLU(), - #nn.Pad3D([1, 1, 1, 1, 0, 0]), - nn.Conv3D(64, 64, 3, stride=(1, 1, 1), data_format='NDHWC'), - #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), - nn.ReLU(), - #nn.Pad3D(1), - nn.Conv3D(64, 64, 3, stride=(2, 1, 1), data_format='NDHWC'), - #nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), - nn.ReLU(), - ) - else: - self.middle_conv = paddle.nn.Sequential( - #nn.Pad3D(1), - nn.Conv3D(num_input_features, 64, 3, stride=(2, 1, 1), data_format='NDHWC'), - nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), - nn.ReLU(), - #nn.Pad3D([1, 1, 1, 1, 0, 0]), - nn.Conv3D(64, 64, 3, stride=(1, 1, 1), data_format='NDHWC'), - nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), - nn.ReLU(), - #nn.Pad3D(1), - nn.Conv3D(64, 64, 3, stride=(2, 1, 1), data_format='NDHWC'), - nn.BatchNorm3D(64, epsilon=1e-3, momentum=0.001, data_format='NDHWC'), - nn.ReLU(), - ) - def forward(self, x): - return self.middle_conv(x) - - -def get_pos_to_kw_map(func): - pos_to_kw = {} - fsig = inspect.signature(func) - pos = 0 - for name, info in fsig.parameters.items(): - if info.kind is info.POSITIONAL_OR_KEYWORD: - pos_to_kw[pos] = name - pos += 1 - return pos_to_kw - -def change_default_args(**kwargs): - def layer_wrapper(layer_class): - class DefaultArgLayer(layer_class): - def __init__(self, *args, **kw): - pos_to_kw = get_pos_to_kw_map(layer_class.__init__) - kw_to_pos = {kw: pos for pos, kw in pos_to_kw.items()} - for key, val in kwargs.items(): - if key not in kw and kw_to_pos[key] > len(args): - kw[key] = val - super().__init__(*args, **kw) - - return DefaultArgLayer - - return layer_wrapper - -class Empty(torch.nn.Module): - def __init__(self, *args, **kwargs): - super(Empty, self).__init__() - - def forward(self, *args, **kwargs): - if len(args) == 1: - return args[0] - elif len(args) == 0: - return None - return args - -class SpconvMiddleExtractor(torch.nn.Module): - def __init__(self, - #output_shape, - use_norm=True, - num_input_features=128, - num_filters_down1=[64], - num_filters_down2=[64, 64], - name='SpconvMiddleExtractor'): - super(SpconvMiddleExtractor, self).__init__() - if use_norm: - BatchNorm1d = change_default_args( - eps=1e-3, momentum=0.01)(torch.nn.BatchNorm1d) - Linear = change_default_args(bias=False)(nn.Linear) - else: - BatchNorm1d = Empty - Linear = change_default_args(bias=True)(nn.Linear) - - middle_layers = [] - - num_filters = [num_input_features] + num_filters_down1 - filters_pairs_d1 = [[num_filters[i], num_filters[i + 1]] - for i in range(len(num_filters) - 1)] - - for i, o in filters_pairs_d1: - middle_layers.append(spconv.SubMConv3d(i, o, 3, bias=False)) - if use_norm: - #middle_layers.append(BatchNorm1d(o)) - middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01)) - middle_layers.append(torch.nn.ReLU()) - - middle_layers.append( - spconv.SparseConv3d( - num_filters[-1], - num_filters[-1], (3, 1, 1), (2, 1, 1), - bias=False)) - - if use_norm: - #middle_layers.append( - # BatchNorm1d(num_filters[-1])) - middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01)) - middle_layers.append(torch.nn.ReLU()) - - - # assert len(num_filters_down2) > 0 - if len(num_filters_down1) == 0: - num_filters = [num_filters[-1]] + num_filters_down2 - else: - num_filters = [num_filters_down1[-1]] + num_filters_down2 - filters_pairs_d2 = [[num_filters[i], num_filters[i + 1]] - for i in range(len(num_filters) - 1)] - for i, o in filters_pairs_d2: - middle_layers.append(spconv.SubMConv3d(i, o, 3, bias=False)) - if use_norm: - #middle_layers.append(BatchNorm1d(o)) - middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01)) - middle_layers.append(torch.nn.ReLU()) - middle_layers.append( - spconv.SparseConv3d( - num_filters[-1], - num_filters[-1], (3, 1, 1), (2, 1, 1), - bias=False)) - if use_norm: - #middle_layers.append( - #BatchNorm1d(num_filters[-1])) - middle_layers.append(torch.nn.BatchNorm1d(o, eps=1e-3, momentum=0.01)) - middle_layers.append(torch.nn.ReLU()) - #middle_layers.append(scn.SparseToDense(3, num_filters[-1])) - middle_layers.append(spconv.ToDense()) - self.middle_conv = spconv.SparseSequential(*middle_layers) - - def forward(self, x): - out = self.middle_conv(x) - return out - -class SparseMiddleExtractor(paddle.nn.Layer): - def __init__(self, - #output_shape, - use_norm=True, - num_input_features=128, - num_filters_down1=[64], - num_filters_down2=[64, 64], - name='SparseMiddleExtractor'): - super(SparseMiddleExtractor, self).__init__() - self.name = name - - middle_layers = [] - num_filters = [num_input_features] + num_filters_down1 - filters_pairs_d1 = [[num_filters[i], num_filters[i + 1]] for i in range(len(num_filters) - 1)] - for i, o in filters_pairs_d1: - middle_layers.append(sparse.SubmConv3D(i, o, 3, bias_attr=False)) - if use_norm: - middle_layers.append(sparse.BatchNorm(o, epsilon=1e-3, momentum=0.01)) - middle_layers.append(sparse.ReLU()) - - middle_layers.append(sparse.Conv3D(num_filters[-1], num_filters[-1], (3, 1, 1), (2, 1, 1), bias_attr=False)) - - if use_norm: - middle_layers.append(sparse.BatchNorm(num_filters[-1], epsilon=1e-3, momentum=0.01)) - middle_layers.append(sparse.ReLU()) - - - if len(num_filters_down1) == 0: - num_filters = [num_filters[-1]] + num_filters_down2 - else: - num_filters = [num_filters_down1[-1]] + num_filters_down2 - - filters_pairs_d2 = [[num_filters[i], num_filters[i + 1]] for i in range(len(num_filters) - 1)] - - for i, o in filters_pairs_d2: - middle_layers.append(sparse.SubmConv3D(i, o, 3, bias_attr=False)) - if use_norm: - middle_layers.append(sparse.BatchNorm(o, epsilon=1e-3, momentum=0.01)) - middle_layers.append(sparse.ReLU()) - - middle_layers.append(sparse.Conv3D(num_filters[-1], num_filters[-1], (3, 1, 1), (2, 1, 1), bias_attr=False)) - if use_norm: - middle_layers.append(sparse.BatchNorm(num_filters[-1], epsilon=1e-3, momentum=0.01)) - middle_layers.append(sparse.ReLU()) - - self.middle_conv = nn.Sequential(*middle_layers) - - def forward(self, x): - sparse_out = self.middle_conv(x) - #return sparse_out - return sparse_out.to_dense() - - -def test(): - paddle.seed(0) - with _test_eager_guard(): - in_channels = 128 - # Note: 1. paddle的BatchNorm1D的输入shape不能太大,否则报CUDNN_STATUS_NOT_SUPPORTED. - shape = [20, 40, 100] - batch_size = 1 - sparsity = 0.95 - - full_shape = [batch_size] + shape + [in_channels] - print(full_shape) - - total_elements = np.prod(shape) - nnz = int(total_elements * (1-sparsity)) - print("nnz=", nnz) - - #product indices - indices = [] - for i in range(4): - indices.append(paddle.randint(0, full_shape[i], [1, nnz])) - - indices = paddle.concat(indices) - #product values - values = paddle.randn((nnz, in_channels)) - - sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, shape=full_shape) - - dense_x = sparse_x.to_dense() - - #spconv - device = torch.device("cuda") - torch_x = torch.tensor(dense_x.numpy(), device=device) - - spconv_x = spconv.SparseConvTensor.from_dense(torch_x) - - #whether to use batch_norm - use_norm = True - - dense_model = MiddleExtractor(use_norm=use_norm, num_input_features=in_channels) - spconv_model = SpconvMiddleExtractor(use_norm=use_norm, num_input_features=in_channels).to(device) - sparse_model = SparseMiddleExtractor(use_norm=use_norm, num_input_features=in_channels) - layer_nums = len(sparse_model.middle_conv) - block_size = 3 if use_norm else 2 - layer_nums = int(layer_nums / block_size) - - for i in range(0, layer_nums): - weight = paddle.to_tensor(spconv_model.middle_conv[i * block_size].weight.detach().cpu().numpy()) - sparse_model.middle_conv[i * block_size].weight.set_value(paddle.transpose(paddle.to_tensor(weight), [1,2,3,4,0])) - if use_norm: - bn_weight = paddle.to_tensor(spconv_model.middle_conv[i*block_size + 1].weight.detach().cpu().numpy()) - sparse_model.middle_conv[i * block_size + 1].weight.set_value(bn_weight) - - print(dense_model) - print(sparse_model) - print(spconv_model) - paddle.device.cuda.synchronize() - - #warm up - dense_x.stop_gradient=True - out1 = dense_model(dense_x) - paddle.device.cuda.synchronize() - sparse_x.stop_gradient=True - out2 = sparse_model(sparse_x) - paddle.device.cuda.synchronize() - spconv_x.features.required_grad=False - out3 = spconv_model(spconv_x) - torch.cuda.synchronize(device) - #warm up - - t0 = time.time() - #padde dense - dense_x.stop_gradient=False - out1 = dense_model(dense_x) - out1.backward(out1) - paddle.device.cuda.synchronize() - t1 = time.time() - - #padde sparse - sparse_x.stop_gradient=False - out2 = sparse_model(sparse_x) - out2.backward(out2) - paddle.device.cuda.synchronize() - t2 = time.time() - - #spconv - spconv_x.features.required_grad=True - spconv_x.features.requires_grad_() - out3 = spconv_model(spconv_x) - out3.backward(out3) - torch.cuda.synchronize(device) - t3 = time.time() - - # Note 2. sparse的BatchNorm底层是使用paddle.nn.BatchNorm1D对values进行bn计算,测试发现BatchNorm1D的性能比BatchNorm3D差,因此use_norm=True的情况,需要更高的稀疏度才能比dense的快 - # Note 3. 只跑前向,sparse的耗时和spconv接近,稀疏度越高sparse的性能越好,当前方式测试前向+反向,spconv的耗时很高, 原因未知 - print("dense time: ", t1 - t0) - print("sparse time: ", t2 - t1) - print("spconv time: ", t3 - t2) - - # Note 4. paddle和torch的BN存在误差,测试shape=(4000, 64)的随机输入,单层BN前向误差在1e-6, 反向误差在1e-4 - #verify the forward calculation result - assert np.allclose(paddle.transpose(out2, [0, 4, 1, 2, 3]).numpy(), out3.detach().cpu().numpy(), atol=1e-4, rtol=1e-4) - - #verify the backward calculation result - assert np.allclose(spconv_x.features.grad.cpu().numpy(), - sparse_x.grad.values().numpy(), atol=1e-3, rtol=1e-3) - -test() diff --git a/python/paddle/fluid/tests/unittests/test_sparse_mnist.py b/python/paddle/fluid/tests/unittests/test_sparse_mnist.py deleted file mode 100644 index 3589dc83090f3..0000000000000 --- a/python/paddle/fluid/tests/unittests/test_sparse_mnist.py +++ /dev/null @@ -1,126 +0,0 @@ -import paddle -from paddle.vision.transforms import Compose, Normalize, ToTensor -from paddle.fluid.framework import _test_eager_guard -import time - -paddle.disable_static() -#transform = Compose([Normalize(mean=[127.5], -# std=[127.5], -# data_format='CHW')]) -transform = Compose([ToTensor()]) -# 使用transform对数据集做归一化 -print('download training data and load training data') -train_dataset = paddle.vision.datasets.MNIST(mode='train', transform=transform) -test_dataset = paddle.vision.datasets.MNIST(mode='test', transform=transform) -print('load finished') - -import numpy as np -#import matplotlib.pyplot as plt -train_data0, train_label_0 = train_dataset[0][0],train_dataset[0][1] -train_data0 = train_data0.reshape([28,28]) -#plt.figure(figsize=(2,2)) -#plt.imshow(train_data0, cmap=plt.cm.binary) -print('train_data0 label is: ' + str(train_label_0)) - - -import paddle -import paddle.nn.functional as F -class SparseLeNet(paddle.nn.Layer): - def __init__(self): - super(SparseLeNet, self).__init__() - #self.bn = paddle.sparse.BatchNorm(1) - self.conv1 = paddle.sparse.Conv3D(in_channels=1, out_channels=6, kernel_size=[1, 5, 5], stride=[1, 1, 1], padding=[0, 2, 2]) - self.relu1 = paddle.sparse.ReLU() - self.pool1 = paddle.sparse.MaxPool3D(kernel_size=[1, 2, 2], stride=[1, 2, 2]) - self.conv2 = paddle.sparse.Conv3D(in_channels=6, out_channels=16, kernel_size=[1, 5, 5], stride=[1, 1, 1]) - self.relu2 = paddle.sparse.ReLU() - self.pool2 = paddle.sparse.MaxPool3D(kernel_size=[1, 2, 2], stride=[1, 2, 2]) - - self.fc1 = paddle.nn.Linear(16*5*5, 120) - self.fc2 = paddle.nn.Linear(120, 84) - self.fc3 = paddle.nn.Linear(84, 10) - - def forward(self, x): - #x = self.bn(x) - x = self.conv1(x) - x = self.relu1(x) - x = self.pool1(x) - x = self.conv2(x) - x = self.relu2(x) - x = self.pool2(x) - x = x.to_dense() - - x = paddle.flatten(x, start_axis=1, stop_axis=-1) - x = self.fc1(x) - x = paddle.nn.functional.relu(x) - x = self.fc2(x) - x = paddle.nn.functional.relu(x) - x = self.fc3(x) - return x - -import paddle.nn.functional as F -train_loader = paddle.io.DataLoader(train_dataset, batch_size=64, shuffle=True) -# 加载训练集 batch_size 设为 64 -# sparse 训练 - -def prepare_data(x_data): - x_data = paddle.transpose(x_data, perm=[0, 2, 3, 1]) - x_data = paddle.reshape(x_data, [x_data.shape[0], 1, x_data.shape[1], x_data.shape[2], x_data.shape[3]]) - return x_data - -def sparse_train(model): - model.train() - epochs = 2 - optim = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()) - # 用Adam作为优化函数 - for epoch in range(epochs): - for batch_id, data in enumerate(train_loader()): - x_data = data[0] - y_data = data[1] - x_data = prepare_data(x_data) - x_data = x_data.to_sparse_coo(4) - x_data.stop_gradient=False - predicts = model(x_data) - loss = F.cross_entropy(predicts, y_data) - # 计算损失 - acc = paddle.metric.accuracy(predicts, y_data) - loss.backward() - if batch_id % 300 == 0: - print("epoch: {}, batch_id: {}, loss is: {}, acc is: {}".format(epoch, batch_id, loss.numpy(), acc.numpy())) - optim.step() - optim.clear_grad() - -test_loader = paddle.io.DataLoader(test_dataset, places=paddle.CPUPlace(), batch_size=64) -# 加载测试数据集 -def test(model): - model.eval() - batch_size = 64 - for batch_id, data in enumerate(test_loader()): - x_data = data[0] - y_data = data[1] - x_data = prepare_data(x_data) - x_data = x_data.to_sparse_coo(4) - predicts = model(x_data) - # 获取预测结果 - loss = F.cross_entropy(predicts, y_data) - acc = paddle.metric.accuracy(predicts, y_data) - if batch_id % 20 == 0: - print("batch_id: {}, loss is: {}, acc is: {}".format(batch_id, loss.numpy(), acc.numpy())) - -with _test_eager_guard(): - sparse_model = SparseLeNet() - print(sparse_model) - - t0 = time.time() - sparse_train(sparse_model) - t1 = time.time() - print("spare time:", t1-t0) - test(sparse_model) - #x = paddle.randn((1, 1,28,28,1)) - #x.stop_gradient=False - #sparse_x = x.to_sparse_coo(4) - #print("sparse_x values shape:", sparse_x.values().shape) - #out = sparse_model(sparse_x) - #out.backward(out) - #print("end") - From a13794700ebdcd88c08e49cdbddd733e80e7bf8d Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 21 Jun 2022 08:39:10 +0000 Subject: [PATCH 28/70] merge origin --- paddle/phi/kernels/funcs/sparse/scatter.cu.h | 17 +-- .../kernels/sparse/gpu/coalesced_kernel.cu | 51 +++++---- .../phi/kernels/sparse/gpu/convolution.cu.h | 12 +- .../sparse/gpu/convolution_grad_kernel.cu | 94 ++++++++-------- .../kernels/sparse/gpu/convolution_kernel.cu | 104 +++++++++--------- 5 files changed, 144 insertions(+), 134 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h index 48b12f8a1b6de..cd89c916db577 100644 --- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h +++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h @@ -41,9 +41,10 @@ __global__ void ScatterKernel(const T* input, T* out) { int tid = threadIdx.x + blockIdx.x * blockDim.x; const int vec_channels = channels / VecSize; - using LoadT = phi::AlignedVector; - using StoreT = phi::AlignedVector; - for (int i = tid; i < non_zero_num * vec_channels; i += gridDim.x * blockDim.x) { + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + for (int i = tid; i < non_zero_num * vec_channels; + i += gridDim.x * blockDim.x) { int indices_i = i / vec_channels; int channels_i = i - indices_i * vec_channels; @@ -51,17 +52,19 @@ __global__ void ScatterKernel(const T* input, int end = indices_i == non_zero_num - 1 ? rulebook_len : unique_value[indices_i + 1]; // max(end-start) = kernel_size - StoreT sums={static_cast(0)}; + StoreT sums = {static_cast(0)}; for (int j = start; j < end; j++) { const int out_feature_i = out_index[j]; LoadT vec_in; - phi::Load(input + out_feature_i * channels + channels_i * VecSize, &vec_in); + phi::Load( + input + out_feature_i * channels + channels_i * VecSize, &vec_in); #pragma unroll - for(int k = 0; k < VecSize; k++){ + for (int k = 0; k < VecSize; k++) { sums[k] += vec_in[k]; } } - phi::Store(sums, out + indices_i * channels + channels_i * VecSize); + phi::Store(sums, + out + indices_i * channels + channels_i * VecSize); } } diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu index 44ecb4365a9c1..60d90a18d4633 100644 --- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu @@ -133,30 +133,33 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx, // 5. scatter the values const int VecSize = VecBytes / sizeof(T); - if(stride % VecSize == 0){ - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, - nnz * stride / VecSize, 1); - phi::funcs::sparse::ScatterKernel - <<>>( - x_values_ptr, - public_indexs.data(), - values_indexs_ptr, - out_nnz, - nnz, - stride, - out_values.data()); - }else{ - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, - nnz * stride, 1); - phi::funcs::sparse::ScatterKernel - <<>>( - x_values_ptr, - public_indexs.data(), - values_indexs_ptr, - out_nnz, - nnz, - stride, - out_values.data()); + if (stride % VecSize == 0) { + config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, nnz * stride / VecSize, 1); + phi::funcs::sparse::ScatterKernel + <<>>(x_values_ptr, + public_indexs.data(), + values_indexs_ptr, + out_nnz, + nnz, + stride, + out_values.data()); + } else { + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1); + phi::funcs::sparse::ScatterKernel + <<>>(x_values_ptr, + public_indexs.data(), + values_indexs_ptr, + out_nnz, + nnz, + stride, + out_values.data()); } // 6. convert index to coordinate diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 7f0d4814b75d0..a08c7931bb4f4 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -24,12 +24,12 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/index_impl.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/sparse/utils.cu.h" #include "paddle/phi/kernels/primitive/compute_primitives.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" -#include "paddle/phi/kernels/funcs/aligned_vector.h" namespace phi { namespace sparse { @@ -53,16 +53,16 @@ __global__ void GatherKernel(const T* params, T* output, size_t index_size, size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, index_size*slice_size/VecSize, int64_t) { + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size / VecSize, int64_t) { const int vec_slice_size = slice_size / VecSize; int indices_i = i / vec_slice_size; int slice_i = i - indices_i * vec_slice_size; // offset inside the slice IndexT gather_i = indices[indices_i]; - int64_t params_i = gather_i * slice_size + slice_i*VecSize; - using LoadT = phi::AlignedVector; - using StoreT = phi::AlignedVector; + int64_t params_i = gather_i * slice_size + slice_i * VecSize; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; LoadT params_vec; - phi::Load(params + params_i, ¶ms_vec); + phi::Load(params + params_i, ¶ms_vec); phi::Store(params_vec, output + i * VecSize); } } diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index ebf2bf6cae896..d91c93fde66fb 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -139,54 +139,56 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, } const int VecSize = VecBytes / sizeof(T); - if(in_channels % VecSize == 0){ - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * in_channels / VecSize, 1); - GatherKernel<<>>(x.non_zero_elements().data(), - rulebook_ptr + rulebook_len, - in_features_ptr, - rulebook_len, - in_channels); - }else{ - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * in_channels, 1); - GatherKernel<<>>(x.non_zero_elements().data(), - rulebook_ptr + rulebook_len, - in_features_ptr, - rulebook_len, - in_channels); + if (in_channels % VecSize == 0) { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * in_channels / VecSize, 1); + GatherKernel + <<>>(x.non_zero_elements().data(), + rulebook_ptr + rulebook_len, + in_features_ptr, + rulebook_len, + in_channels); + } else { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * in_channels, 1); + GatherKernel + <<>>(x.non_zero_elements().data(), + rulebook_ptr + rulebook_len, + in_features_ptr, + rulebook_len, + in_channels); } - if(out_channels % VecSize == 0){ - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * out_channels / VecSize, 1); - GatherKernel - <<>>(out_grad.non_zero_elements().data(), - rulebook_ptr + rulebook_len * 2, - out_grad_features_ptr, - rulebook_len, - out_channels); - }else{ - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * out_channels, 1); - GatherKernel - <<>>(out_grad.non_zero_elements().data(), - rulebook_ptr + rulebook_len * 2, - out_grad_features_ptr, - rulebook_len, - out_channels); + if (out_channels % VecSize == 0) { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * out_channels / VecSize, 1); + GatherKernel + <<>>(out_grad.non_zero_elements().data(), + rulebook_ptr + rulebook_len * 2, + out_grad_features_ptr, + rulebook_len, + out_channels); + } else { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * out_channels, 1); + GatherKernel + <<>>(out_grad.non_zero_elements().data(), + rulebook_ptr + rulebook_len * 2, + out_grad_features_ptr, + rulebook_len, + out_channels); } const T* kernel_ptr = kernel.data(); diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index c2942bdfd2b63..c1ea1c1894461 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -111,28 +111,30 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, set_zero(dev_ctx, &out_features, static_cast(0.0f)); const int VecSize = VecBytes / sizeof(T); - if(in_channels % VecSize == 0){ - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels / VecSize, 1); - GatherKernel<<>>(x.non_zero_elements().data(), - rulebook_ptr + n, - in_features_ptr, - n, - in_channels); - }else{ - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1); - GatherKernel<<>>(x.non_zero_elements().data(), - rulebook_ptr + n, - in_features_ptr, - n, - in_channels); + if (in_channels % VecSize == 0) { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, n * in_channels / VecSize, 1); + GatherKernel + <<>>(x.non_zero_elements().data(), + rulebook_ptr + n, + in_features_ptr, + n, + in_channels); + } else { + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1); + GatherKernel + <<>>(x.non_zero_elements().data(), + rulebook_ptr + n, + in_features_ptr, + n, + in_channels); } // 3. call gemm for every werght @@ -182,35 +184,35 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, out_channels, false); } else { - if(out_channels % VecSize == 0){ - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, out->nnz() * out_channels / VecSize, 1); - phi::funcs::sparse::ScatterKernel - <<>>(out_features_ptr, - unique_value.data(), - out_index.data(), - out->nnz(), - n, - out_channels, - out_values_ptr); - }else{ - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, out->nnz() * out_channels, 1); - phi::funcs::sparse::ScatterKernel - <<>>(out_features_ptr, - unique_value.data(), - out_index.data(), - out->nnz(), - n, - out_channels, - out_values_ptr); - } + if (out_channels % VecSize == 0) { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, out->nnz() * out_channels / VecSize, 1); + phi::funcs::sparse::ScatterKernel + <<>>(out_features_ptr, + unique_value.data(), + out_index.data(), + out->nnz(), + n, + out_channels, + out_values_ptr); + } else { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, out->nnz() * out_channels, 1); + phi::funcs::sparse::ScatterKernel + <<>>(out_features_ptr, + unique_value.data(), + out_index.data(), + out->nnz(), + n, + out_channels, + out_values_ptr); + } } } /** From 6a92b32a5c70d25e571793ac54e7f2de5c7b4523 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 22 Jun 2022 07:34:59 +0000 Subject: [PATCH 29/70] opt subm --- paddle/phi/core/sparse_coo_tensor.h | 7 ++ .../kernels/sparse/convolution_grad_kernel.h | 6 +- .../phi/kernels/sparse/convolution_kernel.h | 18 +---- .../sparse/cpu/convolution_grad_kernel.cc | 7 +- .../kernels/sparse/cpu/convolution_kernel.cc | 55 ++++++++----- .../phi/kernels/sparse/gpu/convolution.cu.h | 1 + .../sparse/gpu/convolution_grad_kernel.cu | 8 +- .../kernels/sparse/gpu/convolution_kernel.cu | 81 +++++++++++++------ .../kernels/test_sparse_conv3d_dev_api.cc | 18 ++--- python/paddle/utils/code_gen/sparse_api.yaml | 5 +- .../paddle/utils/code_gen/sparse_bw_api.yaml | 6 +- 11 files changed, 129 insertions(+), 83 deletions(-) diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h index c65b5ce57430b..7f2da7afe824b 100644 --- a/paddle/phi/core/sparse_coo_tensor.h +++ b/paddle/phi/core/sparse_coo_tensor.h @@ -155,6 +155,11 @@ class SparseCooTensor : public TensorBase, /// \brief get the dnese dim int32_t dense_dim() const; + const DenseTensor& rulebook() const { return rulebook_; } + DenseTensor* mutable_rulebook() { return &rulebook_; } + void SetRulebook(const DenseTensor& rulebook) { rulebook_ = rulebook; } + const bool subm() const { return subm_; } + void SetSubm(const bool subm) { subm_ = subm; } private: // save the indices of non zero elements in original dense tensor @@ -165,6 +170,8 @@ class SparseCooTensor : public TensorBase, bool coalesced_ = false; // save the number of non zero elements in each batch DDim dims_; + DenseTensor rulebook_; + bool subm_ = false; /* --------------------------- */ /* example: non zero element is scalar */ /* --------------------------- */ diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h index eebfcddfc7a9e..03a7403b1f41d 100644 --- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h +++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h @@ -26,7 +26,7 @@ template void Conv3dGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, - const DenseTensor& rulebook, + const SparseCooTensor& out, const SparseCooTensor& out_grad, const std::vector& paddings, const std::vector& dilations, @@ -41,7 +41,7 @@ std::tuple Conv3dGrad( const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, - const DenseTensor& rulebook, + const SparseCooTensor& out, const SparseCooTensor& out_grad, const std::vector& paddings, const std::vector& dilations, @@ -55,7 +55,7 @@ std::tuple Conv3dGrad( Conv3dGradKernel(dev_ctx, x, kernel, - rulebook, + out, out_grad, paddings, dilations, diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h index 62a72a9dd4115..90e46800c92ca 100644 --- a/paddle/phi/kernels/sparse/convolution_kernel.h +++ b/paddle/phi/kernels/sparse/convolution_kernel.h @@ -31,8 +31,7 @@ void Conv3dKernel(const Context& dev_ctx, const std::vector& strides, const int groups, const bool subm, - SparseCooTensor* out, - DenseTensor* rulebook); + SparseCooTensor* out); template SparseCooTensor Conv3d(const Context& dev_ctx, @@ -42,19 +41,10 @@ SparseCooTensor Conv3d(const Context& dev_ctx, const std::vector& dilations, const std::vector& strides, const int groups, - const bool subm, - DenseTensor* rulebook) { + const bool subm) { SparseCooTensor coo; - Conv3dKernel(dev_ctx, - x, - kernel, - paddings, - dilations, - strides, - groups, - subm, - &coo, - rulebook); + Conv3dKernel( + dev_ctx, x, kernel, paddings, dilations, strides, groups, subm, &coo); return coo; } diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc index 5a981fb8df350..b750a688ff867 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc @@ -35,7 +35,7 @@ template void Conv3dGradCPUKernel(const CPUContext& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, - const DenseTensor& rulebook, + const SparseCooTensor& out, const SparseCooTensor& out_grad, const std::vector& paddings, const std::vector& dilations, @@ -48,6 +48,7 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx, const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; + const DenseTensor& rulebook = out.rulebook(); const IntT* rulebook_ptr = rulebook.data(); const int rulebook_len = rulebook.dims()[1]; @@ -182,7 +183,7 @@ template void Conv3dGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, - const DenseTensor& rulebook, + const SparseCooTensor& out, const SparseCooTensor& out_grad, const std::vector& paddings, const std::vector& dilations, @@ -196,7 +197,7 @@ void Conv3dGradKernel(const Context& dev_ctx, Conv3dGradCPUKernel(dev_ctx, x, kernel, - rulebook, + out, out_grad, paddings, dilations, diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index 1b95de890deeb..6db26ffcc7094 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/sparse/cpu/convolution.h" @@ -35,8 +36,7 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx, const std::vector& strides, const int groups, const bool subm, - SparseCooTensor* out, - DenseTensor* rulebook) { + SparseCooTensor* out) { // update padding and dilation // Currently, only support x.layout is NDHWC, groups = 1 // if x.layout != NDHWC then transpose(x), transpose(weight) @@ -70,20 +70,37 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx, DataType::INT32, {kernel_size}, DataLayout::NCHW); DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); - ProductRuleBook(dev_ctx, - x, - kernel_sizes, - subm_paddings, - dilations, - subm_strides, - out_dims, - subm, - rulebook, - &counter_per_kernel); - - UpdateRulebookAndOutIndex( - dev_ctx, x, kernel_size, out_channels, out_dims, rulebook, out); - + DenseTensor* rulebook = nullptr; + // int n = 0; + if (subm && x.subm()) { + DenseTensor out_rulebook = phi::EmptyLike(dev_ctx, x.rulebook()); + phi::Copy(dev_ctx, x.rulebook(), dev_ctx.GetPlace(), false, &out_rulebook); + out->SetRulebook(out_rulebook); + rulebook = out->mutable_rulebook(); + // n = rulebook->dims()[1]; + + DenseTensor out_indices = + phi::EmptyLike(dev_ctx, x.non_zero_indices()); + DenseTensor out_values = phi::EmptyLike(dev_ctx, x.non_zero_elements()); + phi::Copy( + dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices); + out->SetMember(out_indices, out_values, out_dims, true); + out->SetSubm(subm); + } else { + ProductRuleBook(dev_ctx, + x, + kernel_sizes, + subm_paddings, + dilations, + subm_strides, + out_dims, + subm, + rulebook, + &counter_per_kernel); + + UpdateRulebookAndOutIndex( + dev_ctx, x, kernel_size, out_channels, out_dims, rulebook, out); + } int n = rulebook->dims()[1]; const int* counter_ptr = counter_per_kernel.data(); @@ -159,8 +176,7 @@ void Conv3dKernel(const Context& dev_ctx, const std::vector& strides, const int groups, const bool subm, - SparseCooTensor* out, - DenseTensor* rulebook) { + SparseCooTensor* out) { PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "Conv3dCPUKernel", ([&] { Conv3dCPUKernel(dev_ctx, @@ -171,8 +187,7 @@ void Conv3dKernel(const Context& dev_ctx, strides, groups, subm, - out, - rulebook); + out); })); } diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index a08c7931bb4f4..aafd06d606d33 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -555,6 +555,7 @@ int ProductRuleBook(const Context& dev_ctx, phi::Copy( dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices); out->SetMember(out_indices, out_values, out_dims, true); + out->SetSubm(true); } return rulebook_len; } diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index d91c93fde66fb..a5573f74be441 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -42,7 +42,7 @@ template void Conv3dGradGPUKernel(const GPUContext& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, - const DenseTensor& rulebook, + const SparseCooTensor& out, const SparseCooTensor& out_grad, const std::vector& paddings, const std::vector& dilations, @@ -55,6 +55,8 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; + + const DenseTensor& rulebook = out.rulebook(); const IntT* rulebook_ptr = rulebook.data(); const int rulebook_len = rulebook.dims()[1]; @@ -253,7 +255,7 @@ template void Conv3dGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, - const DenseTensor& rulebook, + const SparseCooTensor& out, const SparseCooTensor& out_grad, const std::vector& paddings, const std::vector& dilations, @@ -267,7 +269,7 @@ void Conv3dGradKernel(const Context& dev_ctx, Conv3dGradGPUKernel(dev_ctx, x, kernel, - rulebook, + out, out_grad, paddings, dilations, diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index c1ea1c1894461..fc7f939e9bde9 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -34,8 +34,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, const std::vector& strides, const int groups, const bool subm, - SparseCooTensor* out, - DenseTensor* rulebook) { + SparseCooTensor* out) { // update padding and dilation // Currently, only support x.layout is NDHWC, groups = 1 // if x.layout != NDHWC then transpose(x), transpose(weight) @@ -75,25 +74,61 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta)); DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); - int n = ProductRuleBook(dev_ctx, - x, - kernel_sizes, - subm_paddings, - dilations, - subm_strides, - out_dims, - subm, - rulebook, - &counter_per_kernel, - &offsets_per_kernel, - &out_index, - &unique_value, - out, - &h_counter, - &offsets); + DenseTensor* rulebook = nullptr; + int n = 0; + if (subm && x.subm()) { + DenseTensor out_rulebook = phi::EmptyLike(dev_ctx, x.rulebook()); + phi::Copy(dev_ctx, x.rulebook(), dev_ctx.GetPlace(), false, &out_rulebook); + out->SetRulebook(out_rulebook); + rulebook = out->mutable_rulebook(); + n = rulebook->dims()[1]; - const int* counter_ptr = counter_per_kernel.data(); - const int* offsets_ptr = counter_per_kernel.data(); + DenseTensor out_indices = + phi::EmptyLike(dev_ctx, x.non_zero_indices()); + DenseTensor out_values = phi::EmptyLike(dev_ctx, x.non_zero_elements()); + phi::Copy( + dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices); + out->SetMember(out_indices, out_values, out_dims, true); + out->SetSubm(subm); + const IntT* rulebook_ptr = rulebook->data(); + phi::backends::gpu::GpuMemcpyAsync(&h_counter[0], + rulebook_ptr, + n * sizeof(IntT), + gpuMemcpyDeviceToHost, + dev_ctx.stream()); + dev_ctx.Wait(); + std::vector counter(kernel_size, 0); + for (int i = 0; i < n; i++) { + counter[h_counter[i]] += 1; + } + IntT offset = 0; + for (int i = 0; i < kernel_size; i++) { + offsets[i] = offset; + offset += counter[i]; + } + offsets[kernel_size] = offset; + } else { + rulebook = out->mutable_rulebook(); + n = ProductRuleBook(dev_ctx, + x, + kernel_sizes, + subm_paddings, + dilations, + subm_strides, + out_dims, + subm, + rulebook, + &counter_per_kernel, + &offsets_per_kernel, + &out_index, + &unique_value, + out, + &h_counter, + &offsets); + } + + // const int* counter_ptr = counter_per_kernel.data(); + // const int* offsets_ptr = counter_per_kernel.data(); const IntT* rulebook_ptr = rulebook->data(); // 2. gather @@ -229,8 +264,7 @@ void Conv3dKernel(const Context& dev_ctx, const std::vector& strides, const int groups, const bool subm, - SparseCooTensor* out, - DenseTensor* rulebook) { + SparseCooTensor* out) { PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "Conv3dGPUKernel", ([&] { Conv3dGPUKernel(dev_ctx, @@ -241,8 +275,7 @@ void Conv3dKernel(const Context& dev_ctx, strides, groups, subm, - out, - rulebook); + out); })); } diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index b7d56cb0d2b06..b8ae97a449f80 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -112,8 +112,8 @@ void TestConv3dBase(const std::vector& indices, }; if (!std::is_same::value) { - DenseTensor rulebook = phi::Empty( - dev_ctx_cpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW)); + // DenseTensor rulebook = phi::Empty( + // dev_ctx_cpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW)); SparseCooTensor out = sparse::Conv3d(dev_ctx_cpu, x_tensor, kernel_tensor, @@ -121,8 +121,7 @@ void TestConv3dBase(const std::vector& indices, dilations, strides, 1, - subm, - &rulebook); + subm); ASSERT_EQ(correct_out_dims.size(), out.dims().size()); for (int i = 0; i < correct_out_dims.size(); i++) { @@ -142,7 +141,7 @@ void TestConv3dBase(const std::vector& indices, sparse::Conv3dGrad(dev_ctx_cpu, x_tensor, kernel_tensor, - rulebook, + out, out, paddings, dilations, @@ -196,8 +195,8 @@ void TestConv3dBase(const std::vector& indices, phi::Copy( dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor); - DenseTensor d_rulebook = phi::Empty( - dev_ctx_gpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW)); + // DenseTensor d_rulebook = phi::Empty( + // dev_ctx_gpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW)); SparseCooTensor d_out = sparse::Conv3d(dev_ctx_gpu, d_x_tensor, d_kernel_tensor, @@ -205,8 +204,7 @@ void TestConv3dBase(const std::vector& indices, dilations, strides, 1, - subm, - &d_rulebook); + subm); ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz()); @@ -243,7 +241,7 @@ void TestConv3dBase(const std::vector& indices, sparse::Conv3dGrad(dev_ctx_gpu, d_x_tensor, d_kernel_tensor, - d_rulebook, + d_out, d_out, paddings, dilations, diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml index 84c6d2a16af43..34b4fd317283c 100644 --- a/python/paddle/utils/code_gen/sparse_api.yaml +++ b/python/paddle/utils/code_gen/sparse_api.yaml @@ -1,10 +1,9 @@ - api : conv3d args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) - output : Tensor(out), Tensor(rulebook) + output : Tensor(out) kernel : - func : sparse_conv3d{sparse_coo, dense -> sparse_coo, dense} + func : sparse_conv3d{sparse_coo, dense -> sparse_coo} layout : x - intermediate : rulebook backward : conv3d_grad - api : coo_to_dense diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml index 5d9874dff29ec..0c00ba4c16491 100644 --- a/python/paddle/utils/code_gen/sparse_bw_api.yaml +++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml @@ -1,9 +1,9 @@ - backward_api : conv3d_grad - forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor) - args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) + forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor) + args : (Tensor x, Tensor kernel, Tensor out, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) output : Tensor(x_grad), Tensor(kernel_grad) kernel : - func : sparse_conv3d_grad{sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense} + func : sparse_conv3d_grad{sparse_coo, dense, sparse_coo, sparse_coo -> sparse_coo, dense} - backward_api : coo_to_dense_grad forward : coo_to_dense(Tensor x) -> Tensor(out) From e96f090365266dbedab26870513e2d4c9e14582c Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 22 Jun 2022 11:18:55 +0000 Subject: [PATCH 30/70] opt subm --- .../phi/kernels/sparse/gpu/convolution.cu.h | 386 ++++++++++++------ 1 file changed, 261 insertions(+), 125 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 24a7387d4fe19..982c06dfcd8f3 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -268,6 +268,111 @@ __global__ void ProductRuleBookKernel(const T* x_indices, } } +template +__global__ void GetOutIndexTable(const IntT* indices, + const IntT non_zero_num, + const Dims4D dims, + IntT* out_index_table) { + CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) { + IntT batch = indices[i]; + IntT in_z = indices[i + non_zero_num]; + IntT in_y = indices[i + 2 * non_zero_num]; + IntT in_x = indices[i + 3 * non_zero_num]; + IntT index = PointToIndex(batch, in_x, in_y, in_z, dims); + out_index_table[index] = i; + } +} + +template +__global__ void ProductSubmRuleBookKernel(const T* x_indices, + const Dims4D x_dims, + const Dims4D kernel_dims, + const Dims4D out_dims, + const int64_t non_zero_num, + const Dims4D paddings, + const Dims4D dilations, + const Dims4D strides, + const bool subm, + const T* out_index_table, + T* rulebook, + int* counter, + T* in_indexs) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1]; + extern __shared__ int counter_buf[]; // kernel_size + int* counter_buf2 = counter_buf + kernel_size; + // length = kernel_size * blockDim.x * 2; + int* rulebook_buf = counter_buf + kernel_size * 2; + + const int offset = kernel_size * non_zero_num; + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + counter_buf[i] = 0; + } + __syncthreads(); + + for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { + int kernel_index = 0; + T batch = x_indices[i]; + T in_z = x_indices[i + non_zero_num]; + T in_y = x_indices[i + 2 * non_zero_num]; + T in_x = x_indices[i + 3 * non_zero_num]; + if (subm) { + in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims); + } + for (int kz = 0; kz < kernel_dims[1]; kz++) { + for (int ky = 0; ky < kernel_dims[2]; ky++) { + for (int kx = 0; kx < kernel_dims[3]; kx++) { + int in_i = -1, out_index = -1, kernel_i = -1; + if (phi::funcs::sparse::Check(x_dims, + kernel_dims, + paddings, + dilations, + strides, + in_x, + in_y, + in_z, + kx, + ky, + kz)) { + T out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1]; + T out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2]; + T out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3]; + out_index = phi::funcs::sparse::PointToIndex( + batch, out_x, out_y, out_z, out_dims); + int real_out_index = out_index_table[out_index]; + if (real_out_index != -1) { + in_i = i; + int buf_i = atomicAdd(&counter_buf[kernel_index], 1); + kernel_i = kernel_index; + rulebook_buf[kernel_index * blockDim.x + buf_i] = in_i; + rulebook_buf[kernel_index * blockDim.x + + kernel_size * blockDim.x + buf_i] = real_out_index; + } + } + // rulebook[kernel_index * non_zero_num + i] = kernel_i; + // rulebook[kernel_index * non_zero_num + offset + i] = in_i; + // rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index; + ++kernel_index; + } + } + } + } + __syncthreads(); + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + counter_buf2[i] = atomicAdd(&counter[i], counter_buf[i]); + } + __syncthreads(); + for (int i = 0; i < kernel_size; i++) { + if (threadIdx.x < counter_buf[i]) { + rulebook[i * non_zero_num + counter_buf2[i] + threadIdx.x] = i; + rulebook[i * non_zero_num + offset + counter_buf2[i] + threadIdx.x] = + rulebook_buf[i * blockDim.x + threadIdx.x]; + rulebook[i * non_zero_num + offset * 2 + counter_buf2[i] + threadIdx.x] = + rulebook_buf[i * blockDim.x + kernel_size * blockDim.x + threadIdx.x]; + } + } +} + // the basic algorithm can refer to convolution_kernel.cc or // the second paper // example: @@ -309,12 +414,6 @@ int ProductRuleBook(const Context& dev_ctx, int* counter_ptr = counter_per_kernel->data(); int* offsets_ptr = offsets_per_kernel->data(); int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; - const int rulebook_rows = 3; - const int rulebook_cols = kernel_size * non_zero_num; - DenseTensorMeta rulebook_meta( - indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW); - *rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta)); - IntT* rulebook_ptr = rulebook->data(); const auto x_dims = x.dims(); Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]); @@ -329,145 +428,193 @@ int ProductRuleBook(const Context& dev_ctx, auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); - ProductRuleBookKernel<<>>(indices_ptr, - d_x_dims, - d_kernel_dims, - d_out_dims, - non_zero_num, - d_paddings, - d_dilations, - d_strides, - subm, - rulebook_ptr, - counter_ptr, - in_indexs.data()); - -// 2. remove -1 -#ifdef PADDLE_WITH_HIP - IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), -#else - IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), -#endif - rulebook_ptr, - rulebook_ptr + rulebook_rows * rulebook_cols, - -1); - - phi::funcs::sparse::DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1); - IntT rulebook_len = 0; - phi::backends::gpu::GpuMemcpyAsync( - &rulebook_len, - rulebook_ptr + 3 * kernel_size * non_zero_num - 1, - sizeof(IntT), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif - dev_ctx.stream()); - dev_ctx.Wait(); - rulebook_len /= 3; - if (subm) { // At present, hashtable is not used to map the input and output indexes. // At present, the intermediate output index is generated by normal // convolution, // and then the intermediate output index is subtracted from the input index // to obain the rulebook. + const int rulebook_rows = 3; + const int rulebook_cols = kernel_size * non_zero_num; + DenseTensorMeta rulebook_meta( + indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW); + DenseTensor tmp_rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta)); + IntT* rulebook_ptr = tmp_rulebook.data(); + DenseTensor out_indices = + phi::EmptyLike(dev_ctx, x.non_zero_indices()); + DenseTensor out_values = phi::EmptyLike(dev_ctx, x.non_zero_elements()); + phi::Copy( + dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices); - // call lower_bound to get the real index of out_index - const IntT* in_indexs_ptr = in_indexs.data(); - IntT* out_indexs_ptr = rulebook_ptr + 2 * rulebook_len; - DenseTensor bound = phi::Empty( - dev_ctx, - DenseTensorMeta( - indices_dtype, {static_cast(rulebook_len)}, DataLayout::NCHW)); - IntT* bound_ptr = bound.data(); -#ifdef PADDLE_WITH_HIP - thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()), -#endif - in_indexs_ptr, - in_indexs_ptr + in_indexs.numel(), - out_indexs_ptr, - out_indexs_ptr + rulebook_len, - bound_ptr); - - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); - - UpdateOutIndexAndCounterAfterLowerBound<<>>( - in_indexs_ptr, - bound.data(), - rulebook_len, - kernel_size, - x.nnz(), - rulebook_ptr, - out_indexs_ptr, - counter_ptr); - -// remove -1 + int64_t table_size = 1; + for (int i = 0; i < out_dims.size() - 1; i++) { + table_size *= out_dims[i]; + } + DenseTensor out_index_table = phi::Empty(dev_ctx, {table_size}); + IntT* out_index_table_ptr = out_index_table.data(); + thrust::fill(thrust::cuda::par.on(dev_ctx.stream()), + out_index_table_ptr, + out_index_table_ptr + out_index_table.numel(), + -1); + + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + GetOutIndexTable<<>>( + out_indices.data(), non_zero_num, d_x_dims, out_index_table_ptr); + + if (config.thread_per_block.x > 128) { + config.block_per_grid.x *= config.thread_per_block.x / 128; + config.thread_per_block.x = 128; + } + size_t cache_size = kernel_size * 2 + kernel_size * + config.thread_per_block.x * 2 * + sizeof(int); + ProductSubmRuleBookKernel + <<>>(indices_ptr, + d_x_dims, + d_kernel_dims, + d_out_dims, + non_zero_num, + d_paddings, + d_dilations, + d_strides, + subm, + out_index_table_ptr, + rulebook_ptr, + counter_ptr, + in_indexs.data()); + + out->SetMember(out_indices, out_values, out_dims, true); + + thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), + counter_ptr, + counter_ptr + kernel_size, + offsets_ptr); + + phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], + counter_ptr, + kernel_size * sizeof(int), + gpuMemcpyDeviceToHost, + dev_ctx.stream()); + + phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], + offsets_ptr, + kernel_size * sizeof(int), + gpuMemcpyDeviceToHost, + dev_ctx.stream()); + dev_ctx.Wait(); + int rulebook_len = + (*h_offsets)[kernel_size - 1] + (*h_counter)[kernel_size - 1]; + DenseTensor out_rulebook = phi::Empty(dev_ctx, {3, rulebook_len}); + IntT* out_rulebook_ptr = out_rulebook.data(); + for (int i = 0; i < kernel_size; i++) { + if ((*h_counter)[i] <= 0) continue; + phi::backends::gpu::GpuMemcpyAsync(out_rulebook_ptr + (*h_offsets)[i], + rulebook_ptr + i * non_zero_num, + (*h_counter)[i] * sizeof(IntT), + gpuMemcpyDeviceToDevice, + dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync( + out_rulebook_ptr + rulebook_len + (*h_offsets)[i], + rulebook_ptr + kernel_size * non_zero_num + i * non_zero_num, + (*h_counter)[i] * sizeof(IntT), + gpuMemcpyDeviceToDevice, + dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync( + out_rulebook_ptr + 2 * rulebook_len + (*h_offsets)[i], + rulebook_ptr + 2 * kernel_size * non_zero_num + i * non_zero_num, + (*h_counter)[i] * sizeof(IntT), + gpuMemcpyDeviceToDevice, + dev_ctx.stream()); + } + *rulebook = out_rulebook; + return rulebook_len; + + } else { + const int rulebook_rows = 3; + const int rulebook_cols = kernel_size * non_zero_num; + DenseTensorMeta rulebook_meta( + indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW); + *rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta)); + IntT* rulebook_ptr = rulebook->data(); + ProductRuleBookKernel<<>>(indices_ptr, + d_x_dims, + d_kernel_dims, + d_out_dims, + non_zero_num, + d_paddings, + d_dilations, + d_strides, + subm, + rulebook_ptr, + counter_ptr, + in_indexs.data()); + + // 2. remove -1 #ifdef PADDLE_WITH_HIP IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), #else IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), #endif rulebook_ptr, - rulebook_ptr + 3 * rulebook_len, + rulebook_ptr + rulebook_rows * rulebook_cols, -1); - phi::funcs::sparse::DistanceKernel - <<<1, 1, 0, dev_ctx.stream()>>>(rulebook_ptr, last, bound_ptr); - phi::backends::gpu::GpuMemcpyAsync(&rulebook_len, - bound_ptr, - sizeof(IntT), + + phi::funcs::sparse::DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1); + IntT rulebook_len = 0; + phi::backends::gpu::GpuMemcpyAsync( + &rulebook_len, + rulebook_ptr + 3 * kernel_size * non_zero_num - 1, + sizeof(IntT), #ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, + hipMemcpyDeviceToHost, #else - cudaMemcpyDeviceToHost, + cudaMemcpyDeviceToHost, #endif - dev_ctx.stream()); + dev_ctx.stream()); dev_ctx.Wait(); rulebook_len /= 3; - } #ifdef PADDLE_WITH_HIP - thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), + thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), #else - thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), + thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), #endif - counter_ptr, - counter_ptr + kernel_size, - offsets_ptr); + counter_ptr, + counter_ptr + kernel_size, + offsets_ptr); - phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], - counter_ptr, - kernel_size * sizeof(int), + phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], + counter_ptr, + kernel_size * sizeof(int), #ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, + hipMemcpyDeviceToHost, #else - cudaMemcpyDeviceToHost, + cudaMemcpyDeviceToHost, #endif - dev_ctx.stream()); + dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], - offsets_ptr, - kernel_size * sizeof(int), + phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], + offsets_ptr, + kernel_size * sizeof(int), #ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, + hipMemcpyDeviceToHost, #else - cudaMemcpyDeviceToHost, + cudaMemcpyDeviceToHost, #endif - dev_ctx.stream()); - - rulebook->Resize({rulebook_rows, static_cast(rulebook_len)}); + dev_ctx.stream()); - if (!subm) { + rulebook->Resize({rulebook_rows, static_cast(rulebook_len)}); // 3. sorted or merge the out index out_index->ResizeAndAllocate({static_cast(rulebook_len)}); unique_value->ResizeAndAllocate({static_cast(rulebook_len)}); @@ -538,19 +685,8 @@ int ProductRuleBook(const Context& dev_ctx, out_indices_ptr, rulebook_ptr + 2 * rulebook_len); out->SetMember(out_indices, out_values, out_dims, true); - } else { - DenseTensor out_indices = - phi::EmptyLike(dev_ctx, x.non_zero_indices()); - DenseTensor out_values = - phi::Empty(dev_ctx, - DenseTensorMeta(x.dtype(), - {x.nnz(), kernel_sizes[4]}, - x.non_zero_elements().layout())); - phi::Copy( - dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices); - out->SetMember(out_indices, out_values, out_dims, true); + return rulebook_len; } - return rulebook_len; } } // namespace sparse From c7eddc52808f5518e5b14d440dd2118b360c5948 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 22 Jun 2022 12:12:30 +0000 Subject: [PATCH 31/70] opt copy rulebook --- .../phi/kernels/sparse/gpu/convolution.cu.h | 70 +++++++++++++------ 1 file changed, 47 insertions(+), 23 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index e4c42b701c0d7..d8cc45c445159 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -289,6 +289,41 @@ __global__ void GetOutIndexTable(const IntT* indices, } } +template +__global__ void CopyRuleBook(const int* counters, + const int* offsets, + const IntT* in_rulebook, + const int len, + const int kernel_size, + const int non_zero_num, + IntT* out_rulebook) { + int tid = threadIdx.x + blockDim.x * blockIdx.x; + extern __shared__ int cache_counters[]; + int* cache_offsets = cache_counters + kernel_size; + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + cache_counters[i] = counters[i]; + cache_offsets[i] = offsets[i]; + } + __syncthreads(); + for (int i = tid; i < len; i += gridDim.x * blockDim.x) { + // get the kernel index + int kernel_index = 0; + for (; kernel_index < kernel_size - 1; kernel_index++) { + if (i >= offsets[kernel_index] && i < offsets[kernel_index + 1]) { + break; + } + } + int inner_index = i - offsets[kernel_index]; + out_rulebook[i] = in_rulebook[kernel_index * non_zero_num + inner_index]; + out_rulebook[len + i] = + in_rulebook[kernel_size * non_zero_num + kernel_index * non_zero_num + + inner_index]; + out_rulebook[2 * len + i] = + in_rulebook[2 * kernel_size * non_zero_num + + kernel_index * non_zero_num + inner_index]; + } +} + template __global__ void ProductSubmRuleBookKernel(const T* x_indices, const Dims4D x_dims, @@ -355,9 +390,6 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices, kernel_size * blockDim.x + buf_i] = real_out_index; } } - // rulebook[kernel_index * non_zero_num + i] = kernel_i; - // rulebook[kernel_index * non_zero_num + offset + i] = in_i; - // rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index; ++kernel_index; } } @@ -519,26 +551,18 @@ int ProductRuleBook(const Context& dev_ctx, (*h_offsets)[kernel_size - 1] + (*h_counter)[kernel_size - 1]; DenseTensor out_rulebook = phi::Empty(dev_ctx, {3, rulebook_len}); IntT* out_rulebook_ptr = out_rulebook.data(); - for (int i = 0; i < kernel_size; i++) { - if ((*h_counter)[i] <= 0) continue; - phi::backends::gpu::GpuMemcpyAsync(out_rulebook_ptr + (*h_offsets)[i], - rulebook_ptr + i * non_zero_num, - (*h_counter)[i] * sizeof(IntT), - gpuMemcpyDeviceToDevice, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync( - out_rulebook_ptr + rulebook_len + (*h_offsets)[i], - rulebook_ptr + kernel_size * non_zero_num + i * non_zero_num, - (*h_counter)[i] * sizeof(IntT), - gpuMemcpyDeviceToDevice, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync( - out_rulebook_ptr + 2 * rulebook_len + (*h_offsets)[i], - rulebook_ptr + 2 * kernel_size * non_zero_num + i * non_zero_num, - (*h_counter)[i] * sizeof(IntT), - gpuMemcpyDeviceToDevice, - dev_ctx.stream()); - } + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); + cache_size = kernel_size * 2 * sizeof(int); + CopyRuleBook<<>>(counter_ptr, + offsets_ptr, + rulebook_ptr, + rulebook_len, + kernel_size, + non_zero_num, + out_rulebook_ptr); *rulebook = out_rulebook; return rulebook_len; From dd5e4fde5403f32bee0d7624140bc588a955f4f2 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Thu, 23 Jun 2022 01:34:13 +0000 Subject: [PATCH 32/70] check cache size --- .../phi/kernels/sparse/gpu/convolution.cu.h | 67 ++++++++----------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index d8cc45c445159..a0b98603f85cc 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/index_impl.cu.h" @@ -214,10 +215,8 @@ __global__ void ProductRuleBookKernel(const T* x_indices, const Dims4D paddings, const Dims4D dilations, const Dims4D strides, - const bool subm, T* rulebook, - int* counter, - T* in_indexs) { + int* counter) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ int counter_buf[]; // kernel_size const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1]; @@ -233,9 +232,6 @@ __global__ void ProductRuleBookKernel(const T* x_indices, T in_z = x_indices[i + non_zero_num]; T in_y = x_indices[i + 2 * non_zero_num]; T in_x = x_indices[i + 3 * non_zero_num]; - if (subm) { - in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims); - } for (int kz = 0; kz < kernel_dims[1]; kz++) { for (int ky = 0; ky < kernel_dims[2]; ky++) { for (int kx = 0; kx < kernel_dims[3]; kx++) { @@ -333,11 +329,9 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices, const Dims4D paddings, const Dims4D dilations, const Dims4D strides, - const bool subm, const T* out_index_table, T* rulebook, - int* counter, - T* in_indexs) { + int* counter) { int tid = threadIdx.x + blockIdx.x * blockDim.x; const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1]; extern __shared__ int counter_buf[]; // kernel_size @@ -357,9 +351,6 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices, T in_z = x_indices[i + non_zero_num]; T in_y = x_indices[i + 2 * non_zero_num]; T in_x = x_indices[i + 3 * non_zero_num]; - if (subm) { - in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims); - } for (int kz = 0; kz < kernel_dims[1]; kz++) { for (int ky = 0; ky < kernel_dims[2]; ky++) { for (int kx = 0; kx < kernel_dims[3]; kx++) { @@ -447,8 +438,6 @@ int ProductRuleBook(const Context& dev_ctx, const int64_t non_zero_num = x.nnz(); const auto& non_zero_indices = x.non_zero_indices(); const IntT* indices_ptr = non_zero_indices.data(); - DenseTensor in_indexs = phi::Empty( - dev_ctx, DenseTensorMeta(indices_dtype, {x.nnz()}, DataLayout::NCHW)); int* counter_ptr = counter_per_kernel->data(); int* offsets_ptr = offsets_per_kernel->data(); int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; @@ -503,30 +492,34 @@ int ProductRuleBook(const Context& dev_ctx, dev_ctx.stream()>>>( out_indices.data(), non_zero_num, d_x_dims, out_index_table_ptr); - if (config.thread_per_block.x > 128) { - config.block_per_grid.x *= config.thread_per_block.x / 128; - config.thread_per_block.x = 128; - } size_t cache_size = kernel_size * 2 + kernel_size * config.thread_per_block.x * 2 * sizeof(int); - ProductSubmRuleBookKernel - <<>>(indices_ptr, - d_x_dims, - d_kernel_dims, - d_out_dims, - non_zero_num, - d_paddings, - d_dilations, - d_strides, - subm, - out_index_table_ptr, - rulebook_ptr, - counter_ptr, - in_indexs.data()); + const int MAX_CACHE_SIZE = 48 * 1024; + while (cache_size >= MAX_CACHE_SIZE) { + config.thread_per_block.x /= 2; + config.block_per_grid.x *= 2; + PADDLE_ENFORCE_GE(config.thread_per_block.x, + 32, + phi::errors::Fatal("the shared memory is not enough")); + size_t cache_size = kernel_size * 2 + kernel_size * + config.thread_per_block.x * 2 * + sizeof(int); + } + ProductSubmRuleBookKernel<<>>(indices_ptr, + d_x_dims, + d_kernel_dims, + d_out_dims, + non_zero_num, + d_paddings, + d_dilations, + d_strides, + out_index_table_ptr, + rulebook_ptr, + counter_ptr); out->SetMember(out_indices, out_values, out_dims, true); @@ -584,10 +577,8 @@ int ProductRuleBook(const Context& dev_ctx, d_paddings, d_dilations, d_strides, - subm, rulebook_ptr, - counter_ptr, - in_indexs.data()); + counter_ptr); // 2. remove -1 #ifdef PADDLE_WITH_HIP From 52367a3160de506e43fc42d15e9fab9efce6b5fa Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Thu, 23 Jun 2022 01:34:13 +0000 Subject: [PATCH 33/70] check cache size --- .../phi/kernels/sparse/gpu/convolution.cu.h | 66 ++++++++----------- 1 file changed, 28 insertions(+), 38 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index d8cc45c445159..a3eb06563739c 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/index_impl.cu.h" @@ -214,10 +215,8 @@ __global__ void ProductRuleBookKernel(const T* x_indices, const Dims4D paddings, const Dims4D dilations, const Dims4D strides, - const bool subm, T* rulebook, - int* counter, - T* in_indexs) { + int* counter) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ int counter_buf[]; // kernel_size const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1]; @@ -233,9 +232,6 @@ __global__ void ProductRuleBookKernel(const T* x_indices, T in_z = x_indices[i + non_zero_num]; T in_y = x_indices[i + 2 * non_zero_num]; T in_x = x_indices[i + 3 * non_zero_num]; - if (subm) { - in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims); - } for (int kz = 0; kz < kernel_dims[1]; kz++) { for (int ky = 0; ky < kernel_dims[2]; ky++) { for (int kx = 0; kx < kernel_dims[3]; kx++) { @@ -333,11 +329,9 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices, const Dims4D paddings, const Dims4D dilations, const Dims4D strides, - const bool subm, const T* out_index_table, T* rulebook, - int* counter, - T* in_indexs) { + int* counter) { int tid = threadIdx.x + blockIdx.x * blockDim.x; const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1]; extern __shared__ int counter_buf[]; // kernel_size @@ -357,9 +351,6 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices, T in_z = x_indices[i + non_zero_num]; T in_y = x_indices[i + 2 * non_zero_num]; T in_x = x_indices[i + 3 * non_zero_num]; - if (subm) { - in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims); - } for (int kz = 0; kz < kernel_dims[1]; kz++) { for (int ky = 0; ky < kernel_dims[2]; ky++) { for (int kx = 0; kx < kernel_dims[3]; kx++) { @@ -447,8 +438,6 @@ int ProductRuleBook(const Context& dev_ctx, const int64_t non_zero_num = x.nnz(); const auto& non_zero_indices = x.non_zero_indices(); const IntT* indices_ptr = non_zero_indices.data(); - DenseTensor in_indexs = phi::Empty( - dev_ctx, DenseTensorMeta(indices_dtype, {x.nnz()}, DataLayout::NCHW)); int* counter_ptr = counter_per_kernel->data(); int* offsets_ptr = offsets_per_kernel->data(); int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; @@ -503,30 +492,33 @@ int ProductRuleBook(const Context& dev_ctx, dev_ctx.stream()>>>( out_indices.data(), non_zero_num, d_x_dims, out_index_table_ptr); - if (config.thread_per_block.x > 128) { - config.block_per_grid.x *= config.thread_per_block.x / 128; - config.thread_per_block.x = 128; - } size_t cache_size = kernel_size * 2 + kernel_size * config.thread_per_block.x * 2 * sizeof(int); - ProductSubmRuleBookKernel - <<>>(indices_ptr, - d_x_dims, - d_kernel_dims, - d_out_dims, - non_zero_num, - d_paddings, - d_dilations, - d_strides, - subm, - out_index_table_ptr, - rulebook_ptr, - counter_ptr, - in_indexs.data()); + const int MAX_CACHE_SIZE = 48 * 1024; + while (cache_size >= MAX_CACHE_SIZE) { + config.thread_per_block.x /= 2; + config.block_per_grid.x *= 2; + PADDLE_ENFORCE_GE(config.thread_per_block.x, + 32, + phi::errors::Fatal("the shared memory is not enough")); + cache_size = kernel_size * 2 + + kernel_size * config.thread_per_block.x * 2 * sizeof(int); + } + ProductSubmRuleBookKernel<<>>(indices_ptr, + d_x_dims, + d_kernel_dims, + d_out_dims, + non_zero_num, + d_paddings, + d_dilations, + d_strides, + out_index_table_ptr, + rulebook_ptr, + counter_ptr); out->SetMember(out_indices, out_values, out_dims, true); @@ -584,10 +576,8 @@ int ProductRuleBook(const Context& dev_ctx, d_paddings, d_dilations, d_strides, - subm, rulebook_ptr, - counter_ptr, - in_indexs.data()); + counter_ptr); // 2. remove -1 #ifdef PADDLE_WITH_HIP From 345ebb2c512dcecf831ed6bba10187f4d08f3987 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Thu, 23 Jun 2022 05:28:03 +0000 Subject: [PATCH 34/70] correct alloc out values --- paddle/phi/kernels/sparse/gpu/convolution.cu.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index a3eb06563739c..7a3ec91bb9835 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -469,7 +469,11 @@ int ProductRuleBook(const Context& dev_ctx, IntT* rulebook_ptr = tmp_rulebook.data(); DenseTensor out_indices = phi::EmptyLike(dev_ctx, x.non_zero_indices()); - DenseTensor out_values = phi::EmptyLike(dev_ctx, x.non_zero_elements()); + DenseTensor out_values = + phi::Empty(dev_ctx, + DenseTensorMeta(x.dtype(), + {x.nnz(), kernel_sizes[4]}, + x.non_zero_elements().layout())); phi::Copy( dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices); From 596bfbddee69eda16cdc272b47e220ca01eec7a7 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Fri, 24 Jun 2022 14:24:10 +0000 Subject: [PATCH 35/70] fix backward --- paddle/phi/kernels/sparse/cpu/convolution.h | 7 ++-- .../kernels/sparse/cpu/convolution_kernel.cc | 17 +++++---- .../kernels/sparse/cpu/sparse_pool_kernel.cc | 10 +++--- .../sparse/gpu/convolution_grad_kernel.cu | 16 ++------- .../kernels/sparse/gpu/convolution_kernel.cu | 36 ------------------- .../tests/unittests/test_sparse_conv_op.py | 12 ++++--- .../incubate/sparse/nn/functional/conv.py | 3 +- 7 files changed, 29 insertions(+), 72 deletions(-) diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h index b2544619774c2..07baf77ff5d27 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution.h +++ b/paddle/phi/kernels/sparse/cpu/convolution.h @@ -41,13 +41,12 @@ void ProductRuleBook(const Context& dev_ctx, const DDim& out_dims, const bool subm, DenseTensor* rulebook, - DenseTensor* counter_per_kernel) { + std::vector* counter_per_kernel) { const int64_t non_zero_num = x.nnz(); const auto& non_zero_indices = x.non_zero_indices(); const IntT* indices_ptr = non_zero_indices.data(); - int* counter_ptr = counter_per_kernel->data(); int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; - memset(counter_ptr, 0, kernel_size * sizeof(int)); + memset(counter_per_kernel->data(), 0, kernel_size * sizeof(int)); int rulebook_len = 0; // calc the rulebook_len @@ -107,7 +106,7 @@ void ProductRuleBook(const Context& dev_ctx, } if (rulebook_ptr == nullptr) { - counter_ptr[kernel_index - 1] += 1; + (*counter_per_kernel)[kernel_index - 1] += 1; ++rulebook_len; } else { rulebook_ptr[rulebook_index] = kernel_index - 1; diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index 8fdd5c5ca0f51..7c37d90cd4cf9 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -67,9 +67,7 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx, // Second algorithm: // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf // 1. product rulebook - DenseTensorMeta counter_meta( - DataType::INT32, {kernel_size}, DataLayout::NCHW); - DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); + std::vector counter_per_kernel(kernel_size, 0); // DenseTensor* rulebook = nullptr; const IntT* rulebook_ptr = nullptr; @@ -80,11 +78,7 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx, if (subm && table != nullptr) { const DenseTensor& rulebook = table->first; rulebook_ptr = rulebook.data(); - // DenseTensor out_rulebook = phi::EmptyLike(dev_ctx, x.rulebook()); - // phi::Copy(dev_ctx, x.rulebook(), dev_ctx.GetPlace(), false, - // &out_rulebook); out->SetRulebook(out_rulebook); out->SetTablePtr(x.GetTablePtr()); - // rulebook = out->mutable_rulebook(); n = rulebook.dims()[1]; DenseTensor out_indices = @@ -93,7 +87,9 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx, phi::Copy( dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices); out->SetMember(out_indices, out_values, out_dims, true); - // out->SetSubm(subm); + memcpy(counter_per_kernel.data(), + table->second.data(), + kernel_size * sizeof(int)); } else { DenseTensor rulebook; ProductRuleBook(dev_ctx, @@ -110,9 +106,12 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx, UpdateRulebookAndOutIndex( dev_ctx, x, kernel_size, out_channels, out_dims, &rulebook, out); n = rulebook.dims()[1]; + out->SetTablePtr(x.GetTablePtr()); + out->SetTable(key, std::make_pair(rulebook, counter_per_kernel)); + rulebook_ptr = rulebook.data(); } // int n = rulebook->dims()[1]; - const int* counter_ptr = counter_per_kernel.data(); + const int* counter_ptr = counter_per_kernel.data(); // 2. gather DenseTensorMeta in_features_meta( diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc index 7655913374dbd..f4d6e807538ea 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc @@ -47,9 +47,11 @@ void MaxPoolCPUKernel(const CPUContext& dev_ctx, x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims); const int in_channels = real_kernel_sizes[3]; - DenseTensorMeta counter_meta( - DataType::INT32, {kernel_size}, DataLayout::NCHW); - DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); + // DenseTensorMeta counter_meta( + // DataType::INT32, {kernel_size}, DataLayout::NCHW); + // DenseTensor counter_per_kernel = phi::Empty(dev_ctx, + // std::move(counter_meta)); + std::vector counter_per_kernel(kernel_size, 0); const T* in_features_ptr = x.non_zero_elements().data(); // 1. product rule book @@ -69,7 +71,7 @@ void MaxPoolCPUKernel(const CPUContext& dev_ctx, int rulebook_len = rulebook->dims()[1]; const IntT* rulebook_ptr = rulebook->data(); - const int* counter_ptr = counter_per_kernel.data(); + const int* counter_ptr = counter_per_kernel.data(); std::vector offsets(kernel_size + 1); phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size); diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index 1560f2faa5cac..9533d456af0a6 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -100,20 +100,10 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, &x_grad_indices); x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true); - std::vector offsets(kernel_size + 1), counter(kernel_size, 0), - h_counter(rulebook_len, 0); - // phi::backends::gpu::GpuMemcpyAsync(&h_counter[0], - // rulebook_ptr, - // rulebook_len * sizeof(IntT), - // gpuMemcpyDeviceToHost, - // dev_ctx.stream()); - // dev_ctx.Wait(); + std::vector offsets(kernel_size + 1); + const auto& counter = table->second; - // for (int i = 0; i < rulebook_len; i++) { - // counter[h_counter[i]] += 1; - // } - memcpy(counter.data(), table->second.data(), kernel_size * sizeof(int)); - IntT offset = 0, max_count = 0; + int offset = 0, max_count = 0; for (int i = 0; i < kernel_size; i++) { offsets[i] = offset; offset += counter[i]; diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 3df89322405f8..95475ed5cb9e3 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -77,7 +77,6 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); int n = 0; - // DenseTensor* rulebook = nullptr; const IntT* rulebook_ptr = nullptr; PADDLE_ENFORCE_EQ( key.empty(), @@ -90,11 +89,6 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, memcpy(h_counter.data(), table->second.data(), kernel_size * sizeof(int)); out->SetTablePtr(x.GetTablePtr()); - clock_t t0 = clock(); - // DenseTensor out_rulebook = phi::EmptyLike(dev_ctx, x.rulebook()); - // phi::Copy(dev_ctx, x.rulebook(), dev_ctx.GetPlace(), false, - // &out_rulebook); out->SetRulebook(out_rulebook); rulebook = - // out->mutable_rulebook(); n = rulebook.dims()[1]; DenseTensor out_indices = @@ -103,39 +97,14 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, phi::Copy( dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices); out->SetMember(out_indices, out_values, out_dims, true); - // out->SetSubm(subm); - // const IntT* rulebook_ptr = rulebook->data(); - // std::vector counter(n, 0); - // clock_t t1 = clock(); - // phi::backends::gpu::GpuMemcpyAsync(&counter[0], - // rulebook_ptr, - // n * sizeof(IntT), - // gpuMemcpyDeviceToHost, - // dev_ctx.stream()); - // dev_ctx.Wait(); - // clock_t t2 = clock(); - // for (int i = 0; i < n; i++) { - // PADDLE_ENFORCE_LT(counter[i], - // kernel_size, - // phi::errors::Fatal("the kernel index must less than - // kernel_size")); - // h_counter[counter[i]] += 1; - // } IntT offset = 0; for (int i = 0; i < kernel_size; i++) { offsets[i] = offset; offset += h_counter[i]; } offsets[kernel_size] = offset; - // clock_t t3 = clock(); - // auto f = [](clock_t start, clock_t end) -> float{ - // return (float)(end-start)/CLOCKS_PER_SEC; - // }; - // printf("%f %f %f\n", f(t0, t1), f(t1, t2), f(t2, t3)); } else { DenseTensor rulebook; - // rulebook = &empty_rulebook; - // rulebook = out->mutable_rulebook(); n = ProductRuleBook(dev_ctx, x, kernel_sizes, @@ -152,16 +121,11 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, out, &h_counter, &offsets); - // out->SetSubm(subm); out->SetTablePtr(x.GetTablePtr()); out->SetTable(key, std::make_pair(rulebook, h_counter)); rulebook_ptr = rulebook.data(); } - // const int* counter_ptr = counter_per_kernel.data(); - // const int* offsets_ptr = counter_per_kernel.data(); - /// const IntT* rulebook_ptr = rulebook->data(); - // 2. gather DenseTensorMeta in_features_meta( x.dtype(), {n, in_channels}, DataLayout::NCHW); diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py index e1a9b2428babc..afd9c33421660 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py @@ -51,6 +51,7 @@ def test_conv3d(self): padding=paddings, dilation=dilations, groups=1, + key='conv3d', data_format="NDHWC") out.backward(out) assert np.array_equal(correct_out_values, out.values().numpy()) @@ -66,7 +67,7 @@ def test_subm_conv3d(self): indices, values, dense_shape, stop_gradient=True) weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32') y = paddle.incubate.sparse.nn.functional.subm_conv3d( - sparse_x, weight) + sparse_x, weight, key='subm_conv') assert np.array_equal(sparse_x.indices().numpy(), y.indices().numpy()) @@ -84,13 +85,13 @@ def test_Conv3D(self): indices, values, dense_shape, False) sparse_conv3d = paddle.incubate.sparse.nn.Conv3D( - 1, 1, (1, 3, 3), data_format='NDHWC') + 1, 1, (1, 3, 3), data_format='NDHWC', key='conv3d') sparse_out = sparse_conv3d(sparse_input) #test errors with self.assertRaises(ValueError): #Currently, only support data_format='NDHWC' conv3d = paddle.incubate.sparse.nn.SubmConv3D( - 1, 1, (1, 3, 3), data_format='NCDHW') + 1, 1, (1, 3, 3), data_format='NCDHW', key='subm_conv') def test_SubmConv3D(self): with _test_eager_guard(): @@ -104,7 +105,7 @@ def test_SubmConv3D(self): indices, values, dense_shape, False) subm_conv3d = paddle.incubate.sparse.nn.SubmConv3D( - 1, 1, (1, 3, 3), data_format='NDHWC') + 1, 1, (1, 3, 3), data_format='NDHWC', key='subm_conv') # test extra_repr print(subm_conv3d.extra_repr()) @@ -116,7 +117,7 @@ def test_SubmConv3D(self): with self.assertRaises(ValueError): #Currently, only support data_format='NDHWC' conv3d = paddle.incubate.sparse.nn.SubmConv3D( - 1, 1, (1, 3, 3), data_format='NCDHW') + 1, 1, (1, 3, 3), data_format='NCDHW', key='subm_conv') def test_Conv3D_bias(self): with _test_eager_guard(): @@ -129,6 +130,7 @@ def test_Conv3D_bias(self): sp_conv3d = paddle.incubate.sparse.nn.Conv3D(3, 2, 3, + key='conv3d', data_format='NDHWC') sp_conv3d.weight.set_value( paddle.to_tensor(conv3d.weight.numpy().transpose(2, 3, 4, 1, diff --git a/python/paddle/incubate/sparse/nn/functional/conv.py b/python/paddle/incubate/sparse/nn/functional/conv.py index a0de73e30fb86..62800dd01c65b 100644 --- a/python/paddle/incubate/sparse/nn/functional/conv.py +++ b/python/paddle/incubate/sparse/nn/functional/conv.py @@ -84,6 +84,7 @@ def conv3d(x, padding=0, dilation=1, groups=1, + key=None, data_format="NDHWC", name=None): r""" @@ -188,7 +189,7 @@ def conv3d(x, # (1, 1, 1, 2, 1) """ return _conv3d(x, weight, bias, stride, padding, dilation, groups, False, - None, data_format, name) + key, data_format, name) def subm_conv3d(x, From c906bdb835add55f94354f8865519425b27d1f72 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Sat, 25 Jun 2022 11:29:59 +0000 Subject: [PATCH 36/70] opt conv --- .../phi/kernels/sparse/gpu/convolution.cu.h | 283 ++++++++++++------ .../kernels/sparse/gpu/convolution_kernel.cu | 2 +- 2 files changed, 197 insertions(+), 88 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 7a3ec91bb9835..fddb743de5af7 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -68,6 +68,52 @@ __global__ void GatherKernel(const T* params, } } +template +__global__ void UniqueKernel(const IntT* in_indexs, + const int rulebook_len, + int* out_index_table, + int* out_indexs, + int* nnz) { + extern __shared__ int cache[]; + __shared__ int count, start; + if (threadIdx.x == 0) { + count = 0; + start = 0; + } + __syncthreads(); + + int i = threadIdx.x + blockDim.x * blockIdx.x; + if (i < rulebook_len) { + // atomicOr only support int + int index = static_cast(in_indexs[i]); + int change_index = index == 0 ? 1 : index; + int flag = atomicOr(out_index_table + index, change_index); + if (flag == 0) { + int j = atomicAdd(&count, 1); + cache[j] = index; + } + } + __syncthreads(); + + if (threadIdx.x == 0) { + start = atomicAdd(nnz, count); + } + __syncthreads(); + for (int i = threadIdx.x; i < count; i += blockDim.x) { + out_indexs[start + i] = cache[i]; + } +} + +template +__global__ void UpdateOutIndex(const int* out_index_table, + const int n, + IntT* out_indexs) { + CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { + IntT index = out_indexs[i]; + out_indexs[i] = out_index_table[index]; + } +} + template inline IntT* SortedAndUniqueIndex(const Context& dev_ctx, const IntT* rulebook_ptr, @@ -75,42 +121,38 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx, DenseTensor* out_index, DenseTensor* unique_key, DenseTensor* unique_value) { - phi::IndexKernel>( - dev_ctx, out_index, kps::IdentityFunctor()); - phi::IndexKernel>( - dev_ctx, unique_value, kps::IdentityFunctor()); - - phi::backends::gpu::GpuMemcpyAsync(unique_key->data(), - rulebook_ptr, - sizeof(IntT) * len, -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToDevice, -#else - cudaMemcpyDeviceToDevice, -#endif - dev_ctx.stream()); -// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher -// performance, but thrust::merge_by_key limited by data size -#ifdef PADDLE_WITH_HIP - thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()), -#endif - unique_key->data(), - unique_key->data() + len, - out_index->data()); - - // 4. unique - thrust::pair new_end = -#ifdef PADDLE_WITH_HIP - thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()), -#endif - unique_key->data(), - unique_key->data() + len, - unique_value->data()); - return new_end.first; + // phi::IndexKernel>( + // dev_ctx, out_index, kps::IdentityFunctor()); + // phi::IndexKernel>( + // dev_ctx, unique_value, kps::IdentityFunctor()); + // + // phi::backends::gpu::GpuMemcpyAsync(unique_key->data(), + // rulebook_ptr, + // sizeof(IntT) * len, + // gpuMemcpyDeviceToDevice, + // dev_ctx.stream()); + //// compared with thrust::sort_by_key, thrust::merge_by_key may achieved + // higher / performance, but thrust::merge_by_key limited by data size + // #ifdef PADDLE_WITH_HIP + // thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()), + // #else + // thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()), + // #endif + // unique_key->data(), + // unique_key->data() + len, + // out_index->data()); + // + // // 4. unique + // thrust::pair new_end = + // #ifdef PADDLE_WITH_HIP + // thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()), + // #else + // thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()), + // #endif + // unique_key->data(), + // unique_key->data() + len, + // unique_value->data()); + // return new_end.first; } /** @@ -285,6 +327,26 @@ __global__ void GetOutIndexTable(const IntT* indices, } } +template +__global__ void GetOutIndexTable(const int* indexs, + const int non_zero_num, + const Dims4D out_dims, + int* out_index_table, + IntT* out_indices) { + CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) { + IntT index = static_cast(indexs[i]); + out_index_table[index] = i; + IntT batch, x, y, z; + phi::funcs::sparse::IndexToPoint( + index, out_dims, &batch, &x, &y, &z); + // get out indices + out_indices[i] = batch; + out_indices[i + non_zero_num] = z; + out_indices[i + non_zero_num * 2] = y; + out_indices[i + non_zero_num * 3] = x; + } +} + template __global__ void CopyRuleBook(const int* counters, const int* offsets, @@ -649,66 +711,113 @@ int ProductRuleBook(const Context& dev_ctx, int* out_index_ptr = out_index->data(); int* unique_value_ptr = unique_value->data(); IntT* unique_key_ptr = unique_key.data(); - - IntT* new_end = - SortedAndUniqueIndex(dev_ctx, - rulebook_ptr + 2 * rulebook_len, - rulebook_len, - out_index, - &unique_key, - unique_value); - // thrust::distance doesn't support stream parameters - // const int out_non_zero_num = thrust::distance(unique_key_ptr, - // new_end.first); - phi::funcs::sparse::DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - unique_key_ptr, - new_end, - rulebook_ptr + rulebook_rows * rulebook_cols - 1); - IntT out_non_zero_num = 0; -#ifdef PADDLE_WITH_HIP - phi::backends::gpu::GpuMemcpyAsync( - &out_non_zero_num, - rulebook_ptr + rulebook_rows * rulebook_cols - 1, - sizeof(IntT), - hipMemcpyDeviceToHost, - dev_ctx.stream()); -#else - phi::backends::gpu::GpuMemcpyAsync( - &out_non_zero_num, - rulebook_ptr + rulebook_rows * rulebook_cols - 1, - sizeof(IntT), - cudaMemcpyDeviceToHost, - dev_ctx.stream()); -#endif + cudaMemsetAsync(unique_value_ptr, 0, sizeof(int), dev_ctx.stream()); + + // IntT* new_end = + // SortedAndUniqueIndex(dev_ctx, + // rulebook_ptr + 2 * rulebook_len, + // rulebook_len, + // out_index, + // &unique_key, + // unique_value); + int64_t table_size = 1; + for (int i = 0; i < out_dims.size() - 1; i++) { + table_size *= out_dims[i]; + } + DenseTensor out_index_table = phi::Empty(dev_ctx, {table_size}); + int* out_index_table_ptr = out_index_table.data(); + cudaMemsetAsync( + out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream()); + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); + size_t cache_size = sizeof(int) * config.thread_per_block.x; + UniqueKernel<<>>(rulebook_ptr + 2 * rulebook_len, + rulebook_len, + out_index_table_ptr, + out_index_ptr, + unique_value_ptr); + int out_nnz = 0; + cudaMemcpyAsync(&out_nnz, + unique_value_ptr, + sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); dev_ctx.Wait(); - // 5. update out_indices and rulebook by unique_value_ptr const int64_t sparse_dim = 4; DenseTensorMeta indices_meta( - indices_dtype, {sparse_dim, out_non_zero_num}, DataLayout::NCHW); - DenseTensorMeta values_meta(x.dtype(), - {out_non_zero_num, kernel_sizes[4]}, - x.non_zero_elements().layout()); + indices_dtype, {sparse_dim, out_nnz}, DataLayout::NCHW); + DenseTensorMeta values_meta( + x.dtype(), {out_nnz, kernel_sizes[4]}, x.non_zero_elements().layout()); phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); + out->SetMember(out_indices, out_values, out_dims, true); IntT* out_indices_ptr = out_indices.data(); - config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1); - UpdateIndexKernel - <<>>(unique_key_ptr, - unique_value_ptr, - out_index_ptr, - out_non_zero_num, - rulebook_len, - d_out_dims, - out_indices_ptr, - rulebook_ptr + 2 * rulebook_len); - out->SetMember(out_indices, out_values, out_dims, true); + thrust::sort(thrust::cuda::par.on(dev_ctx.stream()), + out_index_ptr, + out_index_ptr + out_nnz); + + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1); + GetOutIndexTable<<>>(out_index_ptr, + out_nnz, + d_out_dims, + out_index_table_ptr, + out_indices_ptr); + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); + UpdateOutIndex<<>>( + out_index_table_ptr, rulebook_len, rulebook_ptr + 2 * rulebook_len); + + // thrust::distance doesn't support stream parameters + // const int out_non_zero_num = thrust::distance(unique_key_ptr, + // new_end.first); + // phi::funcs::sparse::DistanceKernel<<<1, 1, 0, + // dev_ctx.stream()>>>( + // unique_key_ptr, + // new_end, + // rulebook_ptr + rulebook_rows * rulebook_cols - 1); + // IntT out_non_zero_num = 0; + // #ifdef PADDLE_WITH_HIP + // phi::backends::gpu::GpuMemcpyAsync( + // &out_non_zero_num, + // rulebook_ptr + rulebook_rows * rulebook_cols - 1, + // sizeof(IntT), + // hipMemcpyDeviceToHost, + // dev_ctx.stream()); + // #else + // phi::backends::gpu::GpuMemcpyAsync( + // &out_non_zero_num, + // rulebook_ptr + rulebook_rows * rulebook_cols - 1, + // sizeof(IntT), + // cudaMemcpyDeviceToHost, + // dev_ctx.stream()); + // #endif + + // 5. update out_indices and rulebook by unique_value_ptr + // config = + // phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, + // out_non_zero_num, 1); + /// UpdateIndexKernel + // <<>>(unique_key_ptr, + // unique_value_ptr, + // out_index_ptr, + // out_non_zero_num, + // rulebook_len, + // d_out_dims, + // out_indices_ptr, + // rulebook_ptr + 2 * rulebook_len); return rulebook_len; } } diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 0abeda531d6df..092b40a1ff4f0 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -170,7 +170,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, } // 4. scatter - if (subm) { + if (true) { set_zero(dev_ctx, out_values, static_cast(0.0f)); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1); From 6dc1584d3f86874767b3e5726e95037e2d3494ac Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Sun, 26 Jun 2022 10:34:27 +0000 Subject: [PATCH 37/70] opt conv3d --- paddle/phi/kernels/funcs/sparse/scatter.cu.h | 35 +++++++ .../phi/kernels/sparse/gpu/convolution.cu.h | 95 ++++++------------- .../kernels/sparse/gpu/convolution_kernel.cu | 46 ++------- 3 files changed, 73 insertions(+), 103 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h index 3afe13db73a9a..39a2ad7a1db0a 100644 --- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h +++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h @@ -97,6 +97,41 @@ __global__ void ScatterCUDAKernel(const T* params, } } +template +__global__ void ScatterKernelV2(const T* input, + const int* out_index_counts, + const int* origin_out_indexs, + const int non_zero_num, + const int kernel_size, + const int channels, + T* out) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + const int vec_channels = channels / VecSize; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + for (int i = tid; i < non_zero_num * vec_channels; + i += gridDim.x * blockDim.x) { + int indices_i = i / vec_channels; + int channels_i = i - indices_i * vec_channels; + + int len = out_index_counts[indices_i]; + // max(end-start) = kernel_size + StoreT sums = {static_cast(0)}; + for (int j = 0; j < len; j++) { + const int out_feature_i = origin_out_indexs[indices_i * kernel_size + j]; + LoadT vec_in; + phi::Load( + input + out_feature_i * channels + channels_i * VecSize, &vec_in); +#pragma unroll + for (int k = 0; k < VecSize; k++) { + sums[k] += vec_in[k]; + } + } + phi::Store(sums, + out + indices_i * channels + channels_i * VecSize); + } +} + } // namespace sparse } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index fddb743de5af7..88b2b97c79404 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -107,10 +107,19 @@ __global__ void UniqueKernel(const IntT* in_indexs, template __global__ void UpdateOutIndex(const int* out_index_table, const int n, - IntT* out_indexs) { + const int kernel_size, + IntT* out_indexs, + int* out_index_counts, + int* origin_out_indexs) { CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { IntT index = out_indexs[i]; - out_indexs[i] = out_index_table[index]; + int real_index = out_index_table[index]; + out_indexs[i] = real_index; + + // kernel_size at most + int j = atomicAdd(out_index_counts + real_index, 1); + // nnz * kernel_size + origin_out_indexs[real_index * kernel_size + j] = i; } } @@ -703,23 +712,11 @@ int ProductRuleBook(const Context& dev_ctx, rulebook->Resize({rulebook_rows, static_cast(rulebook_len)}); // 3. sorted or merge the out index out_index->ResizeAndAllocate({static_cast(rulebook_len)}); - unique_value->ResizeAndAllocate({static_cast(rulebook_len)}); - DenseTensor unique_key = phi::Empty( - dev_ctx, - DenseTensorMeta( - indices_dtype, {static_cast(rulebook_len)}, DataLayout::NCHW)); + DenseTensor unique_key = + phi::Empty(dev_ctx, {static_cast(rulebook_len)}); int* out_index_ptr = out_index->data(); - int* unique_value_ptr = unique_value->data(); - IntT* unique_key_ptr = unique_key.data(); - cudaMemsetAsync(unique_value_ptr, 0, sizeof(int), dev_ctx.stream()); - - // IntT* new_end = - // SortedAndUniqueIndex(dev_ctx, - // rulebook_ptr + 2 * rulebook_len, - // rulebook_len, - // out_index, - // &unique_key, - // unique_value); + int* unique_key_ptr = unique_key.data(); + int64_t table_size = 1; for (int i = 0; i < out_dims.size() - 1; i++) { table_size *= out_dims[i]; @@ -728,6 +725,7 @@ int ProductRuleBook(const Context& dev_ctx, int* out_index_table_ptr = out_index_table.data(); cudaMemsetAsync( out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream()); + cudaMemsetAsync(unique_key_ptr, 0, sizeof(int), dev_ctx.stream()); config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); size_t cache_size = sizeof(int) * config.thread_per_block.x; UniqueKernel<<ResizeAndAllocate({static_cast(out_nnz * kernel_size)}); + int* unique_value_ptr = unique_value->data(); + + // return rulebook_len; UpdateOutIndex<<>>( - out_index_table_ptr, rulebook_len, rulebook_ptr + 2 * rulebook_len); - - // thrust::distance doesn't support stream parameters - // const int out_non_zero_num = thrust::distance(unique_key_ptr, - // new_end.first); - // phi::funcs::sparse::DistanceKernel<<<1, 1, 0, - // dev_ctx.stream()>>>( - // unique_key_ptr, - // new_end, - // rulebook_ptr + rulebook_rows * rulebook_cols - 1); - // IntT out_non_zero_num = 0; - // #ifdef PADDLE_WITH_HIP - // phi::backends::gpu::GpuMemcpyAsync( - // &out_non_zero_num, - // rulebook_ptr + rulebook_rows * rulebook_cols - 1, - // sizeof(IntT), - // hipMemcpyDeviceToHost, - // dev_ctx.stream()); - // #else - // phi::backends::gpu::GpuMemcpyAsync( - // &out_non_zero_num, - // rulebook_ptr + rulebook_rows * rulebook_cols - 1, - // sizeof(IntT), - // cudaMemcpyDeviceToHost, - // dev_ctx.stream()); - // #endif - - // 5. update out_indices and rulebook by unique_value_ptr - // config = - // phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, - // out_non_zero_num, 1); - /// UpdateIndexKernel - // <<>>(unique_key_ptr, - // unique_value_ptr, - // out_index_ptr, - // out_non_zero_num, - // rulebook_len, - // d_out_dims, - // out_indices_ptr, - // rulebook_ptr + 2 * rulebook_len); + dev_ctx.stream()>>>(out_index_table_ptr, + rulebook_len, + kernel_size, + rulebook_ptr + 2 * rulebook_len, + out_index_ptr, + unique_value_ptr); + return rulebook_len; } } diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 8df2939bdf551..e87fd71d511f6 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -199,42 +199,10 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, tmp_kernel_ptr, static_cast(0), tmp_out_ptr); - - if (subm) { - // if(out_channels % VecSize == 0){ - // auto config = - // phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, M * - // out_channels/VecSize, 1); - // phi::funcs::sparse::ScatterCUDAKernel - // <<>>(out_features_ptr, - // rulebook_ptr + 2 * n + offsets[i], - // out_values_ptr, - // M, - // out_channels, - // false); - // }else{ - // auto config = - // phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, M * - // out_channels, 1); - // phi::funcs::sparse::ScatterCUDAKernel - // <<>>(out_features_ptr, - // rulebook_ptr + 2 * n + offsets[i], - // out_values_ptr, - // M, - // out_channels, - // false); - // } - } } // 4. scatter - if (true) { + if (subm) { set_zero(dev_ctx, out_values, static_cast(0.0f)); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1); @@ -252,29 +220,29 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, if (out_channels % VecSize == 0) { auto config = phi::backends::gpu::GetGpuLaunchConfig1D( dev_ctx, out->nnz() * out_channels / VecSize, 1); - phi::funcs::sparse::ScatterKernel + phi::funcs::sparse::ScatterKernelV2 <<>>(out_features_ptr, - unique_value.data(), out_index.data(), + unique_value.data(), out->nnz(), - n, + kernel_size, out_channels, out_values_ptr); } else { auto config = phi::backends::gpu::GetGpuLaunchConfig1D( dev_ctx, out->nnz() * out_channels, 1); - phi::funcs::sparse::ScatterKernel + phi::funcs::sparse::ScatterKernelV2 <<>>(out_features_ptr, - unique_value.data(), out_index.data(), + unique_value.data(), out->nnz(), - n, + kernel_size, out_channels, out_values_ptr); } From 82027714e4c7673e27b474f8da06678905c44edb Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 27 Jun 2022 02:50:44 +0000 Subject: [PATCH 38/70] opt scatter --- paddle/phi/kernels/sparse/gpu/convolution.cu.h | 16 ++++++++++++++++ .../phi/kernels/sparse/gpu/convolution_kernel.cu | 15 ++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 88b2b97c79404..f7f822af641a2 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -473,6 +473,21 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices, } } +template +__global__ void UpdateOutIndex(const int n, + const int kernel_size, + const IntT* out_indexs, + int* out_index_counts, + int* origin_out_indexs) { + CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { + IntT index = out_indexs[i]; + // kernel_size at most + int j = atomicAdd(out_index_counts + index, 1); + // nnz * kernel_size + origin_out_indexs[index * kernel_size + j] = i; + } +} + // the basic algorithm can refer to convolution_kernel.cc or // the second paper // example: @@ -631,6 +646,7 @@ int ProductRuleBook(const Context& dev_ctx, non_zero_num, out_rulebook_ptr); *rulebook = out_rulebook; + return rulebook_len; } else { diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index e87fd71d511f6..39208681a373f 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -174,6 +174,19 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, if (subm) { // set_zero(dev_ctx, out_values, static_cast(0.0f)); + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1); + unique_value.ResizeAndAllocate( + {static_cast(out->nnz() * kernel_size)}); + out_index.ResizeAndAllocate({static_cast(n)}); + int* out_index_ptr = out_index.data(); + int* unique_value_ptr = unique_value.data(); + cudaMemsetAsync(out_index_ptr, 0, sizeof(int) * n, dev_ctx.stream()); + UpdateOutIndex<<>>( + n, kernel_size, rulebook_ptr + 2 * n, out_index_ptr, unique_value_ptr); } const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { @@ -202,7 +215,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, } // 4. scatter - if (subm) { + if (false) { set_zero(dev_ctx, out_values, static_cast(0.0f)); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1); From 75df1e242af0cdc7c51a7bc2cf2d1044e060e4c1 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 27 Jun 2022 02:51:48 +0000 Subject: [PATCH 39/70] opt SparseMaskCopy --- .../kernels/sparse/gpu/sparse_mask_kernel.cu | 95 ++++++++++++------- 1 file changed, 62 insertions(+), 33 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu index e1f2a9149b4bd..2daff1ba43ae1 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu @@ -27,7 +27,6 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h" #include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" -#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" namespace phi { namespace sparse { @@ -146,6 +145,36 @@ __global__ void SparseMaskCopyKernel(const IntT* x_indexs, } } +template +__global__ void MaskTable(const IntT* x_indexs, const int n, int* table) { + CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { + int index = x_indexs[i]; + table[index] = i == 0 ? -1 : i; + } +} + +template +__global__ void MaskCopy(const IntT* mask_indexs, + const int* table, + const int n, + const int stride, + const T* x_values, + T* out_values) { + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { + int j = table[mask_indexs[i]]; + if (j != 0) { + if (j == -1) j = 0; + for (int k = 0; k < stride; k += VecSize) { + LoadT vec_x; + phi::Load(x_values + j * stride + k, &vec_x); + phi::Store(vec_x, out_values + i * stride + k); + } + } + } +} + template void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx, const SparseCooTensor& x, @@ -217,52 +246,52 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx, mask_indexs.numel(), sparse_dim, mask_indexs_ptr); -// 4. call thrust::lower_bound -#ifdef PADDLE_WITH_HIP - thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()), -#endif - x_indexs_ptr, - x_indexs_ptr + x_indexs.numel(), - mask_indexs_ptr, - mask_indexs_ptr + mask_indexs.numel(), - bound_out_ptr); - // 5. copy value to out + int table_size = 1; + auto x_dims = x.dims(); + for (int i = 0; i < x_dims.size() - 1; i++) { + table_size *= x_dims[i]; + } + DenseTensor table = phi::Empty(dev_ctx, {table_size}); + cudaMemsetAsync( + table.data(), 0, table_size * sizeof(int), dev_ctx.stream()); + const int64_t stride = + x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1]; *out = phi::EmptyLike(dev_ctx, x.non_zero_elements()); phi::funcs::SetConstant set_zero; set_zero(dev_ctx, out, static_cast(0)); T* out_ptr = out->data(); - - const int64_t stride = - x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1]; - + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1); + MaskTable<<>>( + x_indexs_ptr, x_indexs.numel(), table.data()); + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1); const int VecSize = VecBytes / sizeof(T); if (stride % VecSize == 0) { - SparseMaskCopyKernel + MaskCopy <<>>(x_indexs_ptr, - mask_indexs_ptr, - bound_out_ptr, - x.non_zero_elements().data(), + dev_ctx.stream()>>>(mask_indexs_ptr, + table.data(), mask_indexs.numel(), stride, - out_ptr); - } else { - SparseMaskCopyKernel - <<>>(x_indexs_ptr, - mask_indexs_ptr, - bound_out_ptr, x.non_zero_elements().data(), - mask_indexs.numel(), - stride, out_ptr); + } else { + MaskCopy<<>>(mask_indexs_ptr, + table.data(), + mask_indexs.numel(), + stride, + x.non_zero_elements().data(), + out_ptr); } } From 2745b0e91178beab72047352e6c350901a03da47 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 27 Jun 2022 02:52:39 +0000 Subject: [PATCH 40/70] coalesced is not performed by default --- paddle/phi/kernels/sparse/sparse_utils_kernel.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h index 93abf70b24412..2f5bb189c0ffe 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h @@ -156,7 +156,8 @@ void SparseCooTensorKernel(const Context& dev_ctx, SparseCooTensor* out) { SparseCooTensor before_coalesced( indices, values, phi::make_ddim(dense_shape.GetData())); - CoalescedKernel(dev_ctx, before_coalesced, out); + // CoalescedKernel(dev_ctx, before_coalesced, out); + *out = before_coalesced; } } // namespace sparse From ad9c2b610146e933a47394a9687568c59bb019c2 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 27 Jun 2022 07:37:45 +0000 Subject: [PATCH 41/70] opt rulebook --- .../phi/kernels/sparse/gpu/convolution.cu.h | 138 ++++-------------- .../sparse/gpu/convolution_grad_kernel.cu | 35 +++-- .../kernels/sparse/gpu/convolution_kernel.cu | 10 +- 3 files changed, 54 insertions(+), 129 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index f7f822af641a2..57bf0d51779bf 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -123,47 +123,6 @@ __global__ void UpdateOutIndex(const int* out_index_table, } } -template -inline IntT* SortedAndUniqueIndex(const Context& dev_ctx, - const IntT* rulebook_ptr, - const int len, - DenseTensor* out_index, - DenseTensor* unique_key, - DenseTensor* unique_value) { - // phi::IndexKernel>( - // dev_ctx, out_index, kps::IdentityFunctor()); - // phi::IndexKernel>( - // dev_ctx, unique_value, kps::IdentityFunctor()); - // - // phi::backends::gpu::GpuMemcpyAsync(unique_key->data(), - // rulebook_ptr, - // sizeof(IntT) * len, - // gpuMemcpyDeviceToDevice, - // dev_ctx.stream()); - //// compared with thrust::sort_by_key, thrust::merge_by_key may achieved - // higher / performance, but thrust::merge_by_key limited by data size - // #ifdef PADDLE_WITH_HIP - // thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()), - // #else - // thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()), - // #endif - // unique_key->data(), - // unique_key->data() + len, - // out_index->data()); - // - // // 4. unique - // thrust::pair new_end = - // #ifdef PADDLE_WITH_HIP - // thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()), - // #else - // thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()), - // #endif - // unique_key->data(), - // unique_key->data() + len, - // unique_value->data()); - // return new_end.first; -} - /** * @brief: update the out index and indices * unique_keys: save the index of the output feature list @@ -205,42 +164,6 @@ __global__ void UpdateIndexKernel(const T* unique_keys, } } -template -__global__ void UpdateOutIndexAndCounterAfterLowerBound( - const IntT* x_indexs, - const IntT* bound_out, - const int rulebook_len, - const int kernel_size, - const int64_t non_zero_num, - IntT* rulebook_ptr, - IntT* out_indexs, - int* counter_ptr) { - extern __shared__ int cache_count[]; - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - cache_count[i] = 0; - } - __syncthreads(); - - CUDA_KERNEL_LOOP_TYPE(i, rulebook_len, int64_t) { - int j = bound_out[i]; - if (j >= 0 && j < non_zero_num && out_indexs[i] == x_indexs[j]) { - out_indexs[i] = j; - } else { - // mask this position will be remove - int kernel_index = rulebook_ptr[i]; - rulebook_ptr[i + rulebook_len] = -1; - rulebook_ptr[i + 2 * rulebook_len] = -1; - rulebook_ptr[i] = -1; - atomicAdd(&cache_count[kernel_index], 1); - } - } - __syncthreads(); - - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - atomicSub(&counter_ptr[i], cache_count[i]); - } -} - /** * @brief product rulebook * for input_i in x_indices: @@ -307,9 +230,9 @@ __global__ void ProductRuleBookKernel(const T* x_indices, atomicAdd(&counter_buf[kernel_index], 1); kernel_i = kernel_index; } - rulebook[kernel_index * non_zero_num + i] = kernel_i; - rulebook[kernel_index * non_zero_num + offset + i] = in_i; - rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index; + // rulebook[kernel_index * non_zero_num + i] = kernel_i; + rulebook[kernel_index * non_zero_num + i] = in_i; + rulebook[kernel_index * non_zero_num + offset + i] = out_index; ++kernel_index; } } @@ -381,13 +304,11 @@ __global__ void CopyRuleBook(const int* counters, } } int inner_index = i - offsets[kernel_index]; + // out_rulebook[i] = in_rulebook[kernel_index * non_zero_num + inner_index]; out_rulebook[i] = in_rulebook[kernel_index * non_zero_num + inner_index]; out_rulebook[len + i] = in_rulebook[kernel_size * non_zero_num + kernel_index * non_zero_num + inner_index]; - out_rulebook[2 * len + i] = - in_rulebook[2 * kernel_size * non_zero_num + - kernel_index * non_zero_num + inner_index]; } } @@ -464,10 +385,10 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices, __syncthreads(); for (int i = 0; i < kernel_size; i++) { if (threadIdx.x < counter_buf[i]) { - rulebook[i * non_zero_num + counter_buf2[i] + threadIdx.x] = i; - rulebook[i * non_zero_num + offset + counter_buf2[i] + threadIdx.x] = + // rulebook[i * non_zero_num + counter_buf2[i] + threadIdx.x] = i; + rulebook[i * non_zero_num + counter_buf2[i] + threadIdx.x] = rulebook_buf[i * blockDim.x + threadIdx.x]; - rulebook[i * non_zero_num + offset * 2 + counter_buf2[i] + threadIdx.x] = + rulebook[i * non_zero_num + offset + counter_buf2[i] + threadIdx.x] = rulebook_buf[i * blockDim.x + kernel_size * blockDim.x + threadIdx.x]; } } @@ -536,8 +457,12 @@ int ProductRuleBook(const Context& dev_ctx, Dims4D d_strides(1, strides[2], strides[1], strides[0]); Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]); // 1. product rule book - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, counter_per_kernel, 0); + // phi::funcs::SetConstant set_zero; + // set_zero(dev_ctx, counter_per_kernel, 0); + phi::backends::gpu::GpuMemsetAsync(counter_ptr, + 0, + sizeof(int) * counter_per_kernel->numel(), + dev_ctx.stream()); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); @@ -547,7 +472,7 @@ int ProductRuleBook(const Context& dev_ctx, // convolution, // and then the intermediate output index is subtracted from the input index // to obain the rulebook. - const int rulebook_rows = 3; + const int rulebook_rows = 2; const int rulebook_cols = kernel_size * non_zero_num; DenseTensorMeta rulebook_meta( indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW); @@ -631,7 +556,8 @@ int ProductRuleBook(const Context& dev_ctx, dev_ctx.Wait(); int rulebook_len = (*h_offsets)[kernel_size - 1] + (*h_counter)[kernel_size - 1]; - DenseTensor out_rulebook = phi::Empty(dev_ctx, {3, rulebook_len}); + DenseTensor out_rulebook = + phi::Empty(dev_ctx, {rulebook_rows, rulebook_len}); IntT* out_rulebook_ptr = out_rulebook.data(); config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); cache_size = kernel_size * 2 * sizeof(int); @@ -650,7 +576,7 @@ int ProductRuleBook(const Context& dev_ctx, return rulebook_len; } else { - const int rulebook_rows = 3; + const int rulebook_rows = 2; const int rulebook_cols = kernel_size * non_zero_num; DenseTensorMeta rulebook_meta( indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW); @@ -681,20 +607,17 @@ int ProductRuleBook(const Context& dev_ctx, -1); phi::funcs::sparse::DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1); + rulebook_ptr, last, rulebook_ptr + rulebook_rows * rulebook_cols - 1); IntT rulebook_len = 0; phi::backends::gpu::GpuMemcpyAsync( &rulebook_len, - rulebook_ptr + 3 * kernel_size * non_zero_num - 1, + rulebook_ptr + rulebook_rows * rulebook_cols - 1, sizeof(IntT), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif + gpuMemcpyDeviceToHost, dev_ctx.stream()); + dev_ctx.Wait(); - rulebook_len /= 3; + rulebook_len /= 2; #ifdef PADDLE_WITH_HIP thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), @@ -708,21 +631,13 @@ int ProductRuleBook(const Context& dev_ctx, phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], counter_ptr, kernel_size * sizeof(int), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif + gpuMemcpyDeviceToHost, dev_ctx.stream()); phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], offsets_ptr, kernel_size * sizeof(int), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif + gpuMemcpyDeviceToHost, dev_ctx.stream()); rulebook->Resize({rulebook_rows, static_cast(rulebook_len)}); @@ -742,12 +657,13 @@ int ProductRuleBook(const Context& dev_ctx, cudaMemsetAsync( out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream()); cudaMemsetAsync(unique_key_ptr, 0, sizeof(int), dev_ctx.stream()); + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); size_t cache_size = sizeof(int) * config.thread_per_block.x; UniqueKernel<<>>(rulebook_ptr + 2 * rulebook_len, + dev_ctx.stream()>>>(rulebook_ptr + rulebook_len, rulebook_len, out_index_table_ptr, out_index_ptr, @@ -797,7 +713,7 @@ int ProductRuleBook(const Context& dev_ctx, dev_ctx.stream()>>>(out_index_table_ptr, rulebook_len, kernel_size, - rulebook_ptr + 2 * rulebook_len, + rulebook_ptr + rulebook_len, out_index_ptr, unique_value_ptr); diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index 9533d456af0a6..7cabae44e7903 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -83,7 +83,9 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, *kernel_grad = phi::EmptyLike(dev_ctx, kernel); T* d_kernel_ptr = kernel_grad->data(); phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, kernel_grad, static_cast(0.0f)); + // set_zero(dev_ctx, kernel_grad, static_cast(0.0f)); + phi::backends::gpu::GpuMemsetAsync( + d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel(), dev_ctx.stream()); int half_kernel_size = kernel_size / 2; auto blas = phi::funcs::GetBlas(dev_ctx); @@ -91,8 +93,14 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, phi::EmptyLike(dev_ctx, x.non_zero_indices()); DenseTensor x_grad_values = phi::EmptyLike(dev_ctx, x.non_zero_elements()); T* x_grad_values_ptr = x_grad_values.data(); - set_zero(dev_ctx, &x_grad_values, static_cast(0.0f)); - set_zero(dev_ctx, &d_x_features, static_cast(0.0f)); + // set_zero(dev_ctx, &x_grad_values, static_cast(0.0f)); + phi::backends::gpu::GpuMemsetAsync(x_grad_values_ptr, + 0, + sizeof(T) * x_grad_values.numel(), + dev_ctx.stream()); + // set_zero(dev_ctx, &d_x_features, static_cast(0.0f)); + phi::backends::gpu::GpuMemsetAsync( + d_x_features_ptr, 0, sizeof(T) * d_x_features.numel(), dev_ctx.stream()); phi::Copy(dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), @@ -138,7 +146,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, config.thread_per_block.x, 0, dev_ctx.stream()>>>(x.non_zero_elements().data(), - rulebook_ptr + rulebook_len, + rulebook_ptr, in_features_ptr, rulebook_len, in_channels); @@ -150,7 +158,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, config.thread_per_block.x, 0, dev_ctx.stream()>>>(x.non_zero_elements().data(), - rulebook_ptr + rulebook_len, + rulebook_ptr, in_features_ptr, rulebook_len, in_channels); @@ -164,7 +172,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, config.thread_per_block.x, 0, dev_ctx.stream()>>>(out_grad.non_zero_elements().data(), - rulebook_ptr + rulebook_len * 2, + rulebook_ptr + rulebook_len, out_grad_features_ptr, rulebook_len, out_channels); @@ -176,7 +184,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, config.thread_per_block.x, 0, dev_ctx.stream()>>>(out_grad.non_zero_elements().data(), - rulebook_ptr + rulebook_len * 2, + rulebook_ptr + rulebook_len, out_grad_features_ptr, rulebook_len, out_channels); @@ -231,13 +239,12 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, phi::funcs::ScatterCUDAKernel<<>>( - d_x_features_ptr, - rulebook_ptr + rulebook_len, - x_grad_values_ptr, - rulebook_len, - in_channels, - false); + dev_ctx.stream()>>>(d_x_features_ptr, + rulebook_ptr, + x_grad_values_ptr, + rulebook_len, + in_channels, + false); } template diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 39208681a373f..20911109036b3 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -139,6 +139,8 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, T* out_features_ptr = out_features.data(); phi::funcs::SetConstant set_zero; set_zero(dev_ctx, &out_features, static_cast(0.0f)); + // phi::backends::gpu::GpuMemsetAsync(out_features_ptr, + // static_cast(0.0f), sizeof(T) * out_features.numel(), dev_ctx.stream()); const int VecSize = VecBytes / sizeof(T); if (in_channels % VecSize == 0) { @@ -149,7 +151,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, config.thread_per_block.x, 0, dev_ctx.stream()>>>(x.non_zero_elements().data(), - rulebook_ptr + n, + rulebook_ptr, in_features_ptr, n, in_channels); @@ -161,7 +163,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, config.thread_per_block.x, 0, dev_ctx.stream()>>>(x.non_zero_elements().data(), - rulebook_ptr + n, + rulebook_ptr, in_features_ptr, n, in_channels); @@ -186,7 +188,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, config.thread_per_block, 0, dev_ctx.stream()>>>( - n, kernel_size, rulebook_ptr + 2 * n, out_index_ptr, unique_value_ptr); + n, kernel_size, rulebook_ptr + n, out_index_ptr, unique_value_ptr); } const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { @@ -224,7 +226,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, config.thread_per_block, 0, dev_ctx.stream()>>>(out_features_ptr, - rulebook_ptr + 2 * n, + rulebook_ptr + n, out_values_ptr, n, out_channels, From 214475bb64302594790ca38c3da39fde4898334e Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 27 Jun 2022 08:24:47 +0000 Subject: [PATCH 42/70] remove a sync --- .../phi/kernels/sparse/gpu/convolution.cu.h | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 57bf0d51779bf..bc15b43a3200c 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -606,18 +606,22 @@ int ProductRuleBook(const Context& dev_ctx, rulebook_ptr + rulebook_rows * rulebook_cols, -1); - phi::funcs::sparse::DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - rulebook_ptr, last, rulebook_ptr + rulebook_rows * rulebook_cols - 1); - IntT rulebook_len = 0; - phi::backends::gpu::GpuMemcpyAsync( - &rulebook_len, - rulebook_ptr + rulebook_rows * rulebook_cols - 1, - sizeof(IntT), - gpuMemcpyDeviceToHost, - dev_ctx.stream()); - - dev_ctx.Wait(); - rulebook_len /= 2; + // phi::funcs::sparse::DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + // rulebook_ptr, last, rulebook_ptr + rulebook_rows * rulebook_cols - + // 1); + // IntT rulebook_len = 0; + // phi::backends::gpu::GpuMemcpyAsync( + // &rulebook_len, + // rulebook_ptr + rulebook_rows * rulebook_cols - 1, + // sizeof(IntT), + // gpuMemcpyDeviceToHost, + // dev_ctx.stream()); + + // dev_ctx.Wait(); + // rulebook_len /= 2; + // printf("rulebook_len = %d\n", rulebook_len); + // printf("distance = %d\n", last-rulebook_ptr); + IntT rulebook_len = (last - rulebook_ptr) / 2; #ifdef PADDLE_WITH_HIP thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), From 13f0b93146205b4289747ff23f6895ab7678a8f2 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 28 Jun 2022 04:05:05 +0000 Subject: [PATCH 43/70] gatherV2 --- paddle/phi/kernels/funcs/sparse/scatter.cu.h | 2 + .../phi/kernels/sparse/gpu/convolution.cu.h | 59 ++++++++++++++-- .../sparse/gpu/convolution_grad_kernel.cu | 68 ++++++++++++++++--- .../kernels/sparse/gpu/convolution_kernel.cu | 39 +++++++---- 4 files changed, 138 insertions(+), 30 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h index 39a2ad7a1db0a..de0a14c7dbf80 100644 --- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h +++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h @@ -117,6 +117,8 @@ __global__ void ScatterKernelV2(const T* input, int len = out_index_counts[indices_i]; // max(end-start) = kernel_size StoreT sums = {static_cast(0)}; + phi::Load(out + indices_i * channels + channels_i * VecSize, + &sums); for (int j = 0; j < len; j++) { const int out_feature_i = origin_out_indexs[indices_i * kernel_size + j]; LoadT vec_in; diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index bc15b43a3200c..dc8600ce31d9d 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -68,6 +68,34 @@ __global__ void GatherKernel(const T* params, } } +template +__global__ void GatherKernelV2(const T* inputs, + const int* index_counts, + const int* origin_indexs, + const int non_zero_num, + const int kernel_size, + T* output, + const int channels) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + const int vec_channels = channels / VecSize; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + for (int i = tid; i < non_zero_num * vec_channels; + i += gridDim.x * blockDim.x) { + int indices_i = i / vec_channels; + int channels_i = i - indices_i * vec_channels; + int len = index_counts[indices_i]; + LoadT in_vec; + phi::Load(inputs + indices_i * channels + channels_i * VecSize, + &in_vec); + for (int j = 0; j < len; j++) { + int out_i = origin_indexs[indices_i * kernel_size + j]; + phi::Store(in_vec, + output + out_i * channels + channels_i * VecSize); + } + } +} + template __global__ void UniqueKernel(const IntT* in_indexs, const int rulebook_len, @@ -397,15 +425,34 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices, template __global__ void UpdateOutIndex(const int n, const int kernel_size, - const IntT* out_indexs, - int* out_index_counts, - int* origin_out_indexs) { + const IntT* indexs, + int* index_counts, + int* index_groups) { CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { - IntT index = out_indexs[i]; + IntT index = indexs[i]; + // kernel_size at most + int j = atomicAdd(index_counts + index, 1); + // nnz * kernel_size + index_groups[index * kernel_size + j] = i; + } +} + +template +__global__ void UpdateOutIndexV2(const int n, + const int kernel_size, + const int half_kernel_offset, + const IntT* indexs, + int* index_counts, + int* index_groups) { + CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { + IntT index = indexs[i]; // kernel_size at most - int j = atomicAdd(out_index_counts + index, 1); + int* counts_ptr = + i < half_kernel_offset ? index_counts : index_counts + nnz; + int j = atomicAdd(counts_ptr + index, 1); // nnz * kernel_size - origin_out_indexs[index * kernel_size + j] = i; + int group_offset = i < half_kernel_offset ? 0 : kernel_size / 2; + index_groups[index * kernel_size + j + group_offset] = i; } } diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index 7cabae44e7903..38c9661610766 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -233,18 +233,64 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, } // 4. scatter - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * in_channels, 1); + // auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + // dev_ctx, rulebook_len * in_channels, 1); + // + // phi::funcs::ScatterCUDAKernel<<>>(d_x_features_ptr, + // rulebook_ptr, + // x_grad_values_ptr, + // rulebook_len, + // in_channels, + // false); + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); + DenseTensor unique_value = + phi::Empty(dev_ctx, {static_cast(x_grad->nnz() * kernel_size)}); + DenseTensor out_index = + phi::Empty(dev_ctx, {static_cast(rulebook_len)}); + int* out_index_ptr = out_index.data(); + int* unique_value_ptr = unique_value.data(); + cudaMemsetAsync( + out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream()); - phi::funcs::ScatterCUDAKernel<<>>(d_x_features_ptr, - rulebook_ptr, - x_grad_values_ptr, - rulebook_len, - in_channels, - false); + UpdateOutIndex<<>>( + rulebook_len, kernel_size, rulebook_ptr, out_index_ptr, unique_value_ptr); + + if (in_channels % VecSize == 0) { + config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, x_grad->nnz() * in_channels / VecSize, 1); + phi::funcs::sparse::ScatterKernelV2 + <<>>(d_x_features_ptr, + out_index.data(), + unique_value.data(), + x_grad->nnz(), + kernel_size, + in_channels, + x_grad_values_ptr); + } else { + config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, x_grad->nnz() * in_channels, 1); + phi::funcs::sparse::ScatterKernelV2 + <<>>(d_x_features_ptr, + out_index.data(), + unique_value.data(), + x_grad->nnz(), + kernel_size, + in_channels, + x_grad_values_ptr); + } } template diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 20911109036b3..8c1e1e9e9e33f 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -139,33 +139,47 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, T* out_features_ptr = out_features.data(); phi::funcs::SetConstant set_zero; set_zero(dev_ctx, &out_features, static_cast(0.0f)); - // phi::backends::gpu::GpuMemsetAsync(out_features_ptr, - // static_cast(0.0f), sizeof(T) * out_features.numel(), dev_ctx.stream()); + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1); + DenseTensor index_groups = phi::Empty(dev_ctx, {x.nnz() * kernel_size}); + DenseTensor index_counts = phi::Empty(dev_ctx, {n}); + int* index_counts_ptr = index_counts.data(); + int* index_groups_ptr = index_groups.data(); + cudaMemsetAsync(index_counts_ptr, 0, sizeof(int) * n, dev_ctx.stream()); + UpdateOutIndex<<>>( + n, kernel_size, rulebook_ptr, index_counts_ptr, index_groups_ptr); const int VecSize = VecBytes / sizeof(T); if (in_channels % VecSize == 0) { auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, n * in_channels / VecSize, 1); - GatherKernel + dev_ctx, x.nnz() * in_channels / VecSize, 1); + GatherKernelV2 <<>>(x.non_zero_elements().data(), - rulebook_ptr, + index_counts_ptr, + index_groups_ptr, + x.nnz(), + kernel_size, in_features_ptr, - n, in_channels); } else { - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1); - GatherKernel + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, x.nnz() * in_channels, 1); + GatherKernelV2 <<>>(x.non_zero_elements().data(), - rulebook_ptr, + index_counts_ptr, + index_groups_ptr, + x.nnz(), + kernel_size, in_features_ptr, - n, in_channels); } @@ -173,10 +187,9 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, auto blas = phi::funcs::GetBlas(dev_ctx); auto* out_values = out->mutable_non_zero_elements(); T* out_values_ptr = out_values->data(); + set_zero(dev_ctx, out_values, static_cast(0.0f)); if (subm) { - // set_zero(dev_ctx, out_values, static_cast(0.0f)); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1); unique_value.ResizeAndAllocate( {static_cast(out->nnz() * kernel_size)}); From c9929a2689339ed47c185c910d67e07d8bf9428a Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 28 Jun 2022 05:58:01 +0000 Subject: [PATCH 44/70] opt gather of backward --- .../phi/kernels/sparse/gpu/convolution.cu.h | 12 ++--- .../sparse/gpu/convolution_grad_kernel.cu | 54 ++++++++++--------- .../kernels/sparse/gpu/convolution_kernel.cu | 38 +++++-------- 3 files changed, 49 insertions(+), 55 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index dc8600ce31d9d..c5aea43e872ca 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -447,12 +447,12 @@ __global__ void UpdateOutIndexV2(const int n, CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { IntT index = indexs[i]; // kernel_size at most - int* counts_ptr = - i < half_kernel_offset ? index_counts : index_counts + nnz; - int j = atomicAdd(counts_ptr + index, 1); - // nnz * kernel_size - int group_offset = i < half_kernel_offset ? 0 : kernel_size / 2; - index_groups[index * kernel_size + j + group_offset] = i; + /// int* counts_ptr = + /// i < half_kernel_offset ? index_counts : index_counts + nnz; + /// int j = atomicAdd(counts_ptr + index, 1); + ///// nnz * kernel_size + /// int group_offset = i < half_kernel_offset ? 0 : kernel_size / 2; + /// index_groups[index * kernel_size + j + group_offset] = i; } } diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index 38c9661610766..9571b260f0e3d 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -137,30 +137,52 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, } } + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); + DenseTensor unique_value = + phi::Empty(dev_ctx, {static_cast(x_grad->nnz() * kernel_size)}); + DenseTensor out_index = + phi::Empty(dev_ctx, {static_cast(rulebook_len)}); + int* out_index_ptr = out_index.data(); + int* unique_value_ptr = unique_value.data(); + cudaMemsetAsync( + out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream()); + + UpdateOutIndex<<>>( + rulebook_len, kernel_size, rulebook_ptr, out_index_ptr, unique_value_ptr); + const int VecSize = VecBytes / sizeof(T); if (in_channels % VecSize == 0) { auto config = phi::backends::gpu::GetGpuLaunchConfig1D( dev_ctx, rulebook_len * in_channels / VecSize, 1); - GatherKernel + GatherKernelV2 <<>>(x.non_zero_elements().data(), - rulebook_ptr, - in_features_ptr, + // rulebook_ptr, + out_index_ptr, + unique_value_ptr, rulebook_len, + kernel_size, + in_features_ptr, in_channels); } else { auto config = phi::backends::gpu::GetGpuLaunchConfig1D( dev_ctx, rulebook_len * in_channels, 1); - GatherKernel + GatherKernelV2 <<>>(x.non_zero_elements().data(), - rulebook_ptr, - in_features_ptr, + out_index_ptr, + unique_value_ptr, rulebook_len, + kernel_size, + in_features_ptr, in_channels); } @@ -245,25 +267,9 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, // rulebook_len, // in_channels, // false); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); - DenseTensor unique_value = - phi::Empty(dev_ctx, {static_cast(x_grad->nnz() * kernel_size)}); - DenseTensor out_index = - phi::Empty(dev_ctx, {static_cast(rulebook_len)}); - int* out_index_ptr = out_index.data(); - int* unique_value_ptr = unique_value.data(); - cudaMemsetAsync( - out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream()); - - UpdateOutIndex<<>>( - rulebook_len, kernel_size, rulebook_ptr, out_index_ptr, unique_value_ptr); if (in_channels % VecSize == 0) { - config = phi::backends::gpu::GetGpuLaunchConfig1D( + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( dev_ctx, x_grad->nnz() * in_channels / VecSize, 1); phi::funcs::sparse::ScatterKernelV2 <<nnz() * in_channels, 1); phi::funcs::sparse::ScatterKernelV2 <<(); phi::funcs::SetConstant set_zero; set_zero(dev_ctx, &out_features, static_cast(0.0f)); - - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1); - DenseTensor index_groups = phi::Empty(dev_ctx, {x.nnz() * kernel_size}); - DenseTensor index_counts = phi::Empty(dev_ctx, {n}); - int* index_counts_ptr = index_counts.data(); - int* index_groups_ptr = index_groups.data(); - cudaMemsetAsync(index_counts_ptr, 0, sizeof(int) * n, dev_ctx.stream()); - UpdateOutIndex<<>>( - n, kernel_size, rulebook_ptr, index_counts_ptr, index_groups_ptr); + // phi::backends::gpu::GpuMemsetAsync(out_features_ptr, + // static_cast(0.0f), sizeof(T) * out_features.numel(), dev_ctx.stream()); const int VecSize = VecBytes / sizeof(T); if (in_channels % VecSize == 0) { auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, x.nnz() * in_channels / VecSize, 1); - GatherKernelV2 + dev_ctx, n * in_channels / VecSize, 1); + GatherKernel <<>>(x.non_zero_elements().data(), - index_counts_ptr, - index_groups_ptr, - x.nnz(), - kernel_size, + rulebook_ptr, in_features_ptr, + n, in_channels); } else { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, x.nnz() * in_channels, 1); - GatherKernelV2 + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1); + GatherKernel <<>>(x.non_zero_elements().data(), - index_counts_ptr, - index_groups_ptr, - x.nnz(), - kernel_size, + rulebook_ptr, in_features_ptr, + n, in_channels); } @@ -190,6 +176,8 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, set_zero(dev_ctx, out_values, static_cast(0.0f)); if (subm) { + // set_zero(dev_ctx, out_values, static_cast(0.0f)); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1); unique_value.ResizeAndAllocate( {static_cast(out->nnz() * kernel_size)}); From dab4609996c85e0e5005cbdff138d19ac502ae82 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 28 Jun 2022 07:54:12 +0000 Subject: [PATCH 45/70] resolve conflict --- paddle/phi/kernels/sparse/cpu/convolution_kernel.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index 7c37d90cd4cf9..ecf7073b41109 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/visit_type.h" -#include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/sparse/cpu/convolution.h" From f66d0c759bc196b2be68b54fcac397078616419d Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 28 Jun 2022 11:30:58 +0000 Subject: [PATCH 46/70] opt groups indexs --- paddle/phi/kernels/funcs/sparse/scatter.cu.h | 49 ++++++++++++++++ .../phi/kernels/sparse/gpu/convolution.cu.h | 56 ++++++++++++++++--- .../sparse/gpu/convolution_grad_kernel.cu | 53 ++++++++---------- .../kernels/sparse/gpu/convolution_kernel.cu | 1 - 4 files changed, 118 insertions(+), 41 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h index de0a14c7dbf80..33157304424a3 100644 --- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h +++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h @@ -134,6 +134,55 @@ __global__ void ScatterKernelV2(const T* input, } } +template +__global__ void ScatterKernelV3(const T* input, + const int* out_index_counts, + const int* origin_out_indexs, + const int non_zero_num, + const int kernel_size, + const int channels, + T* out) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + const int vec_channels = channels / VecSize; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + for (int i = tid; i < non_zero_num * vec_channels; + i += gridDim.x * blockDim.x) { + int indices_i = i / vec_channels; + int channels_i = i - indices_i * vec_channels; + + int len1 = out_index_counts[indices_i]; + StoreT sums = {static_cast(0)}; + phi::Load(out + indices_i * channels + channels_i * VecSize, + &sums); + for (int j = 0; j < len1; j++) { + const int out_feature_i = origin_out_indexs[indices_i * kernel_size + j]; + LoadT vec_in; + phi::Load( + input + out_feature_i * channels + channels_i * VecSize, &vec_in); +#pragma unroll + for (int k = 0; k < VecSize; k++) { + sums[k] += vec_in[k]; + } + } + + int len2 = out_index_counts[non_zero_num + indices_i]; + for (int j = 0; j < len2; j++) { + const int out_feature_i = origin_out_indexs[indices_i * kernel_size + j + + kernel_size * non_zero_num]; + LoadT vec_in; + phi::Load( + input + out_feature_i * channels + channels_i * VecSize, &vec_in); +#pragma unroll + for (int k = 0; k < VecSize; k++) { + sums[k] += vec_in[k]; + } + } + phi::Store(sums, + out + indices_i * channels + channels_i * VecSize); + } +} + } // namespace sparse } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 6a85b8622c609..a86f5f06083c2 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -96,6 +96,41 @@ __global__ void GatherKernelV2(const T* inputs, } } +template +__global__ void GatherKernelV3(const T* inputs, + const int* index_counts, + const int* origin_indexs, + const int non_zero_num, + const int kernel_size, + T* output, + const int channels) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + const int vec_channels = channels / VecSize; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + for (int i = tid; i < non_zero_num * vec_channels; + i += gridDim.x * blockDim.x) { + int indices_i = i / vec_channels; + int channels_i = i - indices_i * vec_channels; + int len1 = index_counts[indices_i]; + LoadT in_vec; + phi::Load(inputs + indices_i * channels + channels_i * VecSize, + &in_vec); + for (int j = 0; j < len1; j++) { + int out_i = origin_indexs[indices_i * kernel_size + j]; + phi::Store(in_vec, + output + out_i * channels + channels_i * VecSize); + } + int len2 = index_counts[non_zero_num + indices_i]; + for (int j = 0; j < len2; j++) { + int out_i = origin_indexs[indices_i * kernel_size + j + + kernel_size * non_zero_num]; + phi::Store(in_vec, + output + out_i * channels + channels_i * VecSize); + } + } +} + template __global__ void UniqueKernel(const IntT* in_indexs, const int rulebook_len, @@ -438,21 +473,24 @@ __global__ void UpdateOutIndex(const int n, } template -__global__ void UpdateOutIndexV2(const int n, +__global__ void UpdateOutIndexV2(const int rulebook_len, + const int non_zero_num, const int kernel_size, const int half_kernel_offset, const IntT* indexs, int* index_counts, int* index_groups) { - CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { + CUDA_KERNEL_LOOP_TYPE(i, rulebook_len, int64_t) { IntT index = indexs[i]; - // kernel_size at most - /// int* counts_ptr = - /// i < half_kernel_offset ? index_counts : index_counts + nnz; - /// int j = atomicAdd(counts_ptr + index, 1); - ///// nnz * kernel_size - /// int group_offset = i < half_kernel_offset ? 0 : kernel_size / 2; - /// index_groups[index * kernel_size + j + group_offset] = i; + int* counts_ptr = + i < half_kernel_offset ? index_counts : index_counts + non_zero_num; + int* groups_ptr = i < half_kernel_offset + ? index_groups + : index_groups + non_zero_num * kernel_size; + // conflict kernel_size times at most + int j = atomicAdd(counts_ptr + index, 1); + // nnz * kernel_size + groups_ptr[index * kernel_size + j] = i; } } diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index 745ee0e83bb91..72df861cccbef 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -139,48 +139,52 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); - DenseTensor unique_value = - phi::Empty(dev_ctx, {static_cast(x_grad->nnz() * kernel_size)}); + DenseTensor unique_value = phi::Empty( + dev_ctx, {static_cast(x_grad->nnz() * kernel_size * 2)}); DenseTensor out_index = - phi::Empty(dev_ctx, {static_cast(rulebook_len)}); + phi::Empty(dev_ctx, {static_cast(x.nnz() * 2)}); int* out_index_ptr = out_index.data(); int* unique_value_ptr = unique_value.data(); cudaMemsetAsync( - out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream()); + out_index_ptr, 0, sizeof(int) * x.nnz() * 2, dev_ctx.stream()); - UpdateOutIndex<<>>( - rulebook_len, kernel_size, rulebook_ptr, out_index_ptr, unique_value_ptr); + UpdateOutIndexV2<<>>(rulebook_len, + x.nnz(), + kernel_size, + offsets[kernel_size / 2], + rulebook_ptr, + out_index_ptr, + unique_value_ptr); const int VecSize = VecBytes / sizeof(T); if (in_channels % VecSize == 0) { auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * in_channels / VecSize, 1); - GatherKernelV2 + dev_ctx, x.nnz() * in_channels / VecSize, 1); + GatherKernelV3 <<>>(x.non_zero_elements().data(), - // rulebook_ptr, out_index_ptr, unique_value_ptr, - rulebook_len, + x.nnz(), kernel_size, in_features_ptr, in_channels); } else { auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * in_channels, 1); - GatherKernelV2 + dev_ctx, x.nnz() * in_channels, 1); + GatherKernelV3 <<>>(x.non_zero_elements().data(), out_index_ptr, unique_value_ptr, - rulebook_len, + x.nnz(), kernel_size, in_features_ptr, in_channels); @@ -255,23 +259,10 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, } // 4. scatter - // auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - // dev_ctx, rulebook_len * in_channels, 1); - // - // phi::funcs::ScatterCUDAKernel<<>>(d_x_features_ptr, - // rulebook_ptr, - // x_grad_values_ptr, - // rulebook_len, - // in_channels, - // false); - if (in_channels % VecSize == 0) { auto config = phi::backends::gpu::GetGpuLaunchConfig1D( dev_ctx, x_grad->nnz() * in_channels / VecSize, 1); - phi::funcs::sparse::ScatterKernelV2 + phi::funcs::sparse::ScatterKernelV3 <<nnz() * in_channels, 1); - phi::funcs::sparse::ScatterKernelV2 + phi::funcs::sparse::ScatterKernelV3 <<(0.0f)); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1); unique_value.ResizeAndAllocate( {static_cast(out->nnz() * kernel_size)}); From 44ad03e1e4daa15a20bb98e4b123b64501ff75c8 Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Tue, 28 Jun 2022 23:18:53 +0800 Subject: [PATCH 47/70] refine code --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 127 +++++++++++--------- 1 file changed, 73 insertions(+), 54 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index a26bba041912b..1936fccc63e4a 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -138,6 +138,62 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( } } +template +__device__ __forceinline__ void merge_block_vertical( + BatchNormParamType x_sum, + BatchNormParamType x_square_sum, + BatchNormParamType *smem_sum, + BatchNormParamType *smem_square_sum, + BatchNormParamType *x_sum_out, + BatchNormParamType *x_square_sum_out) { + int tid = threadIdx.x + threadIdx.y * blockDim.x; +#pragma unroll + for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) { + if (threadIdx.y < offset * 2) { + smem_sum[tid] = x_sum; + smem_square_sum[tid] = x_square_sum; + } + __syncthreads(); + if (threadIdx.y < offset) { + int pair_tid = tid + offset * blockDim.x; + x_sum += smem_sum[pair_tid]; + x_square_sum += smem_square_sum[pair_tid]; + } + } + if (threadIdx.y == 0) { + *x_sum_out = x_sum; + *x_square_sum_out = x_square_sum; + } +} + +template +__device__ __forceinline__ void merge_block_horizonal( + BatchNormParamType x_sum, + BatchNormParamType x_square_sum, + BatchNormParamType *smem_sum, + BatchNormParamType *smem_square_sum, + BatchNormParamType *x_sum_out, + BatchNormParamType *x_square_sum_out) { + int tid = threadIdx.x + threadIdx.y * blockDim.x; +#pragma unroll + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) { + if (threadIdx.x < offset * 2) { + smem_sum[tid] = x_sum; + smem_square_sum[tid] = x_square_sum; + } + __syncthreads(); + if (threadIdx.x < offset) { + int pair_tid = tid + offset; + x_sum += smem_sum[pair_tid]; + x_square_sum += smem_square_sum[pair_tid]; + } + } + if (threadIdx.x == 0) { + *x_sum_out = x_sum; + *x_square_sum_out = x_square_sum; + } +} + template static __global__ void BNForwardTraining2DChannelLastCompStat( const T *x, @@ -180,20 +236,8 @@ static __global__ void BNForwardTraining2DChannelLastCompStat( } // vertical block sum - int tid = threadIdx.x + threadIdx.y * blockDim.x; -#pragma unroll - for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) { - if (threadIdx.y < offset * 2) { - smem_sum[tid] = x_sum; - smem_square_sum[tid] = x_square_sum; - } - __syncthreads(); - if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { - int pair_tid = tid + offset * blockDim.x; - x_sum += smem_sum[pair_tid]; - x_square_sum += smem_square_sum[pair_tid]; - } - } + merge_block_vertical( + x_sum, x_square_sum, smem_sum, smem_square_sum, &x_sum, &x_square_sum); if (gridDim.y > 1) { volatile BatchNormParamType *staging_sum = block_data_ptr; @@ -228,19 +272,12 @@ static __global__ void BNForwardTraining2DChannelLastCompStat( } // vertical block sum - int tid = threadIdx.x + threadIdx.y * blockDim.x; - for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) { - if (threadIdx.y < offset * 2) { - smem_sum[tid] = x_sum; - smem_square_sum[tid] = x_square_sum; - } - __syncthreads(); - if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { - int pair_tid = tid + offset * blockDim.x; - x_sum += smem_sum[pair_tid]; - x_square_sum += smem_square_sum[pair_tid]; - } - } + merge_block_vertical(x_sum, + x_square_sum, + smem_sum, + smem_square_sum, + &x_sum, + &x_square_sum); // final compute if (threadIdx.y == 0) { @@ -363,19 +400,8 @@ static __global__ void BNForwardTraining2DCompStat( } // horizonal block sum - int tid = threadIdx.x + threadIdx.y * blockDim.x; - for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) { - if (threadIdx.x < offset * 2) { - smem_sum[tid] = x_sum; - smem_square_sum[tid] = x_square_sum; - } - __syncthreads(); - if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) { - int pair_tid = tid + offset; - x_sum += smem_sum[pair_tid]; - x_square_sum += smem_square_sum[pair_tid]; - } - } + merge_block_horizonal( + x_sum, x_square_sum, smem_sum, smem_square_sum, &x_sum, &x_square_sum); if (gridDim.x > 1) { volatile BatchNormParamType *staging_sum = block_data_ptr; @@ -409,20 +435,13 @@ static __global__ void BNForwardTraining2DCompStat( x_square_sum += staging_square_sum[i + x * C]; } - // vertical block sum - int tid = threadIdx.x + threadIdx.y * blockDim.x; - for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) { - if (threadIdx.x < offset * 2) { - smem_sum[tid] = x_sum; - smem_square_sum[tid] = x_square_sum; - } - __syncthreads(); - if (threadIdx.x < offset && threadIdx.x + offset < blockDim.y) { - int pair_tid = tid + offset; - x_sum += smem_sum[pair_tid]; - x_square_sum += smem_square_sum[pair_tid]; - } - } + // horizonal block sum + merge_block_horizonal(x_sum, + x_square_sum, + smem_sum, + smem_square_sum, + &x_sum, + &x_square_sum); // final compute if (threadIdx.x == 0) { From db9792c1109dfef99892807bd7d8972138fc2ce3 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Fri, 1 Jul 2022 13:38:35 +0000 Subject: [PATCH 48/70] replace sort with remove_copy --- .../phi/kernels/sparse/gpu/convolution.cu.h | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index a86f5f06083c2..001b38847e5ff 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -137,11 +137,9 @@ __global__ void UniqueKernel(const IntT* in_indexs, int* out_index_table, int* out_indexs, int* nnz) { - extern __shared__ int cache[]; - __shared__ int count, start; + __shared__ int count; if (threadIdx.x == 0) { count = 0; - start = 0; } __syncthreads(); @@ -149,7 +147,7 @@ __global__ void UniqueKernel(const IntT* in_indexs, if (i < rulebook_len) { // atomicOr only support int int index = static_cast(in_indexs[i]); - int change_index = index == 0 ? 1 : index; + int change_index = index == 0 ? -1 : index; int flag = atomicOr(out_index_table + index, change_index); if (flag == 0) { int j = atomicAdd(&count, 1); @@ -159,11 +157,7 @@ __global__ void UniqueKernel(const IntT* in_indexs, __syncthreads(); if (threadIdx.x == 0) { - start = atomicAdd(nnz, count); - } - __syncthreads(); - for (int i = threadIdx.x; i < count; i += blockDim.x) { - out_indexs[start + i] = cache[i]; + atomicAdd(nnz, count); } } @@ -330,6 +324,7 @@ __global__ void GetOutIndexTable(const int* indexs, IntT* out_indices) { CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) { IntT index = static_cast(indexs[i]); + index = index == -1 ? 0 : index; out_index_table[index] = i; IntT batch, x, y, z; phi::funcs::sparse::IndexToPoint( @@ -748,10 +743,9 @@ int ProductRuleBook(const Context& dev_ctx, cudaMemsetAsync(unique_key_ptr, 0, sizeof(int), dev_ctx.stream()); config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); - size_t cache_size = sizeof(int) * config.thread_per_block.x; UniqueKernel<<>>(rulebook_ptr + rulebook_len, rulebook_len, out_index_table_ptr, @@ -776,9 +770,14 @@ int ProductRuleBook(const Context& dev_ctx, IntT* out_indices_ptr = out_indices.data(); - thrust::sort(thrust::cuda::par.on(dev_ctx.stream()), - out_index_ptr, - out_index_ptr + out_nnz); + // thrust::sort(thrust::cuda::par.on(dev_ctx.stream()), + // out_index_ptr, + // out_index_ptr + out_nnz); + thrust::remove_copy(thrust::cuda::par.on(dev_ctx.stream()), + out_index_table_ptr, + out_index_table_ptr + table_size, + out_index_ptr, + 0); config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1); GetOutIndexTable<< Date: Fri, 1 Jul 2022 13:51:39 +0000 Subject: [PATCH 49/70] fix cache --- paddle/phi/kernels/sparse/gpu/convolution.cu.h | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 001b38847e5ff..e5285bb7c33d5 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -151,7 +151,6 @@ __global__ void UniqueKernel(const IntT* in_indexs, int flag = atomicOr(out_index_table + index, change_index); if (flag == 0) { int j = atomicAdd(&count, 1); - cache[j] = index; } } __syncthreads(); From c2957762ebc09f4db69d588c2f75c1879b653ae4 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Sun, 3 Jul 2022 03:03:30 +0000 Subject: [PATCH 50/70] unorder the out index of Conv3D --- .../phi/kernels/sparse/gpu/convolution.cu.h | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 001b38847e5ff..3f975c80d54d0 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -137,9 +137,11 @@ __global__ void UniqueKernel(const IntT* in_indexs, int* out_index_table, int* out_indexs, int* nnz) { - __shared__ int count; + extern __shared__ int cache[]; + __shared__ int count, start; if (threadIdx.x == 0) { count = 0; + start = 0; } __syncthreads(); @@ -157,7 +159,11 @@ __global__ void UniqueKernel(const IntT* in_indexs, __syncthreads(); if (threadIdx.x == 0) { - atomicAdd(nnz, count); + start = atomicAdd(nnz, count); + } + __syncthreads(); + for (int i = threadIdx.x; i < count; i += blockDim.x) { + out_indexs[start + i] = cache[i]; } } @@ -317,14 +323,14 @@ __global__ void GetOutIndexTable(const IntT* indices, } template -__global__ void GetOutIndexTable(const int* indexs, +__global__ void GetOutIndexTable(int* indexs, const int non_zero_num, const Dims4D out_dims, int* out_index_table, IntT* out_indices) { CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) { IntT index = static_cast(indexs[i]); - index = index == -1 ? 0 : index; + // index = index == -1 ? 0 : index; out_index_table[index] = i; IntT batch, x, y, z; phi::funcs::sparse::IndexToPoint( @@ -334,6 +340,7 @@ __global__ void GetOutIndexTable(const int* indexs, out_indices[i + non_zero_num] = z; out_indices[i + non_zero_num * 2] = y; out_indices[i + non_zero_num * 3] = x; + indexs[i] = 0; } } @@ -743,9 +750,10 @@ int ProductRuleBook(const Context& dev_ctx, cudaMemsetAsync(unique_key_ptr, 0, sizeof(int), dev_ctx.stream()); config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); + size_t cache_size = sizeof(int) * config.thread_per_block.x; UniqueKernel<<>>(rulebook_ptr + rulebook_len, rulebook_len, out_index_table_ptr, @@ -759,6 +767,14 @@ int ProductRuleBook(const Context& dev_ctx, dev_ctx.stream()); dev_ctx.Wait(); + // thrust::pair min_max = + // thrust::minmax_element(thrust::cuda::par.on(dev_ctx.stream()), + // out_index_ptr, out_index_ptr + out_nnz); int start = 0, end = 0; + // cudaMemcpyAsync(&start, min_max.first, sizeof(int), + // cudaMemcpyDeviceToHost, dev_ctx.stream()); cudaMemcpyAsync(&end, + // min_max.second, sizeof(int), cudaMemcpyDeviceToHost, dev_ctx.stream()); + // dev_ctx.Wait(); + const int64_t sparse_dim = 4; DenseTensorMeta indices_meta( indices_dtype, {sparse_dim, out_nnz}, DataLayout::NCHW); @@ -773,11 +789,12 @@ int ProductRuleBook(const Context& dev_ctx, // thrust::sort(thrust::cuda::par.on(dev_ctx.stream()), // out_index_ptr, // out_index_ptr + out_nnz); - thrust::remove_copy(thrust::cuda::par.on(dev_ctx.stream()), - out_index_table_ptr, - out_index_table_ptr + table_size, - out_index_ptr, - 0); + // printf("start = %d, end=%d, table_size=%d, nnz=%d\n", start, end, + // table_size); thrust::remove_copy(thrust::cuda::par.on(dev_ctx.stream()), + // out_index_table_ptr + start, + // out_index_table_ptr + end, + // out_index_ptr, + // 0); config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1); GetOutIndexTable<<ResizeAndAllocate({static_cast(out_nnz * kernel_size)}); int* unique_value_ptr = unique_value->data(); From e181b1c085ea6d5f75dfd4bf5ccee6be6f52e86d Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 4 Jul 2022 06:06:10 +0000 Subject: [PATCH 51/70] add coalesced --- paddle/phi/api/yaml/sparse_api.yaml | 7 +++++++ .../phi/kernels/sparse/gpu/coalesced_kernel.cu | 2 +- paddle/phi/kernels/sparse/gpu/convolution.cu.h | 15 +++++++++------ .../kernels/sparse/gpu/sparse_mask_kernel.cu | 17 +++++++++-------- .../tests/unittests/test_sparse_conv_op.py | 1 + python/paddle/incubate/sparse/__init__.py | 3 +++ 6 files changed, 30 insertions(+), 15 deletions(-) diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml index f3379f8c956db..3987208c2ecc7 100644 --- a/paddle/phi/api/yaml/sparse_api.yaml +++ b/paddle/phi/api/yaml/sparse_api.yaml @@ -131,6 +131,13 @@ layout : x backward : values_grad +- api: coalesced + args : (Tensor x) + output : Tensor(out) + kernel : + func: coalesced{sparse_coo -> sparse_coo} + layout : x + - api: full_like args : (Tensor x, Scalar value, DataType dtype=DataType::UNDEFINED) output : Tensor(out) diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu index c5f0c332c7bbc..bf59c10bedd96 100644 --- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu @@ -192,7 +192,7 @@ void CoalescedKernel(const Context& dev_ctx, } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(sort, +PD_REGISTER_KERNEL(coalesced, GPU, ALL_LAYOUT, phi::sparse::CoalescedKernel, diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 3f975c80d54d0..c923e6df5fc9f 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -318,7 +318,7 @@ __global__ void GetOutIndexTable(const IntT* indices, IntT in_y = indices[i + 2 * non_zero_num]; IntT in_x = indices[i + 3 * non_zero_num]; IntT index = PointToIndex(batch, in_x, in_y, in_z, dims); - out_index_table[index] = i; + out_index_table[index] = i == 0 ? -1 : i; } } @@ -429,7 +429,8 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices, out_index = phi::funcs::sparse::PointToIndex( batch, out_x, out_y, out_z, out_dims); int real_out_index = out_index_table[out_index]; - if (real_out_index != -1) { + if (real_out_index != 0) { + real_out_index = real_out_index == -1 ? 0 : real_out_index; in_i = i; int buf_i = atomicAdd(&counter_buf[kernel_index], 1); kernel_i = kernel_index; @@ -581,10 +582,12 @@ int ProductRuleBook(const Context& dev_ctx, } DenseTensor out_index_table = phi::Empty(dev_ctx, {table_size}); IntT* out_index_table_ptr = out_index_table.data(); - thrust::fill(thrust::cuda::par.on(dev_ctx.stream()), - out_index_table_ptr, - out_index_table_ptr + out_index_table.numel(), - -1); + // thrust::fill(thrust::cuda::par.on(dev_ctx.stream()), + // out_index_table_ptr, + // out_index_table_ptr + out_index_table.numel(), + // -1); + cudaMemsetAsync( + out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream()); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu index 36f79dd346680..0e399a7b0e81f 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu @@ -94,14 +94,15 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx, auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1); - MaskKernel<<>>( - x_ptr, - indices_ptr, - sparse_offsets.data(), - non_zero_num, - cols, - sparse_dim, - out_values_ptr); + MaskKernel + <<>>( + x_ptr, + indices_ptr, + sparse_offsets.data(), + non_zero_num, + cols, + sparse_dim, + out_values_ptr); out->SetMember(out_indices, out_values, dims, true); } diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py index afd9c33421660..e43db8bd9150b 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py @@ -54,6 +54,7 @@ def test_conv3d(self): key='conv3d', data_format="NDHWC") out.backward(out) + out = paddle.incubate.sparse.coalesced(out) assert np.array_equal(correct_out_values, out.values().numpy()) def test_subm_conv3d(self): diff --git a/python/paddle/incubate/sparse/__init__.py b/python/paddle/incubate/sparse/__init__.py index f696434118745..6c9678873abe5 100644 --- a/python/paddle/incubate/sparse/__init__.py +++ b/python/paddle/incubate/sparse/__init__.py @@ -28,6 +28,8 @@ from .math import multiply from .math import subtract +from .coalesced import coalesced + from . import nn __all__ = [ @@ -43,4 +45,5 @@ 'subtract', 'multiply', 'divide', + 'coalesced', ] From e2bf43a6e9ead0886f57947ce1eb431f7dc86f22 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 4 Jul 2022 07:44:16 +0000 Subject: [PATCH 52/70] add coalesced.py --- python/paddle/incubate/sparse/coalesced.py | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 python/paddle/incubate/sparse/coalesced.py diff --git a/python/paddle/incubate/sparse/coalesced.py b/python/paddle/incubate/sparse/coalesced.py new file mode 100644 index 0000000000000..dcd2f8ca28f3a --- /dev/null +++ b/python/paddle/incubate/sparse/coalesced.py @@ -0,0 +1,25 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle import _C_ops +from paddle.fluid.framework import core, dygraph_only + +__all__ = [ + 'coalesced', +] + + +@dygraph_only +def coalesced(x): + return _C_ops.final_state_sparse_coalesced(x) From 0aa457fc6e8b0ffda8b0bb28330baf02b343c8eb Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 5 Jul 2022 06:35:03 +0000 Subject: [PATCH 53/70] coalesced before compare result --- paddle/phi/kernels/sparse/coalesced_kernel.h | 7 + .../kernels/sparse/gpu/coalesced_kernel.cu | 13 +- .../phi/kernels/sparse/gpu/convolution.cu.h | 122 ++++++------------ .../sparse/gpu/convolution_grad_kernel.cu | 20 +-- .../kernels/sparse/gpu/convolution_kernel.cu | 78 +++++------ .../kernels/test_sparse_conv3d_dev_api.cc | 6 +- 6 files changed, 89 insertions(+), 157 deletions(-) diff --git a/paddle/phi/kernels/sparse/coalesced_kernel.h b/paddle/phi/kernels/sparse/coalesced_kernel.h index 0755579a57ade..d2f5f8f3150af 100644 --- a/paddle/phi/kernels/sparse/coalesced_kernel.h +++ b/paddle/phi/kernels/sparse/coalesced_kernel.h @@ -26,5 +26,12 @@ void CoalescedKernel(const Context& dev_ctx, const SparseCooTensor& x, SparseCooTensor* out); +template +SparseCooTensor Coalesced(const Context& dev_ctx, const SparseCooTensor& x) { + SparseCooTensor coo; + CoalescedKernel(dev_ctx, x, &coo); + return coo; +} + } // namespace sparse } // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu index bf59c10bedd96..ac147ccd0abb6 100644 --- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu @@ -55,11 +55,7 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx, phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data(), sparse_offsets.data(), sizeof(IntT) * sparse_dim, -#ifdef PADDLE_WITH_HIP - hipMemcpyHostToDevice, -#else - cudaMemcpyHostToDevice, -#endif + gpuMemcpyHostToDevice, dev_ctx.stream()); // 1. flatten indices @@ -117,11 +113,7 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx, phi::backends::gpu::GpuMemcpyAsync(&out_nnz, out_indices.data(), sizeof(IntT), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif + gpuMemcpyDeviceToHost, dev_ctx.stream()); dev_ctx.Wait(); @@ -188,7 +180,6 @@ void CoalescedKernel(const Context& dev_ctx, CoalescedGPUKernel(dev_ctx, x, out); })); } - } // namespace sparse } // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index c923e6df5fc9f..4c0929a91cefd 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -37,17 +37,9 @@ namespace sparse { using Dims4D = phi::funcs::sparse::Dims4D; -// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace -// this kernel with phi::GatherCUDAKernel; -// Vectorization can be used to improve read and write bandwidth -/** - * brief: gather data from params according to indices - * params: the inputs - * indices: the indices you want to gather - * output: the outputs - * index_size: the size of indices - * slice_size: slice size corresponding to each index, here is the channel size - **/ +// Vectorize load and store global memory +// In the scene of 3D point cloud, the slice_size 4,8,16,32,64 are commonly +// used. template __global__ void GatherKernel(const T* params, const IndexT* indices, @@ -68,6 +60,7 @@ __global__ void GatherKernel(const T* params, } } +// the index_counts records the number of times the same index will be gather template __global__ void GatherKernelV2(const T* inputs, const int* index_counts, @@ -96,6 +89,7 @@ __global__ void GatherKernelV2(const T* inputs, } } +// double sparse, seed GroupIndexs template __global__ void GatherKernelV3(const T* inputs, const int* index_counts, @@ -131,6 +125,7 @@ __global__ void GatherKernelV3(const T* inputs, } } +// unique the out indexs in rulebook template __global__ void UniqueKernel(const IntT* in_indexs, const int rulebook_len, @@ -168,12 +163,12 @@ __global__ void UniqueKernel(const IntT* in_indexs, } template -__global__ void UpdateOutIndex(const int* out_index_table, - const int n, - const int kernel_size, - IntT* out_indexs, - int* out_index_counts, - int* origin_out_indexs) { +__global__ void GroupIndexs(const int* out_index_table, + const int n, + const int kernel_size, + IntT* out_indexs, + int* out_index_counts, + int* out_index_groups) { CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { IntT index = out_indexs[i]; int real_index = out_index_table[index]; @@ -182,7 +177,7 @@ __global__ void UpdateOutIndex(const int* out_index_table, // kernel_size at most int j = atomicAdd(out_index_counts + real_index, 1); // nnz * kernel_size - origin_out_indexs[real_index * kernel_size + j] = i; + out_index_groups[real_index * kernel_size + j] = i; } } @@ -461,11 +456,11 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices, } template -__global__ void UpdateOutIndex(const int n, - const int kernel_size, - const IntT* indexs, - int* index_counts, - int* index_groups) { +__global__ void GroupIndexs(const int n, + const int kernel_size, + const IntT* indexs, + int* index_counts, + int* index_groups) { CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { IntT index = indexs[i]; // kernel_size at most @@ -475,14 +470,15 @@ __global__ void UpdateOutIndex(const int n, } } +// double space to reduce atomicAdd conflict template -__global__ void UpdateOutIndexV2(const int rulebook_len, - const int non_zero_num, - const int kernel_size, - const int half_kernel_offset, - const IntT* indexs, - int* index_counts, - int* index_groups) { +__global__ void GroupIndexsV2(const int rulebook_len, + const int non_zero_num, + const int kernel_size, + const int half_kernel_offset, + const IntT* indexs, + int* index_counts, + int* index_groups) { CUDA_KERNEL_LOOP_TYPE(i, rulebook_len, int64_t) { IntT index = indexs[i]; int* counts_ptr = @@ -545,8 +541,6 @@ int ProductRuleBook(const Context& dev_ctx, Dims4D d_strides(1, strides[2], strides[1], strides[0]); Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]); // 1. product rule book - // phi::funcs::SetConstant set_zero; - // set_zero(dev_ctx, counter_per_kernel, 0); phi::backends::gpu::GpuMemsetAsync(counter_ptr, 0, sizeof(int) * counter_per_kernel->numel(), @@ -555,11 +549,6 @@ int ProductRuleBook(const Context& dev_ctx, phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); if (subm) { - // At present, hashtable is not used to map the input and output indexes. - // At present, the intermediate output index is generated by normal - // convolution, - // and then the intermediate output index is subtracted from the input index - // to obain the rulebook. const int rulebook_rows = 2; const int rulebook_cols = kernel_size * non_zero_num; DenseTensorMeta rulebook_meta( @@ -582,11 +571,7 @@ int ProductRuleBook(const Context& dev_ctx, } DenseTensor out_index_table = phi::Empty(dev_ctx, {table_size}); IntT* out_index_table_ptr = out_index_table.data(); - // thrust::fill(thrust::cuda::par.on(dev_ctx.stream()), - // out_index_table_ptr, - // out_index_table_ptr + out_index_table.numel(), - // -1); - cudaMemsetAsync( + phi::backends::gpu::GpuMemsetAsync( out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream()); auto config = @@ -696,21 +681,6 @@ int ProductRuleBook(const Context& dev_ctx, rulebook_ptr + rulebook_rows * rulebook_cols, -1); - // phi::funcs::sparse::DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - // rulebook_ptr, last, rulebook_ptr + rulebook_rows * rulebook_cols - - // 1); - // IntT rulebook_len = 0; - // phi::backends::gpu::GpuMemcpyAsync( - // &rulebook_len, - // rulebook_ptr + rulebook_rows * rulebook_cols - 1, - // sizeof(IntT), - // gpuMemcpyDeviceToHost, - // dev_ctx.stream()); - - // dev_ctx.Wait(); - // rulebook_len /= 2; - // printf("rulebook_len = %d\n", rulebook_len); - // printf("distance = %d\n", last-rulebook_ptr); IntT rulebook_len = (last - rulebook_ptr) / 2; #ifdef PADDLE_WITH_HIP @@ -770,14 +740,6 @@ int ProductRuleBook(const Context& dev_ctx, dev_ctx.stream()); dev_ctx.Wait(); - // thrust::pair min_max = - // thrust::minmax_element(thrust::cuda::par.on(dev_ctx.stream()), - // out_index_ptr, out_index_ptr + out_nnz); int start = 0, end = 0; - // cudaMemcpyAsync(&start, min_max.first, sizeof(int), - // cudaMemcpyDeviceToHost, dev_ctx.stream()); cudaMemcpyAsync(&end, - // min_max.second, sizeof(int), cudaMemcpyDeviceToHost, dev_ctx.stream()); - // dev_ctx.Wait(); - const int64_t sparse_dim = 4; DenseTensorMeta indices_meta( indices_dtype, {sparse_dim, out_nnz}, DataLayout::NCHW); @@ -789,16 +751,6 @@ int ProductRuleBook(const Context& dev_ctx, IntT* out_indices_ptr = out_indices.data(); - // thrust::sort(thrust::cuda::par.on(dev_ctx.stream()), - // out_index_ptr, - // out_index_ptr + out_nnz); - // printf("start = %d, end=%d, table_size=%d, nnz=%d\n", start, end, - // table_size); thrust::remove_copy(thrust::cuda::par.on(dev_ctx.stream()), - // out_index_table_ptr + start, - // out_index_table_ptr + end, - // out_index_ptr, - // 0); - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1); GetOutIndexTable<<ResizeAndAllocate({static_cast(out_nnz * kernel_size)}); int* unique_value_ptr = unique_value->data(); // return rulebook_len; - UpdateOutIndex<<>>(out_index_table_ptr, - rulebook_len, - kernel_size, - rulebook_ptr + rulebook_len, - out_index_ptr, - unique_value_ptr); + GroupIndexs<<>>(out_index_table_ptr, + rulebook_len, + kernel_size, + rulebook_ptr + rulebook_len, + out_index_ptr, + unique_value_ptr); return rulebook_len; } diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index 72df861cccbef..06aed45b488e4 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -148,16 +148,16 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, cudaMemsetAsync( out_index_ptr, 0, sizeof(int) * x.nnz() * 2, dev_ctx.stream()); - UpdateOutIndexV2<<>>(rulebook_len, - x.nnz(), - kernel_size, - offsets[kernel_size / 2], - rulebook_ptr, - out_index_ptr, - unique_value_ptr); + GroupIndexsV2<<>>(rulebook_len, + x.nnz(), + kernel_size, + offsets[kernel_size / 2], + rulebook_ptr, + out_index_ptr, + unique_value_ptr); const int VecSize = VecBytes / sizeof(T); if (in_channels % VecSize == 0) { diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 8e09c4c24e7d5..3a7415870e103 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -139,8 +139,6 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, T* out_features_ptr = out_features.data(); phi::funcs::SetConstant set_zero; set_zero(dev_ctx, &out_features, static_cast(0.0f)); - // phi::backends::gpu::GpuMemsetAsync(out_features_ptr, - // static_cast(0.0f), sizeof(T) * out_features.numel(), dev_ctx.stream()); const int VecSize = VecBytes / sizeof(T); if (in_channels % VecSize == 0) { @@ -176,18 +174,18 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, set_zero(dev_ctx, out_values, static_cast(0.0f)); if (subm) { - // set_zero(dev_ctx, out_values, static_cast(0.0f)); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1); unique_value.ResizeAndAllocate( {static_cast(out->nnz() * kernel_size)}); out_index.ResizeAndAllocate({static_cast(n)}); int* out_index_ptr = out_index.data(); int* unique_value_ptr = unique_value.data(); - cudaMemsetAsync(out_index_ptr, 0, sizeof(int) * n, dev_ctx.stream()); - UpdateOutIndex<<>>( + phi::backends::gpu::GpuMemsetAsync( + out_index_ptr, 0, sizeof(int) * n, dev_ctx.stream()); + GroupIndexs<<>>( n, kernel_size, rulebook_ptr + n, out_index_ptr, unique_value_ptr); } const T* kernel_ptr = kernel.data(); @@ -217,50 +215,34 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, } // 4. scatter - if (false) { - set_zero(dev_ctx, out_values, static_cast(0.0f)); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1); - phi::funcs::ScatterCUDAKernel - <<nnz() * out_channels / VecSize, 1); + phi::funcs::sparse::ScatterKernelV2 + <<>>(out_features_ptr, - rulebook_ptr + n, - out_values_ptr, - n, + out_index.data(), + unique_value.data(), + out->nnz(), + kernel_size, out_channels, - false); + out_values_ptr); } else { - if (out_channels % VecSize == 0) { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, out->nnz() * out_channels / VecSize, 1); - phi::funcs::sparse::ScatterKernelV2 - <<>>(out_features_ptr, - out_index.data(), - unique_value.data(), - out->nnz(), - kernel_size, - out_channels, - out_values_ptr); - } else { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, out->nnz() * out_channels, 1); - phi::funcs::sparse::ScatterKernelV2 - <<>>(out_features_ptr, - out_index.data(), - unique_value.data(), - out->nnz(), - kernel_size, - out_channels, - out_values_ptr); - } + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, out->nnz() * out_channels, 1); + phi::funcs::sparse::ScatterKernelV2 + <<>>(out_features_ptr, + out_index.data(), + unique_value.data(), + out->nnz(), + kernel_size, + out_channels, + out_values_ptr); } } /** diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index 5f3c290a2414e..6f03f3e1ec0b6 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/sparse/coalesced_kernel.h" #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" @@ -207,6 +208,7 @@ void TestConv3dBase(const std::vector& indices, 1, subm, "Conv3d_0"); + SparseCooTensor tmp_d_out = sparse::Coalesced(dev_ctx_gpu, d_out); ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz()); @@ -218,7 +220,7 @@ void TestConv3dBase(const std::vector& indices, dev_ctx_cpu, DenseTensorMeta(indices_dtype, {4, d_out.nnz()}, DataLayout::NCHW)); phi::Copy(dev_ctx_gpu, - d_out.non_zero_indices(), + tmp_d_out.non_zero_indices(), phi::CPUPlace(), true, &h_indices_tensor); @@ -232,7 +234,7 @@ void TestConv3dBase(const std::vector& indices, phi::EmptyLike(dev_ctx_cpu, d_out.non_zero_elements()); phi::Copy(dev_ctx_gpu, - d_out.non_zero_elements(), + tmp_d_out.non_zero_elements(), phi::CPUPlace(), true, &h_features_tensor); From 749dccefd077f86254d09eaba020e20a7a018ae9 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 5 Jul 2022 11:54:05 +0000 Subject: [PATCH 54/70] the key of conv3d is not required --- .../final_state_generator/eager_gen.py | 3 + paddle/phi/api/yaml/sparse_api.yaml | 5 +- paddle/phi/api/yaml/sparse_bw_api.yaml | 6 +- .../kernels/sparse/convolution_grad_kernel.h | 6 ++ .../phi/kernels/sparse/convolution_kernel.h | 12 ++- .../sparse/cpu/convolution_grad_kernel.cc | 48 +++++++--- .../kernels/sparse/cpu/convolution_kernel.cc | 79 ++++++++++------ .../phi/kernels/sparse/gpu/convolution.cu.h | 15 +-- .../sparse/gpu/convolution_grad_kernel.cu | 39 ++++++-- .../kernels/sparse/gpu/convolution_kernel.cu | 91 ++++++++++++------- .../kernels/test_sparse_conv3d_dev_api.cc | 18 ++-- .../tests/unittests/test_sparse_conv_op.py | 4 +- .../incubate/sparse/nn/functional/conv.py | 9 +- .../paddle/incubate/sparse/nn/layer/conv.py | 3 +- 14 files changed, 221 insertions(+), 117 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index d406f00b25039..a595bf5c613c6 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -545,6 +545,9 @@ def BackwardValidationCheck(self): backward_forward_inputs_map = self.backward_forward_inputs_map backward_grad_inputs_map = self.backward_grad_inputs_map backward_attrs_list = self.backward_attrs_list + print(backward_forward_inputs_map) + print(backward_grad_inputs_map) + print(backward_attrs_list) # Check Order: TensorWrappers, GradTensors, Attributes max_fwd_input_position = -1 diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml index 3987208c2ecc7..cc583fce1f8a1 100644 --- a/paddle/phi/api/yaml/sparse_api.yaml +++ b/paddle/phi/api/yaml/sparse_api.yaml @@ -9,10 +9,11 @@ - api : conv3d args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) - output : Tensor(out) + output : Tensor(out), Tensor(rulebook), Tensor(counter) kernel : - func : sparse_conv3d{sparse_coo, dense -> sparse_coo} + func : sparse_conv3d{sparse_coo, dense -> sparse_coo, dense, dense} layout : x + intermediate: rulebook, counter backward : conv3d_grad - api : coo_to_dense diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml index c96443bc8241b..04cdfccce636e 100644 --- a/paddle/phi/api/yaml/sparse_bw_api.yaml +++ b/paddle/phi/api/yaml/sparse_bw_api.yaml @@ -7,11 +7,11 @@ add_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr} - backward_api : conv3d_grad - forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out@SparseCooTensor) - args : (Tensor x, Tensor kernel, Tensor out, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) + forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor), Tensor(counter@DenseTensor) + args : (Tensor x, Tensor kernel, Tensor out, Tensor rulebook, Tensor counter, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) output : Tensor(x_grad), Tensor(kernel_grad) kernel : - func : sparse_conv3d_grad{sparse_coo, dense, sparse_coo, sparse_coo -> sparse_coo, dense} + func : sparse_conv3d_grad{sparse_coo, dense, sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense} - backward_api : coo_to_dense_grad forward : coo_to_dense(Tensor x) -> Tensor(out) diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h index 52be7ffd02903..54d09babb2cf9 100644 --- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h +++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h @@ -27,6 +27,8 @@ void Conv3dGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, const SparseCooTensor& out, + const DenseTensor& rulebook, + const DenseTensor& counter, const SparseCooTensor& out_grad, const std::vector& paddings, const std::vector& dilations, @@ -43,6 +45,8 @@ std::tuple Conv3dGrad( const SparseCooTensor& x, const DenseTensor& kernel, const SparseCooTensor& out, + const DenseTensor& rulebook, + const DenseTensor& counter, const SparseCooTensor& out_grad, const std::vector& paddings, const std::vector& dilations, @@ -58,6 +62,8 @@ std::tuple Conv3dGrad( x, kernel, out, + rulebook, + counter, out_grad, paddings, dilations, diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h index 6acc3241385a3..62559d4e0ff1e 100644 --- a/paddle/phi/kernels/sparse/convolution_kernel.h +++ b/paddle/phi/kernels/sparse/convolution_kernel.h @@ -32,7 +32,9 @@ void Conv3dKernel(const Context& dev_ctx, const int groups, const bool subm, const std::string& key, - SparseCooTensor* out); + SparseCooTensor* out, + DenseTensor* rulebook, + DenseTensor* counter); template SparseCooTensor Conv3d(const Context& dev_ctx, @@ -43,7 +45,9 @@ SparseCooTensor Conv3d(const Context& dev_ctx, const std::vector& strides, const int groups, const bool subm, - const std::string& key) { + const std::string& key, + DenseTensor* rulebook, + DenseTensor* counter) { SparseCooTensor coo; Conv3dKernel(dev_ctx, x, @@ -54,7 +58,9 @@ SparseCooTensor Conv3d(const Context& dev_ctx, groups, subm, key, - &coo); + &coo, + rulebook, + counter); return coo; } diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc index 071d586dc7d56..5c6c2539c0a74 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc @@ -35,6 +35,8 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, const SparseCooTensor& out, + const DenseTensor& rulebook, + const DenseTensor& counter, const SparseCooTensor& out_grad, const std::vector& paddings, const std::vector& dilations, @@ -48,12 +50,26 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx, const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; - // const DenseTensor& rulebook = out.rulebook(); - const auto* table = out.table(key); - const DenseTensor& rulebook = table->first; - const IntT* rulebook_ptr = rulebook.data(); - const int rulebook_len = rulebook.dims()[1]; + int rulebook_len = 0; + const IntT* rulebook_ptr = nullptr; + const int* counter_ptr = nullptr; + bool cache_in_table = false; + if (!key.empty()) { + const auto* table = out.table(key); + if (table != nullptr) { + cache_in_table = true; + const DenseTensor& tmp_rulebook = table->first; + rulebook_ptr = tmp_rulebook.data(); + rulebook_len = tmp_rulebook.dims()[1]; + counter_ptr = table->second.data(); + } + } + if (!cache_in_table) { + rulebook_ptr = rulebook.data(); + rulebook_len = rulebook.dims()[1]; + counter_ptr = counter.data(); + } DenseTensorMeta in_features_meta( x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW); @@ -90,16 +106,14 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx, &x_grad_indices); x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true); - std::vector offsets(kernel_size + 1), counter(kernel_size, 0); - for (int i = 0; i < rulebook_len; i++) { - counter[rulebook_ptr[i]] += 1; - } - IntT offset = 0, max_count = 0; + std::vector offsets(kernel_size + 1); + IntT offset = 0; + int max_count = 0; for (int i = 0; i < kernel_size; i++) { offsets[i] = offset; - offset += counter[i]; + offset += counter_ptr[i]; if (i < half_kernel_size) { - max_count = std::max(max_count, counter[i]); + max_count = std::max(max_count, counter_ptr[i]); } } offsets[kernel_size] = offset; @@ -133,11 +147,11 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx, const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { - if (counter[i] <= 0 || (subm && i == half_kernel_size)) { + if (counter_ptr[i] <= 0 || (subm && i == half_kernel_size)) { continue; } - const int M = counter[i]; + const int M = counter_ptr[i]; const int K = in_channels; const int N = out_channels; T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels; @@ -175,7 +189,7 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx, // 4. scatter Scatter(d_x_features_ptr, - rulebook.data() + rulebook_len, + rulebook_ptr + rulebook_len, rulebook_len, in_channels, x_grad_values_ptr); @@ -186,6 +200,8 @@ void Conv3dGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, const SparseCooTensor& out, + const DenseTensor& rulebook, + const DenseTensor& counter, const SparseCooTensor& out_grad, const std::vector& paddings, const std::vector& dilations, @@ -201,6 +217,8 @@ void Conv3dGradKernel(const Context& dev_ctx, x, kernel, out, + rulebook, + counter, out_grad, paddings, dilations, diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index ecf7073b41109..f5f7497df96fb 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -37,7 +37,9 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx, const int groups, const bool subm, const std::string& key, - SparseCooTensor* out) { + SparseCooTensor* out, + DenseTensor* rulebook, + DenseTensor* counter) { // update padding and dilation // Currently, only support x.layout is NDHWC, groups = 1 // if x.layout != NDHWC then transpose(x), transpose(weight) @@ -71,27 +73,34 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx, // DenseTensor* rulebook = nullptr; const IntT* rulebook_ptr = nullptr; - PADDLE_ENFORCE(!key.empty(), - phi::errors::Fatal("the key of sparse conv must be not null")); int n = 0; - const auto* table = x.table(key); - if (subm && table != nullptr) { - const DenseTensor& rulebook = table->first; - rulebook_ptr = rulebook.data(); - out->SetTablePtr(x.GetTablePtr()); - n = rulebook.dims()[1]; - - DenseTensor out_indices = - phi::EmptyLike(dev_ctx, x.non_zero_indices()); - DenseTensor out_values = phi::EmptyLike(dev_ctx, x.non_zero_elements()); - phi::Copy( - dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices); - out->SetMember(out_indices, out_values, out_dims, true); - memcpy(counter_per_kernel.data(), - table->second.data(), - kernel_size * sizeof(int)); - } else { - DenseTensor rulebook; + bool need_product_rulebook = true; + if (subm && !key.empty()) { + const auto* table = x.table(key); + if (table != nullptr) { + need_product_rulebook = false; + const DenseTensor& rulebook = table->first; + rulebook_ptr = rulebook.data(); + out->SetTablePtr(x.GetTablePtr()); + n = rulebook.dims()[1]; + + DenseTensor out_indices = + phi::EmptyLike(dev_ctx, x.non_zero_indices()); + DenseTensor out_values = + phi::EmptyLike(dev_ctx, x.non_zero_elements()); + phi::Copy(dev_ctx, + x.non_zero_indices(), + dev_ctx.GetPlace(), + false, + &out_indices); + out->SetMember(out_indices, out_values, out_dims, true); + memcpy(counter_per_kernel.data(), + table->second.data(), + kernel_size * sizeof(int)); + } + } + if (need_product_rulebook) { + DenseTensor tmp_rulebook; ProductRuleBook(dev_ctx, x, kernel_sizes, @@ -100,15 +109,25 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx, subm_strides, out_dims, subm, - &rulebook, + &tmp_rulebook, &counter_per_kernel); UpdateRulebookAndOutIndex( - dev_ctx, x, kernel_size, out_channels, out_dims, &rulebook, out); - n = rulebook.dims()[1]; + dev_ctx, x, kernel_size, out_channels, out_dims, &tmp_rulebook, out); + n = tmp_rulebook.dims()[1]; + rulebook_ptr = tmp_rulebook.data(); + out->SetTablePtr(x.GetTablePtr()); - out->SetTable(key, std::make_pair(rulebook, counter_per_kernel)); - rulebook_ptr = rulebook.data(); + if (!key.empty()) { + out->SetTable(key, std::make_pair(tmp_rulebook, counter_per_kernel)); + } else { + *rulebook = tmp_rulebook; + counter->Resize({kernel_size}); + int* counter_ptr = dev_ctx.template HostAlloc(counter); + memcpy(counter_ptr, + counter_per_kernel.data(), + counter_per_kernel.size() * sizeof(int)); + } } // int n = rulebook->dims()[1]; const int* counter_ptr = counter_per_kernel.data(); @@ -183,7 +202,9 @@ void Conv3dKernel(const Context& dev_ctx, const int groups, const bool subm, const std::string& key, - SparseCooTensor* out) { + SparseCooTensor* out, + DenseTensor* rulebook, + DenseTensor* counter) { PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "Conv3dCPUKernel", ([&] { Conv3dCPUKernel(dev_ctx, @@ -195,7 +216,9 @@ void Conv3dKernel(const Context& dev_ctx, groups, subm, key, - out); + out, + rulebook, + counter); })); } diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 4c0929a91cefd..139259f48c8b3 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -718,9 +718,10 @@ int ProductRuleBook(const Context& dev_ctx, } DenseTensor out_index_table = phi::Empty(dev_ctx, {table_size}); int* out_index_table_ptr = out_index_table.data(); - cudaMemsetAsync( + phi::backends::gpu::GpuMemsetAsync( out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream()); - cudaMemsetAsync(unique_key_ptr, 0, sizeof(int), dev_ctx.stream()); + phi::backends::gpu::GpuMemsetAsync( + unique_key_ptr, 0, sizeof(int), dev_ctx.stream()); config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); size_t cache_size = sizeof(int) * config.thread_per_block.x; @@ -733,11 +734,11 @@ int ProductRuleBook(const Context& dev_ctx, out_index_ptr, unique_key_ptr); int out_nnz = 0; - cudaMemcpyAsync(&out_nnz, - unique_key_ptr, - sizeof(int), - cudaMemcpyDeviceToHost, - dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync(&out_nnz, + unique_key_ptr, + sizeof(int), + gpuMemcpyDeviceToHost, + dev_ctx.stream()); dev_ctx.Wait(); const int64_t sparse_dim = 4; diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index 06aed45b488e4..741720b20b9ff 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -44,6 +44,8 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, const SparseCooTensor& out, + const DenseTensor& rulebook, + const DenseTensor& counter, const SparseCooTensor& out_grad, const std::vector& paddings, const std::vector& dilations, @@ -58,11 +60,25 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; - const auto* table = out.table(key); - const DenseTensor& rulebook = table->first; - const IntT* rulebook_ptr = rulebook.data(); - - const int rulebook_len = rulebook.dims()[1]; + int rulebook_len = 0; + const IntT* rulebook_ptr = nullptr; + const int* counter_ptr = nullptr; + bool cache_in_table = false; + if (!key.empty()) { + const auto* table = out.table(key); + if (table != nullptr) { + cache_in_table = true; + const DenseTensor& tmp_rulebook = table->first; + rulebook_ptr = tmp_rulebook.data(); + rulebook_len = tmp_rulebook.dims()[1]; + counter_ptr = table->second.data(); + } + } + if (!cache_in_table) { + rulebook_ptr = rulebook.data(); + rulebook_len = rulebook.dims()[1]; + counter_ptr = counter.data(); + } DenseTensorMeta in_features_meta( x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW); @@ -109,14 +125,13 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true); std::vector offsets(kernel_size + 1); - const auto& counter = table->second; int offset = 0, max_count = 0; for (int i = 0; i < kernel_size; i++) { offsets[i] = offset; - offset += counter[i]; + offset += counter_ptr[i]; if (i < half_kernel_size) { - max_count = std::max(max_count, counter[i]); + max_count = std::max(max_count, counter_ptr[i]); } } offsets[kernel_size] = offset; @@ -218,11 +233,11 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { - if (counter[i] <= 0 || (subm && i == half_kernel_size)) { + if (counter_ptr[i] <= 0 || (subm && i == half_kernel_size)) { continue; } - const int M = counter[i]; + const int M = counter_ptr[i]; const int K = in_channels; const int N = out_channels; T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels; @@ -295,6 +310,8 @@ void Conv3dGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, const SparseCooTensor& out, + const DenseTensor& rulebook, + const DenseTensor& counter, const SparseCooTensor& out_grad, const std::vector& paddings, const std::vector& dilations, @@ -310,6 +327,8 @@ void Conv3dGradKernel(const Context& dev_ctx, x, kernel, out, + rulebook, + counter, out_grad, paddings, dilations, diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 3a7415870e103..6c071d963ebd2 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -23,6 +23,8 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" +#include "glog/logging.h" + namespace phi { namespace sparse { @@ -36,7 +38,9 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, const int groups, const bool subm, const std::string& key, - SparseCooTensor* out) { + SparseCooTensor* out, + DenseTensor* rulebook, + DenseTensor* counter) { // update padding and dilation // Currently, only support x.layout is NDHWC, groups = 1 // if x.layout != NDHWC then transpose(x), transpose(weight) @@ -76,35 +80,41 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta)); DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); + VLOG(6) << "call SubmConv3D or Conv3D " << subm << " and the key is " << key; int n = 0; const IntT* rulebook_ptr = nullptr; - PADDLE_ENFORCE_EQ( - key.empty(), - false, - phi::errors::Fatal("the key of sparse conv must be not null")); - const auto* table = x.table(key); - if (subm && table != nullptr) { - const DenseTensor& rulebook = table->first; - rulebook_ptr = rulebook.data(); - memcpy(h_counter.data(), table->second.data(), kernel_size * sizeof(int)); - out->SetTablePtr(x.GetTablePtr()); + bool need_product_rulebook = true; + if (subm && !key.empty()) { + const auto* table = x.table(key); + if (table != nullptr) { + need_product_rulebook = false; + const DenseTensor& rulebook = table->first; + rulebook_ptr = rulebook.data(); + memcpy(h_counter.data(), table->second.data(), kernel_size * sizeof(int)); + out->SetTablePtr(x.GetTablePtr()); - n = rulebook.dims()[1]; + n = rulebook.dims()[1]; - DenseTensor out_indices = - phi::EmptyLike(dev_ctx, x.non_zero_indices()); - DenseTensor out_values = phi::EmptyLike(dev_ctx, x.non_zero_elements()); - phi::Copy( - dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices); - out->SetMember(out_indices, out_values, out_dims, true); - IntT offset = 0; - for (int i = 0; i < kernel_size; i++) { - offsets[i] = offset; - offset += h_counter[i]; + DenseTensor out_indices = + phi::EmptyLike(dev_ctx, x.non_zero_indices()); + DenseTensor out_values = + phi::EmptyLike(dev_ctx, x.non_zero_elements()); + phi::Copy(dev_ctx, + x.non_zero_indices(), + dev_ctx.GetPlace(), + false, + &out_indices); + out->SetMember(out_indices, out_values, out_dims, true); + IntT offset = 0; + for (int i = 0; i < kernel_size; i++) { + offsets[i] = offset; + offset += h_counter[i]; + } + offsets[kernel_size] = offset; } - offsets[kernel_size] = offset; - } else { - DenseTensor rulebook; + } + if (need_product_rulebook) { + DenseTensor tmp_rulebook; n = ProductRuleBook(dev_ctx, x, kernel_sizes, @@ -113,7 +123,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, subm_strides, out_dims, subm, - &rulebook, + &tmp_rulebook, &counter_per_kernel, &offsets_per_kernel, &out_index, @@ -121,9 +131,17 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, out, &h_counter, &offsets); + rulebook_ptr = tmp_rulebook.data(); + out->SetTablePtr(x.GetTablePtr()); - out->SetTable(key, std::make_pair(rulebook, h_counter)); - rulebook_ptr = rulebook.data(); + if (!key.empty()) { + out->SetTable(key, std::make_pair(tmp_rulebook, h_counter)); + } else { + *rulebook = tmp_rulebook; + counter->Resize({kernel_size}); + int* counter_ptr = dev_ctx.template HostAlloc(counter); + memcpy(counter_ptr, h_counter.data(), h_counter.size() * sizeof(int)); + } } // 2. gather @@ -245,10 +263,13 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, out_values_ptr); } } + /** - * x: (N, D, H, W, C) - * kernel: (D, H, W, C, OC) - * out: (N, D, H, W, OC) + * x: the input SparseCooTensor, shape is (N, D, H, W, C) + * kernel: the weight data, shape is (D, H, W, C, OC) + * out: the output SparseCooTensor, shape is (N, D, H, W, OC) + * rulebook: return rulebook if key is not vailed else return nullptr + * counter: return counter if key is not vailed else return nullptr **/ template void Conv3dKernel(const Context& dev_ctx, @@ -260,7 +281,9 @@ void Conv3dKernel(const Context& dev_ctx, const int groups, const bool subm, const std::string& key, - SparseCooTensor* out) { + SparseCooTensor* out, + DenseTensor* rulebook, + DenseTensor* counter) { PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "Conv3dGPUKernel", ([&] { Conv3dGPUKernel(dev_ctx, @@ -272,7 +295,9 @@ void Conv3dKernel(const Context& dev_ctx, groups, subm, key, - out); + out, + rulebook, + counter); })); } diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index 6f03f3e1ec0b6..df4fec61a9a3d 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -112,8 +112,7 @@ void TestConv3dBase(const std::vector& indices, }; if (!std::is_same::value) { - // DenseTensor rulebook = phi::Empty( - // dev_ctx_cpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW)); + DenseTensor rulebook, counter; SparseCooTensor out = sparse::Conv3d(dev_ctx_cpu, x_tensor, kernel_tensor, @@ -122,7 +121,9 @@ void TestConv3dBase(const std::vector& indices, strides, 1, subm, - "Conv3d_0"); + "Conv3d_0", + &rulebook, + &counter); ASSERT_EQ(correct_out_dims.size(), out.dims().size()); for (int i = 0; i < correct_out_dims.size(); i++) { @@ -143,6 +144,8 @@ void TestConv3dBase(const std::vector& indices, x_tensor, kernel_tensor, out, + rulebook, + counter, out, paddings, dilations, @@ -197,8 +200,7 @@ void TestConv3dBase(const std::vector& indices, phi::Copy( dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor); - // DenseTensor d_rulebook = phi::Empty( - // dev_ctx_gpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW)); + DenseTensor d_rulebook, d_counter; SparseCooTensor d_out = sparse::Conv3d(dev_ctx_gpu, d_x_tensor, d_kernel_tensor, @@ -207,7 +209,9 @@ void TestConv3dBase(const std::vector& indices, strides, 1, subm, - "Conv3d_0"); + "Conv3d_0", + &d_rulebook, + &d_counter); SparseCooTensor tmp_d_out = sparse::Coalesced(dev_ctx_gpu, d_out); ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); @@ -246,6 +250,8 @@ void TestConv3dBase(const std::vector& indices, d_x_tensor, d_kernel_tensor, d_out, + d_rulebook, + d_counter, d_out, paddings, dilations, diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py index e43db8bd9150b..ede33e4167472 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py @@ -51,7 +51,6 @@ def test_conv3d(self): padding=paddings, dilation=dilations, groups=1, - key='conv3d', data_format="NDHWC") out.backward(out) out = paddle.incubate.sparse.coalesced(out) @@ -86,7 +85,7 @@ def test_Conv3D(self): indices, values, dense_shape, False) sparse_conv3d = paddle.incubate.sparse.nn.Conv3D( - 1, 1, (1, 3, 3), data_format='NDHWC', key='conv3d') + 1, 1, (1, 3, 3), data_format='NDHWC') sparse_out = sparse_conv3d(sparse_input) #test errors with self.assertRaises(ValueError): @@ -131,7 +130,6 @@ def test_Conv3D_bias(self): sp_conv3d = paddle.incubate.sparse.nn.Conv3D(3, 2, 3, - key='conv3d', data_format='NDHWC') sp_conv3d.weight.set_value( paddle.to_tensor(conv3d.weight.numpy().transpose(2, 3, 4, 1, diff --git a/python/paddle/incubate/sparse/nn/functional/conv.py b/python/paddle/incubate/sparse/nn/functional/conv.py index 62800dd01c65b..2dda83b2c1659 100644 --- a/python/paddle/incubate/sparse/nn/functional/conv.py +++ b/python/paddle/incubate/sparse/nn/functional/conv.py @@ -63,9 +63,9 @@ def _conv3d(x, dilation = convert_to_list(dilation, dims, 'dilation') op_type = "conv3d" - pre_bias = _C_ops.final_state_sparse_conv3d( - x, weight, padding, dilation, stride, groups, subm, - key if key is not None else name) + pre_bias = _C_ops.final_state_sparse_conv3d(x, weight, padding, dilation, + stride, groups, subm, + key if key is not None else "") if bias is not None: values = pre_bias.values() add_bias = elementwise_add(values, bias, axis=1) @@ -84,7 +84,6 @@ def conv3d(x, padding=0, dilation=1, groups=1, - key=None, data_format="NDHWC", name=None): r""" @@ -189,7 +188,7 @@ def conv3d(x, # (1, 1, 1, 2, 1) """ return _conv3d(x, weight, bias, stride, padding, dilation, groups, False, - key, data_format, name) + None, data_format, name) def subm_conv3d(x, diff --git a/python/paddle/incubate/sparse/nn/layer/conv.py b/python/paddle/incubate/sparse/nn/layer/conv.py index b583fb6c12627..c7fe1f7b4033e 100644 --- a/python/paddle/incubate/sparse/nn/layer/conv.py +++ b/python/paddle/incubate/sparse/nn/layer/conv.py @@ -231,7 +231,6 @@ def __init__(self, padding=0, dilation=1, groups=1, - key=None, padding_mode='zeros', weight_attr=None, bias_attr=None, @@ -244,7 +243,7 @@ def __init__(self, dilation=dilation, groups=groups, subm=False, - key=key, + key=None, padding_mode=padding_mode, weight_attr=weight_attr, bias_attr=bias_attr, From d38563b5bda1c67c32c2b0a77c664fa0b11be535 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 5 Jul 2022 12:20:52 +0000 Subject: [PATCH 55/70] opt code structure --- paddle/phi/kernels/funcs/sparse/scatter.cu.h | 108 +++--------------- .../phi/kernels/sparse/gpu/convolution.cu.h | 60 +++------- .../sparse/gpu/convolution_grad_kernel.cu | 20 ++-- .../kernels/sparse/gpu/convolution_kernel.cu | 2 + 4 files changed, 47 insertions(+), 143 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h index 33157304424a3..270d17722c3a4 100644 --- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h +++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h @@ -67,80 +67,17 @@ __global__ void ScatterKernel(const T* input, out + indices_i * channels + channels_i * VecSize); } } - -template -__global__ void ScatterCUDAKernel(const T* params, - const IndexT* indices, - T* output, - size_t index_size, - size_t slice_size, - bool overwrite) { - const size_t vec_slice_size = slice_size / VecSize; - using LoadT = phi::AlignedVector; - using StoreT = phi::AlignedVector; - CUDA_KERNEL_LOOP_TYPE(i, index_size * vec_slice_size, int64_t) { - int64_t indices_i = i / vec_slice_size; - int64_t slice_i = - i - indices_i * vec_slice_size; // offset inside the slice - IndexT scatter_i = indices[indices_i]; - - int64_t out_i = scatter_i * slice_size + slice_i * VecSize; - LoadT vec_params, vec_out; - phi::Load(params + i * VecSize, &vec_params); - phi::Load(output + out_i, &vec_out); -#pragma unroll - for (int j = 0; j < VecSize; j++) { - vec_out[j] += vec_params[j]; - } - phi::Store(vec_out, output + out_i); - // output[out_i] += params[i]; - } -} - +// scatter's index has been grouped in advance +// index_counts record the count of every group +// index_groups save the index of every group template __global__ void ScatterKernelV2(const T* input, - const int* out_index_counts, - const int* origin_out_indexs, - const int non_zero_num, - const int kernel_size, - const int channels, - T* out) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - const int vec_channels = channels / VecSize; - using LoadT = phi::AlignedVector; - using StoreT = phi::AlignedVector; - for (int i = tid; i < non_zero_num * vec_channels; - i += gridDim.x * blockDim.x) { - int indices_i = i / vec_channels; - int channels_i = i - indices_i * vec_channels; - - int len = out_index_counts[indices_i]; - // max(end-start) = kernel_size - StoreT sums = {static_cast(0)}; - phi::Load(out + indices_i * channels + channels_i * VecSize, - &sums); - for (int j = 0; j < len; j++) { - const int out_feature_i = origin_out_indexs[indices_i * kernel_size + j]; - LoadT vec_in; - phi::Load( - input + out_feature_i * channels + channels_i * VecSize, &vec_in); -#pragma unroll - for (int k = 0; k < VecSize; k++) { - sums[k] += vec_in[k]; - } - } - phi::Store(sums, - out + indices_i * channels + channels_i * VecSize); - } -} - -template -__global__ void ScatterKernelV3(const T* input, - const int* out_index_counts, - const int* origin_out_indexs, + const int* index_counts, + const int* index_groups, const int non_zero_num, const int kernel_size, const int channels, + const int buffer_counts, T* out) { int tid = threadIdx.x + blockIdx.x * blockDim.x; const int vec_channels = channels / VecSize; @@ -151,31 +88,22 @@ __global__ void ScatterKernelV3(const T* input, int indices_i = i / vec_channels; int channels_i = i - indices_i * vec_channels; - int len1 = out_index_counts[indices_i]; StoreT sums = {static_cast(0)}; phi::Load(out + indices_i * channels + channels_i * VecSize, &sums); - for (int j = 0; j < len1; j++) { - const int out_feature_i = origin_out_indexs[indices_i * kernel_size + j]; - LoadT vec_in; - phi::Load( - input + out_feature_i * channels + channels_i * VecSize, &vec_in); + for (int it = 0; it < buffer_counts; it++) { + int len = index_counts[indices_i + it * non_zero_num]; + const int group_offset = it * kernel_size * non_zero_num; + for (int j = 0; j < len; j++) { + const int out_feature_i = + index_groups[indices_i * kernel_size + j + group_offset]; + LoadT vec_in; + phi::Load( + input + out_feature_i * channels + channels_i * VecSize, &vec_in); #pragma unroll - for (int k = 0; k < VecSize; k++) { - sums[k] += vec_in[k]; - } - } - - int len2 = out_index_counts[non_zero_num + indices_i]; - for (int j = 0; j < len2; j++) { - const int out_feature_i = origin_out_indexs[indices_i * kernel_size + j + - kernel_size * non_zero_num]; - LoadT vec_in; - phi::Load( - input + out_feature_i * channels + channels_i * VecSize, &vec_in); -#pragma unroll - for (int k = 0; k < VecSize; k++) { - sums[k] += vec_in[k]; + for (int k = 0; k < VecSize; k++) { + sums[k] += vec_in[k]; + } } } phi::Store(sums, diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 139259f48c8b3..f1d5074a777c0 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -60,44 +60,16 @@ __global__ void GatherKernel(const T* params, } } -// the index_counts records the number of times the same index will be gather -template -__global__ void GatherKernelV2(const T* inputs, - const int* index_counts, - const int* origin_indexs, - const int non_zero_num, - const int kernel_size, - T* output, - const int channels) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - const int vec_channels = channels / VecSize; - using LoadT = phi::AlignedVector; - using StoreT = phi::AlignedVector; - for (int i = tid; i < non_zero_num * vec_channels; - i += gridDim.x * blockDim.x) { - int indices_i = i / vec_channels; - int channels_i = i - indices_i * vec_channels; - int len = index_counts[indices_i]; - LoadT in_vec; - phi::Load(inputs + indices_i * channels + channels_i * VecSize, - &in_vec); - for (int j = 0; j < len; j++) { - int out_i = origin_indexs[indices_i * kernel_size + j]; - phi::Store(in_vec, - output + out_i * channels + channels_i * VecSize); - } - } -} - // double sparse, seed GroupIndexs template -__global__ void GatherKernelV3(const T* inputs, +__global__ void GatherKernelV2(const T* inputs, const int* index_counts, - const int* origin_indexs, + const int* index_groups, const int non_zero_num, const int kernel_size, - T* output, - const int channels) { + const int channels, + const int buffer_count, + T* output) { int tid = threadIdx.x + blockIdx.x * blockDim.x; const int vec_channels = channels / VecSize; using LoadT = phi::AlignedVector; @@ -106,21 +78,19 @@ __global__ void GatherKernelV3(const T* inputs, i += gridDim.x * blockDim.x) { int indices_i = i / vec_channels; int channels_i = i - indices_i * vec_channels; - int len1 = index_counts[indices_i]; LoadT in_vec; phi::Load(inputs + indices_i * channels + channels_i * VecSize, &in_vec); - for (int j = 0; j < len1; j++) { - int out_i = origin_indexs[indices_i * kernel_size + j]; - phi::Store(in_vec, - output + out_i * channels + channels_i * VecSize); - } - int len2 = index_counts[non_zero_num + indices_i]; - for (int j = 0; j < len2; j++) { - int out_i = origin_indexs[indices_i * kernel_size + j + - kernel_size * non_zero_num]; - phi::Store(in_vec, - output + out_i * channels + channels_i * VecSize); +#pragma unroll + for (int it = 0; it < buffer_count; it++) { + int len = index_counts[indices_i + it * non_zero_num]; + const int group_offset = it * kernel_size * non_zero_num; +#pragma unroll + for (int j = 0; j < len; j++) { + int out_i = index_groups[indices_i * kernel_size + j + group_offset]; + phi::Store( + in_vec, output + out_i * channels + channels_i * VecSize); + } } } } diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index 741720b20b9ff..b6f674576f934 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -178,7 +178,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, if (in_channels % VecSize == 0) { auto config = phi::backends::gpu::GetGpuLaunchConfig1D( dev_ctx, x.nnz() * in_channels / VecSize, 1); - GatherKernelV3 + GatherKernelV2 << + GatherKernelV2 <<nnz() * in_channels / VecSize, 1); - phi::funcs::sparse::ScatterKernelV3 + phi::funcs::sparse::ScatterKernelV2 <<nnz(), kernel_size, in_channels, + 2, x_grad_values_ptr); } else { auto config = phi::backends::gpu::GetGpuLaunchConfig1D( dev_ctx, x_grad->nnz() * in_channels, 1); - phi::funcs::sparse::ScatterKernelV3 + phi::funcs::sparse::ScatterKernelV2 <<nnz(), kernel_size, in_channels, + 2, x_grad_values_ptr); } } diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 6c071d963ebd2..1381373abc03b 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -246,6 +246,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, out->nnz(), kernel_size, out_channels, + 1, out_values_ptr); } else { auto config = phi::backends::gpu::GetGpuLaunchConfig1D( @@ -260,6 +261,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, out->nnz(), kernel_size, out_channels, + 1, out_values_ptr); } } From d06527fa24e0d9ca01b5231173da173a07f92604 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 6 Jul 2022 02:27:06 +0000 Subject: [PATCH 56/70] opt gather/scatter code structure --- paddle/phi/kernels/funcs/sparse/scatter.cu.h | 44 +++++++ .../phi/kernels/sparse/gpu/convolution.cu.h | 113 ++++++++++------- .../sparse/gpu/convolution_grad_kernel.cu | 119 ++++-------------- .../kernels/sparse/gpu/convolution_kernel.cu | 72 +++-------- 4 files changed, 154 insertions(+), 194 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h index 270d17722c3a4..f7c4b7642d7bd 100644 --- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h +++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" #define VecBytes 16 @@ -111,6 +113,48 @@ __global__ void ScatterKernelV2(const T* input, } } +template +void ScatterV2(const GPUContext& dev_ctx, + const T* input, + const int* index_counts, + const int* index_groups, + const int non_zero_num, + const int kernel_size, + const int channels, + const int buffer_counts, + T* output) { + const int VecSize = VecBytes / sizeof(T); + if (channels % VecSize == 0) { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, non_zero_num * channels / VecSize, 1); + ScatterKernelV2<<>>(input, + index_counts, + index_groups, + non_zero_num, + kernel_size, + channels, + buffer_counts, + output); + } else { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, non_zero_num * channels, 1); + ScatterKernelV2<<>>(input, + index_counts, + index_groups, + non_zero_num, + kernel_size, + channels, + buffer_counts, + output); + } +} + } // namespace sparse } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index f1d5074a777c0..47409d3a0664b 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/index_impl.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" #include "paddle/phi/kernels/funcs/sparse/utils.cu.h" #include "paddle/phi/kernels/primitive/compute_primitives.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" @@ -95,6 +96,75 @@ __global__ void GatherKernelV2(const T* inputs, } } +template +inline void Gather(const GPUContext& dev_ctx, + const T* inputs, + const IntT* indices, + const int indices_size, + const int channels, + T* output) { + const int VecSize = VecBytes / sizeof(T); + if (channels % VecSize == 0) { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, indices_size * channels / VecSize, 1); + GatherKernel + <<>>(inputs, indices, output, indices_size, channels); + } else { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, indices_size * channels, 1); + GatherKernel + <<>>(inputs, indices, output, indices_size, channels); + } +} + +template +inline void GatherV2(const GPUContext& dev_ctx, + const T* inputs, + const int* index_counts, + const int* index_groups, + const int non_zero_num, + const int kernel_size, + const int channels, + const int buffer_count, + T* output) { + const int VecSize = VecBytes / sizeof(T); + if (channels % VecSize == 0) { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, non_zero_num * channels / VecSize, 1); + GatherKernelV2<<>>(inputs, + index_counts, + index_groups, + non_zero_num, + kernel_size, + channels, + buffer_count, + output); + } else { + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, non_zero_num * channels, 1); + GatherKernelV2<<>>(inputs, + index_counts, + index_groups, + non_zero_num, + kernel_size, + channels, + buffer_count, + output); + } +} + // unique the out indexs in rulebook template __global__ void UniqueKernel(const IntT* in_indexs, @@ -151,47 +221,6 @@ __global__ void GroupIndexs(const int* out_index_table, } } -/** - * @brief: update the out index and indices - * unique_keys: save the index of the output feature list - * unique_values: indiates the index of key before deduplication - * out_indexs: indicates the position of the output index in the rulebook - * rulebook_len: indicates the length of rulebook - * out_dims: indicates the output dims - * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys) - * rulebook_out_indexs: the output index in rulebook - **/ -template -__global__ void UpdateIndexKernel(const T* unique_keys, - const int* unique_values, - const int* out_indexs, - const int64_t non_zero_num, - const int rulebook_len, - const Dims4D out_dims, - T* out_indices, - T* rulebook_out_indexs) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { - const T index = unique_keys[i]; - T batch, x, y, z; - phi::funcs::sparse::IndexToPoint( - index, out_dims, &batch, &x, &y, &z); - // get out indices - out_indices[i] = batch; - out_indices[i + non_zero_num] = z; - out_indices[i + non_zero_num * 2] = y; - out_indices[i + non_zero_num * 3] = x; - - // update rulebook - int start = unique_values[i]; - int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1]; - // max(end-start) = kernel_size - for (T j = start; j < end; j++) { - rulebook_out_indexs[out_indexs[j]] = i; - } - } -} - /** * @brief product rulebook * for input_i in x_indices: @@ -295,7 +324,6 @@ __global__ void GetOutIndexTable(int* indexs, IntT* out_indices) { CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) { IntT index = static_cast(indexs[i]); - // index = index == -1 ? 0 : index; out_index_table[index] = i; IntT batch, x, y, z; phi::funcs::sparse::IndexToPoint( @@ -334,7 +362,6 @@ __global__ void CopyRuleBook(const int* counters, } } int inner_index = i - offsets[kernel_index]; - // out_rulebook[i] = in_rulebook[kernel_index * non_zero_num + inner_index]; out_rulebook[i] = in_rulebook[kernel_index * non_zero_num + inner_index]; out_rulebook[len + i] = in_rulebook[kernel_size * non_zero_num + kernel_index * non_zero_num + diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index b6f674576f934..1a6842416e3dd 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -24,8 +24,6 @@ limitations under the License. */ #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/scatter.cu.h" -#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" namespace phi { @@ -99,7 +97,6 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, *kernel_grad = phi::EmptyLike(dev_ctx, kernel); T* d_kernel_ptr = kernel_grad->data(); phi::funcs::SetConstant set_zero; - // set_zero(dev_ctx, kernel_grad, static_cast(0.0f)); phi::backends::gpu::GpuMemsetAsync( d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel(), dev_ctx.stream()); @@ -109,12 +106,10 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, phi::EmptyLike(dev_ctx, x.non_zero_indices()); DenseTensor x_grad_values = phi::EmptyLike(dev_ctx, x.non_zero_elements()); T* x_grad_values_ptr = x_grad_values.data(); - // set_zero(dev_ctx, &x_grad_values, static_cast(0.0f)); phi::backends::gpu::GpuMemsetAsync(x_grad_values_ptr, 0, sizeof(T) * x_grad_values.numel(), dev_ctx.stream()); - // set_zero(dev_ctx, &d_x_features, static_cast(0.0f)); phi::backends::gpu::GpuMemsetAsync( d_x_features_ptr, 0, sizeof(T) * d_x_features.numel(), dev_ctx.stream()); phi::Copy(dev_ctx, @@ -160,7 +155,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, phi::Empty(dev_ctx, {static_cast(x.nnz() * 2)}); int* out_index_ptr = out_index.data(); int* unique_value_ptr = unique_value.data(); - cudaMemsetAsync( + phi::backends::gpu::GpuMemsetAsync( out_index_ptr, 0, sizeof(int) * x.nnz() * 2, dev_ctx.stream()); GroupIndexsV2<< - <<>>(x.non_zero_elements().data(), - out_index_ptr, - unique_value_ptr, - x.nnz(), - kernel_size, - in_channels, - 2, - in_features_ptr); - } else { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, x.nnz() * in_channels, 1); - GatherKernelV2 - <<>>(x.non_zero_elements().data(), - out_index_ptr, - unique_value_ptr, - x.nnz(), - kernel_size, - in_channels, - 2, - in_features_ptr); - } + GatherV2(dev_ctx, + x.non_zero_elements().data(), + out_index_ptr, + unique_value_ptr, + x.nnz(), + kernel_size, + in_channels, + 2, + in_features_ptr); - if (out_channels % VecSize == 0) { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * out_channels / VecSize, 1); - GatherKernel - <<>>(out_grad.non_zero_elements().data(), - rulebook_ptr + rulebook_len, - out_grad_features_ptr, - rulebook_len, - out_channels); - } else { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * out_channels, 1); - GatherKernel - <<>>(out_grad.non_zero_elements().data(), - rulebook_ptr + rulebook_len, - out_grad_features_ptr, - rulebook_len, - out_channels); - } + Gather(dev_ctx, + out_grad.non_zero_elements().data(), + rulebook_ptr + rulebook_len, + rulebook_len, + out_channels, + out_grad_features_ptr); const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { @@ -276,37 +229,15 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, } // 4. scatter - if (in_channels % VecSize == 0) { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, x_grad->nnz() * in_channels / VecSize, 1); - phi::funcs::sparse::ScatterKernelV2 - <<>>(d_x_features_ptr, - out_index.data(), - unique_value.data(), - x_grad->nnz(), - kernel_size, - in_channels, - 2, - x_grad_values_ptr); - } else { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, x_grad->nnz() * in_channels, 1); - phi::funcs::sparse::ScatterKernelV2 - <<>>(d_x_features_ptr, - out_index.data(), - unique_value.data(), - x_grad->nnz(), - kernel_size, - in_channels, - 2, - x_grad_values_ptr); - } + phi::funcs::sparse::ScatterV2(dev_ctx, + d_x_features_ptr, + out_index.data(), + unique_value.data(), + x_grad->nnz(), + kernel_size, + in_channels, + 2, + x_grad_values_ptr); } template diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 1381373abc03b..70453d371cc50 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -158,32 +158,12 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, phi::funcs::SetConstant set_zero; set_zero(dev_ctx, &out_features, static_cast(0.0f)); - const int VecSize = VecBytes / sizeof(T); - if (in_channels % VecSize == 0) { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, n * in_channels / VecSize, 1); - GatherKernel - <<>>(x.non_zero_elements().data(), - rulebook_ptr, - in_features_ptr, - n, - in_channels); - } else { - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1); - GatherKernel - <<>>(x.non_zero_elements().data(), - rulebook_ptr, - in_features_ptr, - n, - in_channels); - } + Gather(dev_ctx, + x.non_zero_elements().data(), + rulebook_ptr, + n, + in_channels, + in_features_ptr); // 3. call gemm for every werght auto blas = phi::funcs::GetBlas(dev_ctx); @@ -233,37 +213,15 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, } // 4. scatter - if (out_channels % VecSize == 0) { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, out->nnz() * out_channels / VecSize, 1); - phi::funcs::sparse::ScatterKernelV2 - <<>>(out_features_ptr, - out_index.data(), - unique_value.data(), - out->nnz(), - kernel_size, - out_channels, - 1, - out_values_ptr); - } else { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, out->nnz() * out_channels, 1); - phi::funcs::sparse::ScatterKernelV2 - <<>>(out_features_ptr, - out_index.data(), - unique_value.data(), - out->nnz(), - kernel_size, - out_channels, - 1, - out_values_ptr); - } + phi::funcs::sparse::ScatterV2(dev_ctx, + out_features_ptr, + out_index.data(), + unique_value.data(), + out->nnz(), + kernel_size, + out_channels, + 1, + out_values_ptr); } /** From 842acf70487069a4088defa92bd685db1550e14c Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 6 Jul 2022 04:11:17 +0000 Subject: [PATCH 57/70] fix pool --- paddle/phi/api/yaml/sparse_api.yaml | 6 +-- paddle/phi/api/yaml/sparse_bw_api.yaml | 6 +-- .../sparse/cpu/sparse_pool_grad_kernel.cc | 15 +++---- .../kernels/sparse/cpu/sparse_pool_kernel.cc | 14 +++++-- .../phi/kernels/sparse/gpu/convolution.cu.h | 36 +++++++---------- .../sparse/gpu/sparse_pool_grad_kernel.cu | 32 +++++---------- .../kernels/sparse/gpu/sparse_pool_kernel.cu | 39 +++++++++++-------- .../kernels/sparse/sparse_pool_grad_kernel.h | 4 +- .../phi/kernels/sparse/sparse_pool_kernel.h | 17 ++++++-- .../tests/kernels/test_sparse_pool_dev_api.cc | 27 ++++++++----- 10 files changed, 104 insertions(+), 92 deletions(-) diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml index cc583fce1f8a1..a73529dde3c17 100644 --- a/paddle/phi/api/yaml/sparse_api.yaml +++ b/paddle/phi/api/yaml/sparse_api.yaml @@ -169,11 +169,11 @@ - api: maxpool args : (Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) - output : Tensor(out), Tensor(rulebook) + output : Tensor(out), Tensor(rulebook), Tensor(counter) kernel : - func : sparse_maxpool{sparse_coo -> sparse_coo, dense} + func : sparse_maxpool{sparse_coo -> sparse_coo, dense, dense} layout : x - intermediate : rulebook + intermediate : rulebook, counter backward : sparse_maxpool_grad - api: mv diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml index 04cdfccce636e..4d0371257d810 100644 --- a/paddle/phi/api/yaml/sparse_bw_api.yaml +++ b/paddle/phi/api/yaml/sparse_bw_api.yaml @@ -93,11 +93,11 @@ func : softmax_csr_grad{sparse_csr, sparse_csr -> sparse_csr} - backward_api : sparse_maxpool_grad - forward : sparse_maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out), Tensor(rulebook) - args : (Tensor x, Tensor rulebook, Tensor out, Tensor out_grad, int[] kernel_sizes) + forward : sparse_maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out), Tensor(rulebook), Tensor(counter) + args : (Tensor x, Tensor rulebook, Tensor counter, Tensor out, Tensor out_grad, int[] kernel_sizes) output : Tensor(x_grad) kernel : - func : sparse_maxpool_grad {sparse_coo, dense, sparse_coo, sparse_coo -> sparse_coo} + func : sparse_maxpool_grad {sparse_coo, dense, dense, sparse_coo, sparse_coo -> sparse_coo} - backward_api : sqrt_grad forward : sqrt(Tensor x) -> Tensor(out) diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc index 64c843c07a6ef..580cfe9bb94d0 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc @@ -28,6 +28,7 @@ template void MaxPoolGradCPUKernel(const CPUContext& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, + const DenseTensor& counter, const SparseCooTensor& out, const SparseCooTensor& out_grad, const std::vector& kernel_sizes, @@ -36,11 +37,10 @@ void MaxPoolGradCPUKernel(const CPUContext& dev_ctx, const int channels = x.dims()[4]; int rulebook_len = rulebook.dims()[1]; const IntT* rulebook_ptr = rulebook.data(); - std::vector offsets(kernel_size + 1), counter(kernel_size, 0); - for (int i = 0; i < rulebook_len; i++) { - counter[rulebook_ptr[i]] += 1; - } - phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size); + std::vector offsets(kernel_size + 1); + const int* counter_ptr = counter.data(); + + phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size); const T* in_features_ptr = x.non_zero_elements().data(); const T* out_features_ptr = out.non_zero_elements().data(); @@ -60,7 +60,7 @@ void MaxPoolGradCPUKernel(const CPUContext& dev_ctx, phi::funcs::MaxPoolGrad grad_functor; for (int i = 0; i < kernel_size; i++) { - for (int j = 0; j < counter[i]; j++) { + for (int j = 0; j < counter_ptr[i]; j++) { IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j]; IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j]; for (int c = 0; c < channels; c++) { @@ -78,6 +78,7 @@ template void MaxPoolGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, + const DenseTensor& counter, const SparseCooTensor& out, const SparseCooTensor& out_grad, const std::vector& kernel_sizes, @@ -85,7 +86,7 @@ void MaxPoolGradKernel(const Context& dev_ctx, PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "MaxPoolGradCPUKernel", ([&] { MaxPoolGradCPUKernel( - dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad); + dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad); })); } diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc index f4d6e807538ea..a3224b6fe14bb 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc @@ -37,7 +37,8 @@ void MaxPoolCPUKernel(const CPUContext& dev_ctx, const std::vector& dilations, const std::vector& strides, SparseCooTensor* out, - DenseTensor* rulebook) { + DenseTensor* rulebook, + DenseTensor* counter) { const auto& x_dims = x.dims(); int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; const std::vector& real_kernel_sizes = @@ -71,7 +72,10 @@ void MaxPoolCPUKernel(const CPUContext& dev_ctx, int rulebook_len = rulebook->dims()[1]; const IntT* rulebook_ptr = rulebook->data(); - const int* counter_ptr = counter_per_kernel.data(); + + counter->Resize({kernel_size}); + int* counter_ptr = dev_ctx.template HostAlloc(counter); + memcpy(counter_ptr, counter_per_kernel.data(), kernel_size * sizeof(int)); std::vector offsets(kernel_size + 1); phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size); @@ -107,7 +111,8 @@ void MaxPoolKernel(const Context& dev_ctx, const std::vector& dilations, const std::vector& strides, SparseCooTensor* out, - DenseTensor* rulebook) { + DenseTensor* rulebook, + DenseTensor* counter) { PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "MaxPoolCPUKernel", ([&] { MaxPoolCPUKernel(dev_ctx, @@ -117,7 +122,8 @@ void MaxPoolKernel(const Context& dev_ctx, dilations, strides, out, - rulebook); + rulebook, + counter); })); } diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 47409d3a0664b..9787393d06960 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -305,7 +305,7 @@ template __global__ void GetOutIndexTable(const IntT* indices, const IntT non_zero_num, const Dims4D dims, - IntT* out_index_table) { + int* out_index_table) { CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) { IntT batch = indices[i]; IntT in_z = indices[i + non_zero_num]; @@ -378,7 +378,7 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices, const Dims4D paddings, const Dims4D dilations, const Dims4D strides, - const T* out_index_table, + const int* out_index_table, T* rulebook, int* counter) { int tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -545,11 +545,19 @@ int ProductRuleBook(const Context& dev_ctx, auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + const int rulebook_rows = 2; + const int rulebook_cols = kernel_size * non_zero_num; + DenseTensorMeta rulebook_meta( + indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW); + + int64_t table_size = 1; + for (int i = 0; i < out_dims.size() - 1; i++) { + table_size *= out_dims[i]; + } + DenseTensor out_index_table = phi::Empty(dev_ctx, {table_size}); + int* out_index_table_ptr = out_index_table.data(); + if (subm) { - const int rulebook_rows = 2; - const int rulebook_cols = kernel_size * non_zero_num; - DenseTensorMeta rulebook_meta( - indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW); DenseTensor tmp_rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta)); IntT* rulebook_ptr = tmp_rulebook.data(); DenseTensor out_indices = @@ -562,12 +570,6 @@ int ProductRuleBook(const Context& dev_ctx, phi::Copy( dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices); - int64_t table_size = 1; - for (int i = 0; i < out_dims.size() - 1; i++) { - table_size *= out_dims[i]; - } - DenseTensor out_index_table = phi::Empty(dev_ctx, {table_size}); - IntT* out_index_table_ptr = out_index_table.data(); phi::backends::gpu::GpuMemsetAsync( out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream()); @@ -648,10 +650,6 @@ int ProductRuleBook(const Context& dev_ctx, return rulebook_len; } else { - const int rulebook_rows = 2; - const int rulebook_cols = kernel_size * non_zero_num; - DenseTensorMeta rulebook_meta( - indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW); *rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta)); IntT* rulebook_ptr = rulebook->data(); ProductRuleBookKernel<<data(); int* unique_key_ptr = unique_key.data(); - int64_t table_size = 1; - for (int i = 0; i < out_dims.size() - 1; i++) { - table_size *= out_dims[i]; - } - DenseTensor out_index_table = phi::Empty(dev_ctx, {table_size}); - int* out_index_table_ptr = out_index_table.data(); phi::backends::gpu::GpuMemsetAsync( out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream()); phi::backends::gpu::GpuMemsetAsync( diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu index 5fe6e68c1e83f..12225da7a01fb 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu @@ -55,6 +55,7 @@ template void MaxPoolGradGPUKernel(const GPUContext& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, + const DenseTensor& counter, const SparseCooTensor& out, const SparseCooTensor& out_grad, const std::vector& kernel_sizes, @@ -63,23 +64,9 @@ void MaxPoolGradGPUKernel(const GPUContext& dev_ctx, const int in_channels = x.dims()[4]; int rulebook_len = rulebook.dims()[1]; const IntT* rulebook_ptr = rulebook.data(); - std::vector offsets(kernel_size + 1), counter(kernel_size, 0), - h_counter(rulebook_len, 0); - phi::backends::gpu::GpuMemcpyAsync(&h_counter[0], - rulebook_ptr, - rulebook_len * sizeof(IntT), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif - - dev_ctx.stream()); - dev_ctx.Wait(); - for (int i = 0; i < rulebook_len; i++) { - counter[h_counter[i]] += 1; - } - phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size); + std::vector offsets(kernel_size + 1); + const int* counter_ptr = counter.data(); + phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size); const T* in_features_ptr = x.non_zero_elements().data(); const T* out_features_ptr = out.non_zero_elements().data(); @@ -99,12 +86,12 @@ void MaxPoolGradGPUKernel(const GPUContext& dev_ctx, &x_grad_indices); for (int i = 0; i < kernel_size; i++) { - if (counter[i] <= 0) { + if (counter_ptr[i] <= 0) { continue; } auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, counter[i] * in_channels, 1); + dev_ctx, counter_ptr[i] * in_channels, 1); MaxPoolGradCudaKernel <<>>(in_features_ptr, out_features_ptr, out_grad_ptr, - rulebook_ptr + offsets[i] + rulebook_len, - counter[i], + rulebook_ptr + offsets[i], + counter_ptr[i], rulebook_len, in_channels, x_grad_ptr); @@ -124,6 +111,7 @@ template void MaxPoolGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, + const DenseTensor& counter, const SparseCooTensor& out, const SparseCooTensor& out_grad, const std::vector& kernel_sizes, @@ -131,7 +119,7 @@ void MaxPoolGradKernel(const Context& dev_ctx, PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "MaxPoolGradGPUKernel", ([&] { MaxPoolGradGPUKernel( - dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad); + dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad); })); } diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu index bc6723d26b7a6..61a622075efde 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu @@ -55,7 +55,8 @@ void MaxPoolGPUKernel(const GPUContext& dev_ctx, const std::vector& dilations, const std::vector& strides, SparseCooTensor* out, - DenseTensor* rulebook) { + DenseTensor* rulebook, + DenseTensor* counter) { const auto& x_dims = x.dims(); int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; const std::vector& real_kernel_sizes = @@ -65,7 +66,7 @@ void MaxPoolGPUKernel(const GPUContext& dev_ctx, x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims); const int in_channels = real_kernel_sizes[3]; - std::vector offsets(kernel_size + 1), counter(kernel_size); + std::vector offsets(kernel_size + 1), h_counter(kernel_size); DenseTensorMeta counter_meta( DataType::INT32, {kernel_size}, DataLayout::NCHW); DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); @@ -89,13 +90,16 @@ void MaxPoolGPUKernel(const GPUContext& dev_ctx, &out_index, &unique_value, out, - &counter, + &h_counter, &offsets); const IntT* rulebook_ptr = rulebook->data(); T* out_features_ptr = out->mutable_non_zero_elements()->data(); const T* in_features_ptr = x.non_zero_elements().data(); + counter->Resize({kernel_size}); + int* counter_ptr = dev_ctx.template HostAlloc(counter); + memcpy(counter_ptr, h_counter.data(), h_counter.size() * sizeof(int)); // 2. max pool #ifdef PADDLE_WITH_HIP thrust::fill(thrust::hip::par.on(dev_ctx.stream()), @@ -107,22 +111,21 @@ void MaxPoolGPUKernel(const GPUContext& dev_ctx, static_cast(0)); // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster for (int i = 0; i < kernel_size; i++) { - if (counter[i] <= 0) { + if (h_counter[i] <= 0) { continue; } auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, counter[i] * in_channels, 1); - MaxPoolCudaKernel - <<>>(in_features_ptr, - rulebook_ptr + offsets[i] + rulebook_len, - counter[i], - rulebook_len, - in_channels, - out_features_ptr); + dev_ctx, h_counter[i] * in_channels, 1); + MaxPoolCudaKernel<<>>(in_features_ptr, + rulebook_ptr + offsets[i], + h_counter[i], + rulebook_len, + in_channels, + out_features_ptr); } } @@ -134,7 +137,8 @@ void MaxPoolKernel(const Context& dev_ctx, const std::vector& dilations, const std::vector& strides, SparseCooTensor* out, - DenseTensor* rulebook) { + DenseTensor* rulebook, + DenseTensor* counter) { PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "MaxPoolGPUKernel", ([&] { MaxPoolGPUKernel(dev_ctx, @@ -144,7 +148,8 @@ void MaxPoolKernel(const Context& dev_ctx, dilations, strides, out, - rulebook); + rulebook, + counter); })); } diff --git a/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h index 2f7366a010aaa..ef9f8418b0116 100644 --- a/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h @@ -25,6 +25,7 @@ template void MaxPoolGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, + const DenseTensor& counter, const SparseCooTensor& out, const SparseCooTensor& out_grad, const std::vector& kernel_sizes, @@ -34,12 +35,13 @@ template SparseCooTensor MaxPoolGrad(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, + const DenseTensor& counter, const SparseCooTensor& out, const SparseCooTensor& out_grad, const std::vector& kernel_sizes) { SparseCooTensor x_grad; MaxPoolGradKernel( - dev_ctx, x, rulebook, out, out_grad, kernel_sizes, &x_grad); + dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, &x_grad); return x_grad; } diff --git a/paddle/phi/kernels/sparse/sparse_pool_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_kernel.h index d5248a1ad250e..9f4939da8d52a 100644 --- a/paddle/phi/kernels/sparse/sparse_pool_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_pool_kernel.h @@ -29,7 +29,8 @@ void MaxPoolKernel(const Context& dev_ctx, const std::vector& dilations, const std::vector& strides, SparseCooTensor* out, - DenseTensor* rulebook); + DenseTensor* rulebook, + DenseTensor* counter); template SparseCooTensor MaxPool(const Context& dev_ctx, @@ -38,10 +39,18 @@ SparseCooTensor MaxPool(const Context& dev_ctx, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, - DenseTensor* rulebook) { + DenseTensor* rulebook, + DenseTensor* counter) { SparseCooTensor coo; - MaxPoolKernel( - dev_ctx, x, kernel_sizes, paddings, dilations, strides, &coo, rulebook); + MaxPoolKernel(dev_ctx, + x, + kernel_sizes, + paddings, + dilations, + strides, + &coo, + rulebook, + counter); return coo; } diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc index 460dca59c718c..36fa99d9bfc75 100644 --- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/sparse/coalesced_kernel.h" #include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" #include "paddle/phi/kernels/sparse/sparse_pool_kernel.h" @@ -89,14 +90,15 @@ void TestMaxPoolBase(const std::vector& indices, }; if (!std::is_same::value) { - DenseTensor rulebook; + DenseTensor rulebook, counter; SparseCooTensor out = sparse::MaxPool(dev_ctx_cpu, x_tensor, kernel_sizes, paddings, dilations, strides, - &rulebook); + &rulebook, + &counter); ASSERT_EQ(correct_out_dims.size(), out.dims().size()); for (int i = 0; i < correct_out_dims.size(); i++) { @@ -113,7 +115,7 @@ void TestMaxPoolBase(const std::vector& indices, if (backward) { SparseCooTensor x_grad = sparse::MaxPoolGrad( - dev_ctx_cpu, x_tensor, rulebook, out, out, kernel_sizes); + dev_ctx_cpu, x_tensor, rulebook, counter, out, out, kernel_sizes); f_verify(x_grad.non_zero_elements().data(), features_grad); } } @@ -149,14 +151,16 @@ void TestMaxPoolBase(const std::vector& indices, SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims); - DenseTensor d_rulebook; + DenseTensor d_rulebook, d_counter; SparseCooTensor d_out = sparse::MaxPool(dev_ctx_gpu, d_x_tensor, kernel_sizes, paddings, dilations, strides, - &d_rulebook); + &d_rulebook, + &d_counter); + SparseCooTensor tmp_d_out = sparse::Coalesced(dev_ctx_gpu, d_out); ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz()); @@ -168,7 +172,7 @@ void TestMaxPoolBase(const std::vector& indices, dev_ctx_cpu, DenseTensorMeta(indices_dtype, {4, d_out.nnz()}, DataLayout::NCHW)); phi::Copy(dev_ctx_gpu, - d_out.non_zero_indices(), + tmp_d_out.non_zero_indices(), phi::CPUPlace(), true, &h_indices_tensor); @@ -182,15 +186,20 @@ void TestMaxPoolBase(const std::vector& indices, phi::EmptyLike(dev_ctx_cpu, d_out.non_zero_elements()); phi::Copy(dev_ctx_gpu, - d_out.non_zero_elements(), + tmp_d_out.non_zero_elements(), phi::CPUPlace(), true, &h_features_tensor); f_verify(h_features_tensor.data(), correct_out_features); if (backward) { - SparseCooTensor x_grad = sparse::MaxPoolGrad( - dev_ctx_gpu, d_x_tensor, d_rulebook, d_out, d_out, kernel_sizes); + SparseCooTensor x_grad = sparse::MaxPoolGrad(dev_ctx_gpu, + d_x_tensor, + d_rulebook, + d_counter, + d_out, + d_out, + kernel_sizes); DenseTensor h_features_grad = phi::EmptyLike(dev_ctx_cpu, x_grad.non_zero_elements()); phi::Copy(dev_ctx_gpu, From 5751987c6407cf3492b047e012515dd686fdfc07 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 6 Jul 2022 05:40:29 +0000 Subject: [PATCH 58/70] rename pool_kernel.cc --- .../kernels/sparse/cpu/sparse_mask_kernel.cc | 183 ---------- .../sparse/cpu/sparse_pool_grad_kernel.cc | 103 ------ .../kernels/sparse/cpu/sparse_pool_kernel.cc | 140 -------- .../kernels/sparse/gpu/sparse_mask_kernel.cu | 340 ------------------ .../sparse/gpu/sparse_pool_grad_kernel.cu | 136 ------- .../kernels/sparse/gpu/sparse_pool_kernel.cu | 167 --------- .../phi/kernels/sparse/sparse_mask_kernel.h | 36 -- .../kernels/sparse/sparse_pool_grad_kernel.h | 49 --- .../phi/kernels/sparse/sparse_pool_kernel.h | 58 --- .../sparse/sparse_utils_grad_kernel.cc | 1 - .../kernels/sparse/sparse_utils_grad_kernel.h | 2 +- .../tests/kernels/test_sparse_pool_dev_api.cc | 4 +- 12 files changed, 3 insertions(+), 1216 deletions(-) delete mode 100644 paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc delete mode 100644 paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc delete mode 100644 paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc delete mode 100644 paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu delete mode 100644 paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu delete mode 100644 paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu delete mode 100644 paddle/phi/kernels/sparse/sparse_mask_kernel.h delete mode 100644 paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h delete mode 100644 paddle/phi/kernels/sparse/sparse_pool_kernel.h diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc deleted file mode 100644 index cf2acd8557333..0000000000000 --- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc +++ /dev/null @@ -1,183 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" - -#include "paddle/phi/api/ext/dispatch.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/core/enforce.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/core/visit_type.h" -#include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/sparse/flatten_indices.h" - -namespace phi { -namespace sparse { - -template -void SparseMaskCPUKernel(const CPUContext& dev_ctx, - const DenseTensor& x, - const SparseCooTensor& mask, - SparseCooTensor* out) { - const DDim& dims = x.dims(); - PADDLE_ENFORCE_EQ( - x.dims(), - mask.dims(), - phi::errors::InvalidArgument("the input x and mask must have the shape")); - const DenseTensor& indices = mask.non_zero_indices(); - const DenseTensor& values = mask.non_zero_elements(); - const int sparse_dim = mask.sparse_dim(); - - DenseTensor out_indices = phi::EmptyLike(dev_ctx, indices); - DenseTensor out_values = phi::EmptyLike(dev_ctx, values); - - // the out_indices is same as indices of mask - phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &out_indices); - - T* out_values_ptr = out_values.data(); - const T* x_ptr = x.data(); - - const int64_t non_zero_num = mask.nnz(); - auto dims_2d = flatten_to_2d(dims, sparse_dim); - const int cols = dims_2d[1]; - const IntT* indices_ptr = indices.data(); - - std::vector out_indexs(non_zero_num), sparse_offsets(sparse_dim); - - phi::funcs::sparse::CalcOffsetsPerDim( - dims, sparse_dim, sparse_offsets.data()); - - for (int64_t i = 0; i < non_zero_num; i++) { - int64_t index = phi::funcs::sparse::CoordinateToIndex( - indices_ptr, sparse_offsets.data(), non_zero_num, sparse_dim, i); - memcpy(out_values_ptr + i * cols, x_ptr + index * cols, cols * sizeof(T)); - } - - out->SetMember(out_indices, out_values, dims, true); -} - -/** - * @brief Filter the DenseTensor x by the - * mask.non_zero_indices() and output a SparseCooTensor - * x and mask must have the same shape. - **/ -template -void SparseMaskKernel(const Context& dev_ctx, - const DenseTensor& x, - const SparseCooTensor& mask, - SparseCooTensor* out) { - PD_VISIT_INTEGRAL_TYPES( - mask.non_zero_indices().dtype(), "SparseMaskCPUKernel", ([&] { - SparseMaskCPUKernel(dev_ctx, x, mask, out); - })); -} - -template -void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& mask_indices, - DenseTensor* out) { - PADDLE_ENFORCE_EQ( - mask_indices.dims().size(), - 2, - phi::errors::InvalidArgument("the mask_indices must be 2-D tensor")); - - const int32_t sparse_dim = x.sparse_dim(); - - std::vector sparse_offsets(sparse_dim), x_indexs(x.nnz()), - mask_indexs(mask_indices.dims()[1]); - phi::funcs::sparse::CalcOffsetsPerDim( - x.dims(), sparse_dim, sparse_offsets.data()); - - phi::funcs::sparse::FlattenIndices(x.non_zero_indices().data(), - sparse_offsets.data(), - x.nnz(), - sparse_dim, - 0, - 1, - x_indexs.data()); - phi::funcs::sparse::FlattenIndices(mask_indices.data(), - sparse_offsets.data(), - x.nnz(), - sparse_dim, - 0, - 1, - mask_indexs.data()); - - std::unordered_map x_indexs_map; - for (uint64_t i = 0; i < x_indexs.size(); i++) { - x_indexs_map[x_indexs[i]] = i; - } - *out = phi::EmptyLike(dev_ctx, x.non_zero_elements()); - T* out_ptr = out->data(); - memset(out_ptr, static_cast(0), out->numel() * sizeof(T)); - const int64_t stride = - x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1]; - const T* in_ptr = x.non_zero_elements().data(); - // TODO(zhangkaihuo): multithreading can be used for acceleration - for (uint64_t i = 0; i < mask_indexs.size(); i++) { - auto iter = x_indexs_map.find(mask_indexs[i]); - if (iter != x_indexs_map.end()) { - memcpy(out_ptr + i * stride, - in_ptr + iter->second * stride, - stride * sizeof(T)); - } - } -} - -/** - * @brief filter values from x.values() using mask_indices - */ -template -void SparseMaskHelperKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& mask_indices, - DenseTensor* out) { - PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "SparseMaskHelperCPUKernel", ([&] { - SparseMaskHelperCPUKernel(dev_ctx, x, mask_indices, out); - })); -} - -} // namespace sparse -} // namespace phi - -PD_REGISTER_KERNEL(sparse_mask, - CPU, - ALL_LAYOUT, - phi::sparse::SparseMaskKernel, - float, - double, - uint8_t, - int8_t, - int16_t, - int, - int64_t) { - kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); -} - -PD_REGISTER_KERNEL(sparse_mask_helper, - CPU, - ALL_LAYOUT, - phi::sparse::SparseMaskHelperKernel, - float, - double, - uint8_t, - int16_t, - int, - int64_t) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); -} diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc deleted file mode 100644 index 580cfe9bb94d0..0000000000000 --- a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" - -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/core/visit_type.h" -#include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/funcs/pooling.h" -#include "paddle/phi/kernels/funcs/sparse/convolution.h" - -namespace phi { -namespace sparse { - -template -void MaxPoolGradCPUKernel(const CPUContext& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& rulebook, - const DenseTensor& counter, - const SparseCooTensor& out, - const SparseCooTensor& out_grad, - const std::vector& kernel_sizes, - SparseCooTensor* x_grad) { - int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; - const int channels = x.dims()[4]; - int rulebook_len = rulebook.dims()[1]; - const IntT* rulebook_ptr = rulebook.data(); - std::vector offsets(kernel_size + 1); - const int* counter_ptr = counter.data(); - - phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size); - - const T* in_features_ptr = x.non_zero_elements().data(); - const T* out_features_ptr = out.non_zero_elements().data(); - const T* out_grad_ptr = out_grad.non_zero_elements().data(); - // TODO(zhangkaihuo): call phi::sparse::EmptyLike - DenseTensor x_grad_indices = - phi::EmptyLike(dev_ctx, x.non_zero_indices()); - DenseTensor x_grad_values = phi::EmptyLike(dev_ctx, x.non_zero_elements()); - x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true); - T* x_grad_ptr = x_grad_values.data(); - memset(x_grad_ptr, 0, sizeof(T) * x_grad_values.numel()); - phi::Copy(dev_ctx, - x.non_zero_indices(), - dev_ctx.GetPlace(), - false, - &x_grad_indices); - - phi::funcs::MaxPoolGrad grad_functor; - for (int i = 0; i < kernel_size; i++) { - for (int j = 0; j < counter_ptr[i]; j++) { - IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j]; - IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j]; - for (int c = 0; c < channels; c++) { - grad_functor.compute(in_features_ptr[in_i * channels + c], - out_features_ptr[out_i * channels + c], - out_grad_ptr[out_i * channels + c], - 1, - &x_grad_ptr[in_i * channels + c]); - } - } - } -} - -template -void MaxPoolGradKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& rulebook, - const DenseTensor& counter, - const SparseCooTensor& out, - const SparseCooTensor& out_grad, - const std::vector& kernel_sizes, - SparseCooTensor* x_grad) { - PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "MaxPoolGradCPUKernel", ([&] { - MaxPoolGradCPUKernel( - dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad); - })); -} - -} // namespace sparse -} // namespace phi - -PD_REGISTER_KERNEL(sparse_maxpool_grad, - CPU, - ALL_LAYOUT, - phi::sparse::MaxPoolGradKernel, - float, - double) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); -} diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc deleted file mode 100644 index a3224b6fe14bb..0000000000000 --- a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc +++ /dev/null @@ -1,140 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h" - -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/core/tensor_meta.h" -#include "paddle/phi/core/visit_type.h" -#include "paddle/phi/kernels/funcs/pooling.h" -#include "paddle/phi/kernels/funcs/sparse/convolution.h" -#include "paddle/phi/kernels/sparse/cpu/convolution.h" - -namespace phi { -namespace sparse { - -/** - * x: (N, D, H, W, C) - * kernel: (D, H, W, C, OC) - * out: (N, D, H, W, OC) - **/ -template -void MaxPoolCPUKernel(const CPUContext& dev_ctx, - const SparseCooTensor& x, - const std::vector& kernel_sizes, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - SparseCooTensor* out, - DenseTensor* rulebook, - DenseTensor* counter) { - const auto& x_dims = x.dims(); - int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; - const std::vector& real_kernel_sizes = - phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]); - DDim out_dims = {1, 1, 1, 1, 1}; - phi::funcs::sparse::GetOutShape( - x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims); - const int in_channels = real_kernel_sizes[3]; - - // DenseTensorMeta counter_meta( - // DataType::INT32, {kernel_size}, DataLayout::NCHW); - // DenseTensor counter_per_kernel = phi::Empty(dev_ctx, - // std::move(counter_meta)); - std::vector counter_per_kernel(kernel_size, 0); - - const T* in_features_ptr = x.non_zero_elements().data(); - // 1. product rule book - ProductRuleBook(dev_ctx, - x, - real_kernel_sizes, - paddings, - dilations, - strides, - out_dims, - false, - rulebook, - &counter_per_kernel); - - UpdateRulebookAndOutIndex( - dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out); - - int rulebook_len = rulebook->dims()[1]; - const IntT* rulebook_ptr = rulebook->data(); - - counter->Resize({kernel_size}); - int* counter_ptr = dev_ctx.template HostAlloc(counter); - memcpy(counter_ptr, counter_per_kernel.data(), kernel_size * sizeof(int)); - - std::vector offsets(kernel_size + 1); - phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size); - std::vector out_flags(out->nnz(), false); - - // 2. max pool - T* out_features_ptr = out->mutable_non_zero_elements()->data(); - phi::funcs::MaxPool max_pool_functor; - for (int i = 0; i < kernel_size; i++) { - for (int j = 0; j < counter_ptr[i]; j++) { - IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j]; - IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j]; - if (!out_flags[out_i]) { - out_flags[out_i] = true; - memcpy(&out_features_ptr[out_i * in_channels], - &in_features_ptr[in_i * in_channels], - in_channels * sizeof(T)); - } else { - for (int c = 0; c < in_channels; c++) { - max_pool_functor.compute(in_features_ptr[in_i * in_channels + c], - &out_features_ptr[out_i * in_channels + c]); - } - } - } - } -} - -template -void MaxPoolKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const std::vector& kernel_sizes, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - SparseCooTensor* out, - DenseTensor* rulebook, - DenseTensor* counter) { - PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "MaxPoolCPUKernel", ([&] { - MaxPoolCPUKernel(dev_ctx, - x, - kernel_sizes, - paddings, - dilations, - strides, - out, - rulebook, - counter); - })); -} - -} // namespace sparse -} // namespace phi - -PD_REGISTER_KERNEL(sparse_maxpool, - CPU, - ALL_LAYOUT, - phi::sparse::MaxPoolKernel, - float, - double) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); -} diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu deleted file mode 100644 index 0e399a7b0e81f..0000000000000 --- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu +++ /dev/null @@ -1,340 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" - -#include - -#include "paddle/phi/backends/gpu/gpu_info.h" -#include "paddle/phi/backends/gpu/gpu_launch_config.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/core/enforce.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/core/visit_type.h" -#include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h" -#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" - -namespace phi { -namespace sparse { - -template -__global__ void MaskKernel(const T* x_ptr, - const IntT* indices_ptr, - const int64_t* sparse_offsets, - const int64_t non_zero_num, - const int cols, - const int sparse_dim, - T* out_values_ptr) { - CUDA_KERNEL_LOOP_TYPE(i, non_zero_num * cols, int64_t) { - int64_t out_i = i / cols; - int64_t col_i = i - out_i * cols; - int64_t index = 0; - for (int j = 0; j < sparse_dim; j++) { - index += indices_ptr[j * non_zero_num + out_i] * sparse_offsets[j]; - } - out_values_ptr[out_i * cols + col_i] = x_ptr[index * cols + col_i]; - } -} - -template -void SparseMaskGPUKernel(const GPUContext& dev_ctx, - const DenseTensor& x, - const SparseCooTensor& mask, - SparseCooTensor* out) { - const DDim& dims = x.dims(); - PADDLE_ENFORCE_EQ( - x.dims(), - mask.dims(), - phi::errors::InvalidArgument("the input x and mask must have the shape")); - const DenseTensor& indices = mask.non_zero_indices(); - const DenseTensor& values = mask.non_zero_elements(); - const int sparse_dim = mask.sparse_dim(); - DenseTensor sparse_offsets = phi::Empty( - dev_ctx, - DenseTensorMeta(DataType::INT64, {sparse_dim}, DataLayout::NCHW)); - std::vector h_sparse_offsets(sparse_dim); - phi::funcs::sparse::CalcOffsetsPerDim( - dims, sparse_dim, h_sparse_offsets.data()); - - phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data(), - &h_sparse_offsets[0], - sizeof(int64_t) * sparse_dim, -#ifdef PADDLE_WITH_HIP - hipMemcpyHostToDevice, -#else - cudaMemcpyHostToDevice, -#endif - dev_ctx.stream()); - - DenseTensor out_indices = phi::EmptyLike(dev_ctx, indices); - DenseTensor out_values = phi::EmptyLike(dev_ctx, values); - - phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &out_indices); - - const IntT* indices_ptr = indices.data(); - T* out_values_ptr = out_values.data(); - const T* x_ptr = x.data(); - const int64_t non_zero_num = mask.nnz(); - auto dims_2d = flatten_to_2d(dims, sparse_dim); - const int cols = dims_2d[1]; - - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1); - MaskKernel - <<>>( - x_ptr, - indices_ptr, - sparse_offsets.data(), - non_zero_num, - cols, - sparse_dim, - out_values_ptr); - - out->SetMember(out_indices, out_values, dims, true); -} - -/** - * @brief Filter the DenseTensor x by the - * mask.non_zero_indices() and output a SparseCooTensor - * x and mask must have the same shape. - **/ -template -void SparseMaskKernel(const Context& dev_ctx, - const DenseTensor& x, - const SparseCooTensor& mask, - SparseCooTensor* out) { - PD_VISIT_INTEGRAL_TYPES( - mask.non_zero_indices().dtype(), "SparseMaskGPUKernel", ([&] { - SparseMaskGPUKernel(dev_ctx, x, mask, out); - })); -} - -template -__global__ void SparseMaskCopyKernel(const IntT* x_indexs, - const IntT* mask_indexs, - const IntT* bound_out, - const T* x_values, - const int64_t n, - const int64_t stride, - T* out_values) { - CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { - using LoadT = phi::AlignedVector; - using StoreT = phi::AlignedVector; - const IntT j = bound_out[i]; - if (j >= 0 && j < n && mask_indexs[i] == x_indexs[j]) { - for (int k = 0; k < stride / VecSize; k++) { - // out_values[i * stride + k] = x_values[j * stride + k]; - LoadT vec_x; - phi::Load(x_values + j * stride + k * VecSize, &vec_x); - phi::Store(vec_x, out_values + i * stride + k * VecSize); - } - } - } -} - -template -__global__ void MaskTable(const IntT* x_indexs, const int n, int* table) { - CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { - int index = x_indexs[i]; - table[index] = i == 0 ? -1 : i; - } -} - -template -__global__ void MaskCopy(const IntT* mask_indexs, - const int* table, - const int n, - const int stride, - const T* x_values, - T* out_values) { - using LoadT = phi::AlignedVector; - using StoreT = phi::AlignedVector; - CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { - int j = table[mask_indexs[i]]; - if (j != 0) { - if (j == -1) j = 0; - for (int k = 0; k < stride; k += VecSize) { - LoadT vec_x; - phi::Load(x_values + j * stride + k, &vec_x); - phi::Store(vec_x, out_values + i * stride + k); - } - } - } -} - -template -void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& mask_indices, - DenseTensor* out) { - PADDLE_ENFORCE_EQ( - mask_indices.dims().size(), - 2, - phi::errors::InvalidArgument("the mask_indices must be 2-D tensor")); - - const int32_t sparse_dim = x.sparse_dim(); - auto indices_dtype = paddle::experimental::CppTypeToDataType::Type(); - - std::vector sparse_offsets(sparse_dim); - - DenseTensorMeta x_indexs_meta(indices_dtype, {x.nnz()}, DataLayout::NCHW); - DenseTensorMeta mask_indexs_meta( - indices_dtype, {mask_indices.dims()[1]}, DataLayout::NCHW); - DenseTensorMeta sparse_offset_meta( - indices_dtype, {sparse_dim}, DataLayout::NCHW); - - DenseTensor x_indexs = - phi::Empty(dev_ctx, std::move(x_indexs_meta)); - DenseTensor mask_indexs = - phi::Empty(dev_ctx, std::move(mask_indexs_meta)); - DenseTensor bound_out = - phi::Empty(dev_ctx, std::move(mask_indexs_meta)); - DenseTensor d_sparse_offsets = - phi::Empty(dev_ctx, std::move(sparse_offset_meta)); - IntT* x_indexs_ptr = x_indexs.data(); - IntT* mask_indexs_ptr = mask_indexs.data(); - IntT* bound_out_ptr = bound_out.data(); - - // 1. calc the offsets of per dim - phi::funcs::sparse::CalcOffsetsPerDim( - x.dims(), sparse_dim, sparse_offsets.data()); - // 2. copy sparse_offsets to device - phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data(), - sparse_offsets.data(), - sizeof(IntT) * sparse_dim, -#ifdef PADDLE_WITH_HIP - hipMemcpyHostToDevice, -#else - cudaMemcpyHostToDevice, -#endif - dev_ctx.stream()); - - // 3. flatten x indices and mask indices - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1); - phi::funcs::sparse::FlattenIndicesKernel<<>>( - x.non_zero_indices().data(), - d_sparse_offsets.data(), - x_indexs.numel(), - sparse_dim, - x_indexs_ptr); - - config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1); - phi::funcs::sparse::FlattenIndicesKernel<<>>( - mask_indices.data(), - d_sparse_offsets.data(), - mask_indexs.numel(), - sparse_dim, - mask_indexs_ptr); - - int table_size = 1; - auto x_dims = x.dims(); - for (int i = 0; i < x_dims.size() - 1; i++) { - table_size *= x_dims[i]; - } - DenseTensor table = phi::Empty(dev_ctx, {table_size}); - cudaMemsetAsync( - table.data(), 0, table_size * sizeof(int), dev_ctx.stream()); - const int64_t stride = - x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1]; - *out = phi::EmptyLike(dev_ctx, x.non_zero_elements()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, out, static_cast(0)); - T* out_ptr = out->data(); - config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1); - MaskTable<<>>( - x_indexs_ptr, x_indexs.numel(), table.data()); - config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1); - const int VecSize = VecBytes / sizeof(T); - if (stride % VecSize == 0) { - MaskCopy - <<>>(mask_indexs_ptr, - table.data(), - mask_indexs.numel(), - stride, - x.non_zero_elements().data(), - out_ptr); - } else { - MaskCopy<<>>(mask_indexs_ptr, - table.data(), - mask_indexs.numel(), - stride, - x.non_zero_elements().data(), - out_ptr); - } -} - -template -void SparseMaskHelperKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& mask_indices, - DenseTensor* out) { - PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "SparseMaskHelperGPUKernel", ([&] { - SparseMaskHelperGPUKernel(dev_ctx, x, mask_indices, out); - })); -} - -} // namespace sparse -} // namespace phi - -PD_REGISTER_KERNEL(sparse_mask, - GPU, - ALL_LAYOUT, - phi::sparse::SparseMaskKernel, - float, - double, - phi::dtype::float16, - uint8_t, - int8_t, - int16_t, - int, - int64_t) { - kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); -} - -PD_REGISTER_KERNEL(sparse_mask_helper, - GPU, - ALL_LAYOUT, - phi::sparse::SparseMaskHelperKernel, - float, - double, - phi::dtype::float16, - uint8_t, - int16_t, - int, - int64_t) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); -} diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu deleted file mode 100644 index 12225da7a01fb..0000000000000 --- a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu +++ /dev/null @@ -1,136 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/backends/gpu/gpu_info.h" -#include "paddle/phi/backends/gpu/gpu_launch_config.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/core/visit_type.h" -#include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/pooling.h" -#include "paddle/phi/kernels/funcs/sparse/convolution.h" - -namespace phi { -namespace sparse { - -template -__global__ void MaxPoolGradCudaKernel(const T* in_features_ptr, - const T* out_features_ptr, - const T* out_grad_ptr, - const IntT* rulebook_ptr, - const int n, - const int rulebook_len, - const int channels, - T* x_grad_ptr) { - phi::funcs::MaxPoolGrad grad_functor; - CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) { - int real_i = i / channels; - int c = i - real_i * channels; - IntT in_i = rulebook_ptr[real_i]; - IntT out_i = rulebook_ptr[real_i + rulebook_len]; - grad_functor.compute(in_features_ptr[in_i * channels + c], - out_features_ptr[out_i * channels + c], - out_grad_ptr[out_i * channels + c], - 1, - &x_grad_ptr[in_i * channels + c]); - } -} - -template -void MaxPoolGradGPUKernel(const GPUContext& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& rulebook, - const DenseTensor& counter, - const SparseCooTensor& out, - const SparseCooTensor& out_grad, - const std::vector& kernel_sizes, - SparseCooTensor* x_grad) { - int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; - const int in_channels = x.dims()[4]; - int rulebook_len = rulebook.dims()[1]; - const IntT* rulebook_ptr = rulebook.data(); - std::vector offsets(kernel_size + 1); - const int* counter_ptr = counter.data(); - phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size); - - const T* in_features_ptr = x.non_zero_elements().data(); - const T* out_features_ptr = out.non_zero_elements().data(); - const T* out_grad_ptr = out_grad.non_zero_elements().data(); - // TODO(zhangkaihuo): call phi::sparse::EmptyLike - DenseTensor x_grad_indices = - phi::EmptyLike(dev_ctx, x.non_zero_indices()); - DenseTensor x_grad_values = phi::EmptyLike(dev_ctx, x.non_zero_elements()); - x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true); - T* x_grad_ptr = x_grad_values.data(); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, &x_grad_values, static_cast(0.0f)); - phi::Copy(dev_ctx, - x.non_zero_indices(), - dev_ctx.GetPlace(), - false, - &x_grad_indices); - - for (int i = 0; i < kernel_size; i++) { - if (counter_ptr[i] <= 0) { - continue; - } - - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, counter_ptr[i] * in_channels, 1); - MaxPoolGradCudaKernel - <<>>(in_features_ptr, - out_features_ptr, - out_grad_ptr, - rulebook_ptr + offsets[i], - counter_ptr[i], - rulebook_len, - in_channels, - x_grad_ptr); - } -} - -template -void MaxPoolGradKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& rulebook, - const DenseTensor& counter, - const SparseCooTensor& out, - const SparseCooTensor& out_grad, - const std::vector& kernel_sizes, - SparseCooTensor* x_grad) { - PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "MaxPoolGradGPUKernel", ([&] { - MaxPoolGradGPUKernel( - dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad); - })); -} - -} // namespace sparse -} // namespace phi - -PD_REGISTER_KERNEL(sparse_maxpool_grad, - GPU, - ALL_LAYOUT, - phi::sparse::MaxPoolGradKernel, - float, - double) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); -} diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu deleted file mode 100644 index 61a622075efde..0000000000000 --- a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu +++ /dev/null @@ -1,167 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h" - -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/core/tensor_meta.h" -#include "paddle/phi/core/visit_type.h" -#include "paddle/phi/kernels/funcs/pooling.h" -#include "paddle/phi/kernels/funcs/sparse/convolution.h" -#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" - -namespace phi { -namespace sparse { - -template -__global__ void MaxPoolCudaKernel(const T* in_features_ptr, - const IntT* rulebook_ptr, - const int n, - const int rulebook_len, - const int channels, - T* out_features_ptr) { - phi::funcs::MaxPool max_pool_functor; - CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) { - int real_i = i / channels; - int channel_i = i - real_i * channels; - IntT in_i = rulebook_ptr[real_i]; - IntT out_i = rulebook_ptr[real_i + rulebook_len]; - max_pool_functor.compute(in_features_ptr[in_i * channels + channel_i], - &out_features_ptr[out_i * channels + channel_i]); - } -} - -/** - * x: (N, D, H, W, C) - * kernel: (D, H, W, C, OC) - * out: (N, D, H, W, OC) - **/ -template -void MaxPoolGPUKernel(const GPUContext& dev_ctx, - const SparseCooTensor& x, - const std::vector& kernel_sizes, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - SparseCooTensor* out, - DenseTensor* rulebook, - DenseTensor* counter) { - const auto& x_dims = x.dims(); - int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; - const std::vector& real_kernel_sizes = - phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]); - DDim out_dims = {1, 1, 1, 1, 1}; - phi::funcs::sparse::GetOutShape( - x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims); - const int in_channels = real_kernel_sizes[3]; - - std::vector offsets(kernel_size + 1), h_counter(kernel_size); - DenseTensorMeta counter_meta( - DataType::INT32, {kernel_size}, DataLayout::NCHW); - DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); - DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); - DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW); - DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta)); - DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); - - // 1. product rulebook - int rulebook_len = ProductRuleBook(dev_ctx, - x, - real_kernel_sizes, - paddings, - dilations, - strides, - out_dims, - false, - rulebook, - &counter_per_kernel, - &offsets_per_kernel, - &out_index, - &unique_value, - out, - &h_counter, - &offsets); - - const IntT* rulebook_ptr = rulebook->data(); - - T* out_features_ptr = out->mutable_non_zero_elements()->data(); - const T* in_features_ptr = x.non_zero_elements().data(); - counter->Resize({kernel_size}); - int* counter_ptr = dev_ctx.template HostAlloc(counter); - memcpy(counter_ptr, h_counter.data(), h_counter.size() * sizeof(int)); -// 2. max pool -#ifdef PADDLE_WITH_HIP - thrust::fill(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::fill(thrust::cuda::par.on(dev_ctx.stream()), -#endif - out_features_ptr, - out_features_ptr + out->non_zero_elements().numel(), - static_cast(0)); - // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster - for (int i = 0; i < kernel_size; i++) { - if (h_counter[i] <= 0) { - continue; - } - - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, h_counter[i] * in_channels, 1); - MaxPoolCudaKernel<<>>(in_features_ptr, - rulebook_ptr + offsets[i], - h_counter[i], - rulebook_len, - in_channels, - out_features_ptr); - } -} - -template -void MaxPoolKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const std::vector& kernel_sizes, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - SparseCooTensor* out, - DenseTensor* rulebook, - DenseTensor* counter) { - PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "MaxPoolGPUKernel", ([&] { - MaxPoolGPUKernel(dev_ctx, - x, - kernel_sizes, - paddings, - dilations, - strides, - out, - rulebook, - counter); - })); -} - -} // namespace sparse -} // namespace phi - -PD_REGISTER_KERNEL(sparse_maxpool, - GPU, - ALL_LAYOUT, - phi::sparse::MaxPoolKernel, - float, - double, - phi::dtype::float16) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); -} diff --git a/paddle/phi/kernels/sparse/sparse_mask_kernel.h b/paddle/phi/kernels/sparse/sparse_mask_kernel.h deleted file mode 100644 index 88899e3dc672e..0000000000000 --- a/paddle/phi/kernels/sparse/sparse_mask_kernel.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/sparse_coo_tensor.h" - -namespace phi { -namespace sparse { - -template -void SparseMaskKernel(const Context& dev_ctx, - const DenseTensor& x, - const SparseCooTensor& mask, - SparseCooTensor* out); - -template -void SparseMaskHelperKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& mask_indices, - DenseTensor* out); - -} // namespace sparse -} // namespace phi diff --git a/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h deleted file mode 100644 index ef9f8418b0116..0000000000000 --- a/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/sparse_coo_tensor.h" -#include "paddle/phi/kernels/empty_kernel.h" - -namespace phi { -namespace sparse { - -template -void MaxPoolGradKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& rulebook, - const DenseTensor& counter, - const SparseCooTensor& out, - const SparseCooTensor& out_grad, - const std::vector& kernel_sizes, - SparseCooTensor* x_grad); - -template -SparseCooTensor MaxPoolGrad(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& rulebook, - const DenseTensor& counter, - const SparseCooTensor& out, - const SparseCooTensor& out_grad, - const std::vector& kernel_sizes) { - SparseCooTensor x_grad; - MaxPoolGradKernel( - dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, &x_grad); - return x_grad; -} - -} // namespace sparse -} // namespace phi diff --git a/paddle/phi/kernels/sparse/sparse_pool_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_kernel.h deleted file mode 100644 index 9f4939da8d52a..0000000000000 --- a/paddle/phi/kernels/sparse/sparse_pool_kernel.h +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/sparse_coo_tensor.h" -#include "paddle/phi/kernels/empty_kernel.h" - -namespace phi { -namespace sparse { - -template -void MaxPoolKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const std::vector& kernel_sizes, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - SparseCooTensor* out, - DenseTensor* rulebook, - DenseTensor* counter); - -template -SparseCooTensor MaxPool(const Context& dev_ctx, - const SparseCooTensor& x, - const std::vector& kernel_sizes, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - DenseTensor* rulebook, - DenseTensor* counter) { - SparseCooTensor coo; - MaxPoolKernel(dev_ctx, - x, - kernel_sizes, - paddings, - dilations, - strides, - &coo, - rulebook, - counter); - return coo; -} - -} // namespace sparse -} // namespace phi diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc index 69677be34b231..9425c14b79b36 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" namespace phi { namespace sparse { diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h index a00b9c275c292..7cf97c3f48ece 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h" -#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" +#include "paddle/phi/kernels/sparse/mask_kernel.h" namespace phi { namespace sparse { diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc index 36fa99d9bfc75..7497dca51a59c 100644 --- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc @@ -23,8 +23,8 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/sparse/coalesced_kernel.h" -#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" -#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h" +#include "paddle/phi/kernels/sparse/pool_grad_kernel.h" +#include "paddle/phi/kernels/sparse/pool_kernel.h" namespace phi { namespace tests { From 7c2fbf52f71c8835632928b7da5db7db9108e954 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 6 Jul 2022 05:42:40 +0000 Subject: [PATCH 59/70] add new file --- .../final_state_generator/eager_gen.py | 3 - paddle/phi/kernels/sparse/cpu/mask_kernel.cc | 183 ++++++++++ .../kernels/sparse/cpu/pool_grad_kernel.cc | 103 ++++++ paddle/phi/kernels/sparse/cpu/pool_kernel.cc | 140 ++++++++ .../gpu/.convolution_grad_kernel.cu.swp | Bin 0 -> 20480 bytes paddle/phi/kernels/sparse/gpu/mask_kernel.cu | 338 ++++++++++++++++++ .../kernels/sparse/gpu/pool_grad_kernel.cu | 136 +++++++ paddle/phi/kernels/sparse/gpu/pool_kernel.cu | 167 +++++++++ paddle/phi/kernels/sparse/mask_kernel.h | 36 ++ paddle/phi/kernels/sparse/pool_grad_kernel.h | 49 +++ paddle/phi/kernels/sparse/pool_kernel.h | 58 +++ 11 files changed, 1210 insertions(+), 3 deletions(-) create mode 100644 paddle/phi/kernels/sparse/cpu/mask_kernel.cc create mode 100644 paddle/phi/kernels/sparse/cpu/pool_grad_kernel.cc create mode 100644 paddle/phi/kernels/sparse/cpu/pool_kernel.cc create mode 100644 paddle/phi/kernels/sparse/gpu/.convolution_grad_kernel.cu.swp create mode 100644 paddle/phi/kernels/sparse/gpu/mask_kernel.cu create mode 100644 paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu create mode 100644 paddle/phi/kernels/sparse/gpu/pool_kernel.cu create mode 100644 paddle/phi/kernels/sparse/mask_kernel.h create mode 100644 paddle/phi/kernels/sparse/pool_grad_kernel.h create mode 100644 paddle/phi/kernels/sparse/pool_kernel.h diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index a595bf5c613c6..d406f00b25039 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -545,9 +545,6 @@ def BackwardValidationCheck(self): backward_forward_inputs_map = self.backward_forward_inputs_map backward_grad_inputs_map = self.backward_grad_inputs_map backward_attrs_list = self.backward_attrs_list - print(backward_forward_inputs_map) - print(backward_grad_inputs_map) - print(backward_attrs_list) # Check Order: TensorWrappers, GradTensors, Attributes max_fwd_input_position = -1 diff --git a/paddle/phi/kernels/sparse/cpu/mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/mask_kernel.cc new file mode 100644 index 0000000000000..92c015101264c --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/mask_kernel.cc @@ -0,0 +1,183 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/mask_kernel.h" + +#include "paddle/phi/api/ext/dispatch.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/sparse/flatten_indices.h" + +namespace phi { +namespace sparse { + +template +void SparseMaskCPUKernel(const CPUContext& dev_ctx, + const DenseTensor& x, + const SparseCooTensor& mask, + SparseCooTensor* out) { + const DDim& dims = x.dims(); + PADDLE_ENFORCE_EQ( + x.dims(), + mask.dims(), + phi::errors::InvalidArgument("the input x and mask must have the shape")); + const DenseTensor& indices = mask.non_zero_indices(); + const DenseTensor& values = mask.non_zero_elements(); + const int sparse_dim = mask.sparse_dim(); + + DenseTensor out_indices = phi::EmptyLike(dev_ctx, indices); + DenseTensor out_values = phi::EmptyLike(dev_ctx, values); + + // the out_indices is same as indices of mask + phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &out_indices); + + T* out_values_ptr = out_values.data(); + const T* x_ptr = x.data(); + + const int64_t non_zero_num = mask.nnz(); + auto dims_2d = flatten_to_2d(dims, sparse_dim); + const int cols = dims_2d[1]; + const IntT* indices_ptr = indices.data(); + + std::vector out_indexs(non_zero_num), sparse_offsets(sparse_dim); + + phi::funcs::sparse::CalcOffsetsPerDim( + dims, sparse_dim, sparse_offsets.data()); + + for (int64_t i = 0; i < non_zero_num; i++) { + int64_t index = phi::funcs::sparse::CoordinateToIndex( + indices_ptr, sparse_offsets.data(), non_zero_num, sparse_dim, i); + memcpy(out_values_ptr + i * cols, x_ptr + index * cols, cols * sizeof(T)); + } + + out->SetMember(out_indices, out_values, dims, true); +} + +/** + * @brief Filter the DenseTensor x by the + * mask.non_zero_indices() and output a SparseCooTensor + * x and mask must have the same shape. + **/ +template +void SparseMaskKernel(const Context& dev_ctx, + const DenseTensor& x, + const SparseCooTensor& mask, + SparseCooTensor* out) { + PD_VISIT_INTEGRAL_TYPES( + mask.non_zero_indices().dtype(), "SparseMaskCPUKernel", ([&] { + SparseMaskCPUKernel(dev_ctx, x, mask, out); + })); +} + +template +void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& mask_indices, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + mask_indices.dims().size(), + 2, + phi::errors::InvalidArgument("the mask_indices must be 2-D tensor")); + + const int32_t sparse_dim = x.sparse_dim(); + + std::vector sparse_offsets(sparse_dim), x_indexs(x.nnz()), + mask_indexs(mask_indices.dims()[1]); + phi::funcs::sparse::CalcOffsetsPerDim( + x.dims(), sparse_dim, sparse_offsets.data()); + + phi::funcs::sparse::FlattenIndices(x.non_zero_indices().data(), + sparse_offsets.data(), + x.nnz(), + sparse_dim, + 0, + 1, + x_indexs.data()); + phi::funcs::sparse::FlattenIndices(mask_indices.data(), + sparse_offsets.data(), + x.nnz(), + sparse_dim, + 0, + 1, + mask_indexs.data()); + + std::unordered_map x_indexs_map; + for (uint64_t i = 0; i < x_indexs.size(); i++) { + x_indexs_map[x_indexs[i]] = i; + } + *out = phi::EmptyLike(dev_ctx, x.non_zero_elements()); + T* out_ptr = out->data(); + memset(out_ptr, static_cast(0), out->numel() * sizeof(T)); + const int64_t stride = + x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1]; + const T* in_ptr = x.non_zero_elements().data(); + // TODO(zhangkaihuo): multithreading can be used for acceleration + for (uint64_t i = 0; i < mask_indexs.size(); i++) { + auto iter = x_indexs_map.find(mask_indexs[i]); + if (iter != x_indexs_map.end()) { + memcpy(out_ptr + i * stride, + in_ptr + iter->second * stride, + stride * sizeof(T)); + } + } +} + +/** + * @brief filter values from x.values() using mask_indices + */ +template +void SparseMaskHelperKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& mask_indices, + DenseTensor* out) { + PD_VISIT_INTEGRAL_TYPES( + x.non_zero_indices().dtype(), "SparseMaskHelperCPUKernel", ([&] { + SparseMaskHelperCPUKernel(dev_ctx, x, mask_indices, out); + })); +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_mask, + CPU, + ALL_LAYOUT, + phi::sparse::SparseMaskKernel, + float, + double, + uint8_t, + int8_t, + int16_t, + int, + int64_t) { + kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); +} + +PD_REGISTER_KERNEL(sparse_mask_helper, + CPU, + ALL_LAYOUT, + phi::sparse::SparseMaskHelperKernel, + float, + double, + uint8_t, + int16_t, + int, + int64_t) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/cpu/pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/pool_grad_kernel.cc new file mode 100644 index 0000000000000..d17d06e6e4f14 --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/pool_grad_kernel.cc @@ -0,0 +1,103 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/pool_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" + +namespace phi { +namespace sparse { + +template +void MaxPoolGradCPUKernel(const CPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const DenseTensor& counter, + const SparseCooTensor& out, + const SparseCooTensor& out_grad, + const std::vector& kernel_sizes, + SparseCooTensor* x_grad) { + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const int channels = x.dims()[4]; + int rulebook_len = rulebook.dims()[1]; + const IntT* rulebook_ptr = rulebook.data(); + std::vector offsets(kernel_size + 1); + const int* counter_ptr = counter.data(); + + phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size); + + const T* in_features_ptr = x.non_zero_elements().data(); + const T* out_features_ptr = out.non_zero_elements().data(); + const T* out_grad_ptr = out_grad.non_zero_elements().data(); + // TODO(zhangkaihuo): call phi::sparse::EmptyLike + DenseTensor x_grad_indices = + phi::EmptyLike(dev_ctx, x.non_zero_indices()); + DenseTensor x_grad_values = phi::EmptyLike(dev_ctx, x.non_zero_elements()); + x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true); + T* x_grad_ptr = x_grad_values.data(); + memset(x_grad_ptr, 0, sizeof(T) * x_grad_values.numel()); + phi::Copy(dev_ctx, + x.non_zero_indices(), + dev_ctx.GetPlace(), + false, + &x_grad_indices); + + phi::funcs::MaxPoolGrad grad_functor; + for (int i = 0; i < kernel_size; i++) { + for (int j = 0; j < counter_ptr[i]; j++) { + IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j]; + IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j]; + for (int c = 0; c < channels; c++) { + grad_functor.compute(in_features_ptr[in_i * channels + c], + out_features_ptr[out_i * channels + c], + out_grad_ptr[out_i * channels + c], + 1, + &x_grad_ptr[in_i * channels + c]); + } + } + } +} + +template +void MaxPoolGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const DenseTensor& counter, + const SparseCooTensor& out, + const SparseCooTensor& out_grad, + const std::vector& kernel_sizes, + SparseCooTensor* x_grad) { + PD_VISIT_INTEGRAL_TYPES( + x.non_zero_indices().dtype(), "MaxPoolGradCPUKernel", ([&] { + MaxPoolGradCPUKernel( + dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad); + })); +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool_grad, + CPU, + ALL_LAYOUT, + phi::sparse::MaxPoolGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/cpu/pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/pool_kernel.cc new file mode 100644 index 0000000000000..38e512bd00c93 --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/pool_kernel.cc @@ -0,0 +1,140 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/pool_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" +#include "paddle/phi/kernels/sparse/cpu/convolution.h" + +namespace phi { +namespace sparse { + +/** + * x: (N, D, H, W, C) + * kernel: (D, H, W, C, OC) + * out: (N, D, H, W, OC) + **/ +template +void MaxPoolCPUKernel(const CPUContext& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + SparseCooTensor* out, + DenseTensor* rulebook, + DenseTensor* counter) { + const auto& x_dims = x.dims(); + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const std::vector& real_kernel_sizes = + phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]); + DDim out_dims = {1, 1, 1, 1, 1}; + phi::funcs::sparse::GetOutShape( + x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims); + const int in_channels = real_kernel_sizes[3]; + + // DenseTensorMeta counter_meta( + // DataType::INT32, {kernel_size}, DataLayout::NCHW); + // DenseTensor counter_per_kernel = phi::Empty(dev_ctx, + // std::move(counter_meta)); + std::vector counter_per_kernel(kernel_size, 0); + + const T* in_features_ptr = x.non_zero_elements().data(); + // 1. product rule book + ProductRuleBook(dev_ctx, + x, + real_kernel_sizes, + paddings, + dilations, + strides, + out_dims, + false, + rulebook, + &counter_per_kernel); + + UpdateRulebookAndOutIndex( + dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out); + + int rulebook_len = rulebook->dims()[1]; + const IntT* rulebook_ptr = rulebook->data(); + + counter->Resize({kernel_size}); + int* counter_ptr = dev_ctx.template HostAlloc(counter); + memcpy(counter_ptr, counter_per_kernel.data(), kernel_size * sizeof(int)); + + std::vector offsets(kernel_size + 1); + phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size); + std::vector out_flags(out->nnz(), false); + + // 2. max pool + T* out_features_ptr = out->mutable_non_zero_elements()->data(); + phi::funcs::MaxPool max_pool_functor; + for (int i = 0; i < kernel_size; i++) { + for (int j = 0; j < counter_ptr[i]; j++) { + IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j]; + IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j]; + if (!out_flags[out_i]) { + out_flags[out_i] = true; + memcpy(&out_features_ptr[out_i * in_channels], + &in_features_ptr[in_i * in_channels], + in_channels * sizeof(T)); + } else { + for (int c = 0; c < in_channels; c++) { + max_pool_functor.compute(in_features_ptr[in_i * in_channels + c], + &out_features_ptr[out_i * in_channels + c]); + } + } + } + } +} + +template +void MaxPoolKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + SparseCooTensor* out, + DenseTensor* rulebook, + DenseTensor* counter) { + PD_VISIT_INTEGRAL_TYPES( + x.non_zero_indices().dtype(), "MaxPoolCPUKernel", ([&] { + MaxPoolCPUKernel(dev_ctx, + x, + kernel_sizes, + paddings, + dilations, + strides, + out, + rulebook, + counter); + })); +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool, + CPU, + ALL_LAYOUT, + phi::sparse::MaxPoolKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/gpu/.convolution_grad_kernel.cu.swp b/paddle/phi/kernels/sparse/gpu/.convolution_grad_kernel.cu.swp new file mode 100644 index 0000000000000000000000000000000000000000..e1d0e2bee6c88631e06944143561a3ec7c50b945 GIT binary patch literal 20480 zcmeHNZEPGz86GGlgtQPSsgMw2aFDrk?#p)6L|4afu}yl4V+Y?QQLxh0-tOGp;=SF? z?yT)gNGl;xBt#I3gvzh-RfJR!@PpswLn@+D1!`1*A|WIY^EhE2nGRUt_mP+}noxDCZHJvY&N+$|JgcRFuhZV22b*HtaTkX&-dcON< zX7J*9-85}h^x|9FDy}i#VRldqJlzjivE_wD!*$kOJLHz@Xf0niwMeF5gnJcGy;ls} z>kNdvnLqg;nVy;~2`Y|`eV9J<-V66SE2{pA0mXn~Krx^gPz)#r6a$KZdx`<>-AA58 zjo%vA`f_}K%Z~f&Z$_6Hr+;e4`?tl<$Kw0*JKo2S>ZTY_3@8Q^1BwB~fMP%~pcqgL zC@-yHT@Hp@x;MGIW2VMbw3j754Ch!dK1>gzbD$oJufe!$G zegYi+jkK19PlJC4IBi1hYEfV_%84n;5u*#co_KG z+X?w0@Dwl)90gvxpO6=UF9Q~E4444^g-wo^fLnk8oB$30zq*f*UjokpHZTd?58TG4 z%2$D>flmTO;Gf|67l7dVDZzPk(NQixv{}P-*EE|sbbzjX7peE+fdITt4H4=UB(^x|N{o7@6_5X1JllnZH`Pl4R3M%k(UwNI6={ zJ_@S=H_PR9gmt+73<@_(-DWdjJjg`x8k#6KJw_+9IU1WzzY$h)M3`Y*yT*JsW3qM4 z;F~!*Zfcualj%J4S&&0MI=2kX&;veGpUspCrDirNK%eHO3>eu(r9MF;8Esv+Ll$UY zh@Ohm^0TJSb&-cO9|`7XSD2_t#W5?k?HW2~nazTO*c4G{joGZj98@}6Fs%-5QdJMR zOB=Qxz_uj689D}-Kg0NGq|VH_V0x41DDsM{E%iVnZR>V34ta?vJVn!l7U|TZgK3MU zM99o845W)wV;m&`zYb3qsn;Z;mBCmNffJLJ%4Y{-+urok7QYre$=brS8}cZ|Cu#0D zkG}N6wx*GV+Ug^*Qo7sp$y-jZOU2@;l(aPJEH25@LdWCXq=f{j=N230bMt8bRL6gQSTzPTwa=c?2>Hr6fft%SU)$iIGLEA;d!BC`^_p%(|<3Q6lHNH zs0%{c0#3!~PBVtdA$g1^kJZU5a&Y(LCwdZbr`=KzQU6I6Un~NzlnD{#N06++XcQ`oe!rE3PY#~RQOhyx&}4LMMcfFSg`mGCIad)zheb~w5mHMxm2w`wtS&ZFI84nDoge1 zLX9r3(7ENM`D(qoyo9Gu(8|(fdbYYWpQFrzeV?*T56kj^B0mcQ%wlGNn1WBM5prhs z7nfQAO$)dV)g3xksnKd}jGnI4s)je1@^oGq)mkp7h-trmV zVQp<+vFR%&%~;j(l+N_xmeuT*eK{2E^StD{ZNYTHgzUs|LH zS+vNASKlF=Y>g3>Q`^=9D>&FpvbiQU)b^oNyrk~i@i%ylIJ^Fp< zhmLhMWYGcT2qxnF&H{E|nBK``XJ>Jym^wfe2M0On|NjPOC0T=i<@G;;$!0+MvzX*H@I13yC?%-Ts zocr5A2_V25IM;t2_!Dp&co}#OxCI121^6iN0PqKt_ZxsH_cNmG>QW3S1{4E|0mXn~ zKr!(D!N4GAW}m*ufR(87Ok@zjzC05V0J0xvq|cMt$4Xy0GU-k`nj@3e=>0DZAw52@ z=WVz~riSBht$`5EwE|*ED6Juu6Y;KRWfTdbK4)fTVtFYf#aoEj5z-JBR}x#YEjj}s zAnArgz-t_3g;)VZipW8jL=+^!w~QnHc2PzZz@>-ajn? z&_U^V&p90`GLoO%6B6;W$F|WSsdasX0w84eg7jsp!W-^|QHpw|B8U($u;uS2d<_k} zlo2yB)FTSE9AF6bzbi96itfgW3~t;iY)0%!)e(kNE?1Z8(^ETQCx=B#Mw82b6oQOh No8tu +__global__ void MaskKernel(const T* x_ptr, + const IntT* indices_ptr, + const int64_t* sparse_offsets, + const int64_t non_zero_num, + const int cols, + const int sparse_dim, + T* out_values_ptr) { + CUDA_KERNEL_LOOP_TYPE(i, non_zero_num * cols, int64_t) { + int64_t out_i = i / cols; + int64_t col_i = i - out_i * cols; + int64_t index = 0; + for (int j = 0; j < sparse_dim; j++) { + index += indices_ptr[j * non_zero_num + out_i] * sparse_offsets[j]; + } + out_values_ptr[out_i * cols + col_i] = x_ptr[index * cols + col_i]; + } +} + +template +void SparseMaskGPUKernel(const GPUContext& dev_ctx, + const DenseTensor& x, + const SparseCooTensor& mask, + SparseCooTensor* out) { + const DDim& dims = x.dims(); + PADDLE_ENFORCE_EQ( + x.dims(), + mask.dims(), + phi::errors::InvalidArgument("the input x and mask must have the shape")); + const DenseTensor& indices = mask.non_zero_indices(); + const DenseTensor& values = mask.non_zero_elements(); + const int sparse_dim = mask.sparse_dim(); + DenseTensor sparse_offsets = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT64, {sparse_dim}, DataLayout::NCHW)); + std::vector h_sparse_offsets(sparse_dim); + phi::funcs::sparse::CalcOffsetsPerDim( + dims, sparse_dim, h_sparse_offsets.data()); + + phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data(), + &h_sparse_offsets[0], + sizeof(int64_t) * sparse_dim, +#ifdef PADDLE_WITH_HIP + hipMemcpyHostToDevice, +#else + cudaMemcpyHostToDevice, +#endif + dev_ctx.stream()); + + DenseTensor out_indices = phi::EmptyLike(dev_ctx, indices); + DenseTensor out_values = phi::EmptyLike(dev_ctx, values); + + phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &out_indices); + + const IntT* indices_ptr = indices.data(); + T* out_values_ptr = out_values.data(); + const T* x_ptr = x.data(); + const int64_t non_zero_num = mask.nnz(); + auto dims_2d = flatten_to_2d(dims, sparse_dim); + const int cols = dims_2d[1]; + + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1); + MaskKernel + <<>>( + x_ptr, + indices_ptr, + sparse_offsets.data(), + non_zero_num, + cols, + sparse_dim, + out_values_ptr); + + out->SetMember(out_indices, out_values, dims, true); +} + +/** + * @brief Filter the DenseTensor x by the + * mask.non_zero_indices() and output a SparseCooTensor + * x and mask must have the same shape. + **/ +template +void SparseMaskKernel(const Context& dev_ctx, + const DenseTensor& x, + const SparseCooTensor& mask, + SparseCooTensor* out) { + PD_VISIT_INTEGRAL_TYPES( + mask.non_zero_indices().dtype(), "SparseMaskGPUKernel", ([&] { + SparseMaskGPUKernel(dev_ctx, x, mask, out); + })); +} + +template +__global__ void SparseMaskCopyKernel(const IntT* x_indexs, + const IntT* mask_indexs, + const IntT* bound_out, + const T* x_values, + const int64_t n, + const int64_t stride, + T* out_values) { + CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + const IntT j = bound_out[i]; + if (j >= 0 && j < n && mask_indexs[i] == x_indexs[j]) { + for (int k = 0; k < stride / VecSize; k++) { + // out_values[i * stride + k] = x_values[j * stride + k]; + LoadT vec_x; + phi::Load(x_values + j * stride + k * VecSize, &vec_x); + phi::Store(vec_x, out_values + i * stride + k * VecSize); + } + } + } +} + +template +__global__ void MaskTable(const IntT* x_indexs, const int n, int* table) { + CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { + int index = x_indexs[i]; + table[index] = i == 0 ? -1 : i; + } +} + +template +__global__ void MaskCopy(const IntT* mask_indexs, + const int* table, + const int n, + const int stride, + const T* x_values, + T* out_values) { + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { + int j = table[mask_indexs[i]]; + if (j != 0) { + if (j == -1) j = 0; + for (int k = 0; k < stride; k += VecSize) { + LoadT vec_x; + phi::Load(x_values + j * stride + k, &vec_x); + phi::Store(vec_x, out_values + i * stride + k); + } + } + } +} + +template +void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& mask_indices, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + mask_indices.dims().size(), + 2, + phi::errors::InvalidArgument("the mask_indices must be 2-D tensor")); + + const int32_t sparse_dim = x.sparse_dim(); + auto indices_dtype = paddle::experimental::CppTypeToDataType::Type(); + + std::vector sparse_offsets(sparse_dim); + + DenseTensorMeta x_indexs_meta(indices_dtype, {x.nnz()}, DataLayout::NCHW); + DenseTensorMeta mask_indexs_meta( + indices_dtype, {mask_indices.dims()[1]}, DataLayout::NCHW); + DenseTensorMeta sparse_offset_meta( + indices_dtype, {sparse_dim}, DataLayout::NCHW); + + DenseTensor x_indexs = + phi::Empty(dev_ctx, std::move(x_indexs_meta)); + DenseTensor mask_indexs = + phi::Empty(dev_ctx, std::move(mask_indexs_meta)); + DenseTensor bound_out = + phi::Empty(dev_ctx, std::move(mask_indexs_meta)); + DenseTensor d_sparse_offsets = + phi::Empty(dev_ctx, std::move(sparse_offset_meta)); + IntT* x_indexs_ptr = x_indexs.data(); + IntT* mask_indexs_ptr = mask_indexs.data(); + IntT* bound_out_ptr = bound_out.data(); + + // 1. calc the offsets of per dim + phi::funcs::sparse::CalcOffsetsPerDim( + x.dims(), sparse_dim, sparse_offsets.data()); + // 2. copy sparse_offsets to device + phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data(), + sparse_offsets.data(), + sizeof(IntT) * sparse_dim, +#ifdef PADDLE_WITH_HIP + hipMemcpyHostToDevice, +#else + cudaMemcpyHostToDevice, +#endif + dev_ctx.stream()); + + // 3. flatten x indices and mask indices + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1); + phi::funcs::sparse::FlattenIndicesKernel<<>>( + x.non_zero_indices().data(), + d_sparse_offsets.data(), + x_indexs.numel(), + sparse_dim, + x_indexs_ptr); + + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1); + phi::funcs::sparse::FlattenIndicesKernel<<>>( + mask_indices.data(), + d_sparse_offsets.data(), + mask_indexs.numel(), + sparse_dim, + mask_indexs_ptr); + + int table_size = 1; + auto x_dims = x.dims(); + for (int i = 0; i < x_dims.size() - 1; i++) { + table_size *= x_dims[i]; + } + DenseTensor table = phi::Empty(dev_ctx, {table_size}); + cudaMemsetAsync( + table.data(), 0, table_size * sizeof(int), dev_ctx.stream()); + const int64_t stride = + x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1]; + *out = phi::EmptyLike(dev_ctx, x.non_zero_elements()); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, out, static_cast(0)); + T* out_ptr = out->data(); + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1); + MaskTable<<>>( + x_indexs_ptr, x_indexs.numel(), table.data()); + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1); + const int VecSize = VecBytes / sizeof(T); + if (stride % VecSize == 0) { + MaskCopy + <<>>(mask_indexs_ptr, + table.data(), + mask_indexs.numel(), + stride, + x.non_zero_elements().data(), + out_ptr); + } else { + MaskCopy<<>>(mask_indexs_ptr, + table.data(), + mask_indexs.numel(), + stride, + x.non_zero_elements().data(), + out_ptr); + } +} + +template +void SparseMaskHelperKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& mask_indices, + DenseTensor* out) { + PD_VISIT_INTEGRAL_TYPES( + x.non_zero_indices().dtype(), "SparseMaskHelperGPUKernel", ([&] { + SparseMaskHelperGPUKernel(dev_ctx, x, mask_indices, out); + })); +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_mask, + GPU, + ALL_LAYOUT, + phi::sparse::SparseMaskKernel, + float, + double, + phi::dtype::float16, + uint8_t, + int8_t, + int16_t, + int, + int64_t) { + kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); +} + +PD_REGISTER_KERNEL(sparse_mask_helper, + GPU, + ALL_LAYOUT, + phi::sparse::SparseMaskHelperKernel, + float, + double, + phi::dtype::float16, + uint8_t, + int16_t, + int, + int64_t) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu new file mode 100644 index 0000000000000..d4f40c7d8c19e --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu @@ -0,0 +1,136 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/pool_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" + +namespace phi { +namespace sparse { + +template +__global__ void MaxPoolGradCudaKernel(const T* in_features_ptr, + const T* out_features_ptr, + const T* out_grad_ptr, + const IntT* rulebook_ptr, + const int n, + const int rulebook_len, + const int channels, + T* x_grad_ptr) { + phi::funcs::MaxPoolGrad grad_functor; + CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) { + int real_i = i / channels; + int c = i - real_i * channels; + IntT in_i = rulebook_ptr[real_i]; + IntT out_i = rulebook_ptr[real_i + rulebook_len]; + grad_functor.compute(in_features_ptr[in_i * channels + c], + out_features_ptr[out_i * channels + c], + out_grad_ptr[out_i * channels + c], + 1, + &x_grad_ptr[in_i * channels + c]); + } +} + +template +void MaxPoolGradGPUKernel(const GPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const DenseTensor& counter, + const SparseCooTensor& out, + const SparseCooTensor& out_grad, + const std::vector& kernel_sizes, + SparseCooTensor* x_grad) { + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const int in_channels = x.dims()[4]; + int rulebook_len = rulebook.dims()[1]; + const IntT* rulebook_ptr = rulebook.data(); + std::vector offsets(kernel_size + 1); + const int* counter_ptr = counter.data(); + phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size); + + const T* in_features_ptr = x.non_zero_elements().data(); + const T* out_features_ptr = out.non_zero_elements().data(); + const T* out_grad_ptr = out_grad.non_zero_elements().data(); + // TODO(zhangkaihuo): call phi::sparse::EmptyLike + DenseTensor x_grad_indices = + phi::EmptyLike(dev_ctx, x.non_zero_indices()); + DenseTensor x_grad_values = phi::EmptyLike(dev_ctx, x.non_zero_elements()); + x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true); + T* x_grad_ptr = x_grad_values.data(); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, &x_grad_values, static_cast(0.0f)); + phi::Copy(dev_ctx, + x.non_zero_indices(), + dev_ctx.GetPlace(), + false, + &x_grad_indices); + + for (int i = 0; i < kernel_size; i++) { + if (counter_ptr[i] <= 0) { + continue; + } + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, counter_ptr[i] * in_channels, 1); + MaxPoolGradCudaKernel + <<>>(in_features_ptr, + out_features_ptr, + out_grad_ptr, + rulebook_ptr + offsets[i], + counter_ptr[i], + rulebook_len, + in_channels, + x_grad_ptr); + } +} + +template +void MaxPoolGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const DenseTensor& counter, + const SparseCooTensor& out, + const SparseCooTensor& out_grad, + const std::vector& kernel_sizes, + SparseCooTensor* x_grad) { + PD_VISIT_INTEGRAL_TYPES( + x.non_zero_indices().dtype(), "MaxPoolGradGPUKernel", ([&] { + MaxPoolGradGPUKernel( + dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad); + })); +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool_grad, + GPU, + ALL_LAYOUT, + phi::sparse::MaxPoolGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu new file mode 100644 index 0000000000000..255c6621da015 --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu @@ -0,0 +1,167 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/pool_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" +#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" + +namespace phi { +namespace sparse { + +template +__global__ void MaxPoolCudaKernel(const T* in_features_ptr, + const IntT* rulebook_ptr, + const int n, + const int rulebook_len, + const int channels, + T* out_features_ptr) { + phi::funcs::MaxPool max_pool_functor; + CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) { + int real_i = i / channels; + int channel_i = i - real_i * channels; + IntT in_i = rulebook_ptr[real_i]; + IntT out_i = rulebook_ptr[real_i + rulebook_len]; + max_pool_functor.compute(in_features_ptr[in_i * channels + channel_i], + &out_features_ptr[out_i * channels + channel_i]); + } +} + +/** + * x: (N, D, H, W, C) + * kernel: (D, H, W, C, OC) + * out: (N, D, H, W, OC) + **/ +template +void MaxPoolGPUKernel(const GPUContext& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + SparseCooTensor* out, + DenseTensor* rulebook, + DenseTensor* counter) { + const auto& x_dims = x.dims(); + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const std::vector& real_kernel_sizes = + phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]); + DDim out_dims = {1, 1, 1, 1, 1}; + phi::funcs::sparse::GetOutShape( + x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims); + const int in_channels = real_kernel_sizes[3]; + + std::vector offsets(kernel_size + 1), h_counter(kernel_size); + DenseTensorMeta counter_meta( + DataType::INT32, {kernel_size}, DataLayout::NCHW); + DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); + DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); + DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW); + DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta)); + DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); + + // 1. product rulebook + int rulebook_len = ProductRuleBook(dev_ctx, + x, + real_kernel_sizes, + paddings, + dilations, + strides, + out_dims, + false, + rulebook, + &counter_per_kernel, + &offsets_per_kernel, + &out_index, + &unique_value, + out, + &h_counter, + &offsets); + + const IntT* rulebook_ptr = rulebook->data(); + + T* out_features_ptr = out->mutable_non_zero_elements()->data(); + const T* in_features_ptr = x.non_zero_elements().data(); + counter->Resize({kernel_size}); + int* counter_ptr = dev_ctx.template HostAlloc(counter); + memcpy(counter_ptr, h_counter.data(), h_counter.size() * sizeof(int)); +// 2. max pool +#ifdef PADDLE_WITH_HIP + thrust::fill(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::fill(thrust::cuda::par.on(dev_ctx.stream()), +#endif + out_features_ptr, + out_features_ptr + out->non_zero_elements().numel(), + static_cast(0)); + // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster + for (int i = 0; i < kernel_size; i++) { + if (h_counter[i] <= 0) { + continue; + } + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, h_counter[i] * in_channels, 1); + MaxPoolCudaKernel<<>>(in_features_ptr, + rulebook_ptr + offsets[i], + h_counter[i], + rulebook_len, + in_channels, + out_features_ptr); + } +} + +template +void MaxPoolKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + SparseCooTensor* out, + DenseTensor* rulebook, + DenseTensor* counter) { + PD_VISIT_INTEGRAL_TYPES( + x.non_zero_indices().dtype(), "MaxPoolGPUKernel", ([&] { + MaxPoolGPUKernel(dev_ctx, + x, + kernel_sizes, + paddings, + dilations, + strides, + out, + rulebook, + counter); + })); +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool, + GPU, + ALL_LAYOUT, + phi::sparse::MaxPoolKernel, + float, + double, + phi::dtype::float16) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/mask_kernel.h b/paddle/phi/kernels/sparse/mask_kernel.h new file mode 100644 index 0000000000000..88899e3dc672e --- /dev/null +++ b/paddle/phi/kernels/sparse/mask_kernel.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" + +namespace phi { +namespace sparse { + +template +void SparseMaskKernel(const Context& dev_ctx, + const DenseTensor& x, + const SparseCooTensor& mask, + SparseCooTensor* out); + +template +void SparseMaskHelperKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& mask_indices, + DenseTensor* out); + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/pool_grad_kernel.h b/paddle/phi/kernels/sparse/pool_grad_kernel.h new file mode 100644 index 0000000000000..ef9f8418b0116 --- /dev/null +++ b/paddle/phi/kernels/sparse/pool_grad_kernel.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { +namespace sparse { + +template +void MaxPoolGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const DenseTensor& counter, + const SparseCooTensor& out, + const SparseCooTensor& out_grad, + const std::vector& kernel_sizes, + SparseCooTensor* x_grad); + +template +SparseCooTensor MaxPoolGrad(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const DenseTensor& counter, + const SparseCooTensor& out, + const SparseCooTensor& out_grad, + const std::vector& kernel_sizes) { + SparseCooTensor x_grad; + MaxPoolGradKernel( + dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, &x_grad); + return x_grad; +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/pool_kernel.h b/paddle/phi/kernels/sparse/pool_kernel.h new file mode 100644 index 0000000000000..9f4939da8d52a --- /dev/null +++ b/paddle/phi/kernels/sparse/pool_kernel.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { +namespace sparse { + +template +void MaxPoolKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + SparseCooTensor* out, + DenseTensor* rulebook, + DenseTensor* counter); + +template +SparseCooTensor MaxPool(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + DenseTensor* rulebook, + DenseTensor* counter) { + SparseCooTensor coo; + MaxPoolKernel(dev_ctx, + x, + kernel_sizes, + paddings, + dilations, + strides, + &coo, + rulebook, + counter); + return coo; +} + +} // namespace sparse +} // namespace phi From 6684d944f555436bc4a1b00d329c912619b8f5d6 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 6 Jul 2022 06:55:28 +0000 Subject: [PATCH 60/70] for ci --- paddle/phi/core/sparse_coo_tensor.h | 16 ++-- paddle/phi/kernels/funcs/sparse/scatter.cu.h | 5 +- .../phi/kernels/sparse/gpu/convolution.cu.h | 22 ++---- .../kernels/sparse/gpu/convolution_kernel.cu | 75 +++++++++---------- paddle/phi/kernels/sparse/gpu/mask_kernel.cu | 41 ++-------- .../kernels/test_sparse_conv3d_dev_api.cc | 8 +- 6 files changed, 67 insertions(+), 100 deletions(-) diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h index 5982612946f7b..c69c7aab89d28 100644 --- a/paddle/phi/core/sparse_coo_tensor.h +++ b/paddle/phi/core/sparse_coo_tensor.h @@ -156,6 +156,7 @@ class SparseCooTensor : public TensorBase, /// \brief get the dnese dim int32_t dense_dim() const; + /// \brief query table according to key const std::pair>* table( const std::string& key) const { const auto& iter = table_ptr_->find(key); @@ -164,7 +165,8 @@ class SparseCooTensor : public TensorBase, } return &iter->second; } - // DenseTensor* mutable_rulebook() { return &rulebook_; } + + /// \brief set table according to key void SetTable(const std::string& key, const std::pair>& table) { auto ret = table_ptr_->insert({key, table}); @@ -173,11 +175,14 @@ class SparseCooTensor : public TensorBase, } } + /// \brief get table_ptr_ const std::shared_ptr< std::map>>>& GetTablePtr() const { return table_ptr_; } + + /// \brief set table_ptr_ void SetTablePtr( const std::shared_ptr< std::map>>>& @@ -185,9 +190,6 @@ class SparseCooTensor : public TensorBase, table_ptr_ = table_ptr; } - // const bool subm() const { return subm_; } - // void SetSubm(const bool subm) { subm_ = subm; } - private: // save the indices of non zero elements in original dense tensor DenseTensor non_zero_indices_; @@ -198,11 +200,15 @@ class SparseCooTensor : public TensorBase, // save the number of non zero elements in each batch DDim dims_; - // for sparse conv + // for submanifold conv + // SubmConv will generate a rulebook and a counter, which can be + // reused by different SubmConv. + // refer to sparse/gpu/convolution_kernel.cu std::shared_ptr< std::map>>> table_ptr_ = std::make_shared< std::map>>>(); + /* --------------------------- */ /* example: non zero element is scalar */ /* --------------------------- */ diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h index f7c4b7642d7bd..f27174d581818 100644 --- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h +++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h @@ -69,9 +69,10 @@ __global__ void ScatterKernel(const T* input, out + indices_i * channels + channels_i * VecSize); } } + // scatter's index has been grouped in advance -// index_counts record the count of every group -// index_groups save the index of every group +// index_counts record the count of each group +// index_groups save the index of each group template __global__ void ScatterKernelV2(const T* input, const int* index_counts, diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 9787393d06960..4363f94f69443 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -562,11 +562,8 @@ int ProductRuleBook(const Context& dev_ctx, IntT* rulebook_ptr = tmp_rulebook.data(); DenseTensor out_indices = phi::EmptyLike(dev_ctx, x.non_zero_indices()); - DenseTensor out_values = - phi::Empty(dev_ctx, - DenseTensorMeta(x.dtype(), - {x.nnz(), kernel_sizes[4]}, - x.non_zero_elements().layout())); + DenseTensor out_values = phi::Empty(dev_ctx, {x.nnz(), kernel_sizes[4]}); + phi::Copy( dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices); @@ -609,7 +606,7 @@ int ProductRuleBook(const Context& dev_ctx, rulebook_ptr, counter_ptr); - out->SetMember(out_indices, out_values, out_dims, true); + out->SetMember(out_indices, out_values, out_dims, false); thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), counter_ptr, @@ -731,13 +728,11 @@ int ProductRuleBook(const Context& dev_ctx, dev_ctx.Wait(); const int64_t sparse_dim = 4; - DenseTensorMeta indices_meta( - indices_dtype, {sparse_dim, out_nnz}, DataLayout::NCHW); - DenseTensorMeta values_meta( - x.dtype(), {out_nnz, kernel_sizes[4]}, x.non_zero_elements().layout()); - phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); - phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); - out->SetMember(out_indices, out_values, out_dims, true); + phi::DenseTensor out_indices = + phi::Empty(dev_ctx, {sparse_dim, out_nnz}); + phi::DenseTensor out_values = + phi::Empty(dev_ctx, {out_nnz, kernel_sizes[4]}); + out->SetMember(out_indices, out_values, out_dims, false); IntT* out_indices_ptr = out_indices.data(); @@ -754,7 +749,6 @@ int ProductRuleBook(const Context& dev_ctx, unique_value->ResizeAndAllocate({static_cast(out_nnz * kernel_size)}); int* unique_value_ptr = unique_value->data(); - // return rulebook_len; GroupIndexs<<(dev_ctx, {kernel_size}); + DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, {kernel_size}); + DenseTensor out_index; + DenseTensor unique_value; VLOG(6) << "call SubmConv3D or Conv3D " << subm << " and the key is " << key; - int n = 0; + int rulebook_len = 0; const IntT* rulebook_ptr = nullptr; bool need_product_rulebook = true; if (subm && !key.empty()) { @@ -93,7 +88,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, memcpy(h_counter.data(), table->second.data(), kernel_size * sizeof(int)); out->SetTablePtr(x.GetTablePtr()); - n = rulebook.dims()[1]; + rulebook_len = rulebook.dims()[1]; DenseTensor out_indices = phi::EmptyLike(dev_ctx, x.non_zero_indices()); @@ -113,24 +108,25 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, offsets[kernel_size] = offset; } } + if (need_product_rulebook) { DenseTensor tmp_rulebook; - n = ProductRuleBook(dev_ctx, - x, - kernel_sizes, - subm_paddings, - dilations, - subm_strides, - out_dims, - subm, - &tmp_rulebook, - &counter_per_kernel, - &offsets_per_kernel, - &out_index, - &unique_value, - out, - &h_counter, - &offsets); + rulebook_len = ProductRuleBook(dev_ctx, + x, + kernel_sizes, + subm_paddings, + dilations, + subm_strides, + out_dims, + subm, + &tmp_rulebook, + &counter_per_kernel, + &offsets_per_kernel, + &out_index, + &unique_value, + out, + &h_counter, + &offsets); rulebook_ptr = tmp_rulebook.data(); out->SetTablePtr(x.GetTablePtr()); @@ -145,14 +141,10 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, } // 2. gather - DenseTensorMeta in_features_meta( - x.dtype(), {n, in_channels}, DataLayout::NCHW); - DenseTensorMeta out_features_meta( - x.dtype(), {n, out_channels}, DataLayout::NCHW); phi::DenseTensor in_features = - phi::Empty(dev_ctx, std::move(in_features_meta)); + phi::Empty(dev_ctx, {rulebook_len, in_channels}); phi::DenseTensor out_features = - phi::Empty(dev_ctx, std::move(out_features_meta)); + phi::Empty(dev_ctx, {rulebook_len, out_channels}); T* in_features_ptr = in_features.data(); T* out_features_ptr = out_features.data(); phi::funcs::SetConstant set_zero; @@ -161,7 +153,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, Gather(dev_ctx, x.non_zero_elements().data(), rulebook_ptr, - n, + rulebook_len, in_channels, in_features_ptr); @@ -172,20 +164,25 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, set_zero(dev_ctx, out_values, static_cast(0.0f)); if (subm) { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, 1); + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); unique_value.ResizeAndAllocate( {static_cast(out->nnz() * kernel_size)}); - out_index.ResizeAndAllocate({static_cast(n)}); + out_index.ResizeAndAllocate({static_cast(rulebook_len)}); int* out_index_ptr = out_index.data(); int* unique_value_ptr = unique_value.data(); phi::backends::gpu::GpuMemsetAsync( - out_index_ptr, 0, sizeof(int) * n, dev_ctx.stream()); + out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream()); GroupIndexs<<>>( - n, kernel_size, rulebook_ptr + n, out_index_ptr, unique_value_ptr); + dev_ctx.stream()>>>(rulebook_len, + kernel_size, + rulebook_ptr + rulebook_len, + out_index_ptr, + unique_value_ptr); } + const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { if (h_counter[i] <= 0) { diff --git a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu index f1a955477a595..ad55ad08ad527 100644 --- a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu @@ -71,11 +71,7 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx, phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data(), &h_sparse_offsets[0], sizeof(int64_t) * sparse_dim, -#ifdef PADDLE_WITH_HIP - hipMemcpyHostToDevice, -#else - cudaMemcpyHostToDevice, -#endif + gpuMemcpyHostToDevice, dev_ctx.stream()); DenseTensor out_indices = phi::EmptyLike(dev_ctx, indices); @@ -121,29 +117,6 @@ void SparseMaskKernel(const Context& dev_ctx, })); } -template -__global__ void SparseMaskCopyKernel(const IntT* x_indexs, - const IntT* mask_indexs, - const IntT* bound_out, - const T* x_values, - const int64_t n, - const int64_t stride, - T* out_values) { - CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { - using LoadT = phi::AlignedVector; - using StoreT = phi::AlignedVector; - const IntT j = bound_out[i]; - if (j >= 0 && j < n && mask_indexs[i] == x_indexs[j]) { - for (int k = 0; k < stride / VecSize; k++) { - // out_values[i * stride + k] = x_values[j * stride + k]; - LoadT vec_x; - phi::Load(x_values + j * stride + k * VecSize, &vec_x); - phi::Store(vec_x, out_values + i * stride + k * VecSize); - } - } - } -} - template __global__ void MaskTable(const IntT* x_indexs, const int n, int* table) { CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { @@ -214,11 +187,7 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx, phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data(), sparse_offsets.data(), sizeof(IntT) * sparse_dim, -#ifdef PADDLE_WITH_HIP - hipMemcpyHostToDevice, -#else - cudaMemcpyHostToDevice, -#endif + gpuMemcpyHostToDevice, dev_ctx.stream()); // 3. flatten x indices and mask indices @@ -252,7 +221,7 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx, table_size *= x_dims[i]; } DenseTensor table = phi::Empty(dev_ctx, {table_size}); - cudaMemsetAsync( + phi::backends::gpu::GpuMemsetAsync( table.data(), 0, table_size * sizeof(int), dev_ctx.stream()); const int64_t stride = x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1]; @@ -308,7 +277,7 @@ void SparseMaskHelperKernel(const Context& dev_ctx, } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(sparse_mask, +PD_REGISTER_KERNEL(mask, GPU, ALL_LAYOUT, phi::sparse::SparseMaskKernel, @@ -323,7 +292,7 @@ PD_REGISTER_KERNEL(sparse_mask, kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); } -PD_REGISTER_KERNEL(sparse_mask_helper, +PD_REGISTER_KERNEL(mask_helper, GPU, ALL_LAYOUT, phi::sparse::SparseMaskHelperKernel, diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index df4fec61a9a3d..48cdae5aa0868 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -121,7 +121,7 @@ void TestConv3dBase(const std::vector& indices, strides, 1, subm, - "Conv3d_0", + "Conv3d", &rulebook, &counter); @@ -152,7 +152,7 @@ void TestConv3dBase(const std::vector& indices, strides, 1, subm, - "Conv3d_0"); + "Conv3d"); f_verify(std::get<0>(grads).non_zero_elements().data(), features_grad); f_verify(std::get<1>(grads).data(), kernel_grad); } @@ -209,7 +209,7 @@ void TestConv3dBase(const std::vector& indices, strides, 1, subm, - "Conv3d_0", + "Conv3d", &d_rulebook, &d_counter); SparseCooTensor tmp_d_out = sparse::Coalesced(dev_ctx_gpu, d_out); @@ -258,7 +258,7 @@ void TestConv3dBase(const std::vector& indices, strides, 1, subm, - "Conv3d_0"); + "Conv3d"); DenseTensor d_features_grad = std::get<0>(grads).non_zero_elements(); DenseTensor d_kernel_grad = std::get<1>(grads); DenseTensor h_features_grad = From 4346bbb188371be8b2088f6ba7b672796b3f8ce3 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 6 Jul 2022 08:03:26 +0000 Subject: [PATCH 61/70] fix comment --- paddle/phi/kernels/funcs/sparse/convolution.h | 29 +++++++++++++++ .../sparse/cpu/convolution_grad_kernel.cc | 21 ++--------- .../sparse/gpu/convolution_grad_kernel.cu | 35 ++++--------------- .../kernels/sparse/gpu/convolution_kernel.cu | 4 +-- 4 files changed, 40 insertions(+), 49 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h index f3caa2a62f4a8..99aeb8eaf6098 100644 --- a/paddle/phi/kernels/funcs/sparse/convolution.h +++ b/paddle/phi/kernels/funcs/sparse/convolution.h @@ -188,6 +188,35 @@ inline void PrefixSum(const T* counter, T* offsets, const int n) { offsets[n] = offset; } +template +inline const IntT* GetRulebookPtr(const SparseCooTensor& coo, + const DenseTensor& rulebook, + const std::string& key, + int* rulebook_len) { + if (!key.empty()) { + const auto* table = coo.table(key); + if (table != nullptr) { + const DenseTensor& tmp_rulebook = table->first; + *rulebook_len = tmp_rulebook.dims()[1]; + return tmp_rulebook.data(); + } + } + *rulebook_len = rulebook.dims()[1]; + return rulebook.data(); +} + +inline const int* GetCounterPtr(const SparseCooTensor& coo, + const DenseTensor& counter, + const std::string& key) { + if (!key.empty()) { + const auto* table = coo.table(key); + if (table != nullptr) { + return table->second.data(); + } + } + return counter.data(); +} + } // namespace sparse } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc index 5c6c2539c0a74..5e51a56e53cb7 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc @@ -52,24 +52,9 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx, const int out_channels = kernel_dims[4]; int rulebook_len = 0; - const IntT* rulebook_ptr = nullptr; - const int* counter_ptr = nullptr; - bool cache_in_table = false; - if (!key.empty()) { - const auto* table = out.table(key); - if (table != nullptr) { - cache_in_table = true; - const DenseTensor& tmp_rulebook = table->first; - rulebook_ptr = tmp_rulebook.data(); - rulebook_len = tmp_rulebook.dims()[1]; - counter_ptr = table->second.data(); - } - } - if (!cache_in_table) { - rulebook_ptr = rulebook.data(); - rulebook_len = rulebook.dims()[1]; - counter_ptr = counter.data(); - } + const IntT* rulebook_ptr = phi::funcs::sparse::GetRulebookPtr( + out, rulebook, key, &rulebook_len); + const int* counter_ptr = phi::funcs::sparse::GetCounterPtr(out, counter, key); DenseTensorMeta in_features_meta( x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW); diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index 1a6842416e3dd..08e3d71c961ac 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -19,7 +19,6 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -59,44 +58,22 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, const int out_channels = kernel_dims[4]; int rulebook_len = 0; - const IntT* rulebook_ptr = nullptr; - const int* counter_ptr = nullptr; - bool cache_in_table = false; - if (!key.empty()) { - const auto* table = out.table(key); - if (table != nullptr) { - cache_in_table = true; - const DenseTensor& tmp_rulebook = table->first; - rulebook_ptr = tmp_rulebook.data(); - rulebook_len = tmp_rulebook.dims()[1]; - counter_ptr = table->second.data(); - } - } - if (!cache_in_table) { - rulebook_ptr = rulebook.data(); - rulebook_len = rulebook.dims()[1]; - counter_ptr = counter.data(); - } + const IntT* rulebook_ptr = phi::funcs::sparse::GetRulebookPtr( + out, rulebook, key, &rulebook_len); + const int* counter_ptr = phi::funcs::sparse::GetCounterPtr(out, counter, key); - DenseTensorMeta in_features_meta( - x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW); - DenseTensorMeta d_x_features_meta( - x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW); - DenseTensorMeta out_grad_features_meta( - x.dtype(), {rulebook_len, out_channels}, DataLayout::NCHW); phi::DenseTensor in_features = - phi::Empty(dev_ctx, std::move(in_features_meta)); + phi::Empty(dev_ctx, {rulebook_len, in_channels}); phi::DenseTensor d_x_features = - phi::Empty(dev_ctx, std::move(d_x_features_meta)); + phi::Empty(dev_ctx, {rulebook_len, in_channels}); phi::DenseTensor out_grad_features = - phi::Empty(dev_ctx, std::move(out_grad_features_meta)); + phi::Empty(dev_ctx, {rulebook_len, out_channels}); T* in_features_ptr = in_features.data(); T* d_x_features_ptr = d_x_features.data(); T* out_grad_features_ptr = out_grad_features.data(); *kernel_grad = phi::EmptyLike(dev_ctx, kernel); T* d_kernel_ptr = kernel_grad->data(); - phi::funcs::SetConstant set_zero; phi::backends::gpu::GpuMemsetAsync( d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel(), dev_ctx.stream()); diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 51e0dfcc40348..4afa197eb4cca 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -72,8 +72,8 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, // 1. product rulebook DenseTensor counter_per_kernel = phi::Empty(dev_ctx, {kernel_size}); DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, {kernel_size}); - DenseTensor out_index; - DenseTensor unique_value; + DenseTensor out_index = phi::Empty(dev_ctx, {1}); + DenseTensor unique_value = phi::Empty(dev_ctx, {1}); VLOG(6) << "call SubmConv3D or Conv3D " << subm << " and the key is " << key; int rulebook_len = 0; From 3187f52b9b4a29b12873ac0235bf28745811b38d Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 6 Jul 2022 09:10:42 +0000 Subject: [PATCH 62/70] opt code structure --- paddle/phi/kernels/funcs/sparse/convolution.h | 54 +++++++++++++++++++ .../kernels/sparse/cpu/convolution_kernel.cc | 53 +++++++----------- .../kernels/sparse/gpu/convolution_kernel.cu | 48 +++++------------ 3 files changed, 85 insertions(+), 70 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h index 99aeb8eaf6098..a4027670a508c 100644 --- a/paddle/phi/kernels/funcs/sparse/convolution.h +++ b/paddle/phi/kernels/funcs/sparse/convolution.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/blas/blas.h" namespace phi { @@ -217,6 +218,59 @@ inline const int* GetCounterPtr(const SparseCooTensor& coo, return counter.data(); } +template +inline const IntT* PrepareSubm(const Context& dev_ctx, + const SparseCooTensor& x, + const std::string& key, + const DDim& out_dims, + SparseCooTensor* out, + std::vector* counter, + std::vector* offsets, + int* rulebook_len, + bool* need_product_rulebook) { + const auto* table = x.table(key); + if (table != nullptr) { + *need_product_rulebook = false; + const DenseTensor& rulebook = table->first; + memcpy(counter->data(), + table->second.data(), + table->second.size() * sizeof(int)); + out->SetTablePtr(x.GetTablePtr()); + + *rulebook_len = rulebook.dims()[1]; + + DenseTensor out_indices = + phi::EmptyLike(dev_ctx, x.non_zero_indices()); + DenseTensor out_values = phi::EmptyLike(dev_ctx, x.non_zero_elements()); + phi::Copy( + dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices); + out->SetMember(out_indices, out_values, out_dims, false); + PrefixSum(counter->data(), offsets->data(), counter->size()); + return rulebook.data(); + } + return nullptr; +} + +template +inline void SaveToTable(const Context& dev_ctx, + const SparseCooTensor& x, + const std::string& key, + const DenseTensor& in_rulebook, + const std::vector& counter_vec, + SparseCooTensor* out, + DenseTensor* out_rulebook, + DenseTensor* counter) { + out->SetTablePtr(x.GetTablePtr()); + if (!key.empty()) { + out->SetTable(key, std::make_pair(in_rulebook, counter_vec)); + } else { + *out_rulebook = in_rulebook; + counter->Resize({static_cast(counter_vec.size())}); + int* counter_ptr = dev_ctx.template HostAlloc(counter); + memcpy(counter_ptr, counter_vec.data(), counter_vec.size() * sizeof(int)); + } +} + } // namespace sparse } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index f5f7497df96fb..42450427bc0a9 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -70,34 +70,23 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx, // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf // 1. product rulebook std::vector counter_per_kernel(kernel_size, 0); + std::vector offsets(kernel_size + 1); // DenseTensor* rulebook = nullptr; const IntT* rulebook_ptr = nullptr; int n = 0; bool need_product_rulebook = true; if (subm && !key.empty()) { - const auto* table = x.table(key); - if (table != nullptr) { - need_product_rulebook = false; - const DenseTensor& rulebook = table->first; - rulebook_ptr = rulebook.data(); - out->SetTablePtr(x.GetTablePtr()); - n = rulebook.dims()[1]; - - DenseTensor out_indices = - phi::EmptyLike(dev_ctx, x.non_zero_indices()); - DenseTensor out_values = - phi::EmptyLike(dev_ctx, x.non_zero_elements()); - phi::Copy(dev_ctx, - x.non_zero_indices(), - dev_ctx.GetPlace(), - false, - &out_indices); - out->SetMember(out_indices, out_values, out_dims, true); - memcpy(counter_per_kernel.data(), - table->second.data(), - kernel_size * sizeof(int)); - } + rulebook_ptr = phi::funcs::sparse::PrepareSubm( + dev_ctx, + x, + key, + out_dims, + out, + &counter_per_kernel, + &offsets, + &n, + &need_product_rulebook); } if (need_product_rulebook) { DenseTensor tmp_rulebook; @@ -117,17 +106,14 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx, n = tmp_rulebook.dims()[1]; rulebook_ptr = tmp_rulebook.data(); - out->SetTablePtr(x.GetTablePtr()); - if (!key.empty()) { - out->SetTable(key, std::make_pair(tmp_rulebook, counter_per_kernel)); - } else { - *rulebook = tmp_rulebook; - counter->Resize({kernel_size}); - int* counter_ptr = dev_ctx.template HostAlloc(counter); - memcpy(counter_ptr, - counter_per_kernel.data(), - counter_per_kernel.size() * sizeof(int)); - } + phi::funcs::sparse::SaveToTable(dev_ctx, + x, + key, + tmp_rulebook, + counter_per_kernel, + out, + rulebook, + counter); } // int n = rulebook->dims()[1]; const int* counter_ptr = counter_per_kernel.data(); @@ -152,7 +138,6 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx, // 3. call gemm for every werght auto blas = phi::funcs::GetBlas(dev_ctx); - std::vector offsets(kernel_size + 1); int offset = 0; for (int i = 0; i < kernel_size; i++) { offsets[i] = offset; diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 4afa197eb4cca..05aba7521eca5 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -80,33 +80,16 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, const IntT* rulebook_ptr = nullptr; bool need_product_rulebook = true; if (subm && !key.empty()) { - const auto* table = x.table(key); - if (table != nullptr) { - need_product_rulebook = false; - const DenseTensor& rulebook = table->first; - rulebook_ptr = rulebook.data(); - memcpy(h_counter.data(), table->second.data(), kernel_size * sizeof(int)); - out->SetTablePtr(x.GetTablePtr()); - - rulebook_len = rulebook.dims()[1]; - - DenseTensor out_indices = - phi::EmptyLike(dev_ctx, x.non_zero_indices()); - DenseTensor out_values = - phi::EmptyLike(dev_ctx, x.non_zero_elements()); - phi::Copy(dev_ctx, - x.non_zero_indices(), - dev_ctx.GetPlace(), - false, - &out_indices); - out->SetMember(out_indices, out_values, out_dims, true); - IntT offset = 0; - for (int i = 0; i < kernel_size; i++) { - offsets[i] = offset; - offset += h_counter[i]; - } - offsets[kernel_size] = offset; - } + rulebook_ptr = phi::funcs::sparse::PrepareSubm( + dev_ctx, + x, + key, + out_dims, + out, + &h_counter, + &offsets, + &rulebook_len, + &need_product_rulebook); } if (need_product_rulebook) { @@ -129,15 +112,8 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, &offsets); rulebook_ptr = tmp_rulebook.data(); - out->SetTablePtr(x.GetTablePtr()); - if (!key.empty()) { - out->SetTable(key, std::make_pair(tmp_rulebook, h_counter)); - } else { - *rulebook = tmp_rulebook; - counter->Resize({kernel_size}); - int* counter_ptr = dev_ctx.template HostAlloc(counter); - memcpy(counter_ptr, h_counter.data(), h_counter.size() * sizeof(int)); - } + phi::funcs::sparse::SaveToTable( + dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter); } // 2. gather From aa284f4a1b43280c91c8952797536720387f293c Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 6 Jul 2022 12:20:37 +0000 Subject: [PATCH 63/70] rename conv_kernel --- paddle/phi/api/yaml/sparse_api.yaml | 6 +- paddle/phi/api/yaml/sparse_bw_api.yaml | 6 +- paddle/phi/kernels/sparse/conv_grad_kernel.h | 79 ++++++++++++++ paddle/phi/kernels/sparse/conv_kernel.h | 68 ++++++++++++ .../kernels/sparse/convolution_grad_kernel.h | 80 -------------- .../phi/kernels/sparse/convolution_kernel.h | 68 ------------ .../kernels/sparse/cpu/coalesced_kernel.cc | 2 +- .../sparse/cpu/{convolution.h => conv.h} | 2 +- ...ion_grad_kernel.cc => conv_grad_kernel.cc} | 100 +++++++++--------- .../{convolution_kernel.cc => conv_kernel.cc} | 78 +++++++------- paddle/phi/kernels/sparse/cpu/pool_kernel.cc | 2 +- .../gpu/{convolution.cu.h => conv.cu.h} | 80 +++++++------- ...ion_grad_kernel.cu => conv_grad_kernel.cu} | 100 +++++++++--------- .../{convolution_kernel.cu => conv_kernel.cu} | 82 +++++++------- paddle/phi/kernels/sparse/gpu/pool_kernel.cu | 2 +- paddle/phi/tests/api/test_sparse_conv_api.cc | 6 +- .../kernels/test_sparse_conv3d_dev_api.cc | 100 +++++++++--------- .../tests/unittests/test_sparse_utils_op.py | 2 + python/paddle/incubate/sparse/coalesced.py | 27 +++++ .../incubate/sparse/nn/functional/conv.py | 6 +- 20 files changed, 463 insertions(+), 433 deletions(-) create mode 100644 paddle/phi/kernels/sparse/conv_grad_kernel.h create mode 100644 paddle/phi/kernels/sparse/conv_kernel.h delete mode 100644 paddle/phi/kernels/sparse/convolution_grad_kernel.h delete mode 100644 paddle/phi/kernels/sparse/convolution_kernel.h rename paddle/phi/kernels/sparse/cpu/{convolution.h => conv.h} (99%) rename paddle/phi/kernels/sparse/cpu/{convolution_grad_kernel.cc => conv_grad_kernel.cc} (77%) rename paddle/phi/kernels/sparse/cpu/{convolution_kernel.cc => conv_kernel.cc} (82%) rename paddle/phi/kernels/sparse/gpu/{convolution.cu.h => conv.cu.h} (94%) rename paddle/phi/kernels/sparse/gpu/{convolution_grad_kernel.cu => conv_grad_kernel.cu} (81%) rename paddle/phi/kernels/sparse/gpu/{convolution_kernel.cu => conv_kernel.cu} (85%) diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml index a73529dde3c17..5780bec804008 100644 --- a/paddle/phi/api/yaml/sparse_api.yaml +++ b/paddle/phi/api/yaml/sparse_api.yaml @@ -7,14 +7,14 @@ layout : x backward : add_grad -- api : conv3d +- api : conv3d_coo args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) output : Tensor(out), Tensor(rulebook), Tensor(counter) kernel : - func : sparse_conv3d{sparse_coo, dense -> sparse_coo, dense, dense} + func : conv3d_coo{sparse_coo, dense -> sparse_coo, dense, dense} layout : x intermediate: rulebook, counter - backward : conv3d_grad + backward : conv3d_coo_grad - api : coo_to_dense args : (Tensor x) diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml index 4d0371257d810..56f0595351d35 100644 --- a/paddle/phi/api/yaml/sparse_bw_api.yaml +++ b/paddle/phi/api/yaml/sparse_bw_api.yaml @@ -6,12 +6,12 @@ func : add_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo}, add_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr} -- backward_api : conv3d_grad - forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor), Tensor(counter@DenseTensor) +- backward_api : conv3d_coo_grad + forward : conv3d_coo (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor), Tensor(counter@DenseTensor) args : (Tensor x, Tensor kernel, Tensor out, Tensor rulebook, Tensor counter, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) output : Tensor(x_grad), Tensor(kernel_grad) kernel : - func : sparse_conv3d_grad{sparse_coo, dense, sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense} + func : conv3d_coo_grad{sparse_coo, dense, sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense} - backward_api : coo_to_dense_grad forward : coo_to_dense(Tensor x) -> Tensor(out) diff --git a/paddle/phi/kernels/sparse/conv_grad_kernel.h b/paddle/phi/kernels/sparse/conv_grad_kernel.h new file mode 100644 index 0000000000000..867f6b5a53f37 --- /dev/null +++ b/paddle/phi/kernels/sparse/conv_grad_kernel.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { +namespace sparse { + +template +void Conv3dCooGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const SparseCooTensor& out, + const DenseTensor& rulebook, + const DenseTensor& counter, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + const std::string& key, + SparseCooTensor* x_grad, + DenseTensor* kernel_grad); + +template +std::tuple Conv3dCooGrad( + const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const SparseCooTensor& out, + const DenseTensor& rulebook, + const DenseTensor& counter, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + const std::string& key) { + SparseCooTensor x_grad; + DenseTensor kernel_grad; + + // TODO(zhangkaihuo): call InferMeta func here + Conv3dCooGradKernel(dev_ctx, + x, + kernel, + out, + rulebook, + counter, + out_grad, + paddings, + dilations, + strides, + groups, + subm, + key, + &x_grad, + &kernel_grad); + return std::make_tuple(x_grad, kernel_grad); +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/conv_kernel.h b/paddle/phi/kernels/sparse/conv_kernel.h new file mode 100644 index 0000000000000..0c5a2081a6f3d --- /dev/null +++ b/paddle/phi/kernels/sparse/conv_kernel.h @@ -0,0 +1,68 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" + +namespace phi { +namespace sparse { + +template +void Conv3dCooKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + const std::string& key, + SparseCooTensor* out, + DenseTensor* rulebook, + DenseTensor* counter); + +template +SparseCooTensor Conv3dCoo(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + const std::string& key, + DenseTensor* rulebook, + DenseTensor* counter) { + SparseCooTensor coo; + Conv3dCooKernel(dev_ctx, + x, + kernel, + paddings, + dilations, + strides, + groups, + subm, + key, + &coo, + rulebook, + counter); + return coo; +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h deleted file mode 100644 index 54d09babb2cf9..0000000000000 --- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/sparse_coo_tensor.h" -#include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/sparse/convolution_kernel.h" - -namespace phi { -namespace sparse { - -template -void Conv3dGradKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const SparseCooTensor& out, - const DenseTensor& rulebook, - const DenseTensor& counter, - const SparseCooTensor& out_grad, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - const std::string& key, - SparseCooTensor* x_grad, - DenseTensor* kernel_grad); - -template -std::tuple Conv3dGrad( - const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const SparseCooTensor& out, - const DenseTensor& rulebook, - const DenseTensor& counter, - const SparseCooTensor& out_grad, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - const std::string& key) { - SparseCooTensor x_grad; - DenseTensor kernel_grad; - - // TODO(zhangkaihuo): call InferMeta func here - Conv3dGradKernel(dev_ctx, - x, - kernel, - out, - rulebook, - counter, - out_grad, - paddings, - dilations, - strides, - groups, - subm, - key, - &x_grad, - &kernel_grad); - return std::make_tuple(x_grad, kernel_grad); -} - -} // namespace sparse -} // namespace phi diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h deleted file mode 100644 index 62559d4e0ff1e..0000000000000 --- a/paddle/phi/kernels/sparse/convolution_kernel.h +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/sparse_coo_tensor.h" -#include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/funcs/sparse/convolution.h" - -namespace phi { -namespace sparse { - -template -void Conv3dKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - const std::string& key, - SparseCooTensor* out, - DenseTensor* rulebook, - DenseTensor* counter); - -template -SparseCooTensor Conv3d(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor kernel, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - const std::string& key, - DenseTensor* rulebook, - DenseTensor* counter) { - SparseCooTensor coo; - Conv3dKernel(dev_ctx, - x, - kernel, - paddings, - dilations, - strides, - groups, - subm, - key, - &coo, - rulebook, - counter); - return coo; -} - -} // namespace sparse -} // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc index 9d1f71afceb5e..b42294cfc0315 100644 --- a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc @@ -107,7 +107,7 @@ void CoalescedKernel(const Context& dev_ctx, } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(sort, +PD_REGISTER_KERNEL(coalesced, CPU, ALL_LAYOUT, phi::sparse::CoalescedKernel, diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/conv.h similarity index 99% rename from paddle/phi/kernels/sparse/cpu/convolution.h rename to paddle/phi/kernels/sparse/cpu/conv.h index 07baf77ff5d27..82480e492abae 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution.h +++ b/paddle/phi/kernels/sparse/cpu/conv.h @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/sparse/convolution_kernel.h" +#include "paddle/phi/kernels/sparse/conv_kernel.h" namespace phi { namespace sparse { diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc similarity index 77% rename from paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc rename to paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc index 5e51a56e53cb7..44ad2fa588b55 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc @@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" +#include "paddle/phi/kernels/sparse/conv_grad_kernel.h" #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/sparse/cpu/convolution.h" +#include "paddle/phi/kernels/sparse/cpu/conv.h" namespace phi { namespace sparse { @@ -31,21 +31,21 @@ namespace sparse { // x_grad = out_grad * transpose(kenrel) // kernel_grad = transpose(x) * out_grad template -void Conv3dGradCPUKernel(const CPUContext& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const SparseCooTensor& out, - const DenseTensor& rulebook, - const DenseTensor& counter, - const SparseCooTensor& out_grad, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - const std::string& key, - SparseCooTensor* x_grad, - DenseTensor* kernel_grad) { +void Conv3dCooGradCPUKernel(const CPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const SparseCooTensor& out, + const DenseTensor& rulebook, + const DenseTensor& counter, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + const std::string& key, + SparseCooTensor* x_grad, + DenseTensor* kernel_grad) { const auto& kernel_dims = kernel.dims(); const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; const int in_channels = kernel_dims[3]; @@ -181,48 +181,48 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx, } template -void Conv3dGradKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const SparseCooTensor& out, - const DenseTensor& rulebook, - const DenseTensor& counter, - const SparseCooTensor& out_grad, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - const std::string& key, - SparseCooTensor* x_grad, - DenseTensor* kernel_grad) { +void Conv3dCooGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const SparseCooTensor& out, + const DenseTensor& rulebook, + const DenseTensor& counter, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + const std::string& key, + SparseCooTensor* x_grad, + DenseTensor* kernel_grad) { PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "Conv3dGradCPUKernel", ([&] { - Conv3dGradCPUKernel(dev_ctx, - x, - kernel, - out, - rulebook, - counter, - out_grad, - paddings, - dilations, - strides, - groups, - subm, - key, - x_grad, - kernel_grad); + x.non_zero_indices().dtype(), "Conv3dCooGradCPUKernel", ([&] { + Conv3dCooGradCPUKernel(dev_ctx, + x, + kernel, + out, + rulebook, + counter, + out_grad, + paddings, + dilations, + strides, + groups, + subm, + key, + x_grad, + kernel_grad); })); } } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(sparse_conv3d_grad, +PD_REGISTER_KERNEL(conv3d_coo_grad, CPU, ALL_LAYOUT, - phi::sparse::Conv3dGradKernel, + phi::sparse::Conv3dCooGradKernel, float, double) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc similarity index 82% rename from paddle/phi/kernels/sparse/cpu/convolution_kernel.cc rename to paddle/phi/kernels/sparse/cpu/conv_kernel.cc index 42450427bc0a9..5a892e64c65a0 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/sparse/cpu/convolution.h" +#include "paddle/phi/kernels/sparse/cpu/conv.h" namespace phi { namespace sparse { @@ -28,18 +28,18 @@ namespace sparse { * out: (N, D, H, W, OC) **/ template -void Conv3dCPUKernel(const CPUContext& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - const std::string& key, - SparseCooTensor* out, - DenseTensor* rulebook, - DenseTensor* counter) { +void Conv3dCooCPUKernel(const CPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + const std::string& key, + SparseCooTensor* out, + DenseTensor* rulebook, + DenseTensor* counter) { // update padding and dilation // Currently, only support x.layout is NDHWC, groups = 1 // if x.layout != NDHWC then transpose(x), transpose(weight) @@ -178,32 +178,32 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx, } template -void Conv3dKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - const std::string& key, - SparseCooTensor* out, - DenseTensor* rulebook, - DenseTensor* counter) { +void Conv3dCooKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + const std::string& key, + SparseCooTensor* out, + DenseTensor* rulebook, + DenseTensor* counter) { PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "Conv3dCPUKernel", ([&] { - Conv3dCPUKernel(dev_ctx, - x, - kernel, - paddings, - dilations, - strides, - groups, - subm, - key, - out, - rulebook, - counter); + x.non_zero_indices().dtype(), "Conv3dCooCPUKernel", ([&] { + Conv3dCooCPUKernel(dev_ctx, + x, + kernel, + paddings, + dilations, + strides, + groups, + subm, + key, + out, + rulebook, + counter); })); } @@ -211,6 +211,6 @@ void Conv3dKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_KERNEL( - sparse_conv3d, CPU, ALL_LAYOUT, phi::sparse::Conv3dKernel, float, double) { + conv3d_coo, CPU, ALL_LAYOUT, phi::sparse::Conv3dCooKernel, float, double) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); } diff --git a/paddle/phi/kernels/sparse/cpu/pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/pool_kernel.cc index 38e512bd00c93..36949f7161245 100644 --- a/paddle/phi/kernels/sparse/cpu/pool_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/pool_kernel.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/sparse/convolution.h" -#include "paddle/phi/kernels/sparse/cpu/convolution.h" +#include "paddle/phi/kernels/sparse/cpu/conv.h" namespace phi { namespace sparse { diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/conv.cu.h similarity index 94% rename from paddle/phi/kernels/sparse/gpu/convolution.cu.h rename to paddle/phi/kernels/sparse/gpu/conv.cu.h index 4363f94f69443..8cd55e50d98d0 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h @@ -14,11 +14,9 @@ limitations under the License. */ #pragma once -#include -#include #include -#include #include +#include "paddle/phi/kernels/sparse/conv_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_info.h" @@ -31,7 +29,6 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" #include "paddle/phi/kernels/funcs/sparse/utils.cu.h" #include "paddle/phi/kernels/primitive/compute_primitives.h" -#include "paddle/phi/kernels/sparse/convolution_kernel.h" namespace phi { namespace sparse { @@ -490,6 +487,34 @@ __global__ void GroupIndexsV2(const int rulebook_len, } } +inline void CallThrustScan(const GPUContext& dev_ctx, + const int* counter_ptr, + const int kernel_size, + int* offsets_ptr, + int* h_counter_ptr, + int* h_offsets_ptr) { +#ifdef PADDLE_WITH_HIP + thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), +#endif + counter_ptr, + counter_ptr + kernel_size, + offsets_ptr); + + phi::backends::gpu::GpuMemcpyAsync(h_counter_ptr, + counter_ptr, + kernel_size * sizeof(int), + gpuMemcpyDeviceToHost, + dev_ctx.stream()); + + phi::backends::gpu::GpuMemcpyAsync(h_offsets_ptr, + offsets_ptr, + kernel_size * sizeof(int), + gpuMemcpyDeviceToHost, + dev_ctx.stream()); +} + // the basic algorithm can refer to convolution_kernel.cc or // the second paper // example: @@ -608,22 +633,13 @@ int ProductRuleBook(const Context& dev_ctx, out->SetMember(out_indices, out_values, out_dims, false); - thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), - counter_ptr, - counter_ptr + kernel_size, - offsets_ptr); - - phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], - counter_ptr, - kernel_size * sizeof(int), - gpuMemcpyDeviceToHost, - dev_ctx.stream()); + CallThrustScan(dev_ctx, + counter_ptr, + kernel_size, + offsets_ptr, + h_counter->data(), + h_offsets->data()); - phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], - offsets_ptr, - kernel_size * sizeof(int), - gpuMemcpyDeviceToHost, - dev_ctx.stream()); dev_ctx.Wait(); int rulebook_len = (*h_offsets)[kernel_size - 1] + (*h_counter)[kernel_size - 1]; @@ -675,26 +691,12 @@ int ProductRuleBook(const Context& dev_ctx, IntT rulebook_len = (last - rulebook_ptr) / 2; -#ifdef PADDLE_WITH_HIP - thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), -#endif - counter_ptr, - counter_ptr + kernel_size, - offsets_ptr); - - phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], - counter_ptr, - kernel_size * sizeof(int), - gpuMemcpyDeviceToHost, - dev_ctx.stream()); - - phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], - offsets_ptr, - kernel_size * sizeof(int), - gpuMemcpyDeviceToHost, - dev_ctx.stream()); + CallThrustScan(dev_ctx, + counter_ptr, + kernel_size, + offsets_ptr, + h_counter->data(), + h_offsets->data()); rulebook->Resize({rulebook_rows, static_cast(rulebook_len)}); // 3. sorted or merge the out index diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu similarity index 81% rename from paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu rename to paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu index 08e3d71c961ac..848517aae2549 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" +#include "paddle/phi/kernels/sparse/conv_grad_kernel.h" #include "glog/logging.h" #include "paddle/phi/backends/gpu/gpu_context.h" @@ -23,7 +23,7 @@ limitations under the License. */ #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" +#include "paddle/phi/kernels/sparse/gpu/conv.cu.h" namespace phi { namespace sparse { @@ -37,21 +37,21 @@ namespace sparse { // x_grad = out_grad * transpose(kenrel) // kernel_grad = transpose(x) * out_grad template -void Conv3dGradGPUKernel(const GPUContext& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const SparseCooTensor& out, - const DenseTensor& rulebook, - const DenseTensor& counter, - const SparseCooTensor& out_grad, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - const std::string& key, - SparseCooTensor* x_grad, - DenseTensor* kernel_grad) { +void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const SparseCooTensor& out, + const DenseTensor& rulebook, + const DenseTensor& counter, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + const std::string& key, + SparseCooTensor* x_grad, + DenseTensor* kernel_grad) { const auto& kernel_dims = kernel.dims(); const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; const int in_channels = kernel_dims[3]; @@ -218,48 +218,48 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, } template -void Conv3dGradKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const SparseCooTensor& out, - const DenseTensor& rulebook, - const DenseTensor& counter, - const SparseCooTensor& out_grad, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - const std::string& key, - SparseCooTensor* x_grad, - DenseTensor* kernel_grad) { +void Conv3dCooGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const SparseCooTensor& out, + const DenseTensor& rulebook, + const DenseTensor& counter, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + const std::string& key, + SparseCooTensor* x_grad, + DenseTensor* kernel_grad) { PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "Conv3dGradGPUKernel", ([&] { - Conv3dGradGPUKernel(dev_ctx, - x, - kernel, - out, - rulebook, - counter, - out_grad, - paddings, - dilations, - strides, - groups, - subm, - key, - x_grad, - kernel_grad); + x.non_zero_indices().dtype(), "Conv3dCooGradGPUKernel", ([&] { + Conv3dCooGradGPUKernel(dev_ctx, + x, + kernel, + out, + rulebook, + counter, + out_grad, + paddings, + dilations, + strides, + groups, + subm, + key, + x_grad, + kernel_grad); })); } } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(sparse_conv3d_grad, +PD_REGISTER_KERNEL(conv3d_coo_grad, GPU, ALL_LAYOUT, - phi::sparse::Conv3dGradKernel, + phi::sparse::Conv3dCooGradKernel, float, double, phi::dtype::float16) { diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu similarity index 85% rename from paddle/phi/kernels/sparse/gpu/convolution_kernel.cu rename to paddle/phi/kernels/sparse/gpu/conv_kernel.cu index 05aba7521eca5..89159f4c55a70 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/sparse/convolution_kernel.h" +#include "paddle/phi/kernels/sparse/conv_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/scatter.cu.h" #include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" -#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" +#include "paddle/phi/kernels/sparse/gpu/conv.cu.h" #include "glog/logging.h" @@ -29,18 +29,18 @@ namespace phi { namespace sparse { template -void Conv3dGPUKernel(const GPUContext& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - const std::string& key, - SparseCooTensor* out, - DenseTensor* rulebook, - DenseTensor* counter) { +void Conv3dCooGPUKernel(const GPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + const std::string& key, + SparseCooTensor* out, + DenseTensor* rulebook, + DenseTensor* counter) { // update padding and dilation // Currently, only support x.layout is NDHWC, groups = 1 // if x.layout != NDHWC then transpose(x), transpose(weight) @@ -205,42 +205,42 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, * counter: return counter if key is not vailed else return nullptr **/ template -void Conv3dKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - const std::string& key, - SparseCooTensor* out, - DenseTensor* rulebook, - DenseTensor* counter) { +void Conv3dCooKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + const std::string& key, + SparseCooTensor* out, + DenseTensor* rulebook, + DenseTensor* counter) { PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "Conv3dGPUKernel", ([&] { - Conv3dGPUKernel(dev_ctx, - x, - kernel, - paddings, - dilations, - strides, - groups, - subm, - key, - out, - rulebook, - counter); + x.non_zero_indices().dtype(), "Conv3dCooGPUKernel", ([&] { + Conv3dCooGPUKernel(dev_ctx, + x, + kernel, + paddings, + dilations, + strides, + groups, + subm, + key, + out, + rulebook, + counter); })); } } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(sparse_conv3d, +PD_REGISTER_KERNEL(conv3d_coo, GPU, ALL_LAYOUT, - phi::sparse::Conv3dKernel, + phi::sparse::Conv3dCooKernel, float, double, phi::dtype::float16) { diff --git a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu index 255c6621da015..7ac727cae4ca9 100644 --- a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/sparse/convolution.h" -#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" +#include "paddle/phi/kernels/sparse/gpu/conv.cu.h" namespace phi { namespace sparse { diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc index f8ca0f6651c9d..b1df197f42f47 100644 --- a/paddle/phi/tests/api/test_sparse_conv_api.cc +++ b/paddle/phi/tests/api/test_sparse_conv_api.cc @@ -23,7 +23,7 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/sparse_coo_tensor.h" -PD_DECLARE_KERNEL(sparse_conv3d, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(conv3d_coo, CPU, ALL_LAYOUT); template void TestConv3dBase(const std::vector& indices, @@ -76,8 +76,8 @@ void TestConv3dBase(const std::vector& indices, kernel.size() * sizeof(T)); if (!std::is_same::value) { - auto tensor_out = paddle::experimental::sparse::conv3d( - x, weight, paddings, dilations, strides, 1, false, "Conv3d_0"); + auto tensor_out = paddle::experimental::sparse::conv3d_coo( + x, weight, paddings, dilations, strides, 1, false, "Conv3d"); auto out = std::dynamic_pointer_cast(tensor_out.impl()); diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index 48cdae5aa0868..df0e87c6b5a49 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -23,8 +23,8 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/sparse/coalesced_kernel.h" -#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" -#include "paddle/phi/kernels/sparse/convolution_kernel.h" +#include "paddle/phi/kernels/sparse/conv_grad_kernel.h" +#include "paddle/phi/kernels/sparse/conv_kernel.h" namespace phi { namespace tests { @@ -113,17 +113,17 @@ void TestConv3dBase(const std::vector& indices, if (!std::is_same::value) { DenseTensor rulebook, counter; - SparseCooTensor out = sparse::Conv3d(dev_ctx_cpu, - x_tensor, - kernel_tensor, - paddings, - dilations, - strides, - 1, - subm, - "Conv3d", - &rulebook, - &counter); + SparseCooTensor out = sparse::Conv3dCoo(dev_ctx_cpu, + x_tensor, + kernel_tensor, + paddings, + dilations, + strides, + 1, + subm, + "Conv3d", + &rulebook, + &counter); ASSERT_EQ(correct_out_dims.size(), out.dims().size()); for (int i = 0; i < correct_out_dims.size(); i++) { @@ -140,19 +140,19 @@ void TestConv3dBase(const std::vector& indices, if (backward) { std::tuple grads = - sparse::Conv3dGrad(dev_ctx_cpu, - x_tensor, - kernel_tensor, - out, - rulebook, - counter, - out, - paddings, - dilations, - strides, - 1, - subm, - "Conv3d"); + sparse::Conv3dCooGrad(dev_ctx_cpu, + x_tensor, + kernel_tensor, + out, + rulebook, + counter, + out, + paddings, + dilations, + strides, + 1, + subm, + "Conv3d"); f_verify(std::get<0>(grads).non_zero_elements().data(), features_grad); f_verify(std::get<1>(grads).data(), kernel_grad); } @@ -201,17 +201,17 @@ void TestConv3dBase(const std::vector& indices, dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor); DenseTensor d_rulebook, d_counter; - SparseCooTensor d_out = sparse::Conv3d(dev_ctx_gpu, - d_x_tensor, - d_kernel_tensor, - paddings, - dilations, - strides, - 1, - subm, - "Conv3d", - &d_rulebook, - &d_counter); + SparseCooTensor d_out = sparse::Conv3dCoo(dev_ctx_gpu, + d_x_tensor, + d_kernel_tensor, + paddings, + dilations, + strides, + 1, + subm, + "Conv3d", + &d_rulebook, + &d_counter); SparseCooTensor tmp_d_out = sparse::Coalesced(dev_ctx_gpu, d_out); ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); @@ -246,19 +246,19 @@ void TestConv3dBase(const std::vector& indices, if (backward) { std::tuple grads = - sparse::Conv3dGrad(dev_ctx_gpu, - d_x_tensor, - d_kernel_tensor, - d_out, - d_rulebook, - d_counter, - d_out, - paddings, - dilations, - strides, - 1, - subm, - "Conv3d"); + sparse::Conv3dCooGrad(dev_ctx_gpu, + d_x_tensor, + d_kernel_tensor, + d_out, + d_rulebook, + d_counter, + d_out, + paddings, + dilations, + strides, + 1, + subm, + "Conv3d"); DenseTensor d_features_grad = std::get<0>(grads).non_zero_elements(); DenseTensor d_kernel_grad = std::get<1>(grads); DenseTensor h_features_grad = diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py index a12425b69299e..b71ef0357cb37 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py @@ -298,6 +298,7 @@ def test_sparse_coo_tensor_sorted(self): values = paddle.to_tensor(values, dtype='float32') sparse_x = paddle.incubate.sparse.sparse_coo_tensor( indices, values) + sparse_x = paddle.incubate.sparse.coalesced(sparse_x) indices_sorted = [[0, 1], [1, 0]] values_sorted = [5.0, 1.0] assert np.array_equal(indices_sorted, @@ -310,6 +311,7 @@ def test_sparse_coo_tensor_sorted(self): values = paddle.to_tensor(values, dtype='float32') sparse_x = paddle.incubate.sparse.sparse_coo_tensor( indices, values) + sparse_x = paddle.incubate.sparse.coalesced(sparse_x) values_sorted = [[5.0, 5.0], [1.0, 1.0]] assert np.array_equal(indices_sorted, sparse_x.indices().numpy()) diff --git a/python/paddle/incubate/sparse/coalesced.py b/python/paddle/incubate/sparse/coalesced.py index dcd2f8ca28f3a..23c82499851d3 100644 --- a/python/paddle/incubate/sparse/coalesced.py +++ b/python/paddle/incubate/sparse/coalesced.py @@ -22,4 +22,31 @@ @dygraph_only def coalesced(x): + r""" + the coalesced operator include sorted and merge, after coalesced, the indices of x is sorted and unique, . + + Args: + x (Tensor): the input SparseCooTensor. + + Returns: + Tensor: return the SparseCooTensor after coalesced. + + Examples: + + .. code-block:: python + + import paddle + from paddle.incubate import sparse + from paddle.fluid.framework import _test_eager_guard + + with _test_eager_guard(): + indices = [[0, 0, 1], [1, 1, 2]] + values = [1.0, 2.0, 3.0] + sp_x = sparse.sparse_coo_tensor(indices, values) + sp_x = sparse.coalesced(sp_x) + print(sp_x.indices()) + #[[0, 1], [1, 2]] + print(sp_x.values()) + #[3.0, 3.0] + """ return _C_ops.final_state_sparse_coalesced(x) diff --git a/python/paddle/incubate/sparse/nn/functional/conv.py b/python/paddle/incubate/sparse/nn/functional/conv.py index 2dda83b2c1659..503ad9a127b0f 100644 --- a/python/paddle/incubate/sparse/nn/functional/conv.py +++ b/python/paddle/incubate/sparse/nn/functional/conv.py @@ -63,9 +63,9 @@ def _conv3d(x, dilation = convert_to_list(dilation, dims, 'dilation') op_type = "conv3d" - pre_bias = _C_ops.final_state_sparse_conv3d(x, weight, padding, dilation, - stride, groups, subm, - key if key is not None else "") + pre_bias = _C_ops.final_state_sparse_conv3d_coo( + x, weight, padding, dilation, stride, groups, subm, + key if key is not None else "") if bias is not None: values = pre_bias.values() add_bias = elementwise_add(values, bias, axis=1) From 0b5ca0eeaa907535f26d18a80a5f6f68734b148d Mon Sep 17 00:00:00 2001 From: Zihang Yao <1162526220@qq.com> Date: Fri, 8 Jul 2022 12:40:52 +0800 Subject: [PATCH 64/70] fix --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 40 ++++++++++++--------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 1936fccc63e4a..61694db7e8ed3 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -236,8 +236,12 @@ static __global__ void BNForwardTraining2DChannelLastCompStat( } // vertical block sum - merge_block_vertical( - x_sum, x_square_sum, smem_sum, smem_square_sum, &x_sum, &x_square_sum); + merge_block_vertical(x_sum, + x_square_sum, + &smem_sum[0], + &smem_square_sum[0], + &x_sum, + &x_square_sum); if (gridDim.y > 1) { volatile BatchNormParamType *staging_sum = block_data_ptr; @@ -272,12 +276,12 @@ static __global__ void BNForwardTraining2DChannelLastCompStat( } // vertical block sum - merge_block_vertical(x_sum, - x_square_sum, - smem_sum, - smem_square_sum, - &x_sum, - &x_square_sum); + merge_block_vertical(x_sum, + x_square_sum, + &smem_sum[0], + &smem_square_sum[0], + &x_sum, + &x_square_sum); // final compute if (threadIdx.y == 0) { @@ -400,8 +404,12 @@ static __global__ void BNForwardTraining2DCompStat( } // horizonal block sum - merge_block_horizonal( - x_sum, x_square_sum, smem_sum, smem_square_sum, &x_sum, &x_square_sum); + merge_block_horizonal(x_sum, + x_square_sum, + &smem_sum[0], + &smem_square_sum[0], + &x_sum, + &x_square_sum); if (gridDim.x > 1) { volatile BatchNormParamType *staging_sum = block_data_ptr; @@ -436,12 +444,12 @@ static __global__ void BNForwardTraining2DCompStat( } // horizonal block sum - merge_block_horizonal(x_sum, - x_square_sum, - smem_sum, - smem_square_sum, - &x_sum, - &x_square_sum); + merge_block_horizonal(x_sum, + x_square_sum, + &smem_sum[0], + &smem_square_sum[0], + &x_sum, + &x_square_sum); // final compute if (threadIdx.x == 0) { From 33ebaf5d8c35cbd0613477e0d37c84f82c7fd13f Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Fri, 8 Jul 2022 08:21:24 +0000 Subject: [PATCH 65/70] rename table_ptr to indices_dict --- paddle/phi/api/yaml/sparse_api.yaml | 4 +- paddle/phi/api/yaml/sparse_bw_api.yaml | 2 +- paddle/phi/core/sparse_coo_tensor.h | 50 +++++++++-------- paddle/phi/kernels/funcs/sparse/convolution.h | 42 +++++++------- .../{coalesced_kernel.h => coalesce_kernel.h} | 10 ++-- ...coalesced_kernel.cc => coalesce_kernel.cc} | 22 ++++---- paddle/phi/kernels/sparse/cpu/conv.h | 6 +- paddle/phi/kernels/sparse/cpu/conv_kernel.cc | 38 ++++++------- paddle/phi/kernels/sparse/cpu/pool_kernel.cc | 6 +- .../gpu/.convolution_grad_kernel.cu.swp | Bin 20480 -> 0 bytes ...coalesced_kernel.cu => coalesce_kernel.cu} | 22 ++++---- paddle/phi/kernels/sparse/gpu/conv.cu.h | 23 +++----- paddle/phi/kernels/sparse/gpu/conv_kernel.cu | 22 +++++--- paddle/phi/kernels/sparse/gpu/pool_kernel.cu | 4 +- .../phi/kernels/sparse/sparse_utils_kernel.h | 7 +-- .../kernels/test_sparse_conv3d_dev_api.cc | 4 +- .../tests/kernels/test_sparse_pool_dev_api.cc | 4 +- .../tests/unittests/test_sparse_conv_op.py | 2 +- python/paddle/incubate/sparse/__init__.py | 5 +- python/paddle/incubate/sparse/coalesced.py | 52 ------------------ python/paddle/incubate/sparse/unary.py | 32 +++++++++++ 21 files changed, 163 insertions(+), 194 deletions(-) rename paddle/phi/kernels/sparse/{coalesced_kernel.h => coalesce_kernel.h} (78%) rename paddle/phi/kernels/sparse/cpu/{coalesced_kernel.cc => coalesce_kernel.cc} (87%) delete mode 100644 paddle/phi/kernels/sparse/gpu/.convolution_grad_kernel.cu.swp rename paddle/phi/kernels/sparse/gpu/{coalesced_kernel.cu => coalesce_kernel.cu} (92%) delete mode 100644 python/paddle/incubate/sparse/coalesced.py diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml index 5780bec804008..917137e2343d9 100644 --- a/paddle/phi/api/yaml/sparse_api.yaml +++ b/paddle/phi/api/yaml/sparse_api.yaml @@ -132,11 +132,11 @@ layout : x backward : values_grad -- api: coalesced +- api: coalesce args : (Tensor x) output : Tensor(out) kernel : - func: coalesced{sparse_coo -> sparse_coo} + func: coalesce{sparse_coo -> sparse_coo} layout : x - api: full_like diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml index 56f0595351d35..b4d990e3ae5e0 100644 --- a/paddle/phi/api/yaml/sparse_bw_api.yaml +++ b/paddle/phi/api/yaml/sparse_bw_api.yaml @@ -7,7 +7,7 @@ add_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr} - backward_api : conv3d_coo_grad - forward : conv3d_coo (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor), Tensor(counter@DenseTensor) + forward : conv3d_coo (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out), Tensor(rulebook), Tensor(counter) args : (Tensor x, Tensor kernel, Tensor out, Tensor rulebook, Tensor counter, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) output : Tensor(x_grad), Tensor(kernel_grad) kernel : diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h index c69c7aab89d28..300ae8a0ab958 100644 --- a/paddle/phi/core/sparse_coo_tensor.h +++ b/paddle/phi/core/sparse_coo_tensor.h @@ -157,37 +157,45 @@ class SparseCooTensor : public TensorBase, int32_t dense_dim() const; /// \brief query table according to key - const std::pair>* table( + const std::pair* IndicesPairs( const std::string& key) const { - const auto& iter = table_ptr_->find(key); - if (iter == table_ptr_->end()) { + if (indices_dict_ == nullptr) { + return nullptr; + } + const auto& iter = indices_dict_->find(key); + if (iter == indices_dict_->end()) { return nullptr; } return &iter->second; } - /// \brief set table according to key - void SetTable(const std::string& key, - const std::pair>& table) { - auto ret = table_ptr_->insert({key, table}); + /// \brief save (key, indices_pairs) + void SaveIndicesPairs( + const std::string& key, + const std::pair& indices_pairs) { + if (indices_dict_ == nullptr) { + indices_dict_ = std::make_shared< + std::map>>(); + } + auto ret = indices_dict_->insert({key, indices_pairs}); if (ret.second == false) { - ret.first->second = table; + ret.first->second = indices_pairs; } } - /// \brief get table_ptr_ + /// \brief get indices_dict_ const std::shared_ptr< - std::map>>>& - GetTablePtr() const { - return table_ptr_; + std::map>>& + GetIndicesDict() const { + return indices_dict_; } - /// \brief set table_ptr_ - void SetTablePtr( + /// \brief set indices_dict_ + void SetIndicesDict( const std::shared_ptr< - std::map>>>& - table_ptr) { - table_ptr_ = table_ptr; + std::map>>& + indices_dict) { + indices_dict_ = indices_dict; } private: @@ -203,11 +211,9 @@ class SparseCooTensor : public TensorBase, // for submanifold conv // SubmConv will generate a rulebook and a counter, which can be // reused by different SubmConv. - // refer to sparse/gpu/convolution_kernel.cu - std::shared_ptr< - std::map>>> - table_ptr_ = std::make_shared< - std::map>>>(); + // refer to sparse/gpu/convolution_kernel.cu. + std::shared_ptr>> + indices_dict_ = nullptr; /* --------------------------- */ /* example: non zero element is scalar */ diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h index a4027670a508c..0c6b8b76b54d8 100644 --- a/paddle/phi/kernels/funcs/sparse/convolution.h +++ b/paddle/phi/kernels/funcs/sparse/convolution.h @@ -195,9 +195,9 @@ inline const IntT* GetRulebookPtr(const SparseCooTensor& coo, const std::string& key, int* rulebook_len) { if (!key.empty()) { - const auto* table = coo.table(key); - if (table != nullptr) { - const DenseTensor& tmp_rulebook = table->first; + const auto* indices_pairs = coo.IndicesPairs(key); + if (indices_pairs != nullptr) { + const DenseTensor& tmp_rulebook = indices_pairs->first; *rulebook_len = tmp_rulebook.dims()[1]; return tmp_rulebook.data(); } @@ -210,9 +210,9 @@ inline const int* GetCounterPtr(const SparseCooTensor& coo, const DenseTensor& counter, const std::string& key) { if (!key.empty()) { - const auto* table = coo.table(key); - if (table != nullptr) { - return table->second.data(); + const auto* indices_pairs = coo.IndicesPairs(key); + if (indices_pairs != nullptr) { + return indices_pairs->second.data(); } } return counter.data(); @@ -224,18 +224,18 @@ inline const IntT* PrepareSubm(const Context& dev_ctx, const std::string& key, const DDim& out_dims, SparseCooTensor* out, - std::vector* counter, - std::vector* offsets, + int* counter, + int* offsets, int* rulebook_len, bool* need_product_rulebook) { - const auto* table = x.table(key); - if (table != nullptr) { + const auto* indices_pairs = x.IndicesPairs(key); + if (indices_pairs != nullptr) { *need_product_rulebook = false; - const DenseTensor& rulebook = table->first; - memcpy(counter->data(), - table->second.data(), - table->second.size() * sizeof(int)); - out->SetTablePtr(x.GetTablePtr()); + const DenseTensor& rulebook = indices_pairs->first; + const int counter_size = indices_pairs->second.numel(); + memcpy( + counter, indices_pairs->second.data(), counter_size * sizeof(int)); + out->SetIndicesDict(x.GetIndicesDict()); *rulebook_len = rulebook.dims()[1]; @@ -245,7 +245,7 @@ inline const IntT* PrepareSubm(const Context& dev_ctx, phi::Copy( dev_ctx, x.non_zero_indices(), dev_ctx.GetPlace(), false, &out_indices); out->SetMember(out_indices, out_values, out_dims, false); - PrefixSum(counter->data(), offsets->data(), counter->size()); + PrefixSum(counter, offsets, counter_size); return rulebook.data(); } return nullptr; @@ -256,18 +256,18 @@ inline void SaveToTable(const Context& dev_ctx, const SparseCooTensor& x, const std::string& key, const DenseTensor& in_rulebook, - const std::vector& counter_vec, + const DenseTensor& h_counter, SparseCooTensor* out, DenseTensor* out_rulebook, DenseTensor* counter) { - out->SetTablePtr(x.GetTablePtr()); + out->SetIndicesDict(x.GetIndicesDict()); if (!key.empty()) { - out->SetTable(key, std::make_pair(in_rulebook, counter_vec)); + out->SaveIndicesPairs(key, std::make_pair(in_rulebook, h_counter)); } else { *out_rulebook = in_rulebook; - counter->Resize({static_cast(counter_vec.size())}); + counter->Resize({h_counter.numel()}); int* counter_ptr = dev_ctx.template HostAlloc(counter); - memcpy(counter_ptr, counter_vec.data(), counter_vec.size() * sizeof(int)); + memcpy(counter_ptr, h_counter.data(), h_counter.numel() * sizeof(int)); } } diff --git a/paddle/phi/kernels/sparse/coalesced_kernel.h b/paddle/phi/kernels/sparse/coalesce_kernel.h similarity index 78% rename from paddle/phi/kernels/sparse/coalesced_kernel.h rename to paddle/phi/kernels/sparse/coalesce_kernel.h index d2f5f8f3150af..cb8b98fd87404 100644 --- a/paddle/phi/kernels/sparse/coalesced_kernel.h +++ b/paddle/phi/kernels/sparse/coalesce_kernel.h @@ -22,14 +22,14 @@ namespace phi { namespace sparse { template -void CoalescedKernel(const Context& dev_ctx, - const SparseCooTensor& x, - SparseCooTensor* out); +void CoalesceKernel(const Context& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* out); template -SparseCooTensor Coalesced(const Context& dev_ctx, const SparseCooTensor& x) { +SparseCooTensor Coalesce(const Context& dev_ctx, const SparseCooTensor& x) { SparseCooTensor coo; - CoalescedKernel(dev_ctx, x, &coo); + CoalesceKernel(dev_ctx, x, &coo); return coo; } diff --git a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc b/paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc similarity index 87% rename from paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc rename to paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc index b42294cfc0315..95d8abd6bcf5c 100644 --- a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/sparse/coalesced_kernel.h" +#include "paddle/phi/kernels/sparse/coalesce_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/visit_type.h" @@ -22,9 +22,9 @@ namespace phi { namespace sparse { template -void CoalescedCPUKernel(const CPUContext& dev_ctx, - const SparseCooTensor& x, - SparseCooTensor* out) { +void CoalesceCPUKernel(const CPUContext& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* out) { const DenseTensor& x_indices = x.non_zero_indices(); const DenseTensor& x_values = x.non_zero_elements(); DenseTensor out_indices = phi::EmptyLike(dev_ctx, x_indices); @@ -95,22 +95,22 @@ void CoalescedCPUKernel(const CPUContext& dev_ctx, } template -void CoalescedKernel(const Context& dev_ctx, - const SparseCooTensor& x, - SparseCooTensor* out) { +void CoalesceKernel(const Context& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* out) { PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "CoalescedCPUKernel", ([&] { - CoalescedCPUKernel(dev_ctx, x, out); + x.non_zero_indices().dtype(), "CoalesceCPUKernel", ([&] { + CoalesceCPUKernel(dev_ctx, x, out); })); } } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(coalesced, +PD_REGISTER_KERNEL(coalesce, CPU, ALL_LAYOUT, - phi::sparse::CoalescedKernel, + phi::sparse::CoalesceKernel, float, double, phi::dtype::float16, diff --git a/paddle/phi/kernels/sparse/cpu/conv.h b/paddle/phi/kernels/sparse/cpu/conv.h index 82480e492abae..e47f33c8c4834 100644 --- a/paddle/phi/kernels/sparse/cpu/conv.h +++ b/paddle/phi/kernels/sparse/cpu/conv.h @@ -41,12 +41,12 @@ void ProductRuleBook(const Context& dev_ctx, const DDim& out_dims, const bool subm, DenseTensor* rulebook, - std::vector* counter_per_kernel) { + int* counter_per_kernel) { const int64_t non_zero_num = x.nnz(); const auto& non_zero_indices = x.non_zero_indices(); const IntT* indices_ptr = non_zero_indices.data(); int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; - memset(counter_per_kernel->data(), 0, kernel_size * sizeof(int)); + memset(counter_per_kernel, 0, kernel_size * sizeof(int)); int rulebook_len = 0; // calc the rulebook_len @@ -106,7 +106,7 @@ void ProductRuleBook(const Context& dev_ctx, } if (rulebook_ptr == nullptr) { - (*counter_per_kernel)[kernel_index - 1] += 1; + counter_per_kernel[kernel_index - 1] += 1; ++rulebook_len; } else { rulebook_ptr[rulebook_index] = kernel_index - 1; diff --git a/paddle/phi/kernels/sparse/cpu/conv_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc index 5a892e64c65a0..f15a636f96d45 100644 --- a/paddle/phi/kernels/sparse/cpu/conv_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc @@ -69,8 +69,11 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx, // Second algorithm: // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf // 1. product rulebook - std::vector counter_per_kernel(kernel_size, 0); - std::vector offsets(kernel_size + 1); + DenseTensor h_counter, h_offsets; + h_counter.Resize({kernel_size}); + h_offsets.Resize({kernel_size + 1}); + int* h_counter_ptr = dev_ctx.template HostAlloc(&h_counter); + int* h_offsets_ptr = dev_ctx.template HostAlloc(&h_offsets); // DenseTensor* rulebook = nullptr; const IntT* rulebook_ptr = nullptr; @@ -83,8 +86,8 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx, key, out_dims, out, - &counter_per_kernel, - &offsets, + h_counter_ptr, + h_offsets_ptr, &n, &need_product_rulebook); } @@ -99,24 +102,17 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx, out_dims, subm, &tmp_rulebook, - &counter_per_kernel); + h_counter_ptr); UpdateRulebookAndOutIndex( dev_ctx, x, kernel_size, out_channels, out_dims, &tmp_rulebook, out); n = tmp_rulebook.dims()[1]; rulebook_ptr = tmp_rulebook.data(); - phi::funcs::sparse::SaveToTable(dev_ctx, - x, - key, - tmp_rulebook, - counter_per_kernel, - out, - rulebook, - counter); + phi::funcs::sparse::SaveToTable( + dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter); } // int n = rulebook->dims()[1]; - const int* counter_ptr = counter_per_kernel.data(); // 2. gather DenseTensorMeta in_features_meta( @@ -140,24 +136,24 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx, auto blas = phi::funcs::GetBlas(dev_ctx); int offset = 0; for (int i = 0; i < kernel_size; i++) { - offsets[i] = offset; - offset += counter_ptr[i]; + h_offsets_ptr[i] = offset; + offset += h_counter_ptr[i]; } - offsets[kernel_size] = offset; + h_offsets_ptr[kernel_size] = offset; const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { - if (counter_ptr[i] <= 0) { + if (h_counter_ptr[i] <= 0) { continue; } // call gemm: (n, in_channels) * (in_channels, out_channels) - const int M = counter_ptr[i]; + const int M = h_counter_ptr[i]; const int K = in_channels; // in_channels const int N = out_channels; // out_channels - T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels; + T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels; const T* tmp_kernel_ptr = kernel_ptr + i * K * N; - T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels; + T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels; blas.GEMM(CblasNoTrans, CblasNoTrans, M, diff --git a/paddle/phi/kernels/sparse/cpu/pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/pool_kernel.cc index 36949f7161245..a8d41d0578b87 100644 --- a/paddle/phi/kernels/sparse/cpu/pool_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/pool_kernel.cc @@ -48,10 +48,6 @@ void MaxPoolCPUKernel(const CPUContext& dev_ctx, x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims); const int in_channels = real_kernel_sizes[3]; - // DenseTensorMeta counter_meta( - // DataType::INT32, {kernel_size}, DataLayout::NCHW); - // DenseTensor counter_per_kernel = phi::Empty(dev_ctx, - // std::move(counter_meta)); std::vector counter_per_kernel(kernel_size, 0); const T* in_features_ptr = x.non_zero_elements().data(); @@ -65,7 +61,7 @@ void MaxPoolCPUKernel(const CPUContext& dev_ctx, out_dims, false, rulebook, - &counter_per_kernel); + counter_per_kernel.data()); UpdateRulebookAndOutIndex( dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out); diff --git a/paddle/phi/kernels/sparse/gpu/.convolution_grad_kernel.cu.swp b/paddle/phi/kernels/sparse/gpu/.convolution_grad_kernel.cu.swp deleted file mode 100644 index e1d0e2bee6c88631e06944143561a3ec7c50b945..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20480 zcmeHNZEPGz86GGlgtQPSsgMw2aFDrk?#p)6L|4afu}yl4V+Y?QQLxh0-tOGp;=SF? z?yT)gNGl;xBt#I3gvzh-RfJR!@PpswLn@+D1!`1*A|WIY^EhE2nGRUt_mP+}noxDCZHJvY&N+$|JgcRFuhZV22b*HtaTkX&-dcON< zX7J*9-85}h^x|9FDy}i#VRldqJlzjivE_wD!*$kOJLHz@Xf0niwMeF5gnJcGy;ls} z>kNdvnLqg;nVy;~2`Y|`eV9J<-V66SE2{pA0mXn~Krx^gPz)#r6a$KZdx`<>-AA58 zjo%vA`f_}K%Z~f&Z$_6Hr+;e4`?tl<$Kw0*JKo2S>ZTY_3@8Q^1BwB~fMP%~pcqgL zC@-yHT@Hp@x;MGIW2VMbw3j754Ch!dK1>gzbD$oJufe!$G zegYi+jkK19PlJC4IBi1hYEfV_%84n;5u*#co_KG z+X?w0@Dwl)90gvxpO6=UF9Q~E4444^g-wo^fLnk8oB$30zq*f*UjokpHZTd?58TG4 z%2$D>flmTO;Gf|67l7dVDZzPk(NQixv{}P-*EE|sbbzjX7peE+fdITt4H4=UB(^x|N{o7@6_5X1JllnZH`Pl4R3M%k(UwNI6={ zJ_@S=H_PR9gmt+73<@_(-DWdjJjg`x8k#6KJw_+9IU1WzzY$h)M3`Y*yT*JsW3qM4 z;F~!*Zfcualj%J4S&&0MI=2kX&;veGpUspCrDirNK%eHO3>eu(r9MF;8Esv+Ll$UY zh@Ohm^0TJSb&-cO9|`7XSD2_t#W5?k?HW2~nazTO*c4G{joGZj98@}6Fs%-5QdJMR zOB=Qxz_uj689D}-Kg0NGq|VH_V0x41DDsM{E%iVnZR>V34ta?vJVn!l7U|TZgK3MU zM99o845W)wV;m&`zYb3qsn;Z;mBCmNffJLJ%4Y{-+urok7QYre$=brS8}cZ|Cu#0D zkG}N6wx*GV+Ug^*Qo7sp$y-jZOU2@;l(aPJEH25@LdWCXq=f{j=N230bMt8bRL6gQSTzPTwa=c?2>Hr6fft%SU)$iIGLEA;d!BC`^_p%(|<3Q6lHNH zs0%{c0#3!~PBVtdA$g1^kJZU5a&Y(LCwdZbr`=KzQU6I6Un~NzlnD{#N06++XcQ`oe!rE3PY#~RQOhyx&}4LMMcfFSg`mGCIad)zheb~w5mHMxm2w`wtS&ZFI84nDoge1 zLX9r3(7ENM`D(qoyo9Gu(8|(fdbYYWpQFrzeV?*T56kj^B0mcQ%wlGNn1WBM5prhs z7nfQAO$)dV)g3xksnKd}jGnI4s)je1@^oGq)mkp7h-trmV zVQp<+vFR%&%~;j(l+N_xmeuT*eK{2E^StD{ZNYTHgzUs|LH zS+vNASKlF=Y>g3>Q`^=9D>&FpvbiQU)b^oNyrk~i@i%ylIJ^Fp< zhmLhMWYGcT2qxnF&H{E|nBK``XJ>Jym^wfe2M0On|NjPOC0T=i<@G;;$!0+MvzX*H@I13yC?%-Ts zocr5A2_V25IM;t2_!Dp&co}#OxCI121^6iN0PqKt_ZxsH_cNmG>QW3S1{4E|0mXn~ zKr!(D!N4GAW}m*ufR(87Ok@zjzC05V0J0xvq|cMt$4Xy0GU-k`nj@3e=>0DZAw52@ z=WVz~riSBht$`5EwE|*ED6Juu6Y;KRWfTdbK4)fTVtFYf#aoEj5z-JBR}x#YEjj}s zAnArgz-t_3g;)VZipW8jL=+^!w~QnHc2PzZz@>-ajn? z&_U^V&p90`GLoO%6B6;W$F|WSsdasX0w84eg7jsp!W-^|QHpw|B8U($u;uS2d<_k} zlo2yB)FTSE9AF6bzbi96itfgW3~t;iY)0%!)e(kNE?1Z8(^ETQCx=B#Mw82b6oQOh No8tu -void CoalescedGPUKernel(const GPUContext& dev_ctx, - const SparseCooTensor& x, - SparseCooTensor* out) { +void CoalesceGPUKernel(const GPUContext& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* out) { const DenseTensor& x_indices = x.non_zero_indices(); const DenseTensor& x_values = x.non_zero_elements(); DenseTensor out_indices = phi::EmptyLike(dev_ctx, x_indices); @@ -172,21 +172,21 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx, } template -void CoalescedKernel(const Context& dev_ctx, - const SparseCooTensor& x, - SparseCooTensor* out) { +void CoalesceKernel(const Context& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* out) { PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "CoalescedGPUKernel", ([&] { - CoalescedGPUKernel(dev_ctx, x, out); + x.non_zero_indices().dtype(), "CoalesceGPUKernel", ([&] { + CoalesceGPUKernel(dev_ctx, x, out); })); } } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(coalesced, +PD_REGISTER_KERNEL(coalesce, GPU, ALL_LAYOUT, - phi::sparse::CoalescedKernel, + phi::sparse::CoalesceKernel, float, double, phi::dtype::float16, diff --git a/paddle/phi/kernels/sparse/gpu/conv.cu.h b/paddle/phi/kernels/sparse/gpu/conv.cu.h index 8cd55e50d98d0..859857ed7baac 100644 --- a/paddle/phi/kernels/sparse/gpu/conv.cu.h +++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h @@ -545,8 +545,8 @@ int ProductRuleBook(const Context& dev_ctx, DenseTensor* out_index, DenseTensor* unique_value, SparseCooTensor* out, - std::vector* h_counter, - std::vector* h_offsets) { + int* h_counter, + int* h_offsets) { auto indices_dtype = paddle::experimental::CppTypeToDataType::Type(); const int64_t non_zero_num = x.nnz(); const auto& non_zero_indices = x.non_zero_indices(); @@ -633,16 +633,11 @@ int ProductRuleBook(const Context& dev_ctx, out->SetMember(out_indices, out_values, out_dims, false); - CallThrustScan(dev_ctx, - counter_ptr, - kernel_size, - offsets_ptr, - h_counter->data(), - h_offsets->data()); + CallThrustScan( + dev_ctx, counter_ptr, kernel_size, offsets_ptr, h_counter, h_offsets); dev_ctx.Wait(); - int rulebook_len = - (*h_offsets)[kernel_size - 1] + (*h_counter)[kernel_size - 1]; + int rulebook_len = h_offsets[kernel_size - 1] + h_counter[kernel_size - 1]; DenseTensor out_rulebook = phi::Empty(dev_ctx, {rulebook_rows, rulebook_len}); IntT* out_rulebook_ptr = out_rulebook.data(); @@ -691,12 +686,8 @@ int ProductRuleBook(const Context& dev_ctx, IntT rulebook_len = (last - rulebook_ptr) / 2; - CallThrustScan(dev_ctx, - counter_ptr, - kernel_size, - offsets_ptr, - h_counter->data(), - h_offsets->data()); + CallThrustScan( + dev_ctx, counter_ptr, kernel_size, offsets_ptr, h_counter, h_offsets); rulebook->Resize({rulebook_rows, static_cast(rulebook_len)}); // 3. sorted or merge the out index diff --git a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu index 89159f4c55a70..543f3884edcb4 100644 --- a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu @@ -65,7 +65,11 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx, x_dims, kernel_sizes, subm_paddings, dilations, subm_strides, &out_dims); const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; - std::vector offsets(kernel_size + 1), h_counter(kernel_size); + DenseTensor h_counter, h_offsets; + h_counter.Resize({kernel_size}); + h_offsets.Resize({kernel_size + 1}); + int* h_counter_ptr = dev_ctx.template HostAlloc(&h_counter); + int* h_offsets_ptr = dev_ctx.template HostAlloc(&h_offsets); // Second algorithm: // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf @@ -86,8 +90,8 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx, key, out_dims, out, - &h_counter, - &offsets, + h_counter.data(), + h_offsets.data(), &rulebook_len, &need_product_rulebook); } @@ -108,8 +112,8 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx, &out_index, &unique_value, out, - &h_counter, - &offsets); + h_counter_ptr, + h_offsets_ptr); rulebook_ptr = tmp_rulebook.data(); phi::funcs::sparse::SaveToTable( @@ -161,17 +165,17 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx, const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { - if (h_counter[i] <= 0) { + if (h_counter_ptr[i] <= 0) { continue; } // call gemm: (n, in_channels) * (in_channels, out_channels) - const int M = h_counter[i]; + const int M = h_counter_ptr[i]; const int K = in_channels; const int N = out_channels; - T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels; + T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels; const T* tmp_kernel_ptr = kernel_ptr + i * K * N; - T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels; + T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels; blas.GEMM(CblasNoTrans, CblasNoTrans, diff --git a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu index 7ac727cae4ca9..a34a87eb1f645 100644 --- a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu @@ -90,8 +90,8 @@ void MaxPoolGPUKernel(const GPUContext& dev_ctx, &out_index, &unique_value, out, - &h_counter, - &offsets); + h_counter.data(), + offsets.data()); const IntT* rulebook_ptr = rulebook->data(); diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h index 2f5bb189c0ffe..12d55596a935d 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h @@ -19,7 +19,6 @@ limitations under the License. */ #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/sparse/coalesced_kernel.h" namespace phi { namespace sparse { @@ -154,10 +153,8 @@ void SparseCooTensorKernel(const Context& dev_ctx, const DenseTensor& indices, const IntArray& dense_shape, SparseCooTensor* out) { - SparseCooTensor before_coalesced( - indices, values, phi::make_ddim(dense_shape.GetData())); - // CoalescedKernel(dev_ctx, before_coalesced, out); - *out = before_coalesced; + *out = + SparseCooTensor(indices, values, phi::make_ddim(dense_shape.GetData())); } } // namespace sparse diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index df0e87c6b5a49..f7c7b7e9486ee 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -22,7 +22,7 @@ limitations under the License. */ #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/sparse/coalesced_kernel.h" +#include "paddle/phi/kernels/sparse/coalesce_kernel.h" #include "paddle/phi/kernels/sparse/conv_grad_kernel.h" #include "paddle/phi/kernels/sparse/conv_kernel.h" @@ -212,7 +212,7 @@ void TestConv3dBase(const std::vector& indices, "Conv3d", &d_rulebook, &d_counter); - SparseCooTensor tmp_d_out = sparse::Coalesced(dev_ctx_gpu, d_out); + SparseCooTensor tmp_d_out = sparse::Coalesce(dev_ctx_gpu, d_out); ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz()); diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc index 7497dca51a59c..a06d85738586e 100644 --- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc @@ -22,7 +22,7 @@ limitations under the License. */ #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/sparse/coalesced_kernel.h" +#include "paddle/phi/kernels/sparse/coalesce_kernel.h" #include "paddle/phi/kernels/sparse/pool_grad_kernel.h" #include "paddle/phi/kernels/sparse/pool_kernel.h" @@ -160,7 +160,7 @@ void TestMaxPoolBase(const std::vector& indices, strides, &d_rulebook, &d_counter); - SparseCooTensor tmp_d_out = sparse::Coalesced(dev_ctx_gpu, d_out); + SparseCooTensor tmp_d_out = sparse::Coalesce(dev_ctx_gpu, d_out); ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz()); diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py index ede33e4167472..36ecfeccd1a1d 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py @@ -53,7 +53,7 @@ def test_conv3d(self): groups=1, data_format="NDHWC") out.backward(out) - out = paddle.incubate.sparse.coalesced(out) + out = paddle.incubate.sparse.coalesce(out) assert np.array_equal(correct_out_values, out.values().numpy()) def test_subm_conv3d(self): diff --git a/python/paddle/incubate/sparse/__init__.py b/python/paddle/incubate/sparse/__init__.py index 6c9678873abe5..7e8fdfa7bfd35 100644 --- a/python/paddle/incubate/sparse/__init__.py +++ b/python/paddle/incubate/sparse/__init__.py @@ -18,6 +18,7 @@ from .unary import sqrt from .unary import sin from .unary import tanh +from .unary import coalesce from .binary import mv from .binary import matmul @@ -28,8 +29,6 @@ from .math import multiply from .math import subtract -from .coalesced import coalesced - from . import nn __all__ = [ @@ -45,5 +44,5 @@ 'subtract', 'multiply', 'divide', - 'coalesced', + 'coalesce', ] diff --git a/python/paddle/incubate/sparse/coalesced.py b/python/paddle/incubate/sparse/coalesced.py deleted file mode 100644 index 23c82499851d3..0000000000000 --- a/python/paddle/incubate/sparse/coalesced.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle import _C_ops -from paddle.fluid.framework import core, dygraph_only - -__all__ = [ - 'coalesced', -] - - -@dygraph_only -def coalesced(x): - r""" - the coalesced operator include sorted and merge, after coalesced, the indices of x is sorted and unique, . - - Args: - x (Tensor): the input SparseCooTensor. - - Returns: - Tensor: return the SparseCooTensor after coalesced. - - Examples: - - .. code-block:: python - - import paddle - from paddle.incubate import sparse - from paddle.fluid.framework import _test_eager_guard - - with _test_eager_guard(): - indices = [[0, 0, 1], [1, 1, 2]] - values = [1.0, 2.0, 3.0] - sp_x = sparse.sparse_coo_tensor(indices, values) - sp_x = sparse.coalesced(sp_x) - print(sp_x.indices()) - #[[0, 1], [1, 2]] - print(sp_x.values()) - #[3.0, 3.0] - """ - return _C_ops.final_state_sparse_coalesced(x) diff --git a/python/paddle/incubate/sparse/unary.py b/python/paddle/incubate/sparse/unary.py index 09e449b0d9c5e..9220debdc24de 100644 --- a/python/paddle/incubate/sparse/unary.py +++ b/python/paddle/incubate/sparse/unary.py @@ -109,3 +109,35 @@ def sin(x, name=None): out = paddle.incubate.sparse.sin(sparse_x) """ return _C_ops.final_state_sparse_sin(x) + + +@dygraph_only +def coalesce(x): + r""" + the coalesced operator include sorted and merge, after coalesced, the indices of x is sorted and unique, . + + Args: + x (Tensor): the input SparseCooTensor. + + Returns: + Tensor: return the SparseCooTensor after coalesced. + + Examples: + + .. code-block:: python + + import paddle + from paddle.incubate import sparse + from paddle.fluid.framework import _test_eager_guard + + with _test_eager_guard(): + indices = [[0, 0, 1], [1, 1, 2]] + values = [1.0, 2.0, 3.0] + sp_x = sparse.sparse_coo_tensor(indices, values) + sp_x = sparse.coalesced(sp_x) + print(sp_x.indices()) + #[[0, 1], [1, 2]] + print(sp_x.values()) + #[3.0, 3.0] + """ + return _C_ops.final_state_sparse_coalesce(x) From 123b16c977acdebc1482dc703fc7df1002cabd6a Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Fri, 8 Jul 2022 13:05:59 +0000 Subject: [PATCH 66/70] fix test_sparse_utils --- python/paddle/fluid/tests/unittests/test_sparse_utils_op.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py index b71ef0357cb37..471ab432215ef 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py @@ -298,7 +298,7 @@ def test_sparse_coo_tensor_sorted(self): values = paddle.to_tensor(values, dtype='float32') sparse_x = paddle.incubate.sparse.sparse_coo_tensor( indices, values) - sparse_x = paddle.incubate.sparse.coalesced(sparse_x) + sparse_x = paddle.incubate.sparse.coalesce(sparse_x) indices_sorted = [[0, 1], [1, 0]] values_sorted = [5.0, 1.0] assert np.array_equal(indices_sorted, @@ -311,7 +311,7 @@ def test_sparse_coo_tensor_sorted(self): values = paddle.to_tensor(values, dtype='float32') sparse_x = paddle.incubate.sparse.sparse_coo_tensor( indices, values) - sparse_x = paddle.incubate.sparse.coalesced(sparse_x) + sparse_x = paddle.incubate.sparse.coalesce(sparse_x) values_sorted = [[5.0, 5.0], [1.0, 1.0]] assert np.array_equal(indices_sorted, sparse_x.indices().numpy()) From af66998715bd5d0a85d796f676125bfa74567f21 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Wed, 13 Jul 2022 06:28:15 +0000 Subject: [PATCH 67/70] sparse support amp --- paddle/fluid/eager/eager_amp_auto_cast.h | 7 ++++++- paddle/phi/kernels/gpu/pad3d_grad_kernel.cu | 9 +++++++-- paddle/phi/kernels/sparse/empty_kernel.cc | 2 ++ paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu | 4 ++++ paddle/phi/kernels/sparse/gpu/unary_kernel.cu | 4 ++++ 5 files changed, 23 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h index 438ccbaca8a5e..e13d0cbc7484d 100644 --- a/paddle/fluid/eager/eager_amp_auto_cast.h +++ b/paddle/fluid/eager/eager_amp_auto_cast.h @@ -85,7 +85,12 @@ inline paddle::experimental::Tensor EagerAmpAutoCast( } } if (NeedCast(input, dst_dtype)) { - return cast_final_state_dygraph_function(input, dst_dtype); + if (input.is_sparse_coo_tensor() || input.is_sparse_csr_tensor()) { + return sparse::cast_final_state_dygraph_function( + input, paddle::experimental::DataType::UNDEFINED, dst_dtype); + } else { + return cast_final_state_dygraph_function(input, dst_dtype); + } } return input; } diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu index 8f4af0a450890..e9f820a318482 100644 --- a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu @@ -503,5 +503,10 @@ void Pad3dGradKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL( - pad3d_grad, GPU, ALL_LAYOUT, phi::Pad3dGradKernel, float, double) {} +PD_REGISTER_KERNEL(pad3d_grad, + GPU, + ALL_LAYOUT, + phi::Pad3dGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/sparse/empty_kernel.cc b/paddle/phi/kernels/sparse/empty_kernel.cc index fe7fb72b4caa6..c1706b9919d90 100644 --- a/paddle/phi/kernels/sparse/empty_kernel.cc +++ b/paddle/phi/kernels/sparse/empty_kernel.cc @@ -97,6 +97,7 @@ PD_REGISTER_KERNEL(empty_like_coo, GPU, ALL_LAYOUT, phi::sparse::EmptyLikeCooKernel, + phi::dtype::float16, float, double, int8_t, @@ -112,6 +113,7 @@ PD_REGISTER_KERNEL(empty_like_csr, GPU, ALL_LAYOUT, phi::sparse::EmptyLikeCsrKernel, + phi::dtype::float16, float, double, int8_t, diff --git a/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu index c1f2b2a1f0d1d..be0f13fb0e538 100644 --- a/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu @@ -23,6 +23,7 @@ GPU, \ ALL_LAYOUT, \ phi::sparse::prefix##CooGradKernel, \ + phi::dtype::float16, \ float, \ double) { \ kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \ @@ -32,6 +33,7 @@ GPU, \ ALL_LAYOUT, \ phi::sparse::prefix##CsrGradKernel, \ + phi::dtype::float16, \ float, \ double) { \ kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \ @@ -56,6 +58,7 @@ PD_REGISTER_KERNEL(cast_coo_grad, GPU, ALL_LAYOUT, phi::sparse::CastCooGradKernel, + phi::dtype::float16, float, double, int8_t, @@ -69,6 +72,7 @@ PD_REGISTER_KERNEL(cast_csr_grad, GPU, ALL_LAYOUT, phi::sparse::CastCsrGradKernel, + phi::dtype::float16, float, double, int8_t, diff --git a/paddle/phi/kernels/sparse/gpu/unary_kernel.cu b/paddle/phi/kernels/sparse/gpu/unary_kernel.cu index fdf0b5106d3cf..6358b7b983576 100644 --- a/paddle/phi/kernels/sparse/gpu/unary_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/unary_kernel.cu @@ -67,6 +67,7 @@ void DivCsrScalarKernel(const Context& dev_ctx, GPU, \ ALL_LAYOUT, \ phi::sparse::prefix##CooKernel, \ + phi::dtype::float16, \ float, \ double) { \ kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \ @@ -76,6 +77,7 @@ void DivCsrScalarKernel(const Context& dev_ctx, GPU, \ ALL_LAYOUT, \ phi::sparse::prefix##CsrKernel, \ + phi::dtype::float16, \ float, \ double) { \ kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \ @@ -119,6 +121,7 @@ PD_REGISTER_KERNEL(cast_coo, GPU, ALL_LAYOUT, phi::sparse::CastCooKernel, + phi::dtype::float16, float, double, int8_t, @@ -132,6 +135,7 @@ PD_REGISTER_KERNEL(cast_csr, GPU, ALL_LAYOUT, phi::sparse::CastCsrKernel, + phi::dtype::float16, float, double, int8_t, From 91ee01bea4416955ae964b2926f18fe9b712b991 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 18 Jul 2022 09:07:39 +0000 Subject: [PATCH 68/70] resolve conflict --- paddle/phi/kernels/sparse/conv_grad_kernel.h | 11 +- paddle/phi/kernels/sparse/gpu/conv_kernel.cu | 197 ++++++++++-------- .../kernels/test_sparse_conv3d_dev_api.cc | 2 - 3 files changed, 120 insertions(+), 90 deletions(-) diff --git a/paddle/phi/kernels/sparse/conv_grad_kernel.h b/paddle/phi/kernels/sparse/conv_grad_kernel.h index 205823e620375..867f6b5a53f37 100644 --- a/paddle/phi/kernels/sparse/conv_grad_kernel.h +++ b/paddle/phi/kernels/sparse/conv_grad_kernel.h @@ -25,13 +25,16 @@ template void Conv3dCooGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, + const SparseCooTensor& out, const DenseTensor& rulebook, + const DenseTensor& counter, const SparseCooTensor& out_grad, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, const int groups, const bool subm, + const std::string& key, SparseCooTensor* x_grad, DenseTensor* kernel_grad); @@ -40,13 +43,16 @@ std::tuple Conv3dCooGrad( const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, + const SparseCooTensor& out, const DenseTensor& rulebook, + const DenseTensor& counter, const SparseCooTensor& out_grad, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, const int groups, - const bool subm) { + const bool subm, + const std::string& key) { SparseCooTensor x_grad; DenseTensor kernel_grad; @@ -54,13 +60,16 @@ std::tuple Conv3dCooGrad( Conv3dCooGradKernel(dev_ctx, x, kernel, + out, rulebook, + counter, out_grad, paddings, dilations, strides, groups, subm, + key, &x_grad, &kernel_grad); return std::make_tuple(x_grad, kernel_grad); diff --git a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu index 6820b677147f3..543f3884edcb4 100644 --- a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu @@ -21,7 +21,9 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/scatter.cu.h" #include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" -#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" +#include "paddle/phi/kernels/sparse/gpu/conv.cu.h" + +#include "glog/logging.h" namespace phi { namespace sparse { @@ -35,8 +37,10 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx, const std::vector& strides, const int groups, const bool subm, + const std::string& key, SparseCooTensor* out, - DenseTensor* rulebook) { + DenseTensor* rulebook, + DenseTensor* counter) { // update padding and dilation // Currently, only support x.layout is NDHWC, groups = 1 // if x.layout != NDHWC then transpose(x), transpose(weight) @@ -61,85 +65,117 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx, x_dims, kernel_sizes, subm_paddings, dilations, subm_strides, &out_dims); const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; - std::vector offsets(kernel_size + 1), h_counter(kernel_size); + DenseTensor h_counter, h_offsets; + h_counter.Resize({kernel_size}); + h_offsets.Resize({kernel_size + 1}); + int* h_counter_ptr = dev_ctx.template HostAlloc(&h_counter); + int* h_offsets_ptr = dev_ctx.template HostAlloc(&h_offsets); // Second algorithm: // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf // 1. product rulebook - DenseTensorMeta counter_meta( - DataType::INT32, {kernel_size}, DataLayout::NCHW); - DenseTensorMeta offsets_meta( - DataType::INT32, {kernel_size}, DataLayout::NCHW); - DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); - DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta)); - DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW); - DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta)); - DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); - - int n = ProductRuleBook(dev_ctx, - x, - kernel_sizes, - subm_paddings, - dilations, - subm_strides, - out_dims, - subm, - rulebook, - &counter_per_kernel, - &offsets_per_kernel, - &out_index, - &unique_value, - out, - &h_counter, - &offsets); - - const int* counter_ptr = counter_per_kernel.data(); - const int* offsets_ptr = counter_per_kernel.data(); - const IntT* rulebook_ptr = rulebook->data(); + DenseTensor counter_per_kernel = phi::Empty(dev_ctx, {kernel_size}); + DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, {kernel_size}); + DenseTensor out_index = phi::Empty(dev_ctx, {1}); + DenseTensor unique_value = phi::Empty(dev_ctx, {1}); + + VLOG(6) << "call SubmConv3D or Conv3D " << subm << " and the key is " << key; + int rulebook_len = 0; + const IntT* rulebook_ptr = nullptr; + bool need_product_rulebook = true; + if (subm && !key.empty()) { + rulebook_ptr = phi::funcs::sparse::PrepareSubm( + dev_ctx, + x, + key, + out_dims, + out, + h_counter.data(), + h_offsets.data(), + &rulebook_len, + &need_product_rulebook); + } + + if (need_product_rulebook) { + DenseTensor tmp_rulebook; + rulebook_len = ProductRuleBook(dev_ctx, + x, + kernel_sizes, + subm_paddings, + dilations, + subm_strides, + out_dims, + subm, + &tmp_rulebook, + &counter_per_kernel, + &offsets_per_kernel, + &out_index, + &unique_value, + out, + h_counter_ptr, + h_offsets_ptr); + rulebook_ptr = tmp_rulebook.data(); + + phi::funcs::sparse::SaveToTable( + dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter); + } // 2. gather - DenseTensorMeta in_features_meta( - x.dtype(), {n, in_channels}, DataLayout::NCHW); - DenseTensorMeta out_features_meta( - x.dtype(), {n, out_channels}, DataLayout::NCHW); phi::DenseTensor in_features = - phi::Empty(dev_ctx, std::move(in_features_meta)); + phi::Empty(dev_ctx, {rulebook_len, in_channels}); phi::DenseTensor out_features = - phi::Empty(dev_ctx, std::move(out_features_meta)); + phi::Empty(dev_ctx, {rulebook_len, out_channels}); T* in_features_ptr = in_features.data(); T* out_features_ptr = out_features.data(); phi::funcs::SetConstant set_zero; set_zero(dev_ctx, &out_features, static_cast(0.0f)); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1); - GatherKernel<<>>(x.non_zero_elements().data(), - rulebook_ptr + n, - in_features_ptr, - n, - in_channels); + Gather(dev_ctx, + x.non_zero_elements().data(), + rulebook_ptr, + rulebook_len, + in_channels, + in_features_ptr); // 3. call gemm for every werght auto blas = phi::funcs::GetBlas(dev_ctx); auto* out_values = out->mutable_non_zero_elements(); T* out_values_ptr = out_values->data(); + set_zero(dev_ctx, out_values, static_cast(0.0f)); + + if (subm) { + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); + unique_value.ResizeAndAllocate( + {static_cast(out->nnz() * kernel_size)}); + out_index.ResizeAndAllocate({static_cast(rulebook_len)}); + int* out_index_ptr = out_index.data(); + int* unique_value_ptr = unique_value.data(); + phi::backends::gpu::GpuMemsetAsync( + out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream()); + GroupIndexs<<>>(rulebook_len, + kernel_size, + rulebook_ptr + rulebook_len, + out_index_ptr, + unique_value_ptr); + } const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { - if (h_counter[i] <= 0) { + if (h_counter_ptr[i] <= 0) { continue; } // call gemm: (n, in_channels) * (in_channels, out_channels) - const int M = h_counter[i]; + const int M = h_counter_ptr[i]; const int K = in_channels; const int N = out_channels; - T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels; + T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels; const T* tmp_kernel_ptr = kernel_ptr + i * K * N; - T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels; + T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels; blas.GEMM(CblasNoTrans, CblasNoTrans, @@ -154,40 +190,23 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx, } // 4. scatter - if (subm) { - set_zero(dev_ctx, out_values, static_cast(0.0f)); - config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1); - phi::funcs::ScatterCUDAKernel - <<>>(out_features_ptr, - rulebook_ptr + 2 * n, - out_values_ptr, - n, - out_channels, - false); - } else { - config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, out->nnz() * out_channels, 1); - phi::funcs::sparse::ScatterKernel - <<>>(out_features_ptr, - unique_value.data(), - out_index.data(), - out->nnz(), - n, - out_channels, - out_values_ptr); - } + phi::funcs::sparse::ScatterV2(dev_ctx, + out_features_ptr, + out_index.data(), + unique_value.data(), + out->nnz(), + kernel_size, + out_channels, + 1, + out_values_ptr); } + /** - * x: (N, D, H, W, C) - * kernel: (D, H, W, C, OC) - * out: (N, D, H, W, OC) + * x: the input SparseCooTensor, shape is (N, D, H, W, C) + * kernel: the weight data, shape is (D, H, W, C, OC) + * out: the output SparseCooTensor, shape is (N, D, H, W, OC) + * rulebook: return rulebook if key is not vailed else return nullptr + * counter: return counter if key is not vailed else return nullptr **/ template void Conv3dCooKernel(const Context& dev_ctx, @@ -198,8 +217,10 @@ void Conv3dCooKernel(const Context& dev_ctx, const std::vector& strides, const int groups, const bool subm, + const std::string& key, SparseCooTensor* out, - DenseTensor* rulebook) { + DenseTensor* rulebook, + DenseTensor* counter) { PD_VISIT_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "Conv3dCooGPUKernel", ([&] { Conv3dCooGPUKernel(dev_ctx, @@ -210,8 +231,10 @@ void Conv3dCooKernel(const Context& dev_ctx, strides, groups, subm, + key, out, - rulebook); + rulebook, + counter); })); } diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index 27b0ef2667973..f7c7b7e9486ee 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -214,8 +214,6 @@ void TestConv3dBase(const std::vector& indices, &d_counter); SparseCooTensor tmp_d_out = sparse::Coalesce(dev_ctx_gpu, d_out); - SparseCooTensor tmp_d_out = sparse::Coalesce(dev_ctx_gpu, d_out); - ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz()); for (int i = 0; i < correct_out_dims.size(); i++) { From a51402a82941977e11a009b488431e200dc02583 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Tue, 19 Jul 2022 08:29:31 +0000 Subject: [PATCH 69/70] fix codestyle --- paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu index fdaff98de4329..a8e88f351ccbc 100644 --- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu @@ -155,7 +155,6 @@ void CoalesceGPUKernel(const GPUContext& dev_ctx, out_values.data()); } - // 6. convert index to coordinate Dim const_dims; for (int i = 0; i < x.dims().size(); i++) { From ab996e00fa09007055409d2943339aa188dc58c1 Mon Sep 17 00:00:00 2001 From: zkh2016 Date: Mon, 25 Jul 2022 11:22:10 +0000 Subject: [PATCH 70/70] supplement the description of key --- python/paddle/incubate/sparse/nn/functional/conv.py | 4 ++++ python/paddle/incubate/sparse/nn/layer/conv.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/python/paddle/incubate/sparse/nn/functional/conv.py b/python/paddle/incubate/sparse/nn/functional/conv.py index 503ad9a127b0f..60cbb94bea236 100644 --- a/python/paddle/incubate/sparse/nn/functional/conv.py +++ b/python/paddle/incubate/sparse/nn/functional/conv.py @@ -277,6 +277,10 @@ def subm_conv3d(x, will be consistent with that of the input. An optional string from: `"NCDHW"`, `"NDHWC"`. The default is `"NDHWC"`. When it is `"NDHWC"`, the data is stored in the order of: `[batch_size, input_depth, input_height, input_width, input_channels]`. + key(str, optional): the key is used to save or use the same rulebook, + the definition and role of rulebook refers to + https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf. The + default value is None. name(str|None): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default. diff --git a/python/paddle/incubate/sparse/nn/layer/conv.py b/python/paddle/incubate/sparse/nn/layer/conv.py index c7fe1f7b4033e..f44358bbe9f3e 100644 --- a/python/paddle/incubate/sparse/nn/layer/conv.py +++ b/python/paddle/incubate/sparse/nn/layer/conv.py @@ -297,6 +297,10 @@ class SubmConv3D(_Conv3D): of the input channels, while the second half of the filters is only connected to the second half of the input channels. The default value is 1. padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Currently only support ``'zeros'``. + key(str, optional): the key is used to save or use the same rulebook, + the definition and role of rulebook refers to + https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf. The + default value is None. weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights of conv3d. If it is set to None or one attribute of ParamAttr, conv3d will create ParamAttr as param_attr. If it is set to None, the parameter