Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Opt sparse mask_kernel #44302

Merged
merged 8 commits into from
Jul 13, 2022
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
#include "paddle/phi/kernels/sparse/mask_kernel.h"

#include "paddle/phi/api/ext/dispatch.h"
#include "paddle/phi/core/ddim.h"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"

#include <thrust/binary_search.h>
#include "paddle/phi/kernels/sparse/mask_kernel.h"

#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
Expand All @@ -24,6 +22,7 @@ limitations under the License. */
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"

Expand Down Expand Up @@ -72,11 +71,7 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx,
phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data<int64_t>(),
&h_sparse_offsets[0],
sizeof(int64_t) * sparse_dim,
#ifdef PADDLE_WITH_HIP
hipMemcpyHostToDevice,
#else
cudaMemcpyHostToDevice,
#endif
gpuMemcpyHostToDevice,
dev_ctx.stream());

DenseTensor out_indices = phi::EmptyLike<T>(dev_ctx, indices);
Expand All @@ -93,14 +88,15 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx,

auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1);
MaskKernel<T, IntT><<<config.block_per_grid, config.thread_per_block>>>(
x_ptr,
indices_ptr,
sparse_offsets.data<int64_t>(),
non_zero_num,
cols,
sparse_dim,
out_values_ptr);
MaskKernel<T, IntT>
<<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
x_ptr,
indices_ptr,
sparse_offsets.data<int64_t>(),
non_zero_num,
cols,
sparse_dim,
out_values_ptr);

out->SetMember(out_indices, out_values, dims, true);
}
Expand All @@ -121,19 +117,31 @@ void SparseMaskKernel(const Context& dev_ctx,
}));
}

template <typename T, typename IntT>
__global__ void SparseMaskCopyKernel(const IntT* x_indexs,
const IntT* mask_indexs,
const IntT* bound_out,
const T* x_values,
const int64_t n,
const int64_t stride,
T* out_values) {
template <typename IntT>
__global__ void MaskTable(const IntT* x_indexs, const int n, int* table) {
CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
int index = x_indexs[i];
table[index] = i == 0 ? -1 : i;
}
}

template <typename T, typename IntT, int VecSize>
__global__ void MaskCopy(const IntT* mask_indexs,
const int* table,
const int n,
const int stride,
const T* x_values,
T* out_values) {
using LoadT = phi::AlignedVector<T, VecSize>;
using StoreT = phi::AlignedVector<T, VecSize>;
CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
const IntT j = bound_out[i];
if (j >= 0 && j < n && mask_indexs[i] == x_indexs[j]) {
for (int k = 0; k < stride; k++) {
out_values[i * stride + k] = x_values[j * stride + k];
int j = table[mask_indexs[i]];
if (j != 0) {
if (j == -1) j = 0;
for (int k = 0; k < stride; k += VecSize) {
LoadT vec_x;
phi::Load<T, VecSize>(x_values + j * stride + k, &vec_x);
phi::Store<T, VecSize>(vec_x, out_values + i * stride + k);
}
}
}
Expand Down Expand Up @@ -179,11 +187,7 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
sparse_offsets.data(),
sizeof(IntT) * sparse_dim,
#ifdef PADDLE_WITH_HIP
hipMemcpyHostToDevice,
#else
cudaMemcpyHostToDevice,
#endif
gpuMemcpyHostToDevice,
dev_ctx.stream());

// 3. flatten x indices and mask indices
Expand All @@ -210,37 +214,54 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
mask_indexs.numel(),
sparse_dim,
mask_indexs_ptr);
// 4. call thrust::lower_bound
#ifdef PADDLE_WITH_HIP
thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()),
#else
thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()),
#endif
x_indexs_ptr,
x_indexs_ptr + x_indexs.numel(),
mask_indexs_ptr,
mask_indexs_ptr + mask_indexs.numel(),
bound_out_ptr);

// 5. copy value to out
int table_size = 1;
auto x_dims = x.dims();
for (int i = 0; i < x_dims.size() - 1; i++) {
table_size *= x_dims[i];
}
DenseTensor table = phi::Empty<int>(dev_ctx, {table_size});
phi::backends::gpu::GpuMemsetAsync(
table.data<int>(), 0, table_size * sizeof(int), dev_ctx.stream());
const int64_t stride =
x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];
*out = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
phi::funcs::SetConstant<GPUContext, T> set_zero;
set_zero(dev_ctx, out, static_cast<T>(0));
T* out_ptr = out->data<T>();

const int64_t stride =
x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];

SparseMaskCopyKernel<<<config.block_per_grid,
config.thread_per_block,
0,
dev_ctx.stream()>>>(x_indexs_ptr,
mask_indexs_ptr,
bound_out_ptr,
x.non_zero_elements().data<T>(),
mask_indexs.numel(),
stride,
out_ptr);
config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1);
MaskTable<<<config.block_per_grid,
config.thread_per_block,
0,
dev_ctx.stream()>>>(
x_indexs_ptr, x_indexs.numel(), table.data<int>());
config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1);
const int VecBytes = 16;
const int VecSize = VecBytes / sizeof(T);
if (stride % VecSize == 0) {
MaskCopy<T, IntT, VecSize>
<<<config.block_per_grid,
config.thread_per_block,
0,
dev_ctx.stream()>>>(mask_indexs_ptr,
table.data<int>(),
mask_indexs.numel(),
stride,
x.non_zero_elements().data<T>(),
out_ptr);
} else {
MaskCopy<T, IntT, 1><<<config.block_per_grid,
config.thread_per_block,
0,
dev_ctx.stream()>>>(mask_indexs_ptr,
table.data<int>(),
mask_indexs.numel(),
stride,
x.non_zero_elements().data<T>(),
out_ptr);
}
}

template <typename T, typename Context>
Expand All @@ -257,7 +278,7 @@ void SparseMaskHelperKernel(const Context& dev_ctx,
} // namespace sparse
} // namespace phi

PD_REGISTER_KERNEL(sparse_mask,
PD_REGISTER_KERNEL(mask,
GPU,
ALL_LAYOUT,
phi::sparse::SparseMaskKernel,
Expand All @@ -272,7 +293,7 @@ PD_REGISTER_KERNEL(sparse_mask,
kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
}

PD_REGISTER_KERNEL(sparse_mask_helper,
PD_REGISTER_KERNEL(mask_helper,
GPU,
ALL_LAYOUT,
phi::sparse::SparseMaskHelperKernel,
Expand Down
1 change: 0 additions & 1 deletion paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h"

#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"

namespace phi {
namespace sparse {
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License. */

#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
#include "paddle/phi/kernels/sparse/mask_kernel.h"

namespace phi {
namespace sparse {
Expand Down