Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

speedup scan_nlist kernel #1028

Merged
merged 2 commits into from
Aug 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 66 additions & 31 deletions source/lib/src/cuda/neighbor_list.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,69 @@
#include "gpu_cuda.h"
#include "neighbor_list.h"

#include <cub/block/block_scan.cuh>
// A stateful callback functor that maintains a running prefix to be applied
// during consecutive scan operations.
struct parallel_prefix_scan_op
{
// Running prefix
int running_total;
// Constructor
__device__ parallel_prefix_scan_op(int running_total) : running_total(running_total) {}
// Callback operator to be entered by the first warp of threads in the block.
// Thread-0 is responsible for returning a value for seeding the block-wide scan.
__device__ int operator()(int block_aggregate)
{
int old_prefix = running_total;
running_total += block_aggregate;
return old_prefix;
}
};

template <
int THREADS_PER_BLOCK>
__global__ void parallel_prefix_scan(
int * numneigh,
int * nei_order,
const int * temp_nlist,
const int mem_size,
const int nloc,
const int nall
)
{
// Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
typedef cub::BlockScan<int, THREADS_PER_BLOCK> BlockScan;
// Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;

// Initialize running total
parallel_prefix_scan_op prefix_op(0);

// Have the block iterate over segments of items
for (int ii = threadIdx.x; ii < nall; ii += THREADS_PER_BLOCK)
{
int block_offset = blockIdx.x * mem_size;
// Load a segment of consecutive items that are blocked across threads
int i_data = temp_nlist[block_offset + ii];
int o_data = i_data == -1 ? 0 : 1;

// Collectively compute the block-wide exclusive prefix sum
BlockScan(temp_storage).ExclusiveSum(
o_data, o_data, prefix_op);

__syncthreads();
// Store scanned items to output segment
if (i_data != -1) {
nei_order[block_offset + ii] = o_data;
}
// Store numneigh into the output array
if (ii == nall - 1) {
o_data += i_data == -1 ? 0 : 1;
numneigh[blockIdx.x] = o_data;
}
}
}

template<typename FPTYPE>
__device__ inline FPTYPE dev_dot(
FPTYPE * arr1,
Expand Down Expand Up @@ -45,29 +108,6 @@ __global__ void build_nlist(
}
}

__global__ void scan_nlist(
int * numneigh,
int * nei_order,
const int * temp_nlist,
const int mem_size,
const int nloc,
const int nall)
{
const unsigned int atom_idx = blockIdx.x * blockDim.x + threadIdx.x;
if(atom_idx<nloc){
const int * row_nlist = temp_nlist + atom_idx * mem_size;
int * row_order = nei_order + atom_idx * mem_size;
int nei_num=0;
for(int i=0;i<nall;i++){
if(row_nlist[i]!=-1){
row_order[i]=nei_num;
nei_num++;
}
}
numneigh[atom_idx]=nei_num;
}
}

__global__ void fill_nlist(
int ** firstneigh,
const int * temp_nlist,
Expand Down Expand Up @@ -143,14 +183,9 @@ int build_nlist_gpu(
mem_size);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int nblock_ = (nloc+TPB-1)/TPB;
scan_nlist<<<nblock_, TPB>>>(
numneigh,
nei_order,
temp_nlist,
mem_size,
nloc,
nall);
parallel_prefix_scan<TPB> <<<nloc, TPB>>>(
numneigh, nei_order,
temp_nlist, mem_size, nloc, nall);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
fill_nlist<<<block_grid, thread_grid>>>(
Expand Down
98 changes: 67 additions & 31 deletions source/lib/src/rocm/neighbor_list.hip.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,69 @@
#include "device.h"
#include "neighbor_list.h"

#include "hipcub/hipcub.hpp"
// A stateful callback functor that maintains a running prefix to be applied
// during consecutive scan operations.
struct parallel_prefix_scan_op
{
// Running prefix
int running_total;
// Constructor
__device__ parallel_prefix_scan_op(int running_total) : running_total(running_total) {}
// Callback operator to be entered by the first warp of threads in the block.
// Thread-0 is responsible for returning a value for seeding the block-wide scan.
__device__ int operator()(int block_aggregate)
{
int old_prefix = running_total;
running_total += block_aggregate;
return old_prefix;
}
};

template <
int THREADS_PER_BLOCK>
__global__ void parallel_prefix_scan(
int * numneigh,
int * nei_order,
const int * temp_nlist,
const int mem_size,
const int nloc,
const int nall
)
{
// Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
typedef hipcub::BlockScan<int, THREADS_PER_BLOCK> BlockScan;
// Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;

// Initialize running total
parallel_prefix_scan_op prefix_op(0);

// Have the block iterate over segments of items
for (int ii = threadIdx.x; ii < nall; ii += THREADS_PER_BLOCK)
{
int block_offset = blockIdx.x * mem_size;
// Load a segment of consecutive items that are blocked across threads
int i_data = temp_nlist[block_offset + ii];
int o_data = i_data == -1 ? 0 : 1;

// Collectively compute the block-wide exclusive prefix sum
BlockScan(temp_storage).ExclusiveSum(
o_data, o_data, prefix_op);

__syncthreads();
// Store scanned items to output segment
if (i_data != -1) {
nei_order[block_offset + ii] = o_data;
}
// Store numneigh into the output array
if (ii == nall - 1) {
o_data += i_data == -1 ? 0 : 1;
numneigh[blockIdx.x] = o_data;
}
}
}

template<typename FPTYPE>
__device__ inline FPTYPE dev_dot(
FPTYPE * arr1,
Expand Down Expand Up @@ -45,29 +108,6 @@ __global__ void build_nlist(
}
}

__global__ void scan_nlist(
int * numneigh,
int * nei_order,
const int * temp_nlist,
const int mem_size,
const int nloc,
const int nall)
{
const unsigned int atom_idx = blockIdx.x * blockDim.x + threadIdx.x;
if(atom_idx<nloc){
const int * row_nlist = temp_nlist + atom_idx * mem_size;
int * row_order = nei_order + atom_idx * mem_size;
int nei_num=0;
for(int i=0;i<nall;i++){
if(row_nlist[i]!=-1){
row_order[i]=nei_num;
nei_num++;
}
}
numneigh[atom_idx]=nei_num;
}
}

__global__ void fill_nlist(
int ** firstneigh,
const int * temp_nlist,
Expand Down Expand Up @@ -143,14 +183,10 @@ int build_nlist_gpu_rocm(
mem_size);
DPErrcheck(hipGetLastError());
DPErrcheck(hipDeviceSynchronize());
const int nblock_ = (nloc+TPB-1)/TPB;
hipLaunchKernelGGL(scan_nlist, nblock_, TPB, 0, 0,
numneigh,
nei_order,
temp_nlist,
mem_size,
nloc,
nall);
hipLaunchKernelGGL(
HIP_KERNEL_NAME(parallel_prefix_scan<TPB>), nloc, TPB, 0, 0,
numneigh, nei_order,
temp_nlist, mem_size, nloc, nall);
DPErrcheck(hipGetLastError());
DPErrcheck(hipDeviceSynchronize());
hipLaunchKernelGGL(fill_nlist, block_grid, thread_grid, 0, 0,
Expand Down