Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement CUDA block- and warp-wide odd-even sort #347

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions device/cuda/src/utils/sort.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/** TRACCC library, part of the ACTS project (R&D line)
*
* (c) 2021 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#pragma once

namespace traccc::cuda {
/**
* @brief Swap two values of arbitrary type.
*
* @tparam T The type of values to swap.
*
* @param a The first object in the swap (will take the value of b).
* @param b The second object in the swap (will take the value of a).
*/
template <typename T>
__device__ __forceinline__ void swap(T& a, T& b) {
T t = a;
a = b;
b = t;
}

/**
* @brief Perform a block-wide odd-even key sorting.
*
* This function performs a sorting operation across the entire block, assuming
* that all the threads in the block are currently active.
*
* @warning The behaviour of this function is ill-defined if any of the threads
* in the block have exited.
*
* @warning This method is efficient for sorting small arrays, preferable in
* shared memory, but given the O(n^2) worst-case performance this should not
* be used on larger arrays.
*
* @tparam K The type of keys to sort.
* @tparam C The type of the comparison function.
*
* @param keys An array of keys to sort.
* @param num_keys The number of keys in the array to sort.
* @param comparison A comparison function.
*/
template <typename K, typename C>
__device__ void blockOddEvenKeySort(K* keys, uint32_t num_keys,
C&& comparison) {
bool sorted;

do {
sorted = true;

for (uint32_t j = 2 * threadIdx.x + 1; j < num_keys - 1;
j += 2 * blockDim.x) {
if (comparison(keys[j + 1], keys[j])) {
swap(keys[j + 1], keys[j]);
sorted = false;
}
}

__syncthreads();

for (uint32_t j = 2 * threadIdx.x; j < num_keys - 1;
j += 2 * blockDim.x) {
if (comparison(keys[j + 1], keys[j])) {
swap(keys[j + 1], keys[j]);
sorted = false;
}
}
} while (__syncthreads_or(!sorted));
}

/**
* @brief Perform a warp-wide odd-even key sorting.
*
* This function performs a sorting operation across a single warp, assuming
* that all the threads in the warp are currently active.
*
* @warning The behaviour of this function is ill-defined if any of the threads
* in the warp have exited.
*
* @warning This method is efficient for sorting small arrays, preferable in
* shared memory, but given the O(n^2) worst-case performance this should not
* be used on larger arrays.
*
* @tparam K The type of keys to sort.
* @tparam C The type of the comparison function.
*
* @param keys An array of keys to sort.
* @param num_keys The number of keys in the array to sort.
* @param comparison A comparison function.
*/
template <typename K, typename C>
__device__ void warpOddEvenKeySort(K* keys, uint32_t num_keys, C&& comparison) {
bool sorted;

do {
sorted = true;

for (uint32_t j = 2 * (threadIdx.x % warpSize) + 1; j < num_keys - 1;
j += 2 * warpSize) {
if (comparison(keys[j + 1], keys[j])) {
swap(keys[j + 1], keys[j]);
sorted = false;
}
}

__syncwarp(0xFFFFFFFF);

for (uint32_t j = 2 * (threadIdx.x % warpSize); j < num_keys - 1;
j += 2 * warpSize) {
if (comparison(keys[j + 1], keys[j])) {
swap(keys[j + 1], keys[j]);
sorted = false;
}
}
} while (__any_sync(0xFFFFFFFF, !sorted));
}
} // namespace traccc::cuda
1 change: 1 addition & 0 deletions tests/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ traccc_add_test(
test_thrust.cu
test_sync.cu
test_array_wrapper.cu
test_sort.cu

LINK_LIBRARIES
CUDA::cudart
Expand Down
81 changes: 81 additions & 0 deletions tests/cuda/test_sort.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/**
* TRACCC library, part of the ACTS project (R&D line)
*
* (c) 2021 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#include <gtest/gtest.h>

#include <../../device/cuda/src/utils/sort.cuh>

__global__ void initializeArrayKernel(uint32_t *keys, uint32_t n_keys) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;

for (uint32_t i = tid; i < n_keys; i += blockDim.x) {
keys[i] = (13 * i) % n_keys;
}
}

__global__ void testBlockSortKernel(uint32_t *keys, uint32_t n_keys) {
traccc::cuda::blockOddEvenKeySort(keys, n_keys, std::less<uint32_t>());
}

__global__ void testWarpSortKernel(uint32_t *keys, uint32_t n_keys) {
traccc::cuda::warpOddEvenKeySort(keys, n_keys, std::less<uint32_t>());
}

TEST(CUDASort, BlockOddEvenKeySort) {
uint32_t n = 2803;
uint32_t *dev_arr = nullptr;
std::unique_ptr<uint32_t[]> host_arr = std::make_unique<uint32_t[]>(n);

ASSERT_EQ(cudaMalloc(&dev_arr, n * sizeof(uint32_t)), cudaSuccess);
ASSERT_NE(dev_arr, nullptr);

initializeArrayKernel<<<1, 1024u>>>(dev_arr, n);
ASSERT_EQ(cudaPeekAtLastError(), cudaSuccess);
ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);

testBlockSortKernel<<<1, 1024u>>>(dev_arr, n);

ASSERT_EQ(cudaPeekAtLastError(), cudaSuccess);

ASSERT_EQ(cudaMemcpy(host_arr.get(), dev_arr, n * sizeof(uint32_t),
cudaMemcpyDeviceToHost),
cudaSuccess);

for (uint32_t i = 0; i < n; ++i) {
ASSERT_EQ(host_arr[i], i);
}

ASSERT_EQ(cudaFree(dev_arr), cudaSuccess);
}

TEST(CUDASort, WarpOddEvenKeySort) {
uint32_t n = 2803;
uint32_t *dev_arr = nullptr;
std::unique_ptr<uint32_t[]> host_arr = std::make_unique<uint32_t[]>(n);

ASSERT_EQ(cudaMalloc(&dev_arr, n * sizeof(uint32_t)), cudaSuccess);
ASSERT_NE(dev_arr, nullptr);

initializeArrayKernel<<<1, 1024u>>>(dev_arr, n);
ASSERT_EQ(cudaPeekAtLastError(), cudaSuccess);
ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);

testWarpSortKernel<<<1, 32u>>>(dev_arr, n);

ASSERT_EQ(cudaPeekAtLastError(), cudaSuccess);

ASSERT_EQ(cudaMemcpy(host_arr.get(), dev_arr, n * sizeof(uint32_t),
cudaMemcpyDeviceToHost),
cudaSuccess);

for (uint32_t i = 0; i < n; ++i) {
ASSERT_EQ(host_arr[i], i);
}

ASSERT_EQ(cudaFree(dev_arr), cudaSuccess);
}