Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement block-wide odd-even sort #632

Merged
merged 1 commit into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 86 additions & 0 deletions device/common/include/traccc/device/sort.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/**
* traccc library, part of the ACTS project (R&D line)
*
* (c) 2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#pragma once

#include <cstdint>
#include <vecmem/memory/device_atomic_ref.hpp>

#include "traccc/definitions/qualifiers.hpp"
#include "traccc/device/concepts/barrier.hpp"
#include "traccc/device/concepts/thread_id.hpp"

namespace traccc::device {
/**
* @brief Swap two values of arbitrary type.
*
* @tparam T The type of values to swap.
*
* @param a The first object in the swap (will take the value of b).
* @param b The second object in the swap (will take the value of a).
*/
template <std::movable T>
TRACCC_DEVICE void swap(T& a, T& b) {
T t = std::move(a);
a = std::move(b);
b = std::move(t);
}

/**
* @brief Perform a block-wide odd-even key sorting.
*
* This function performs a sorting operation across the entire block, assuming
* that all the threads in the block are currently active.
*
* @warning The behaviour of this function is ill-defined if any of the threads
* in the block have exited.
*
* @warning This method is efficient for sorting small arrays, preferably in
* shared memory, but given the O(n^2) worst-case performance this should not
* be used on larger arrays.
*
* @tparam T The thread identifier type.
* @tparam B The barrier type
* @tparam K The type of keys to sort.
* @tparam C The type of the comparison function.
*
* @param thread_id The thread identifier object.
* @param barrier The barrier to use for block synchronization.
* @param keys An array of keys to sort.
* @param num_keys The number of keys in the array to sort.
* @param comparison A comparison function.
*/
template <concepts::thread_id1 T, concepts::barrier B, std::movable K,
std::strict_weak_order<K, K> C>
TRACCC_DEVICE void blockOddEvenSort(T& thread_id, B& barrier, K* keys,
uint32_t num_keys, C&& comparison) {
bool sorted;

do {
sorted = true;

for (uint32_t j = 2 * thread_id.getLocalThreadIdX() + 1;
j < num_keys - 1; j += 2 * thread_id.getBlockDimX()) {
if (comparison(keys[j + 1], keys[j])) {
swap(keys[j + 1], keys[j]);
sorted = false;
}
}

barrier.blockBarrier();

for (uint32_t j = 2 * thread_id.getLocalThreadIdX(); j < num_keys - 1;
j += 2 * thread_id.getBlockDimX()) {
if (comparison(keys[j + 1], keys[j])) {
swap(keys[j + 1], keys[j]);
sorted = false;
}
}
} while (barrier.blockOr(!sorted));
}
} // namespace traccc::device
1 change: 1 addition & 0 deletions tests/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ traccc_add_test(
test_unique_lock.cu
test_sanity_contiguous_on.cu
test_sanity_ordered_on.cu
test_sort.cu

LINK_LIBRARIES
CUDA::cudart
Expand Down
46 changes: 46 additions & 0 deletions tests/cuda/test_sort.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/**
* traccc library, part of the ACTS project (R&D line)
*
* (c) 2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#include <gtest/gtest.h>

#include <vecmem/memory/cuda/managed_memory_resource.hpp>
#include <vecmem/memory/unique_ptr.hpp>

#include "../../cuda/src/utils/barrier.hpp"
#include "traccc/cuda/utils/thread_id.hpp"
#include "traccc/device/sort.hpp"

__global__ void testBlockSortKernel(uint32_t *keys, uint32_t n_keys) {
traccc::cuda::thread_id1 thread_id;
traccc::cuda::barrier barrier;
traccc::device::blockOddEvenSort(thread_id, barrier, keys, n_keys,
std::less<uint32_t>());
}

TEST(CUDASort, BlockOddEvenSort) {
vecmem::cuda::managed_memory_resource mr;

uint32_t n = 2803;
vecmem::unique_alloc_ptr<uint32_t[]> arr =
vecmem::make_unique_alloc<uint32_t[]>(mr, n);

// As long as 13 and n_keys are coprime, this will generate a big,
// non-sorted array containing every element.
for (uint32_t i = 0; i < n; i++) {
arr[i] = (13 * 500 * i) % n;
}

testBlockSortKernel<<<1, 1024u>>>(arr.get(), n);

ASSERT_EQ(cudaPeekAtLastError(), cudaSuccess);
ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);

for (uint32_t i = 0; i < n; ++i) {
ASSERT_EQ(arr[i], i);
}
}
1 change: 1 addition & 0 deletions tests/sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ traccc_add_test(
test_cca.sycl
test_sanity_contiguous_on.sycl
test_sanity_ordered_on.sycl
test_sort.sycl

LINK_LIBRARIES
GTest::gtest_main
Expand Down
51 changes: 51 additions & 0 deletions tests/sycl/test_sort.sycl
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/**
* traccc library, part of the ACTS project (R&D line)
*
* (c) 2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#include <gtest/gtest.h>

#include <CL/sycl.hpp>
#include <vecmem/memory/sycl/shared_memory_resource.hpp>
#include <vecmem/memory/unique_ptr.hpp>

#include "../../sycl/src/utils/barrier.hpp"
#include "traccc/device/sort.hpp"
#include "traccc/sycl/utils/thread_id.hpp"

TEST(SYCLSort, BlockOddEvenSort) {
vecmem::sycl::shared_memory_resource mr;
cl::sycl::queue queue;

uint32_t n = 2803;
vecmem::unique_alloc_ptr<uint32_t[]> arr =
vecmem::make_unique_alloc<uint32_t[]>(mr, n);

// As long as 13 and n_keys are coprime, this will generate a big,
// non-sorted array containing every element.
for (uint32_t i = 0; i < n; i++) {
arr[i] = (13 * 500 * i) % n;
}

cl::sycl::nd_range test_range(cl::sycl::range<1>(128),
cl::sycl::range<1>(128));

queue
.submit([&, keys = arr.get()](cl::sycl::handler &h) {
h.parallel_for<class BlockOddEvenSortKernel>(
test_range, [=](cl::sycl::nd_item<1> item) {
traccc::sycl::thread_id1 thread_id(item);
traccc::sycl::barrier barrier(item);
traccc::device::blockOddEvenSort(thread_id, barrier, keys,
n, std::less<uint32_t>());
});
})
.wait_and_throw();

for (uint32_t i = 0; i < n; ++i) {
ASSERT_EQ(arr[i], i);
}
}
Loading