From 0c259684ba01d9b44b9fed5da7c7601528f4df61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Thu, 11 Jul 2024 00:37:30 +0000 Subject: [PATCH 01/13] WIP --- examples/CMakeLists.txt | 1 + examples/static_multiset/host_bulk_example.cu | 77 +++++++ .../cuco/detail/open_addressing/kernels.cuh | 47 ++++ .../open_addressing/open_addressing_impl.cuh | 62 +++++ .../open_addressing_ref_impl.cuh | 211 ++++++++++++++++++ .../static_multiset/static_multiset.inl | 42 ++++ .../static_multiset/static_multiset_ref.inl | 68 ++++++ include/cuco/operator.hpp | 6 + include/cuco/static_multiset.cuh | 18 ++ 9 files changed, 532 insertions(+) create mode 100644 examples/static_multiset/host_bulk_example.cu diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b5fafd152..417aff2ed 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -38,6 +38,7 @@ ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/stat ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu") ConfigureExample(STATIC_SET_SHARED_MEMORY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/shared_memory_example.cu") ConfigureExample(STATIC_SET_MAPPING_TABLE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/mapping_table_example.cu") +ConfigureExample(STATIC_MULTISET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multiset/host_bulk_example.cu") ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu") ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_ref_example.cu") ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") diff --git a/examples/static_multiset/host_bulk_example.cu b/examples/static_multiset/host_bulk_example.cu new file mode 100644 index 000000000..89974a088 --- /dev/null +++ b/examples/static_multiset/host_bulk_example.cu @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include +#include + +/** + * @file host_bulk_example.cu + * @brief Demonstrates usage of the static_multiset "bulk" host APIs. + * + * The bulk APIs are only invocable from the host and are used for doing operations like `insert` or + * `retrieve` on a multiset of keys. + * + */ +int main(void) +{ + using key_type = int; + + // Empty slots are represented by reserved "sentinel" values. These values should be selected such + // that they never occur in your input data. + key_type constexpr empty_key_sentinel = -1; + + // Number of keys to be inserted + std::size_t constexpr num_keys = 50'000; + + // Compute capacity based on a 50% load factor + auto constexpr load_factor = 0.5; + std::size_t const capacity = std::ceil(num_keys / load_factor); + + // Constructs a set with at least `capacity` slots using -1 as the empty keys sentinel. + cuco::static_multiset multiset{capacity, cuco::empty_key{empty_key_sentinel}}; + + // Create a sequence of keys {0, 1, 2, .., i} + thrust::device_vector keys(num_keys); + thrust::sequence(keys.begin(), keys.end(), 0); + + // Inserts all keys into the hash set + multiset.insert(keys.begin(), keys.end()); + // Insert the same set of keys again, so each distinct key should occur twice in the multiset + multiset.insert(keys.begin(), keys.end()); + + // Counts the occurrences of matching keys contained in the multiset. + auto const output_size = multiset.count(keys.begin(), keys.end()); + + // Storage for result + thrust::device_vector output_probes(output_size); + thrust::device_vector output_matches(output_size); + + auto const [output_probes_end, output_matches_end] = + multiset.retrieve(keys.begin(), keys.end(), output_probes.begin(), output_matches.begin()); + auto const actual_output_size = output_probes_end - output_probes.begin(); + + // The total number of outer matches should be `N + N / 2` + assert(not(output_size == retrieve_size == num_keys * 2)); + + return 0; +} \ No newline at end of file diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh index 2e44dec54..2e74f6c9e 100644 --- a/include/cuco/detail/open_addressing/kernels.cuh +++ b/include/cuco/detail/open_addressing/kernels.cuh @@ -356,6 +356,53 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find(InputIt first, } } +// TODO docs +template +CUCO_KERNEL __launch_bounds__(BlockSize) void retrieve(InputProbeIt input_probe, + cuco::detail::index_type n, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter* atomic_counter, + Ref ref) +{ + auto constexpr tile_size = cuco::detail::warp_size(); // TODO include + + namespace cg = cooperative_groups; + auto const block = cg::this_thread_block(); + auto const tile = cg::tiled_partition(block); + auto const tile_idx = cuco::detail::global_thread_id() / tile_size; + + auto const tiles_in_grid = (gridDim.x * BlockSize) / tile_size; + auto const elems_per_tile = cuco::detail::int_div_ceil(n, tiles_in_grid); // TODO include + + auto const tile_begin_offset = tile_idx * elems_per_tile; + auto const tile_end_offset = max(n, tile_begin_offset + elems_per_tile); + + if (tile_begin_offset < tile_end_offset) { + if constexpr (IsOuter) { + ref.retrieve_outer(tile, + input_probe + tile_begin_offset, + input_probe + tile_end_offset, + output_probe, + output_match, + *atomic_counter); + } else { + ref.retrieve(tile, + input_probe + tile_begin_offset, + input_probe + tile_end_offset, + output_probe, + output_match, + *atomic_counter); + } + } +} + /** * @brief Counts the occurrences of keys in `[first, last)` contained in the container * diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 548ad09a6..766947f80 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -527,6 +527,34 @@ class open_addressing_impl { first, num_keys, output_begin, container_ref); } + // TODO docs + template + std::pair retrieve(InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + Ref container_ref, + cuda_stream_ref stream) const + { // TODO cuda::stream_ref + auto constexpr is_outer = false; + return this->retrieve_impl( + first, last, output_probe, output_match, container_ref, stream); + } + + // TODO docs + template + std::pair retrieve_outer(InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + Ref container_ref, + cuda_stream_ref stream) const + { // TODO cuda::stream_ref + auto constexpr is_outer = true; + return this->retrieve_impl( + first, last, output_probe, output_match, container_ref, stream); + } + /** * @brief Counts the occurrences of keys in `[first, last)` contained in the container * @@ -862,6 +890,40 @@ class open_addressing_impl { return counter.load_to_host(stream); } + // TODO docs + template + std::pair retrieve_impl(InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + Ref container_ref, + cuda_stream_ref stream) const + { // TODO cuda::stream_ref + auto const n = cuco::detail::distance(first, last); + if (n == 0) { return {output_probe, output_match}; } + + auto counter = + detail::counter_storage{this->allocator()}; + counter.reset(stream); + + auto constexpr block_size = cuco::detail::default_block_size(); + auto const grid_size = cuco::detail::max_occupancy_grid_size(block_size, + detail::retrieve); + + detail::retrieve<<>>( + first, n, output_probe, output_match, counter.data(), container_ref); + + auto const num_retrieved = counter.load_to_host(stream); + + return {output_probe + num_retrieved, output_match + num_retrieved}; + } + /** * @brief Extracts the key from a given slot. * diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 060c78737..437177245 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -962,6 +962,217 @@ class open_addressing_ref_impl { } } + // TODO docs + template + __device__ void retrieve( + cooperative_groups::thread_block_tile const& flushing_tile, + InputProbeIt input_probe_begin, + InputProbeIt input_probe_end, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const + { + auto constexpr is_outer = false; + auto const n = cuco::detail::distance(input_probe_begin, input_probe_end); // TODO include + this->retrieve_impl( + flushing_tile, input_probe_begin, n, output_probe, output_match, atomic_counter); + } + + // TODO docs + template + __device__ void retrieve_outer( + cooperative_groups::thread_block_tile const& flushing_tile, + InputProbeIt input_probe_begin, + InputProbeIt input_probe_end, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const + { + auto constexpr is_outer = true; + auto const n = cuco::detail::distance(input_probe_begin, input_probe_end); // TODO include + this->retrieve_impl( + flushing_tile, input_probe_begin, n, output_probe, output_match, atomic_counter); + } + + // TODO docs + template + __device__ void retrieve_impl( + cooperative_groups::thread_block_tile const& flushing_tile, + InputProbeIt input_probe, + cuco::detail::index_type n, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const + { + namespace cg = cooperative_groups; + + if (n == 0) { return; } + + using probe_type = typename std::iterator_traits::value_type; + static_assert(FlushingTileSize >= cg_size); + + // tuning parameter + auto constexpr buffer_multiplicator = 1; + static_assert(buffer_multiplicator > 0); + + auto constexpr probing_tile_size = cg_size; + auto constexpr max_matches_per_step = FlushingTileSize * window_size; + auto constexpr buffer_size = buffer_multiplicator * max_matches_per_step; + + auto const probing_tile = cg::tiled_partition(flushing_tile); + + auto idx = flushing_tile.thread_rank() / probing_tile_size; + auto constexpr stride = FlushingTileSize / probing_tile_size; + + // TODO align to 16B? + __shared__ probe_type probe_buffer[buffer_size]; + __shared__ value_type match_buffer[buffer_size]; + + // using atomic_offset_type = cuda::atomic; + // __shared__ atomic_offset_type buffer_offsets[flushing_tiles_per_block]; + + // #if defined(CUCO_HAS_CG_INVOKE_ONE) + // cg::invoke_one(flushing_tile, + // [&]() { new (&buffer_offsets[flushing_tile_id]) atomic_offset_type{0}; }); + // #else + // if (flushing_tile.thread_rank() == 0) { + // new (&buffer_offsets[flushing_tile_id]) atomic_offset_type{0}; + // } + // #endif + // flushing_tile.sync(); // sync still needed since cg.any doesn't imply a memory barrier + + // iterate over input keys + // Note: `.any()` will implicitly synchronize the tile so we can safely reuse the shmem buffer + while (flushing_tile.any(idx < n)) { + bool active_flag = idx < n; + auto const active_flushing_tile = + cg::binary_partition(flushing_tile, active_flag); + + if (active_flag) { + // perform probing + // make sure the flushing_tile is converged at this point to get a coalesced load + auto const& probe = *(input_probe + idx); + auto probing_iter = + this->probing_scheme_(probing_tile, probe, this->storage_ref_.window_extent()); + bool empty_found = false; + bool match_found = false; + [[maybe_unused]] bool found_any_match = false; // only needed if `IsOuter == true` + size_type num_matches = 0; + + while (true) { + // TODO atomic_ref::load if insert operator is present + auto const window_slots = this->storage_ref_[*probing_iter]; + + for (int32_t i = 0; i < window_size; ++i) { + // if we're not at the end of the probing sequence + if (probing_tile.any(empty_found)) { + // set empty flag for all threads in the probing tile + empty_found = true; + } else { + // inspect slot content + switch (this->predicate_.operator()( + probe, this->extract_key(window_slots[i]))) { + case detail::equal_result::EMPTY: { + empty_found = true; + break; + } + case detail::equal_result::EQUAL: { + match_found = true; + break; + } + default: { + break; + } + } + } + + if (active_flushing_tile.any(match_found)) { + auto const matching_tile = cg::binary_partition(active_flushing_tile, match_found); + // stage matches in shmem buffer + if (match_found) { + probe_buffer[num_matches + matching_tile.thread_rank()] = probe; + match_buffer[num_matches + matching_tile.thread_rank()] = window_slots[i]; + } + // add number of new matches to the buffer counter + num_matches += matching_tile.size(); + } + + if constexpr (IsOuter) { + if (not found_any_match /*yet*/ and probing_tile.any(match_found) /*now*/) { + found_any_match = true; + } + } + + // reset flag for next iteration + match_found = false; + } + + // check if all probing tiles have finished their work + bool const finished = active_flushing_tile.all(empty_found); + + if constexpr (IsOuter) { + if (finished and not found_any_match) { + // TODO write probe + empty_payload_sentinel or end() + // also increment num_matches accordingly + } + } + + // if the buffer has not enough empty slots for the next iteration or the tile has + // finished probing + if (finished or num_matches > (buffer_size - max_matches_per_step)) { + if (num_matches > 0) { + auto const rank = active_flushing_tile.thread_rank(); + +#if defined(CUCO_HAS_CG_INVOKE_ONE) + auto const offset = cg::invoke_one_broadcast(active_flushing_tile, [&]() { + return atomic_counter.fetch_add(num_matches, cuda::std::memory_order_relaxed); + }); +#else + size_type offset; + if (rank == 0) { + offset = atomic_counter.fetch_add(num_matches, cuda::std::memory_order_relaxed); + } + offset = active_flushing_tile.shfl(offset, 0); +#endif + + // flush_buffers + // TODO use memcpy_async or pragma unroll? + for (size_type i = rank; i < num_matches; i += active_flushing_tile.size()) { + *(output_probe + offset + i) = probe_buffer[i]; + *(output_match + offset + i) = match_buffer[i]; + } + + // reset buffer counter + num_matches = 0; + } + } + + // the entire flushing tile has finished its work + if (finished) { break; } + + // onto the next probing window + ++probing_iter; + } + } + + // onto the next key + idx += stride; + } + } + /** * @brief Executes a callback on every element in the container with key equivalent to the probe * key. diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl index d24a090dd..5bb9d689e 100644 --- a/include/cuco/detail/static_multiset/static_multiset.inl +++ b/include/cuco/detail/static_multiset/static_multiset.inl @@ -274,6 +274,48 @@ void static_multisetfind_async(first, last, output_begin, ref(op::find), stream); } +// TODO docs +template +template +std::pair +static_multiset::retrieve( + InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + cuda_stream_ref stream) const // TODO cuda::stream_ref +{ + return this->impl_->retrieve( + first, last, output_probe, output_match, this->ref(op::retrieve), stream); +} + +// TODO docs +template +template +std::pair +static_multiset::retrieve_outer( + InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + cuda_stream_ref stream) const // TODO cuda::stream_ref +{ + return this->impl_->retrieve_outer( + first, last, output_probe, output_match, this->ref(op::retrieve), stream); +} + template +class operator_impl< + op::retrieve_tag, + static_multiset_ref> { + using base_type = static_multiset_ref; + using ref_type = + static_multiset_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + // TODO docs + template + __device__ void retrieve( + cooperative_groups::thread_block_tile const& flushing_tile, + InputProbeIt input_probe_begin, + InputProbeIt input_probe_end, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const + { + auto const& ref_ = static_cast(*this); + ref_.impl_.retrieve(flushing_tile, + input_probe_begin, + input_probe_end, + output_probe, + output_match, + atomic_counter); + } + + // TODO docs + template + __device__ void retrieve_outer( + cooperative_groups::thread_block_tile const& flushing_tile, + InputProbeIt input_probe_begin, + InputProbeIt input_probe_end, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const + { + auto const& ref_ = static_cast(*this); + ref_.impl_.retrieve_outer(flushing_tile, + input_probe_begin, + input_probe_end, + output_probe, + output_match, + atomic_counter); + } +}; + template + std::pair retrieve( + InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + cuda_stream_ref stream = {}) const; // TODO cuda::stream_ref + + // TODO docs + template + std::pair retrieve_outer( + InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + cuda_stream_ref stream = {}) const; // TODO cuda::stream_ref + /** * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset * From ca1d62a7e0732c6b8b87b5a5726b7ca143e511b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Thu, 11 Jul 2024 00:42:00 +0000 Subject: [PATCH 02/13] Fix merge conflicts --- .../open_addressing/open_addressing_impl.cuh | 28 +++++++++---------- .../static_multiset/static_multiset.inl | 4 +-- include/cuco/static_multiset.cuh | 22 +++++++-------- 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 10529301a..35aae8139 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -579,8 +579,8 @@ class open_addressing_impl { OutputProbeIt output_probe, OutputMatchIt output_match, Ref container_ref, - cuda_stream_ref stream) const - { // TODO cuda::stream_ref + cuda::stream_ref stream) const + { auto constexpr is_outer = false; return this->retrieve_impl( first, last, output_probe, output_match, container_ref, stream); @@ -593,8 +593,8 @@ class open_addressing_impl { OutputProbeIt output_probe, OutputMatchIt output_match, Ref container_ref, - cuda_stream_ref stream) const - { // TODO cuda::stream_ref + cuda::stream_ref stream) const + { auto constexpr is_outer = true; return this->retrieve_impl( first, last, output_probe, output_match, container_ref, stream); @@ -942,8 +942,8 @@ class open_addressing_impl { OutputProbeIt output_probe, OutputMatchIt output_match, Ref container_ref, - cuda_stream_ref stream) const - { // TODO cuda::stream_ref + cuda::stream_ref stream) const + { auto const n = cuco::detail::distance(first, last); if (n == 0) { return {output_probe, output_match}; } @@ -954,14 +954,14 @@ class open_addressing_impl { auto constexpr block_size = cuco::detail::default_block_size(); auto const grid_size = cuco::detail::max_occupancy_grid_size(block_size, detail::retrieve); - - detail::retrieve<<>>( + block_size, + InputProbeIt, + OutputProbeIt, + OutputMatchIt, + decltype(counter), + Ref>); + + detail::retrieve<<>>( first, n, output_probe, output_match, counter.data(), container_ref); auto const num_retrieved = counter.load_to_host(stream); diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl index aa5515d1d..8f95d6d59 100644 --- a/include/cuco/detail/static_multiset/static_multiset.inl +++ b/include/cuco/detail/static_multiset/static_multiset.inl @@ -292,7 +292,7 @@ static_multiset InputProbeIt last, OutputProbeIt output_probe, OutputMatchIt output_match, - cuda_stream_ref stream) const // TODO cuda::stream_ref + cuda::stream_ref stream) const { return this->impl_->retrieve( first, last, output_probe, output_match, this->ref(op::retrieve), stream); @@ -313,7 +313,7 @@ static_multiset InputProbeIt last, OutputProbeIt output_probe, OutputMatchIt output_match, - cuda_stream_ref stream) const // TODO cuda::stream_ref + cuda::stream_ref stream) const { return this->impl_->retrieve_outer( first, last, output_probe, output_match, this->ref(op::retrieve), stream); diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh index 44194874d..37787d1cd 100644 --- a/include/cuco/static_multiset.cuh +++ b/include/cuco/static_multiset.cuh @@ -475,21 +475,19 @@ class static_multiset { // TODO docs template - std::pair retrieve( - InputProbeIt first, - InputProbeIt last, - OutputProbeIt output_probe, - OutputMatchIt output_match, - cuda_stream_ref stream = {}) const; // TODO cuda::stream_ref + std::pair retrieve(InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + cuda::stream_ref stream = {}) const; // TODO docs template - std::pair retrieve_outer( - InputProbeIt first, - InputProbeIt last, - OutputProbeIt output_probe, - OutputMatchIt output_match, - cuda_stream_ref stream = {}) const; // TODO cuda::stream_ref + std::pair retrieve_outer(InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + cuda::stream_ref stream = {}) const; /** * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset From c9a6179b466a029371a8a11c0aa155a6d98d4374 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Thu, 11 Jul 2024 15:11:13 +0000 Subject: [PATCH 03/13] Switch to CTA granularity in device retrieve --- .../cuco/detail/open_addressing/kernels.cuh | 42 ++++----- .../open_addressing/open_addressing_impl.cuh | 17 ++-- .../open_addressing_ref_impl.cuh | 92 ++++++++++--------- .../static_multiset/static_multiset_ref.inl | 46 ++++------ include/cuco/detail/utils.hpp | 14 +-- 5 files changed, 101 insertions(+), 110 deletions(-) diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh index 40060616f..983e1c3f6 100644 --- a/include/cuco/detail/open_addressing/kernels.cuh +++ b/include/cuco/detail/open_addressing/kernels.cuh @@ -374,34 +374,30 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void retrieve(InputProbeIt input_probe, AtomicCounter* atomic_counter, Ref ref) { - auto constexpr tile_size = cuco::detail::warp_size(); // TODO include - - namespace cg = cooperative_groups; - auto const block = cg::this_thread_block(); - auto const tile = cg::tiled_partition(block); - auto const tile_idx = cuco::detail::global_thread_id() / tile_size; + namespace cg = cooperative_groups; - auto const tiles_in_grid = (gridDim.x * BlockSize) / tile_size; - auto const elems_per_tile = cuco::detail::int_div_ceil(n, tiles_in_grid); // TODO include + auto const block = cg::this_thread_block(); + auto const blocks_in_grid = gridDim.x; + auto const elems_per_block = cuco::detail::int_div_ceil(n, blocks_in_grid); - auto const tile_begin_offset = tile_idx * elems_per_tile; - auto const tile_end_offset = max(n, tile_begin_offset + elems_per_tile); + auto const block_begin_offset = block.group_index().x * elems_per_block; + auto const block_end_offset = max(n, block_begin_offset + elems_per_block); - if (tile_begin_offset < tile_end_offset) { + if (block_begin_offset < block_end_offset) { if constexpr (IsOuter) { - ref.retrieve_outer(tile, - input_probe + tile_begin_offset, - input_probe + tile_end_offset, - output_probe, - output_match, - *atomic_counter); + ref.retrieve_outer(block, + input_probe + block_begin_offset, + input_probe + block_end_offset, + output_probe, + output_match, + *atomic_counter); } else { - ref.retrieve(tile, - input_probe + tile_begin_offset, - input_probe + tile_end_offset, - output_probe, - output_match, - *atomic_counter); + ref.retrieve(block, + input_probe + block_begin_offset, + input_probe + block_end_offset, + output_probe, + output_match, + *atomic_counter); } } } diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 35aae8139..a50ea4914 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -952,14 +952,15 @@ class open_addressing_impl { counter.reset(stream); auto constexpr block_size = cuco::detail::default_block_size(); - auto const grid_size = cuco::detail::max_occupancy_grid_size(block_size, - detail::retrieve); + auto const grid_size = + cuco::detail::max_occupancy_grid_size(block_size, + detail::retrieve); detail::retrieve<<>>( first, n, output_probe, output_match, counter.data(), container_ref); diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index e54c7d723..a93b84629 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -964,83 +965,85 @@ class open_addressing_ref_impl { } // TODO docs - template - __device__ void retrieve( - cooperative_groups::thread_block_tile const& flushing_tile, - InputProbeIt input_probe_begin, - InputProbeIt input_probe_end, - OutputProbeIt output_probe, - OutputMatchIt output_match, - AtomicCounter& atomic_counter) const + __device__ void retrieve(cooperative_groups::thread_block const& block, + InputProbeIt input_probe_begin, + InputProbeIt input_probe_end, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const { auto constexpr is_outer = false; auto const n = cuco::detail::distance(input_probe_begin, input_probe_end); // TODO include - this->retrieve_impl( - flushing_tile, input_probe_begin, n, output_probe, output_match, atomic_counter); + this->retrieve_impl( + block, input_probe_begin, n, output_probe, output_match, atomic_counter); } // TODO docs - template - __device__ void retrieve_outer( - cooperative_groups::thread_block_tile const& flushing_tile, - InputProbeIt input_probe_begin, - InputProbeIt input_probe_end, - OutputProbeIt output_probe, - OutputMatchIt output_match, - AtomicCounter& atomic_counter) const + __device__ void retrieve_outer(cooperative_groups::thread_block const& block, + InputProbeIt input_probe_begin, + InputProbeIt input_probe_end, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const { auto constexpr is_outer = true; auto const n = cuco::detail::distance(input_probe_begin, input_probe_end); // TODO include - this->retrieve_impl( - flushing_tile, input_probe_begin, n, output_probe, output_match, atomic_counter); + this->retrieve_impl( + block, input_probe_begin, n, output_probe, output_match, atomic_counter); } // TODO docs template - __device__ void retrieve_impl( - cooperative_groups::thread_block_tile const& flushing_tile, - InputProbeIt input_probe, - cuco::detail::index_type n, - OutputProbeIt output_probe, - OutputMatchIt output_match, - AtomicCounter& atomic_counter) const + __device__ void retrieve_impl(cooperative_groups::thread_block const& block, + InputProbeIt input_probe, + cuco::detail::index_type n, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const { namespace cg = cooperative_groups; if (n == 0) { return; } using probe_type = typename std::iterator_traits::value_type; - static_assert(FlushingTileSize >= cg_size); // tuning parameter - auto constexpr buffer_multiplicator = 1; - static_assert(buffer_multiplicator > 0); + auto constexpr buffer_multiplier = 1; + static_assert(buffer_multiplier > 0); - auto constexpr probing_tile_size = cg_size; - auto constexpr max_matches_per_step = FlushingTileSize * window_size; - auto constexpr buffer_size = buffer_multiplicator * max_matches_per_step; + auto constexpr probing_tile_size = cg_size; + auto constexpr flushing_tile_size = cuco::detail::warp_size(); + static_assert(flushing_tile_size >= probing_tile_size); - auto const probing_tile = cg::tiled_partition(flushing_tile); + auto constexpr num_flushing_tiles = BlockSize / flushing_tile_size; + auto constexpr max_matches_per_step = flushing_tile_size * window_size; + auto constexpr buffer_size = buffer_multiplier * max_matches_per_step; - auto idx = flushing_tile.thread_rank() / probing_tile_size; - auto constexpr stride = FlushingTileSize / probing_tile_size; + auto const flushing_tile = cg::tiled_partition(block); + auto const probing_tile = cg::tiled_partition(block); + + auto const flushing_tile_id = flushing_tile.meta_group_rank(); + auto idx = probing_tile.meta_group_rank(); + auto const stride = probing_tile.meta_group_size(); // TODO align to 16B? - __shared__ probe_type probe_buffer[buffer_size]; - __shared__ value_type match_buffer[buffer_size]; + __shared__ probe_type probe_buffers[num_flushing_tiles][buffer_size]; + __shared__ value_type match_buffers[num_flushing_tiles][buffer_size]; // using atomic_offset_type = cuda::atomic; // __shared__ atomic_offset_type buffer_offsets[flushing_tiles_per_block]; @@ -1060,7 +1063,7 @@ class open_addressing_ref_impl { while (flushing_tile.any(idx < n)) { bool active_flag = idx < n; auto const active_flushing_tile = - cg::binary_partition(flushing_tile, active_flag); + cg::binary_partition(flushing_tile, active_flag); if (active_flag) { // perform probing @@ -1104,8 +1107,9 @@ class open_addressing_ref_impl { auto const matching_tile = cg::binary_partition(active_flushing_tile, match_found); // stage matches in shmem buffer if (match_found) { - probe_buffer[num_matches + matching_tile.thread_rank()] = probe; - match_buffer[num_matches + matching_tile.thread_rank()] = window_slots[i]; + probe_buffers[flushing_tile_id][num_matches + matching_tile.thread_rank()] = probe; + match_buffers[flushing_tile_id][num_matches + matching_tile.thread_rank()] = + window_slots[i]; } // add number of new matches to the buffer counter num_matches += matching_tile.size(); @@ -1152,8 +1156,8 @@ class open_addressing_ref_impl { // flush_buffers // TODO use memcpy_async or pragma unroll? for (size_type i = rank; i < num_matches; i += active_flushing_tile.size()) { - *(output_probe + offset + i) = probe_buffer[i]; - *(output_match + offset + i) = match_buffer[i]; + *(output_probe + offset + i) = probe_buffers[flushing_tile_id][i]; + *(output_match + offset + i) = match_buffers[flushing_tile_id][i]; } // reset buffer counter diff --git a/include/cuco/detail/static_multiset/static_multiset_ref.inl b/include/cuco/detail/static_multiset/static_multiset_ref.inl index efb0eec37..c07e9a074 100644 --- a/include/cuco/detail/static_multiset/static_multiset_ref.inl +++ b/include/cuco/detail/static_multiset/static_multiset_ref.inl @@ -471,49 +471,39 @@ class operator_impl< public: // TODO docs - template - __device__ void retrieve( - cooperative_groups::thread_block_tile const& flushing_tile, - InputProbeIt input_probe_begin, - InputProbeIt input_probe_end, - OutputProbeIt output_probe, - OutputMatchIt output_match, - AtomicCounter& atomic_counter) const + __device__ void retrieve(cooperative_groups::thread_block const& block, + InputProbeIt input_probe_begin, + InputProbeIt input_probe_end, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const { auto const& ref_ = static_cast(*this); - ref_.impl_.retrieve(flushing_tile, - input_probe_begin, - input_probe_end, - output_probe, - output_match, - atomic_counter); + ref_.impl_.retrieve( + block, input_probe_begin, input_probe_end, output_probe, output_match, atomic_counter); } // TODO docs - template - __device__ void retrieve_outer( - cooperative_groups::thread_block_tile const& flushing_tile, - InputProbeIt input_probe_begin, - InputProbeIt input_probe_end, - OutputProbeIt output_probe, - OutputMatchIt output_match, - AtomicCounter& atomic_counter) const + __device__ void retrieve_outer(cooperative_groups::thread_block const& block, + InputProbeIt input_probe_begin, + InputProbeIt input_probe_end, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const { auto const& ref_ = static_cast(*this); - ref_.impl_.retrieve_outer(flushing_tile, - input_probe_begin, - input_probe_end, - output_probe, - output_match, - atomic_counter); + ref_.impl_.retrieve_outer( + block, input_probe_begin, input_probe_end, output_probe, output_match, atomic_counter); } }; diff --git a/include/cuco/detail/utils.hpp b/include/cuco/detail/utils.hpp index 86c045e3b..d0d777ed6 100644 --- a/include/cuco/detail/utils.hpp +++ b/include/cuco/detail/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,20 +18,20 @@ #include #include -#include -#include +#include +#include namespace cuco { namespace detail { template -constexpr inline index_type distance(Iterator begin, Iterator end) +__host__ __device__ constexpr inline index_type distance(Iterator begin, Iterator end) { - using category = typename std::iterator_traits::iterator_category; - static_assert(std::is_base_of_v, + using category = typename cuda::std::iterator_traits::iterator_category; + static_assert(cuda::std::is_base_of_v, "Input iterator should be a random access iterator."); // `int64_t` instead of arch-dependant `long int` - return static_cast(std::distance(begin, end)); + return static_cast(cuda::std::distance(begin, end)); } /** From 52c43ee724f11048c4abd03b8ef2fffa304751bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Tue, 16 Jul 2024 23:05:31 +0000 Subject: [PATCH 04/13] Fix stupid bugs --- .../cuco/detail/open_addressing/kernels.cuh | 2 +- .../open_addressing/open_addressing_impl.cuh | 17 +++----- .../open_addressing_ref_impl.cuh | 41 ++++++++----------- 3 files changed, 23 insertions(+), 37 deletions(-) diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh index 983e1c3f6..fdacf9752 100644 --- a/include/cuco/detail/open_addressing/kernels.cuh +++ b/include/cuco/detail/open_addressing/kernels.cuh @@ -381,7 +381,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void retrieve(InputProbeIt input_probe, auto const elems_per_block = cuco::detail::int_div_ceil(n, blocks_in_grid); auto const block_begin_offset = block.group_index().x * elems_per_block; - auto const block_end_offset = max(n, block_begin_offset + elems_per_block); + auto const block_end_offset = min(n, block_begin_offset + elems_per_block); if (block_begin_offset < block_end_offset) { if constexpr (IsOuter) { diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index a50ea4914..71a6da929 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -949,23 +949,16 @@ class open_addressing_impl { auto counter = detail::counter_storage{this->allocator()}; - counter.reset(stream); + counter.reset(stream.get()); - auto constexpr block_size = cuco::detail::default_block_size(); - auto const grid_size = - cuco::detail::max_occupancy_grid_size(block_size, - detail::retrieve); + auto constexpr block_size = cuco::detail::default_block_size(); + auto constexpr grid_stride = 1; + auto const grid_size = cuco::detail::grid_size(n, cg_size, grid_stride, block_size); detail::retrieve<<>>( first, n, output_probe, output_match, counter.data(), container_ref); - auto const num_retrieved = counter.load_to_host(stream); + auto const num_retrieved = counter.load_to_host(stream.get()); return {output_probe + num_retrieved, output_match + num_retrieved}; } diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index a93b84629..51dad4df4 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -1045,21 +1045,6 @@ class open_addressing_ref_impl { __shared__ probe_type probe_buffers[num_flushing_tiles][buffer_size]; __shared__ value_type match_buffers[num_flushing_tiles][buffer_size]; - // using atomic_offset_type = cuda::atomic; - // __shared__ atomic_offset_type buffer_offsets[flushing_tiles_per_block]; - - // #if defined(CUCO_HAS_CG_INVOKE_ONE) - // cg::invoke_one(flushing_tile, - // [&]() { new (&buffer_offsets[flushing_tile_id]) atomic_offset_type{0}; }); - // #else - // if (flushing_tile.thread_rank() == 0) { - // new (&buffer_offsets[flushing_tile_id]) atomic_offset_type{0}; - // } - // #endif - // flushing_tile.sync(); // sync still needed since cg.any doesn't imply a memory barrier - - // iterate over input keys - // Note: `.any()` will implicitly synchronize the tile so we can safely reuse the shmem buffer while (flushing_tile.any(idx < n)) { bool active_flag = idx < n; auto const active_flushing_tile = @@ -1081,11 +1066,7 @@ class open_addressing_ref_impl { auto const window_slots = this->storage_ref_[*probing_iter]; for (int32_t i = 0; i < window_size; ++i) { - // if we're not at the end of the probing sequence - if (probing_tile.any(empty_found)) { - // set empty flag for all threads in the probing tile - empty_found = true; - } else { + if (not empty_found) { // inspect slot content switch (this->predicate_.operator()( probe, this->extract_key(window_slots[i]))) { @@ -1111,8 +1092,10 @@ class open_addressing_ref_impl { match_buffers[flushing_tile_id][num_matches + matching_tile.thread_rank()] = window_slots[i]; } + // add number of new matches to the buffer counter - num_matches += matching_tile.size(); + num_matches += (match_found) ? matching_tile.size() + : active_flushing_tile.size() - matching_tile.size(); } if constexpr (IsOuter) { @@ -1124,14 +1107,25 @@ class open_addressing_ref_impl { // reset flag for next iteration match_found = false; } + empty_found = probing_tile.any(empty_found); // check if all probing tiles have finished their work bool const finished = active_flushing_tile.all(empty_found); if constexpr (IsOuter) { if (finished and not found_any_match) { - // TODO write probe + empty_payload_sentinel or end() - // also increment num_matches accordingly +#if defined(CUCO_HAS_CG_INVOKE_ONE) + cg::invoke_one(active_flushing_tile, [&]() { + probe_buffers[flushing_tile_id][num_matches] = probe; + probe_buffers[flushing_tile_id][num_matches] = this->empty_value_sentinel_; + }); +#else + if (active_flushing_tile.thread_rank() == 0) { + probe_buffers[flushing_tile_id][num_matches] = probe; + probe_buffers[flushing_tile_id][num_matches] = this->empty_value_sentinel_; + } +#endif + num_matches++; // not really a match but a sentinel in the buffer } } @@ -1154,7 +1148,6 @@ class open_addressing_ref_impl { #endif // flush_buffers - // TODO use memcpy_async or pragma unroll? for (size_type i = rank; i < num_matches; i += active_flushing_tile.size()) { *(output_probe + offset + i) = probe_buffers[flushing_tile_id][i]; *(output_match + offset + i) = match_buffers[flushing_tile_id][i]; From 8ea56fdc13a2625568f5e653ab5f7c0aebe376c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Tue, 16 Jul 2024 23:05:59 +0000 Subject: [PATCH 05/13] Update example --- examples/CMakeLists.txt | 2 +- examples/static_multiset/host_bulk_example.cu | 21 ++++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 417aff2ed..0b5bf183b 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/examples/static_multiset/host_bulk_example.cu b/examples/static_multiset/host_bulk_example.cu index 89974a088..8ce96b99a 100644 --- a/examples/static_multiset/host_bulk_example.cu +++ b/examples/static_multiset/host_bulk_example.cu @@ -51,7 +51,8 @@ int main(void) cuco::static_multiset multiset{capacity, cuco::empty_key{empty_key_sentinel}}; // Create a sequence of keys {0, 1, 2, .., i} - thrust::device_vector keys(num_keys); + // We're going to insert each key twice so we only need 'num_keys / 2' distinct keys. + thrust::device_vector keys(num_keys / 2); thrust::sequence(keys.begin(), keys.end(), 0); // Inserts all keys into the hash set @@ -60,18 +61,22 @@ int main(void) multiset.insert(keys.begin(), keys.end()); // Counts the occurrences of matching keys contained in the multiset. - auto const output_size = multiset.count(keys.begin(), keys.end()); + std::size_t const counted_output_size = multiset.count(keys.begin(), keys.end()); // Storage for result - thrust::device_vector output_probes(output_size); - thrust::device_vector output_matches(output_size); + thrust::device_vector output_probes(counted_output_size); + thrust::device_vector output_matches(counted_output_size); - auto const [output_probes_end, output_matches_end] = + // Retrieve all matching keys + auto const [output_probes_end, _] = multiset.retrieve(keys.begin(), keys.end(), output_probes.begin(), output_matches.begin()); - auto const actual_output_size = output_probes_end - output_probes.begin(); + std::size_t const retrieved_output_size = output_probes_end - output_probes.begin(); - // The total number of outer matches should be `N + N / 2` - assert(not(output_size == retrieve_size == num_keys * 2)); + if ((retrieved_output_size == counted_output_size) and (retrieved_output_size == num_keys)) { + std::cout << "Success! Found all keys.\n"; + } else { + std::cout << "Fail! Something went wrong.\n"; + } return 0; } \ No newline at end of file From c8021616b56b21e66a8e070306fa0d8f2bf348f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Tue, 16 Jul 2024 23:08:16 +0000 Subject: [PATCH 06/13] Add benchmark --- benchmarks/CMakeLists.txt | 3 +- .../static_multiset/retrieve_bench.cu | 87 +++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 benchmarks/hash_table/static_multiset/retrieve_bench.cu diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 4651d5646..b4d20083f 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -69,6 +69,7 @@ ConfigureBench(STATIC_MAP_BENCH # - static_multiset benchmarks -------------------------------------------------------------------- ConfigureBench(STATIC_MULTISET_BENCH hash_table/static_multiset/contains_bench.cu + hash_table/static_multiset/retrieve_bench.cu hash_table/static_multiset/count_bench.cu hash_table/static_multiset/find_bench.cu hash_table/static_multiset/insert_bench.cu) diff --git a/benchmarks/hash_table/static_multiset/retrieve_bench.cu b/benchmarks/hash_table/static_multiset/retrieve_bench.cu new file mode 100644 index 000000000..efd694946 --- /dev/null +++ b/benchmarks/hash_table/static_multiset/retrieve_bench.cu @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_multiset::retrieve` performance + */ +template +void static_multiset_retrieve(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + state.add_element_count(num_keys); + + cuco::static_multiset set{size, cuco::empty_key{-1}}; + set.insert(keys.begin(), keys.end()); + + auto const output_size = set.count(keys.begin(), keys.end()); + thrust::device_vector output_match(output_size); + auto output_probe_begin = thrust::discard_iterator{}; + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + set.retrieve( + keys.begin(), keys.end(), output_probe_begin, output_match.begin(), {launch.get_stream()}); + }); +} + +NVBENCH_BENCH_TYPES(static_multiset_retrieve, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multiset_retrieve_uniform_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_multiset_retrieve, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multiset_retrieve_uniform_matching_rate") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); + +NVBENCH_BENCH_TYPES(static_multiset_retrieve, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multiset_retrieve_uniform_multiplicity") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); From 9720c49cf4c09b999d1b30234d80947358df0e10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Wed, 17 Jul 2024 15:47:30 +0000 Subject: [PATCH 07/13] Add docs --- .../open_addressing/open_addressing_impl.cuh | 97 ++++++++++++++++++- .../open_addressing_ref_impl.cuh | 97 ++++++++++++++++++- .../static_multiset/static_multiset_ref.inl | 63 +++++++++++- include/cuco/static_multiset.cuh | 59 ++++++++++- 4 files changed, 306 insertions(+), 10 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 71a6da929..d9e31f1e5 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -572,7 +572,35 @@ class open_addressing_impl { first, num_keys, output_begin, container_ref); } - // TODO docs + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[first, last)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count()` to determine the size of the output range. + * + * This function synchronizes the given CUDA stream. + * + * @tparam InputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the input sequence of keys + * @param last End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param container_ref Non-owning device reference to the container + * @param stream CUDA stream this operation is executed in + * + * @return Iterator pair indicating the the end of the output sequences + */ template std::pair retrieve(InputProbeIt first, InputProbeIt last, @@ -586,7 +614,38 @@ class open_addressing_impl { first, last, output_probe, output_match, container_ref, stream); } - // TODO docs + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[first, last)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count_outer()` to determine the size of the output range. + * + * If a key `k` has no matches in the container, then `{key, empty_slot_sentinel}` will be added + * to the output sequence. + * + * This function synchronizes the given CUDA stream. + * + * @tparam InputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the input sequence of keys + * @param last End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param container_ref Non-owning device reference to the container + * @param stream CUDA stream this operation is executed in + * + * @return Iterator pair indicating the the end of the output sequences + */ template std::pair retrieve_outer(InputProbeIt first, InputProbeIt last, @@ -935,7 +994,39 @@ class open_addressing_impl { return counter.load_to_host(stream); } - // TODO docs + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[first, last)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count()/count_outer()` to determine the size of the output range. + * + * If `IsOuter == true` and a key `k` has no matches in the container, then `{key, + * empty_slot_sentinel}` will be added to the output sequence. + * + * This function synchronizes the given CUDA stream. + * + * @tparam IsOuter Flag indicating if an inner or outer retrieve operation should be performed + * @tparam InputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the input sequence of keys + * @param last End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param container_ref Non-owning device reference to the container + * @param stream CUDA stream this operation is executed in + * + * @return Iterator pair indicating the the end of the output sequences + */ template std::pair retrieve_impl(InputProbeIt first, InputProbeIt last, diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 0cf59e91c..35646760b 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -966,7 +966,35 @@ class open_addressing_ref_impl { } } - // TODO docs + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[input_probe_begin, + * input_probe_end)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count()` to determine the size of the output range. + * + * @tparam BlockSize Size of the thread block this operation is executed in + * @tparam InputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam AtomicCounter Atomic counter type that follows the same semantics as + * `cuda::atomic(_ref)` + * + * @param block Thread block this operation is executed in + * @param input_probe_begin Beginning of the input sequence of keys + * @param input_probe_end End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param atomic_counter Counter that is used to determine the next free position in the output + * sequences + */ template std::pair retrieve(InputProbeIt first, InputProbeIt last, @@ -481,7 +507,36 @@ class static_multiset { OutputMatchIt output_match, cuda::stream_ref stream = {}) const; - // TODO docs + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[first, last)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count_outer()` to determine the size of the output range. + * + * If a key `k` has no matches in the container, then `{key, empty_slot_sentinel}` will be added + * to the output sequence. + * + * This function synchronizes the given CUDA stream. + * + * @tparam InputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * + * @param first Beginning of the input sequence of keys + * @param last End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param stream CUDA stream this operation is executed in + * + * @return Iterator pair indicating the the end of the output sequences + */ template std::pair retrieve_outer(InputProbeIt first, InputProbeIt last, From a69a941fa0ce67e6e5f3a6c22b8c1442a37ba7e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Wed, 17 Jul 2024 22:06:52 +0000 Subject: [PATCH 08/13] Add unit test --- .../open_addressing_ref_impl.cuh | 4 +- tests/CMakeLists.txt | 3 +- tests/static_multiset/retrieve_test.cu | 189 ++++++++++++++++++ 3 files changed, 193 insertions(+), 3 deletions(-) create mode 100644 tests/static_multiset/retrieve_test.cu diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 35646760b..a8fe25dee 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -1210,12 +1210,12 @@ class open_addressing_ref_impl { #if defined(CUCO_HAS_CG_INVOKE_ONE) cg::invoke_one(active_flushing_tile, [&]() { probe_buffers[flushing_tile_id][num_matches] = probe; - probe_buffers[flushing_tile_id][num_matches] = this->empty_value_sentinel_; + probe_buffers[flushing_tile_id][num_matches] = this->empty_slot_sentinel(); }); #else if (active_flushing_tile.thread_rank() == 0) { probe_buffers[flushing_tile_id][num_matches] = probe; - probe_buffers[flushing_tile_id][num_matches] = this->empty_value_sentinel_; + probe_buffers[flushing_tile_id][num_matches] = this->empty_slots_sentinel(); } #endif num_matches++; // not really a match but a sentinel in the buffer diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 491ecf841..a42f34a5d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -102,7 +102,8 @@ ConfigureTest(STATIC_MULTISET_TEST static_multiset/custom_count_test.cu static_multiset/find_test.cu static_multiset/insert_test.cu - static_multiset/for_each_test.cu) + static_multiset/for_each_test.cu + static_multiset/retrieve_test.cu) ################################################################################################### # - static_multimap tests ------------------------------------------------------------------------- diff --git a/tests/static_multiset/retrieve_test.cu b/tests/static_multiset/retrieve_test.cu new file mode 100644 index 000000000..2f2fb8d51 --- /dev/null +++ b/tests/static_multiset/retrieve_test.cu @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static constexpr auto empty_key_sentinel = -1; + +template +void test_unique_sequence(Container& container, std::size_t num_keys) +{ + using key_type = typename Container::key_type; + + thrust::device_vector probed_keys(num_keys); + thrust::device_vector matched_keys(num_keys); + + auto input_keys_begin = thrust::counting_iterator{0}; + auto const input_keys_end = input_keys_begin + num_keys; + + SECTION("Non-inserted keys should not be contained.") + { + REQUIRE(container.size() == 0); + + auto const [probe_end, matched_end] = container.retrieve( + input_keys_begin, input_keys_end, probed_keys.begin(), matched_keys.begin()); + REQUIRE(std::distance(probed_keys.begin(), probe_end) == 0); + REQUIRE(std::distance(matched_keys.begin(), matched_end) == 0); + } + + container.insert(input_keys_begin, input_keys_end); + + SECTION("All inserted keys should be contained.") + { + auto const [probed_end, matched_end] = container.retrieve( + input_keys_begin, input_keys_end, probed_keys.begin(), matched_keys.begin()); + thrust::sort(probed_keys.begin(), probed_end); + thrust::sort(matched_keys.begin(), matched_end); + REQUIRE(cuco::test::equal( + probed_keys.begin(), probed_keys.end(), input_keys_begin, thrust::equal_to{})); + REQUIRE(cuco::test::equal( + matched_keys.begin(), matched_keys.end(), input_keys_begin, thrust::equal_to{})); + } +} + +template +void test_multiplicity(Container& container, std::size_t num_keys, std::size_t multiplicity) +{ + using key_type = typename Container::key_type; + + auto const num_unique_keys = num_keys / multiplicity; + REQUIRE(num_unique_keys > 0); + auto const num_actual_keys = num_unique_keys * multiplicity; + + thrust::device_vector input_keys(num_actual_keys); + thrust::device_vector probed_keys(num_actual_keys); + thrust::device_vector matched_keys(num_actual_keys); + + thrust::transform(thrust::counting_iterator(0), + thrust::counting_iterator(num_actual_keys), + input_keys.begin(), + cuda::proclaim_return_type([multiplicity] __device__(auto const& i) { + return static_cast(i / multiplicity); + })); + thrust::shuffle(input_keys.begin(), input_keys.end(), thrust::default_random_engine{}); + + container.insert(input_keys.begin(), input_keys.end()); + + SECTION("All inserted keys should be contained.") + { + auto const [probed_end, matched_end] = container.retrieve( + input_keys.begin(), input_keys.end(), probed_keys.begin(), matched_keys.begin()); + thrust::sort(input_keys.begin(), input_keys.end()); + thrust::sort(probed_keys.begin(), probed_end); + thrust::sort(matched_keys.begin(), matched_end); + REQUIRE(cuco::test::equal( + probed_keys.begin(), probed_keys.end(), input_keys.begin(), thrust::equal_to{})); + REQUIRE(cuco::test::equal( + matched_keys.begin(), matched_keys.end(), input_keys.begin(), thrust::equal_to{})); + } +} + +template +void test_outer(Container& container, std::size_t num_keys) +{ + using key_type = typename Container::key_type; + + thrust::device_vector probed_keys(num_keys * 2ull); + thrust::device_vector matched_keys(num_keys * 2ull); + + auto input_keys_begin = thrust::counting_iterator{0}; + auto const input_keys_end = input_keys_begin + num_keys; + + auto query_keys_begin = input_keys_begin; + auto const query_keys_end = query_keys_begin + num_keys * 2ull; + + SECTION("Non-inserted keys should output sentinels.") + { + REQUIRE(container.size() == 0); + + auto const [probed_end, matched_end] = container.retrieve_outer( + query_keys_begin, query_keys_end, probed_keys.begin(), matched_keys.begin()); + REQUIRE(static_cast(std::distance(probed_keys.begin(), probed_end)) == num_keys); + REQUIRE(static_cast(std::distance(matched_keys.begin(), matched_end)) == num_keys); + REQUIRE(cuco::test::all_of(matched_keys.begin(), + matched_keys.end(), + cuda::proclaim_return_type([] __device__(auto const& k) { + return static_cast( + k == static_cast(empty_key_sentinel)); + }))); + } + + container.insert(input_keys_begin, input_keys_end); + + SECTION("All inserted keys should be contained.") + { + auto const [probed_end, matched_end] = container.retrieve( + query_keys_begin, query_keys_end, probed_keys.begin(), matched_keys.begin()); + thrust::sort(probed_keys.begin(), probed_end); + thrust::sort(matched_keys.begin(), matched_end); + REQUIRE(cuco::test::equal( + probed_keys.begin(), probed_keys.end(), query_keys_begin, thrust::equal_to{})); + REQUIRE(cuco::test::equal(matched_keys.begin(), + matched_keys.begin() + num_keys, + input_keys_begin, + thrust::equal_to{})); + REQUIRE(cuco::test::all_of(matched_keys.begin() + num_keys, + matched_keys.end(), + cuda::proclaim_return_type([] __device__(auto const& k) { + return static_cast( + k == static_cast(empty_key_sentinel)); + }))); + } +} + +TEMPLATE_TEST_CASE_SIG( + "static_multiset retrieve tests", + "", + ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr std::size_t num_keys{400}; + constexpr double desired_load_factor = 1.; + + using probe = std::conditional_t>, + cuco::double_hashing>>; + + auto set = cuco::static_multiset{ + num_keys, desired_load_factor, cuco::empty_key{empty_key_sentinel}, {}, probe{}}; + + test_unique_sequence(set, num_keys); + // test_multiplicity(set, num_keys, decltype(set)::cg_size + 1); // TODO deadlock or infinite loop + // :/ test_outer(set, num_keys); // TODO also deadlocks -.- +} \ No newline at end of file From 1d2e33cae5122278d149243d6f35a853561aaf58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Fri, 19 Jul 2024 00:48:59 +0000 Subject: [PATCH 09/13] Fix input range assignment --- .../cuco/detail/open_addressing/kernels.cuh | 11 +++++---- .../open_addressing/open_addressing_impl.cuh | 23 +++++++++++++------ tests/static_multiset/retrieve_test.cu | 2 +- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh index fdacf9752..acf959e3b 100644 --- a/include/cuco/detail/open_addressing/kernels.cuh +++ b/include/cuco/detail/open_addressing/kernels.cuh @@ -376,12 +376,13 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void retrieve(InputProbeIt input_probe, { namespace cg = cooperative_groups; - auto const block = cg::this_thread_block(); - auto const blocks_in_grid = gridDim.x; - auto const elems_per_block = cuco::detail::int_div_ceil(n, blocks_in_grid); + auto const block = cg::this_thread_block(); + auto constexpr tiles_in_block = BlockSize / Ref::cg_size; + // make sure all but the last block are always occupied + auto const items_per_block = detail::int_div_ceil(n, tiles_in_block * gridDim.x) * tiles_in_block; - auto const block_begin_offset = block.group_index().x * elems_per_block; - auto const block_end_offset = min(n, block_begin_offset + elems_per_block); + auto const block_begin_offset = block.group_index().x * items_per_block; + auto const block_end_offset = min(n, block_begin_offset + items_per_block); if (block_begin_offset < block_end_offset) { if constexpr (IsOuter) { diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index d9e31f1e5..4777244a1 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -1035,16 +1035,25 @@ class open_addressing_impl { Ref container_ref, cuda::stream_ref stream) const { - auto const n = cuco::detail::distance(first, last); + auto const n = detail::distance(first, last); if (n == 0) { return {output_probe, output_match}; } - auto counter = - detail::counter_storage{this->allocator()}; + using counter_type = detail::counter_storage; + auto counter = counter_type{this->allocator()}; counter.reset(stream.get()); - auto constexpr block_size = cuco::detail::default_block_size(); - auto constexpr grid_stride = 1; - auto const grid_size = cuco::detail::grid_size(n, cg_size, grid_stride, block_size); + int32_t constexpr block_size = cuco::detail::default_block_size(); + int32_t grid_size = + detail::max_occupancy_grid_size(block_size, + detail::retrieve); + grid_size *= 1.2; // oversubscription factor + // TODO shrink grid if n is very small detail::retrieve<<>>( first, n, output_probe, output_match, counter.data(), container_ref); @@ -1080,4 +1089,4 @@ class open_addressing_impl { }; } // namespace detail -} // namespace cuco +} // namespace cuco \ No newline at end of file diff --git a/tests/static_multiset/retrieve_test.cu b/tests/static_multiset/retrieve_test.cu index 2f2fb8d51..a3cc8082e 100644 --- a/tests/static_multiset/retrieve_test.cu +++ b/tests/static_multiset/retrieve_test.cu @@ -185,5 +185,5 @@ TEMPLATE_TEST_CASE_SIG( test_unique_sequence(set, num_keys); // test_multiplicity(set, num_keys, decltype(set)::cg_size + 1); // TODO deadlock or infinite loop - // :/ test_outer(set, num_keys); // TODO also deadlocks -.- + // test_outer(set, num_keys); // TODO also deadlocks -.- } \ No newline at end of file From 6d323301761fcc09ae92e8b199d59be6be1617d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Fri, 19 Jul 2024 02:23:38 +0000 Subject: [PATCH 10/13] Fix multiplicity test --- tests/static_multiset/retrieve_test.cu | 53 ++++++-------------------- 1 file changed, 11 insertions(+), 42 deletions(-) diff --git a/tests/static_multiset/retrieve_test.cu b/tests/static_multiset/retrieve_test.cu index a3cc8082e..117b861fe 100644 --- a/tests/static_multiset/retrieve_test.cu +++ b/tests/static_multiset/retrieve_test.cu @@ -34,50 +34,17 @@ static constexpr auto empty_key_sentinel = -1; -template -void test_unique_sequence(Container& container, std::size_t num_keys) -{ - using key_type = typename Container::key_type; - - thrust::device_vector probed_keys(num_keys); - thrust::device_vector matched_keys(num_keys); - - auto input_keys_begin = thrust::counting_iterator{0}; - auto const input_keys_end = input_keys_begin + num_keys; - - SECTION("Non-inserted keys should not be contained.") - { - REQUIRE(container.size() == 0); - - auto const [probe_end, matched_end] = container.retrieve( - input_keys_begin, input_keys_end, probed_keys.begin(), matched_keys.begin()); - REQUIRE(std::distance(probed_keys.begin(), probe_end) == 0); - REQUIRE(std::distance(matched_keys.begin(), matched_end) == 0); - } - - container.insert(input_keys_begin, input_keys_end); - - SECTION("All inserted keys should be contained.") - { - auto const [probed_end, matched_end] = container.retrieve( - input_keys_begin, input_keys_end, probed_keys.begin(), matched_keys.begin()); - thrust::sort(probed_keys.begin(), probed_end); - thrust::sort(matched_keys.begin(), matched_end); - REQUIRE(cuco::test::equal( - probed_keys.begin(), probed_keys.end(), input_keys_begin, thrust::equal_to{})); - REQUIRE(cuco::test::equal( - matched_keys.begin(), matched_keys.end(), input_keys_begin, thrust::equal_to{})); - } -} - template void test_multiplicity(Container& container, std::size_t num_keys, std::size_t multiplicity) { using key_type = typename Container::key_type; + container.clear(); + auto const num_unique_keys = num_keys / multiplicity; REQUIRE(num_unique_keys > 0); auto const num_actual_keys = num_unique_keys * multiplicity; + REQUIRE(num_actual_keys <= num_keys); thrust::device_vector input_keys(num_actual_keys); thrust::device_vector probed_keys(num_actual_keys); @@ -92,6 +59,7 @@ void test_multiplicity(Container& container, std::size_t num_keys, std::size_t m thrust::shuffle(input_keys.begin(), input_keys.end(), thrust::default_random_engine{}); container.insert(input_keys.begin(), input_keys.end()); + REQUIRE(container.size() == num_actual_keys); SECTION("All inserted keys should be contained.") { @@ -112,6 +80,8 @@ void test_outer(Container& container, std::size_t num_keys) { using key_type = typename Container::key_type; + container.clear(); + thrust::device_vector probed_keys(num_keys * 2ull); thrust::device_vector matched_keys(num_keys * 2ull); @@ -123,8 +93,6 @@ void test_outer(Container& container, std::size_t num_keys) SECTION("Non-inserted keys should output sentinels.") { - REQUIRE(container.size() == 0); - auto const [probed_end, matched_end] = container.retrieve_outer( query_keys_begin, query_keys_end, probed_keys.begin(), matched_keys.begin()); REQUIRE(static_cast(std::distance(probed_keys.begin(), probed_end)) == num_keys); @@ -174,7 +142,7 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, cuco::test::probe_sequence::linear_probing, 2)) { constexpr std::size_t num_keys{400}; - constexpr double desired_load_factor = 1.; + constexpr double desired_load_factor = 0.5; using probe = std::conditional_t>, @@ -183,7 +151,8 @@ TEMPLATE_TEST_CASE_SIG( auto set = cuco::static_multiset{ num_keys, desired_load_factor, cuco::empty_key{empty_key_sentinel}, {}, probe{}}; - test_unique_sequence(set, num_keys); - // test_multiplicity(set, num_keys, decltype(set)::cg_size + 1); // TODO deadlock or infinite loop - // test_outer(set, num_keys); // TODO also deadlocks -.- + test_multiplicity(set, num_keys, 1); // unique sequence + test_multiplicity(set, num_keys, 2); // each key occurs twice + test_multiplicity(set, num_keys, 11); + // test_outer(set, num_keys); // TODO still fails } \ No newline at end of file From 50576f43d7b98081a527714aaa42524f2be36a37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Fri, 19 Jul 2024 02:29:18 +0000 Subject: [PATCH 11/13] Trying some optimizations --- .../open_addressing/open_addressing_impl.cuh | 25 ++++---- .../open_addressing_ref_impl.cuh | 60 ++++++++++--------- 2 files changed, 47 insertions(+), 38 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 4777244a1..37e90a7e0 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -1043,17 +1043,20 @@ class open_addressing_impl { counter.reset(stream.get()); int32_t constexpr block_size = cuco::detail::default_block_size(); - int32_t grid_size = - detail::max_occupancy_grid_size(block_size, - detail::retrieve); - grid_size *= 1.2; // oversubscription factor - // TODO shrink grid if n is very small + // int32_t grid_size = + // detail::max_occupancy_grid_size(block_size, + // detail::retrieve); + // grid_size *= 64; // oversubscription factor + // // TODO shrink grid if n is very small + + auto constexpr grid_stride = 1; + auto const grid_size = cuco::detail::grid_size(n, cg_size, grid_stride, block_size); detail::retrieve<<>>( first, n, output_probe, output_match, counter.data(), container_ref); diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index a8fe25dee..8dad661ef 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -1137,6 +1137,29 @@ class open_addressing_ref_impl { // TODO align to 16B? __shared__ probe_type probe_buffers[num_flushing_tiles][buffer_size]; __shared__ value_type match_buffers[num_flushing_tiles][buffer_size]; + size_type num_matches = 0; + + auto flush_buffers = [&](cg::coalesced_group const& tile) { + auto const rank = tile.thread_rank(); + +#if defined(CUCO_HAS_CG_INVOKE_ONE) + auto const offset = cg::invoke_one_broadcast(tile, [&]() { + return atomic_counter.fetch_add(num_matches, cuda::std::memory_order_relaxed); + }); +#else + size_type offset; + if (rank == 0) { + offset = atomic_counter.fetch_add(num_matches, cuda::std::memory_order_relaxed); + } + offset = tile.shfl(offset, 0); +#endif + + // flush_buffers + for (size_type i = rank; i < num_matches; i += tile.size()) { + *(output_probe + offset + i) = probe_buffers[flushing_tile_id][i]; + *(output_match + offset + i) = match_buffers[flushing_tile_id][i]; + } + }; while (flushing_tile.any(idx < n)) { bool active_flag = idx < n; @@ -1152,7 +1175,6 @@ class open_addressing_ref_impl { bool empty_found = false; bool match_found = false; [[maybe_unused]] bool found_any_match = false; // only needed if `IsOuter == true` - size_type num_matches = 0; while (true) { // TODO atomic_ref::load if insert operator is present @@ -1222,33 +1244,12 @@ class open_addressing_ref_impl { } } - // if the buffer has not enough empty slots for the next iteration or the tile has - // finished probing - if (finished or num_matches > (buffer_size - max_matches_per_step)) { - if (num_matches > 0) { - auto const rank = active_flushing_tile.thread_rank(); - -#if defined(CUCO_HAS_CG_INVOKE_ONE) - auto const offset = cg::invoke_one_broadcast(active_flushing_tile, [&]() { - return atomic_counter.fetch_add(num_matches, cuda::std::memory_order_relaxed); - }); -#else - size_type offset; - if (rank == 0) { - offset = atomic_counter.fetch_add(num_matches, cuda::std::memory_order_relaxed); - } - offset = active_flushing_tile.shfl(offset, 0); -#endif + // if the buffer has not enough empty slots for the next iteration + if (num_matches > (buffer_size - max_matches_per_step)) { + flush_buffers(active_flushing_tile); - // flush_buffers - for (size_type i = rank; i < num_matches; i += active_flushing_tile.size()) { - *(output_probe + offset + i) = probe_buffers[flushing_tile_id][i]; - *(output_match + offset + i) = match_buffers[flushing_tile_id][i]; - } - - // reset buffer counter - num_matches = 0; - } + // reset buffer counter + num_matches = 0; } // the entire flushing tile has finished its work @@ -1257,6 +1258,11 @@ class open_addressing_ref_impl { // onto the next probing window ++probing_iter; } + + // entire flusing_tile has finished; flush remaining elements + if (num_matches != 0 and active_flushing_tile.all((idx + stride) >= n)) { + flush_buffers(active_flushing_tile); + } } // onto the next key From dc2f76da1c455d51833aab647aa53a2135b5144f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Mon, 22 Jul 2024 23:03:16 +0000 Subject: [PATCH 12/13] Fix retrieve_outer --- .../open_addressing_ref_impl.cuh | 42 +++++++--- tests/static_multiset/retrieve_test.cu | 83 ++++++++++--------- 2 files changed, 74 insertions(+), 51 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 8dad661ef..2b78dc9ce 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -1228,20 +1228,38 @@ class open_addressing_ref_impl { bool const finished = active_flushing_tile.all(empty_found); if constexpr (IsOuter) { - if (finished and not found_any_match) { -#if defined(CUCO_HAS_CG_INVOKE_ONE) - cg::invoke_one(active_flushing_tile, [&]() { - probe_buffers[flushing_tile_id][num_matches] = probe; - probe_buffers[flushing_tile_id][num_matches] = this->empty_slot_sentinel(); - }); -#else - if (active_flushing_tile.thread_rank() == 0) { - probe_buffers[flushing_tile_id][num_matches] = probe; - probe_buffers[flushing_tile_id][num_matches] = this->empty_slots_sentinel(); + if (finished) { + bool const writes_sentinel = + ((probing_tile.thread_rank() == 0) and not found_any_match); + + auto const sentinel_writers = + cg::binary_partition(active_flushing_tile, writes_sentinel); + if (writes_sentinel) { + auto const rank = sentinel_writers.thread_rank(); + probe_buffers[flushing_tile_id][num_matches + rank] = probe; + match_buffers[flushing_tile_id][num_matches + rank] = this->empty_slot_sentinel(); } -#endif - num_matches++; // not really a match but a sentinel in the buffer + // add number of new matches to the buffer counter + num_matches += (writes_sentinel) + ? sentinel_writers.size() + : active_flushing_tile.size() - sentinel_writers.size(); } + // if (finished and not found_any_match) { + // #if defined(CUCO_HAS_CG_INVOKE_ONE) + // cg::invoke_one(probing_tile, [&]() { + // probe_buffers[flushing_tile_id][num_matches] = probe; + // probe_buffers[flushing_tile_id][num_matches] = + // this->empty_slot_sentinel(); + // }); + // #else + // if (probing_tile.thread_rank() == 0) { + // probe_buffers[flushing_tile_id][num_matches] = probe; + // probe_buffers[flushing_tile_id][num_matches] = + // this->empty_slot_sentinel(); + // } + // #endif + // num_matches++; // not really a match but a sentinel in the buffer + // } } // if the buffer has not enough empty slots for the next iteration diff --git a/tests/static_multiset/retrieve_test.cu b/tests/static_multiset/retrieve_test.cu index 117b861fe..a89a5d7d5 100644 --- a/tests/static_multiset/retrieve_test.cu +++ b/tests/static_multiset/retrieve_test.cu @@ -32,12 +32,13 @@ #include -static constexpr auto empty_key_sentinel = -1; +#include template void test_multiplicity(Container& container, std::size_t num_keys, std::size_t multiplicity) { - using key_type = typename Container::key_type; + using key_type = typename Container::key_type; + auto const empty_key_sentinel = container.empty_key_sentinel(); container.clear(); @@ -78,53 +79,56 @@ void test_multiplicity(Container& container, std::size_t num_keys, std::size_t m template void test_outer(Container& container, std::size_t num_keys) { - using key_type = typename Container::key_type; + using key_type = typename Container::key_type; + auto const empty_key_sentinel = container.empty_key_sentinel(); container.clear(); + thrust::device_vector insert_keys(num_keys); + thrust::sequence(insert_keys.begin(), insert_keys.end(), 0); + thrust::device_vector query_keys(num_keys * 2ull); + thrust::sequence(query_keys.begin(), query_keys.end(), 0); + thrust::device_vector probed_keys(num_keys * 2ull); thrust::device_vector matched_keys(num_keys * 2ull); - auto input_keys_begin = thrust::counting_iterator{0}; - auto const input_keys_end = input_keys_begin + num_keys; - - auto query_keys_begin = input_keys_begin; - auto const query_keys_end = query_keys_begin + num_keys * 2ull; - SECTION("Non-inserted keys should output sentinels.") { auto const [probed_end, matched_end] = container.retrieve_outer( - query_keys_begin, query_keys_end, probed_keys.begin(), matched_keys.begin()); - REQUIRE(static_cast(std::distance(probed_keys.begin(), probed_end)) == num_keys); - REQUIRE(static_cast(std::distance(matched_keys.begin(), matched_end)) == num_keys); - REQUIRE(cuco::test::all_of(matched_keys.begin(), - matched_keys.end(), - cuda::proclaim_return_type([] __device__(auto const& k) { - return static_cast( - k == static_cast(empty_key_sentinel)); - }))); + query_keys.begin(), query_keys.end(), probed_keys.begin(), matched_keys.begin()); + REQUIRE(static_cast(std::distance(probed_keys.begin(), probed_end)) == + num_keys * 2ull); + REQUIRE(static_cast(std::distance(matched_keys.begin(), matched_end)) == + num_keys * 2ull); + REQUIRE(cuco::test::all_of( + matched_keys.begin(), + matched_keys.end(), + cuda::proclaim_return_type([empty_key_sentinel] __device__(auto const& k) { + return static_cast(k == static_cast(empty_key_sentinel)); + }))); } - container.insert(input_keys_begin, input_keys_end); + container.insert(insert_keys.begin(), insert_keys.end()); SECTION("All inserted keys should be contained.") { - auto const [probed_end, matched_end] = container.retrieve( - query_keys_begin, query_keys_end, probed_keys.begin(), matched_keys.begin()); - thrust::sort(probed_keys.begin(), probed_end); - thrust::sort(matched_keys.begin(), matched_end); + auto const [probed_end, matched_end] = container.retrieve_outer( + query_keys.begin(), query_keys.end(), probed_keys.begin(), matched_keys.begin()); + thrust::sort_by_key( + probed_keys.begin(), probed_end, matched_keys.begin(), thrust::less()); + REQUIRE(cuco::test::equal( - probed_keys.begin(), probed_keys.end(), query_keys_begin, thrust::equal_to{})); + probed_keys.begin(), probed_keys.end(), query_keys.begin(), thrust::equal_to{})); REQUIRE(cuco::test::equal(matched_keys.begin(), matched_keys.begin() + num_keys, - input_keys_begin, + insert_keys.begin(), thrust::equal_to{})); - REQUIRE(cuco::test::all_of(matched_keys.begin() + num_keys, - matched_keys.end(), - cuda::proclaim_return_type([] __device__(auto const& k) { - return static_cast( - k == static_cast(empty_key_sentinel)); - }))); + REQUIRE(cuco::test::all_of( + matched_keys.begin() + num_keys, + matched_keys.end(), + cuda::proclaim_return_type([empty_key_sentinel] __device__(auto const& k) { + return static_cast(k == static_cast(empty_key_sentinel)); + }))); } } @@ -132,17 +136,18 @@ TEMPLATE_TEST_CASE_SIG( "static_multiset retrieve tests", "", ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), - (int32_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, cuco::test::probe_sequence::double_hashing, 2), - (int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int64_t, cuco::test::probe_sequence::double_hashing, 2), - (int32_t, cuco::test::probe_sequence::linear_probing, 1), - (int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, cuco::test::probe_sequence::linear_probing, 1), + // (int32_t, cuco::test::probe_sequence::double_hashing, 1), + // (int32_t, cuco::test::probe_sequence::double_hashing, 2), + // (int64_t, cuco::test::probe_sequence::double_hashing, 1), + // (int64_t, cuco::test::probe_sequence::double_hashing, 2), + // (int32_t, cuco::test::probe_sequence::linear_probing, 1), + // (int32_t, cuco::test::probe_sequence::linear_probing, 2), + // (int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, cuco::test::probe_sequence::linear_probing, 2)) { constexpr std::size_t num_keys{400}; constexpr double desired_load_factor = 0.5; + constexpr auto empty_key_sentinel = std::numeric_limits::max(); using probe = std::conditional_t>, @@ -154,5 +159,5 @@ TEMPLATE_TEST_CASE_SIG( test_multiplicity(set, num_keys, 1); // unique sequence test_multiplicity(set, num_keys, 2); // each key occurs twice test_multiplicity(set, num_keys, 11); - // test_outer(set, num_keys); // TODO still fails + test_outer(set, num_keys); } \ No newline at end of file From ae8e39664ef160b8bb8046b72883e7ee77abf37d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Mon, 22 Jul 2024 23:07:04 +0000 Subject: [PATCH 13/13] Revert debugging remnants --- tests/static_multiset/retrieve_test.cu | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/static_multiset/retrieve_test.cu b/tests/static_multiset/retrieve_test.cu index a89a5d7d5..300c8dc6c 100644 --- a/tests/static_multiset/retrieve_test.cu +++ b/tests/static_multiset/retrieve_test.cu @@ -136,13 +136,13 @@ TEMPLATE_TEST_CASE_SIG( "static_multiset retrieve tests", "", ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), - // (int32_t, cuco::test::probe_sequence::double_hashing, 1), - // (int32_t, cuco::test::probe_sequence::double_hashing, 2), - // (int64_t, cuco::test::probe_sequence::double_hashing, 1), - // (int64_t, cuco::test::probe_sequence::double_hashing, 2), - // (int32_t, cuco::test::probe_sequence::linear_probing, 1), - // (int32_t, cuco::test::probe_sequence::linear_probing, 2), - // (int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, cuco::test::probe_sequence::linear_probing, 2)) { constexpr std::size_t num_keys{400};