From 3128ebcfd13a3df34ba91f8656b3dfb40ca6f347 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Mon, 6 May 2024 21:32:01 -0700 Subject: [PATCH 01/10] Define cxx and cuda standards for benchmarks --- cpp/benchmarks/CMakeLists.txt | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 99780a677..e3f235e06 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -23,6 +23,17 @@ add_library(cuspatial_benchmark_common OBJECT target_compile_features(cuspatial_benchmark_common PUBLIC cxx_std_17 cuda_std_17) +set_target_properties(cuspatial_benchmark_common + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" + INSTALL_RPATH "\$ORIGIN/../../../lib" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + INTERFACE_POSITION_INDEPENDENT_CODE ON +) + target_link_libraries(cuspatial_benchmark_common PUBLIC benchmark::benchmark cudf::cudftestutil @@ -43,6 +54,10 @@ function(ConfigureBench CMAKE_BENCH_NAME) set_target_properties(${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" INSTALL_RPATH "\$ORIGIN/../../../lib" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE benchmark::benchmark_main cuspatial_benchmark_common) install( @@ -61,7 +76,11 @@ function(ConfigureNVBench CMAKE_BENCH_NAME) ${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" INSTALL_RPATH "\$ORIGIN/../../../lib" - ) + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON +) target_link_libraries( ${CMAKE_BENCH_NAME} PRIVATE cuspatial_benchmark_common nvbench::main ) From 8d2cf213fafe9639af1b410e9c436bd292b63af8 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Mon, 6 May 2024 21:32:28 -0700 Subject: [PATCH 02/10] add missing __host__ __device__ to satisfy clangd --- cpp/include/cuspatial/detail/range/multilinestring_range.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/cuspatial/detail/range/multilinestring_range.cuh b/cpp/include/cuspatial/detail/range/multilinestring_range.cuh index b9b53bfc0..f627d656f 100644 --- a/cpp/include/cuspatial/detail/range/multilinestring_range.cuh +++ b/cpp/include/cuspatial/detail/range/multilinestring_range.cuh @@ -74,6 +74,7 @@ template +CUSPATIAL_HOST_DEVICE multilinestring_range::multilinestring_range( GeometryIterator geometry_begin, GeometryIterator geometry_end, From 574ab3c6b30d8b5282f68534daed9ac463af4941 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Mon, 6 May 2024 21:32:58 -0700 Subject: [PATCH 03/10] fix box docstring typo and include second template parameter in ostream operator --- cpp/include/cuspatial/geometry/box.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cuspatial/geometry/box.hpp b/cpp/include/cuspatial/geometry/box.hpp index 1041c4de2..1f3f7b955 100644 --- a/cpp/include/cuspatial/geometry/box.hpp +++ b/cpp/include/cuspatial/geometry/box.hpp @@ -40,9 +40,9 @@ class alignas(sizeof(Vertex)) box { private: /** - * @brief Output stream operator for `vec_2d` for human-readable formatting + * @brief Output stream operator for `box` for human-readable formatting */ - friend std::ostream& operator<<(std::ostream& os, cuspatial::box const& b) + friend std::ostream& operator<<(std::ostream& os, cuspatial::box const& b) { return os << "{" << b.v1 << ", " << b.v2 << "}"; } From 1e309875e902290ddddd77804f432a3a16e5996d Mon Sep 17 00:00:00 2001 From: ptaylor Date: Mon, 6 May 2024 21:34:18 -0700 Subject: [PATCH 04/10] add to_host() to multipoint_array for debugging --- cpp/include/cuspatial_test/vector_factories.cuh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/cpp/include/cuspatial_test/vector_factories.cuh b/cpp/include/cuspatial_test/vector_factories.cuh index a772cd6cf..9d3e8a31a 100644 --- a/cpp/include/cuspatial_test/vector_factories.cuh +++ b/cpp/include/cuspatial_test/vector_factories.cuh @@ -363,6 +363,17 @@ class multipoint_array { _geometry_offsets.begin(), _geometry_offsets.end(), _coordinates.begin(), _coordinates.end()}; } + /** + * @brief Copy the offset arrays to host. + */ + auto to_host() const + { + auto geometry_offsets = cuspatial::test::to_host(_geometry_offsets); + auto coordinate_offsets = cuspatial::test::to_host(_coordinates); + + return std::tuple{geometry_offsets, coordinate_offsets}; + } + /// Release ownership auto release() { return std::pair{std::move(_geometry_offsets), std::move(_coordinates)}; } From 16cb2e0e367c26b6c245586330f46c6218736bec Mon Sep 17 00:00:00 2001 From: ptaylor Date: Mon, 6 May 2024 21:35:44 -0700 Subject: [PATCH 05/10] use the golden ratio as the seed so random points are more evenly distributed instead of distributing along a diagonal --- cpp/include/cuspatial_test/geometry_generator.cuh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/include/cuspatial_test/geometry_generator.cuh b/cpp/include/cuspatial_test/geometry_generator.cuh index a3efa5812..8c17ff291 100644 --- a/cpp/include/cuspatial_test/geometry_generator.cuh +++ b/cpp/include/cuspatial_test/geometry_generator.cuh @@ -33,6 +33,8 @@ #include +#include + namespace cuspatial { namespace test { @@ -399,8 +401,9 @@ auto generate_multipoint_array(multipoint_generator_parameter params, std::size_t{0}, params.num_points_per_multipoints); - auto engine_x = deterministic_engine(params.num_points()); - auto engine_y = deterministic_engine(2 * params.num_points()); + auto golden_ratio = (1 + std::sqrt(T{5})) / 2; + auto engine_x = deterministic_engine(golden_ratio * params.num_points()); + auto engine_y = deterministic_engine((1 / golden_ratio) * params.num_points()); auto x_dist = make_uniform_dist(params.lower_left.x, params.upper_right.x); auto y_dist = make_uniform_dist(params.lower_left.y, params.upper_right.y); From 4fa1cae5b2908eec24d097d80852e20e6f07a767 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Mon, 6 May 2024 21:40:57 -0700 Subject: [PATCH 06/10] add test to OOM quadtree_point_in_polygon on 24GB cards --- cpp/tests/CMakeLists.txt | 3 + .../quadtree_point_in_polygon_test_oom.cu | 140 ++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 cpp/tests/join/quadtree_point_in_polygon_test_oom.cu diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 7f7cb112a..c29b6da18 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -158,6 +158,9 @@ ConfigureTest(JOIN_POINT_IN_POLYGON_SMALL_TEST_EXP ConfigureTest(JOIN_POINT_IN_POLYGON_LARGE_TEST_EXP join/quadtree_point_in_polygon_test_large.cu) +ConfigureTest(JOIN_POINT_IN_POLYGON_OOM_TEST_EXP + join/quadtree_point_in_polygon_test_oom.cu) + ConfigureTest(JOIN_POINT_TO_LINESTRING_SMALL_TEST_EXP join/quadtree_point_to_nearest_linestring_test_small.cu) diff --git a/cpp/tests/join/quadtree_point_in_polygon_test_oom.cu b/cpp/tests/join/quadtree_point_in_polygon_test_oom.cu new file mode 100644 index 000000000..2f66918e0 --- /dev/null +++ b/cpp/tests/join/quadtree_point_in_polygon_test_oom.cu @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +/* + * The test uses the same quadtree structure as in pip_refine_test_small. However, the number of + * randomly generated points under all quadrants (min_size) are increased to be more than the + * number of threads per-block. + */ + +template +struct PIPRefineTestLarge : public cuspatial::test::BaseFixture {}; + +template +struct test_point_in_poly_functor { + cuspatial::multipolygon_range*> + polys; + __device__ inline T operator()(cuspatial::vec_2d point) + { + auto it = cuspatial::make_counting_transform_iterator(0, [&](auto i) { return polys[i][0]; }); + return thrust::count_if(thrust::seq, it, it + polys.num_multipolygons(), [&](auto poly) { + return cuspatial::is_point_in_polygon(point, poly); + }); + } +}; + +using TestTypes = ::testing::Types; + +TYPED_TEST_CASE(PIPRefineTestLarge, TestTypes); + +TYPED_TEST(PIPRefineTestLarge, TestOOM) +{ + using T = TypeParam; + using cuspatial::vec_2d; + using cuspatial::test::make_device_vector; + + using point_t = typename cuspatial::test::multipoint_array, + rmm::device_uvector>>; + + using polys_t = typename cuspatial::test::multipolygon_array, + rmm::device_uvector, + rmm::device_uvector, + rmm::device_uvector>>; + + vec_2d v_min{0.0, 0.0}; + vec_2d v_max{1'000.0, 1'000.0}; + T const scale{1.0}; + std::uint8_t const max_depth{15}; + std::uint32_t const min_size{10'000}; + + std::size_t const num_points{1'000'000}; + std::size_t const num_polys{24'000}; + + rmm::device_uvector> points = [&]() { + point_t points = cuspatial::test::generate_multipoint_array( + cuspatial::test::multipoint_generator_parameter{1, num_points, v_min, v_max}, + this->stream()); + return points.release().second; + }(); + + polys_t multipoly_array = cuspatial::test::generate_multipolygon_array( + cuspatial::test::multipolygon_generator_parameter{ + num_polys, + 1, + 0, + 4, + vec_2d{(v_max - v_min).x / 2, (v_max - v_min).y / 2}, + (v_max - v_min).x / 8}, + this->stream()); + + auto multipolygons = multipoly_array.range(); + + auto expected_size = thrust::count_if(rmm::exec_policy(this->stream()), + points.begin(), + points.end(), + test_point_in_poly_functor{multipolygons}); + + auto [point_indices, quadtree] = cuspatial::quadtree_on_points( + points.begin(), points.end(), v_min, v_max, scale, max_depth, min_size, this->stream()); + + auto bboxes = + rmm::device_uvector>(multipolygons.num_polygons(), this->stream()); + + cuspatial::polygon_bounding_boxes(multipolygons.part_offset_begin(), + multipolygons.part_offset_end(), + multipolygons.ring_offset_begin(), + multipolygons.ring_offset_end(), + multipolygons.point_begin(), + multipolygons.point_end(), + bboxes.begin(), + T{0}, + this->stream()); + + EXPECT_GT(bboxes.size(), 0); + + auto [poly_indices, quad_indices] = cuspatial::join_quadtree_and_bounding_boxes( + quadtree, bboxes.begin(), bboxes.end(), v_min, scale, max_depth, this->stream()); + + auto [actual_poly_indices, actual_point_indices] = + cuspatial::quadtree_point_in_polygon(poly_indices.begin(), + poly_indices.end(), + quad_indices.begin(), + quadtree, + point_indices.begin(), + point_indices.end(), + points.begin(), + multipolygons, + this->stream()); + + EXPECT_GT(actual_point_indices.size(), 0); + EXPECT_GT(actual_poly_indices.size(), 0); + + EXPECT_EQ(actual_point_indices.size(), expected_size); + EXPECT_EQ(actual_poly_indices.size(), expected_size); +} From 781fc7d7534f3cb153274302844e3289d35e1d59 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Tue, 7 May 2024 04:50:03 +0000 Subject: [PATCH 07/10] fix lint --- cpp/benchmarks/CMakeLists.txt | 2 +- cpp/include/cuspatial/detail/range/multilinestring_range.cuh | 2 +- cpp/include/cuspatial/geometry/box.hpp | 2 +- cpp/tests/CMakeLists.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index e3f235e06..10e626f3f 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/cpp/include/cuspatial/detail/range/multilinestring_range.cuh b/cpp/include/cuspatial/detail/range/multilinestring_range.cuh index f627d656f..03ad0fe27 100644 --- a/cpp/include/cuspatial/detail/range/multilinestring_range.cuh +++ b/cpp/include/cuspatial/detail/range/multilinestring_range.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cuspatial/geometry/box.hpp b/cpp/include/cuspatial/geometry/box.hpp index 1f3f7b955..4a9f97639 100644 --- a/cpp/include/cuspatial/geometry/box.hpp +++ b/cpp/include/cuspatial/geometry/box.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index c29b6da18..f6752c860 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 57354cbd6f703a9ff1fcdb0e954c51b90fb96179 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Tue, 7 May 2024 04:50:34 +0000 Subject: [PATCH 08/10] fix new quadtree spatial join OOM test --- .../detail/join/quadtree_point_in_polygon.cuh | 105 +++++++++++------- 1 file changed, 63 insertions(+), 42 deletions(-) diff --git a/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh b/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh index 0b8f23e19..10b96fe85 100644 --- a/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh +++ b/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh @@ -24,15 +24,15 @@ #include #include +#include #include #include #include #include +#include #include -#include - namespace cuspatial { namespace detail { @@ -57,7 +57,7 @@ struct compute_poly_and_point_indices { using IndexType = iterator_value_type; inline thrust::tuple __device__ - operator()(IndexType const global_index) const + operator()(std::uint64_t const global_index) const { auto const [quad_poly_index, local_point_index] = get_quad_and_local_point_indices(global_index, point_offsets_begin, point_offsets_end); @@ -118,16 +118,22 @@ std::pair, rmm::device_uvector> quadtr auto num_poly_quad_pairs = std::distance(poly_indices_first, poly_indices_last); - auto quad_lengths_iter = - thrust::make_permutation_iterator(quadtree.length_begin(), quad_indices_first); + auto quad_lengths_iter = thrust::make_transform_iterator( + thrust::make_permutation_iterator(quadtree.length_begin(), quad_indices_first), + cuda::proclaim_return_type([] __device__(IndexType const& i) -> std::uint64_t { + return static_cast(i); + })); auto quad_offsets_iter = thrust::make_permutation_iterator(quadtree.offset_begin(), quad_indices_first); + // Enumerate the point X/Ys using the sorted `point_indices` (from quadtree construction) + auto point_xys_iter = thrust::make_permutation_iterator(points_first, point_indices_first); + // Compute a "local" set of zero-based point offsets from number of points in each quadrant // Use `num_poly_quad_pairs + 1` as the length so that the last element produced by // `inclusive_scan` is the total number of points to be tested against any polygon. - rmm::device_uvector local_point_offsets(num_poly_quad_pairs + 1, stream); + rmm::device_uvector local_point_offsets(num_poly_quad_pairs + 1, stream); // inclusive scan of quad_lengths_iter thrust::inclusive_scan(rmm::exec_policy(stream), @@ -136,52 +142,67 @@ std::pair, rmm::device_uvector> quadtr local_point_offsets.begin() + 1); // Ensure local point offsets starts at 0 - IndexType init{0}; + std::uint64_t init{0}; local_point_offsets.set_element_async(0, init, stream); // The last element is the total number of points to test against any polygon. auto num_total_points = local_point_offsets.back_element(stream); - // Allocate the output polygon and point index pair vectors - rmm::device_uvector poly_indices(num_total_points, stream); - rmm::device_uvector point_indices(num_total_points, stream); - - auto poly_and_point_indices = - thrust::make_zip_iterator(poly_indices.begin(), point_indices.begin()); + // Process in chunks of *at most* 25% of the free available device memory to avoid OOM'ing + // when allocating additional output space. + auto max_chunk_size = + static_cast(rmm::percent_of_free_device_memory(25) / (sizeof(IndexType) * 2)); - // Enumerate the point X/Ys using the sorted `point_indices` (from quadtree construction) - auto point_xys_iter = thrust::make_permutation_iterator(points_first, point_indices_first); + auto num_chunks = std::uint64_t{num_total_points / max_chunk_size + 1}; - // Compute the combination of polygon and point index pairs. For each polygon/quadrant pair, - // enumerate pairs of (poly_index, point_index) for each point in each quadrant. - // - // In Python pseudocode: - // ``` - // pp_pairs = [] - // for polygon, quadrant in pq_pairs: - // for point in quadrant: - // pp_pairs.append((polygon, point)) - // ``` - // - auto global_to_poly_and_point_indices = detail::make_counting_transform_iterator( - 0, - detail::compute_poly_and_point_indices{quad_offsets_iter, - local_point_offsets.begin(), - local_point_offsets.end(), - poly_indices_first}); - - // Compute the number of intersections by removing (poly, point) pairs that don't intersect - auto num_intersections = thrust::distance( - poly_and_point_indices, - thrust::copy_if(rmm::exec_policy(stream), - global_to_poly_and_point_indices, - global_to_poly_and_point_indices + num_total_points, - poly_and_point_indices, - detail::test_poly_point_intersection{point_xys_iter, polygons})); + // Allocate the output polygon and point index pair vectors + rmm::device_uvector poly_indices(0, stream); + rmm::device_uvector point_indices(0, stream); + + std::uint64_t num_intersections = 0; + + for (std::uint64_t chunk_index = 0, chunk_offset = 0; chunk_index < num_chunks; ++chunk_index) { + auto chunk_size = std::min(max_chunk_size, num_total_points - (chunk_index * max_chunk_size)); + + poly_indices.resize(num_intersections + chunk_size, stream); + point_indices.resize(num_intersections + chunk_size, stream); + + // Compute the combination of polygon and point index pairs. For each polygon/quadrant pair, + // enumerate pairs of (poly_index, point_index) for each point in each quadrant. + // + // In Python pseudocode: + // ``` + // pp_pairs = [] + // for polygon, quadrant in pq_pairs: + // for point in quadrant: + // pp_pairs.append((polygon, point)) + // ``` + // + auto global_to_poly_and_point_indices = detail::make_counting_transform_iterator( + chunk_offset, + detail::compute_poly_and_point_indices{quad_offsets_iter, + local_point_offsets.begin(), + local_point_offsets.end(), + poly_indices_first}); + + auto poly_and_point_indices = + thrust::make_zip_iterator(poly_indices.begin(), point_indices.begin()) + num_intersections; + + // Compute the number of intersections by removing (poly, point) pairs that don't intersect + num_intersections += thrust::distance( + poly_and_point_indices, + thrust::copy_if(rmm::exec_policy(stream), + global_to_poly_and_point_indices, + global_to_poly_and_point_indices + chunk_size, + poly_and_point_indices, + detail::test_poly_point_intersection{point_xys_iter, polygons})); + + chunk_offset += chunk_size; + } poly_indices.resize(num_intersections, stream); - poly_indices.shrink_to_fit(stream); point_indices.resize(num_intersections, stream); + poly_indices.shrink_to_fit(stream); point_indices.shrink_to_fit(stream); return std::pair{std::move(poly_indices), std::move(point_indices)}; From 4f77148bfb668b76cd9791dfaf84a64b6d81a6bf Mon Sep 17 00:00:00 2001 From: ptaylor Date: Fri, 10 May 2024 23:32:02 +0000 Subject: [PATCH 09/10] try to process all points first, fallback to counting the number of hits and allocating the exact amount of memory for the results --- .../detail/join/quadtree_point_in_polygon.cuh | 140 +++++++++++------- 1 file changed, 83 insertions(+), 57 deletions(-) diff --git a/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh b/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh index 10b96fe85..2922268b3 100644 --- a/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh +++ b/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh @@ -33,6 +33,8 @@ #include #include +#include + namespace cuspatial { namespace detail { @@ -118,6 +120,9 @@ std::pair, rmm::device_uvector> quadtr auto num_poly_quad_pairs = std::distance(poly_indices_first, poly_indices_last); + // The quadtree length is an iterator of uint32_t, but we have to transform into uint64_t values + // so the thrust::inclusive_scan accumulates into uint64_t outputs. Changing the output iterator + // to uint64_t isn't sufficient to achieve this behavior. auto quad_lengths_iter = thrust::make_transform_iterator( thrust::make_permutation_iterator(quadtree.length_begin(), quad_indices_first), cuda::proclaim_return_type([] __device__(IndexType const& i) -> std::uint64_t { @@ -127,12 +132,13 @@ std::pair, rmm::device_uvector> quadtr auto quad_offsets_iter = thrust::make_permutation_iterator(quadtree.offset_begin(), quad_indices_first); - // Enumerate the point X/Ys using the sorted `point_indices` (from quadtree construction) - auto point_xys_iter = thrust::make_permutation_iterator(points_first, point_indices_first); - - // Compute a "local" set of zero-based point offsets from number of points in each quadrant + // Compute a "local" set of zero-based point offsets from the number of points in each quadrant. + // // Use `num_poly_quad_pairs + 1` as the length so that the last element produced by // `inclusive_scan` is the total number of points to be tested against any polygon. + // + // Accumulate into uint64_t, because the prefix sums can overflow the size of uint32_t + // when testing a large number of polygons against a large quadtree. rmm::device_uvector local_point_offsets(num_poly_quad_pairs + 1, stream); // inclusive scan of quad_lengths_iter @@ -148,64 +154,84 @@ std::pair, rmm::device_uvector> quadtr // The last element is the total number of points to test against any polygon. auto num_total_points = local_point_offsets.back_element(stream); - // Process in chunks of *at most* 25% of the free available device memory to avoid OOM'ing - // when allocating additional output space. - auto max_chunk_size = - static_cast(rmm::percent_of_free_device_memory(25) / (sizeof(IndexType) * 2)); - - auto num_chunks = std::uint64_t{num_total_points / max_chunk_size + 1}; - - // Allocate the output polygon and point index pair vectors - rmm::device_uvector poly_indices(0, stream); - rmm::device_uvector point_indices(0, stream); - - std::uint64_t num_intersections = 0; - - for (std::uint64_t chunk_index = 0, chunk_offset = 0; chunk_index < num_chunks; ++chunk_index) { - auto chunk_size = std::min(max_chunk_size, num_total_points - (chunk_index * max_chunk_size)); - - poly_indices.resize(num_intersections + chunk_size, stream); - point_indices.resize(num_intersections + chunk_size, stream); - - // Compute the combination of polygon and point index pairs. For each polygon/quadrant pair, - // enumerate pairs of (poly_index, point_index) for each point in each quadrant. - // - // In Python pseudocode: - // ``` - // pp_pairs = [] - // for polygon, quadrant in pq_pairs: - // for point in quadrant: - // pp_pairs.append((polygon, point)) - // ``` - // - auto global_to_poly_and_point_indices = detail::make_counting_transform_iterator( - chunk_offset, + // The largest supported input size for thrust::count_if/copy_if is INT32_MAX. + // This functor iterates over the input space and processes up to INT32_MAX elements at a time. + std::uint64_t max_points_to_test = std::numeric_limits::max(); + auto count_in_chunks = [&](auto const& func) { + std::uint64_t memo{}; + for (std::uint64_t offset{0}; offset < num_total_points; offset += max_points_to_test) { + memo += func(memo, offset, std::min(max_points_to_test, num_total_points - offset)); + } + return memo; + }; + + detail::test_poly_point_intersection test_poly_point_pair{ + // Enumerate the point X/Ys using the sorted `point_indices` (from quadtree construction) + thrust::make_permutation_iterator(points_first, point_indices_first), + polygons}; + + // Compute the combination of polygon and point index pairs. For each polygon/quadrant pair, + // enumerate pairs of (poly_index, point_index) for each point in each quadrant. + // + // In Python pseudocode: + // ``` + // pp_pairs = [] + // for polygon, quadrant in pq_pairs: + // for point in quadrant: + // pp_pairs.append((polygon, point)) + // ``` + // + auto global_to_poly_and_point_indices = [&](auto offset = 0) { + return detail::make_counting_transform_iterator( + offset, detail::compute_poly_and_point_indices{quad_offsets_iter, local_point_offsets.begin(), local_point_offsets.end(), poly_indices_first}); - - auto poly_and_point_indices = - thrust::make_zip_iterator(poly_indices.begin(), point_indices.begin()) + num_intersections; - - // Compute the number of intersections by removing (poly, point) pairs that don't intersect - num_intersections += thrust::distance( - poly_and_point_indices, - thrust::copy_if(rmm::exec_policy(stream), - global_to_poly_and_point_indices, - global_to_poly_and_point_indices + chunk_size, - poly_and_point_indices, - detail::test_poly_point_intersection{point_xys_iter, polygons})); - - chunk_offset += chunk_size; + }; + + auto run_quadtree_point_in_polygon = [&](auto output_size) { + // Allocate the output polygon and point index pair vectors + rmm::device_uvector poly_indices(output_size, stream); + rmm::device_uvector point_indices(output_size, stream); + + auto num_intersections = count_in_chunks([&](auto memo, auto offset, auto size) { + auto poly_and_point_indices = + thrust::make_zip_iterator(poly_indices.begin(), point_indices.begin()) + memo; + // Remove (poly, point) pairs that don't intersect + return thrust::distance(poly_and_point_indices, + thrust::copy_if(rmm::exec_policy(stream), + global_to_poly_and_point_indices(offset), + global_to_poly_and_point_indices(offset) + size, + poly_and_point_indices, + test_poly_point_pair)); + }); + + if (num_intersections < output_size) { + poly_indices.resize(num_intersections, stream); + point_indices.resize(num_intersections, stream); + poly_indices.shrink_to_fit(stream); + point_indices.shrink_to_fit(stream); + } + + return std::pair{std::move(poly_indices), std::move(point_indices)}; + }; + + try { + // First attempt to run the hit test assuming allocating space for all possible intersections + // fits into the available memory. + return run_quadtree_point_in_polygon(num_total_points); + } catch (std::exception const&) { + // If we OOM the first time, pre-compute the number of hits and allocate only that amount of + // space for the output buffers. This halves performance, but it should at least return valid + // results. + return run_quadtree_point_in_polygon(count_in_chunks([&](auto memo, auto offset, auto size) { + return thrust::count_if(rmm::exec_policy(stream), + global_to_poly_and_point_indices(offset), + global_to_poly_and_point_indices(offset) + size, + test_poly_point_pair); + })); } - - poly_indices.resize(num_intersections, stream); - point_indices.resize(num_intersections, stream); - poly_indices.shrink_to_fit(stream); - point_indices.shrink_to_fit(stream); - - return std::pair{std::move(poly_indices), std::move(point_indices)}; } } // namespace cuspatial From 2d8ecf04012a4650ebb473410b773bf3d30d9f9a Mon Sep 17 00:00:00 2001 From: Paul Taylor <178183+trxcllnt@users.noreply.github.com> Date: Mon, 13 May 2024 14:37:48 -0700 Subject: [PATCH 10/10] Update cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh --- cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh b/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh index 2922268b3..68a2e9596 100644 --- a/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh +++ b/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh @@ -221,7 +221,7 @@ std::pair, rmm::device_uvector> quadtr // First attempt to run the hit test assuming allocating space for all possible intersections // fits into the available memory. return run_quadtree_point_in_polygon(num_total_points); - } catch (std::exception const&) { + } catch (rmm::out_of_memory const&) { // If we OOM the first time, pre-compute the number of hits and allocate only that amount of // space for the output buffers. This halves performance, but it should at least return valid // results.