From 8840189c2b747593100049ee81dd83174fa9eaf1 Mon Sep 17 00:00:00 2001 From: Paul Taylor <178183+trxcllnt@users.noreply.github.com> Date: Fri, 24 May 2024 06:36:48 -0700 Subject: [PATCH] Fix quadtree spatial join OOMs on large numbers of input polygons (#1381) Followup to https://github.com/rapidsai/cuspatial/pull/1346. * Fixes some typos/omissions in types and CMake. * Adds a new test that OOMs when quadtree_point_in_polygon is passed too many input polygons. * Fixes quadtree spatial join to handle overflow while counting and more conservatively allocate output buffers. Fixes #890. * [Failing test run](https://github.com/rapidsai/cuspatial/actions/runs/8979838628/job/24662981350#step:7:840) * [Passing test run](https://github.com/rapidsai/cuspatial/actions/runs/8981106226/job/24666403165#step:7:840) Authors: - Paul Taylor (https://github.com/trxcllnt) Approvers: - Mark Harris (https://github.com/harrism) - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cuspatial/pull/1381 --- cpp/benchmarks/CMakeLists.txt | 23 ++- .../detail/join/quadtree_point_in_polygon.cuh | 123 ++++++++++----- .../detail/range/multilinestring_range.cuh | 3 +- cpp/include/cuspatial/geometry/box.hpp | 6 +- .../cuspatial_test/geometry_generator.cuh | 7 +- .../cuspatial_test/vector_factories.cuh | 11 ++ cpp/tests/CMakeLists.txt | 5 +- .../quadtree_point_in_polygon_test_oom.cu | 140 ++++++++++++++++++ 8 files changed, 271 insertions(+), 47 deletions(-) create mode 100644 cpp/tests/join/quadtree_point_in_polygon_test_oom.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 99780a677..10e626f3f 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,6 +23,17 @@ add_library(cuspatial_benchmark_common OBJECT target_compile_features(cuspatial_benchmark_common PUBLIC cxx_std_17 cuda_std_17) +set_target_properties(cuspatial_benchmark_common + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" + INSTALL_RPATH "\$ORIGIN/../../../lib" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + INTERFACE_POSITION_INDEPENDENT_CODE ON +) + target_link_libraries(cuspatial_benchmark_common PUBLIC benchmark::benchmark cudf::cudftestutil @@ -43,6 +54,10 @@ function(ConfigureBench CMAKE_BENCH_NAME) set_target_properties(${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" INSTALL_RPATH "\$ORIGIN/../../../lib" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE benchmark::benchmark_main cuspatial_benchmark_common) install( @@ -61,7 +76,11 @@ function(ConfigureNVBench CMAKE_BENCH_NAME) ${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" INSTALL_RPATH "\$ORIGIN/../../../lib" - ) + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON +) target_link_libraries( ${CMAKE_BENCH_NAME} PRIVATE cuspatial_benchmark_common nvbench::main ) diff --git a/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh b/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh index 0b8f23e19..68a2e9596 100644 --- a/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh +++ b/cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh @@ -24,14 +24,16 @@ #include #include +#include #include #include #include #include +#include #include -#include +#include namespace cuspatial { namespace detail { @@ -57,7 +59,7 @@ struct compute_poly_and_point_indices { using IndexType = iterator_value_type; inline thrust::tuple __device__ - operator()(IndexType const global_index) const + operator()(std::uint64_t const global_index) const { auto const [quad_poly_index, local_point_index] = get_quad_and_local_point_indices(global_index, point_offsets_begin, point_offsets_end); @@ -118,16 +120,26 @@ std::pair, rmm::device_uvector> quadtr auto num_poly_quad_pairs = std::distance(poly_indices_first, poly_indices_last); - auto quad_lengths_iter = - thrust::make_permutation_iterator(quadtree.length_begin(), quad_indices_first); + // The quadtree length is an iterator of uint32_t, but we have to transform into uint64_t values + // so the thrust::inclusive_scan accumulates into uint64_t outputs. Changing the output iterator + // to uint64_t isn't sufficient to achieve this behavior. + auto quad_lengths_iter = thrust::make_transform_iterator( + thrust::make_permutation_iterator(quadtree.length_begin(), quad_indices_first), + cuda::proclaim_return_type([] __device__(IndexType const& i) -> std::uint64_t { + return static_cast(i); + })); auto quad_offsets_iter = thrust::make_permutation_iterator(quadtree.offset_begin(), quad_indices_first); - // Compute a "local" set of zero-based point offsets from number of points in each quadrant + // Compute a "local" set of zero-based point offsets from the number of points in each quadrant. + // // Use `num_poly_quad_pairs + 1` as the length so that the last element produced by // `inclusive_scan` is the total number of points to be tested against any polygon. - rmm::device_uvector local_point_offsets(num_poly_quad_pairs + 1, stream); + // + // Accumulate into uint64_t, because the prefix sums can overflow the size of uint32_t + // when testing a large number of polygons against a large quadtree. + rmm::device_uvector local_point_offsets(num_poly_quad_pairs + 1, stream); // inclusive scan of quad_lengths_iter thrust::inclusive_scan(rmm::exec_policy(stream), @@ -136,21 +148,27 @@ std::pair, rmm::device_uvector> quadtr local_point_offsets.begin() + 1); // Ensure local point offsets starts at 0 - IndexType init{0}; + std::uint64_t init{0}; local_point_offsets.set_element_async(0, init, stream); // The last element is the total number of points to test against any polygon. auto num_total_points = local_point_offsets.back_element(stream); - // Allocate the output polygon and point index pair vectors - rmm::device_uvector poly_indices(num_total_points, stream); - rmm::device_uvector point_indices(num_total_points, stream); - - auto poly_and_point_indices = - thrust::make_zip_iterator(poly_indices.begin(), point_indices.begin()); - - // Enumerate the point X/Ys using the sorted `point_indices` (from quadtree construction) - auto point_xys_iter = thrust::make_permutation_iterator(points_first, point_indices_first); + // The largest supported input size for thrust::count_if/copy_if is INT32_MAX. + // This functor iterates over the input space and processes up to INT32_MAX elements at a time. + std::uint64_t max_points_to_test = std::numeric_limits::max(); + auto count_in_chunks = [&](auto const& func) { + std::uint64_t memo{}; + for (std::uint64_t offset{0}; offset < num_total_points; offset += max_points_to_test) { + memo += func(memo, offset, std::min(max_points_to_test, num_total_points - offset)); + } + return memo; + }; + + detail::test_poly_point_intersection test_poly_point_pair{ + // Enumerate the point X/Ys using the sorted `point_indices` (from quadtree construction) + thrust::make_permutation_iterator(points_first, point_indices_first), + polygons}; // Compute the combination of polygon and point index pairs. For each polygon/quadrant pair, // enumerate pairs of (poly_index, point_index) for each point in each quadrant. @@ -163,28 +181,57 @@ std::pair, rmm::device_uvector> quadtr // pp_pairs.append((polygon, point)) // ``` // - auto global_to_poly_and_point_indices = detail::make_counting_transform_iterator( - 0, - detail::compute_poly_and_point_indices{quad_offsets_iter, - local_point_offsets.begin(), - local_point_offsets.end(), - poly_indices_first}); - - // Compute the number of intersections by removing (poly, point) pairs that don't intersect - auto num_intersections = thrust::distance( - poly_and_point_indices, - thrust::copy_if(rmm::exec_policy(stream), - global_to_poly_and_point_indices, - global_to_poly_and_point_indices + num_total_points, - poly_and_point_indices, - detail::test_poly_point_intersection{point_xys_iter, polygons})); - - poly_indices.resize(num_intersections, stream); - poly_indices.shrink_to_fit(stream); - point_indices.resize(num_intersections, stream); - point_indices.shrink_to_fit(stream); - - return std::pair{std::move(poly_indices), std::move(point_indices)}; + auto global_to_poly_and_point_indices = [&](auto offset = 0) { + return detail::make_counting_transform_iterator( + offset, + detail::compute_poly_and_point_indices{quad_offsets_iter, + local_point_offsets.begin(), + local_point_offsets.end(), + poly_indices_first}); + }; + + auto run_quadtree_point_in_polygon = [&](auto output_size) { + // Allocate the output polygon and point index pair vectors + rmm::device_uvector poly_indices(output_size, stream); + rmm::device_uvector point_indices(output_size, stream); + + auto num_intersections = count_in_chunks([&](auto memo, auto offset, auto size) { + auto poly_and_point_indices = + thrust::make_zip_iterator(poly_indices.begin(), point_indices.begin()) + memo; + // Remove (poly, point) pairs that don't intersect + return thrust::distance(poly_and_point_indices, + thrust::copy_if(rmm::exec_policy(stream), + global_to_poly_and_point_indices(offset), + global_to_poly_and_point_indices(offset) + size, + poly_and_point_indices, + test_poly_point_pair)); + }); + + if (num_intersections < output_size) { + poly_indices.resize(num_intersections, stream); + point_indices.resize(num_intersections, stream); + poly_indices.shrink_to_fit(stream); + point_indices.shrink_to_fit(stream); + } + + return std::pair{std::move(poly_indices), std::move(point_indices)}; + }; + + try { + // First attempt to run the hit test assuming allocating space for all possible intersections + // fits into the available memory. + return run_quadtree_point_in_polygon(num_total_points); + } catch (rmm::out_of_memory const&) { + // If we OOM the first time, pre-compute the number of hits and allocate only that amount of + // space for the output buffers. This halves performance, but it should at least return valid + // results. + return run_quadtree_point_in_polygon(count_in_chunks([&](auto memo, auto offset, auto size) { + return thrust::count_if(rmm::exec_policy(stream), + global_to_poly_and_point_indices(offset), + global_to_poly_and_point_indices(offset) + size, + test_poly_point_pair); + })); + } } } // namespace cuspatial diff --git a/cpp/include/cuspatial/detail/range/multilinestring_range.cuh b/cpp/include/cuspatial/detail/range/multilinestring_range.cuh index b9b53bfc0..03ad0fe27 100644 --- a/cpp/include/cuspatial/detail/range/multilinestring_range.cuh +++ b/cpp/include/cuspatial/detail/range/multilinestring_range.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,6 +74,7 @@ template +CUSPATIAL_HOST_DEVICE multilinestring_range::multilinestring_range( GeometryIterator geometry_begin, GeometryIterator geometry_end, diff --git a/cpp/include/cuspatial/geometry/box.hpp b/cpp/include/cuspatial/geometry/box.hpp index 1041c4de2..4a9f97639 100644 --- a/cpp/include/cuspatial/geometry/box.hpp +++ b/cpp/include/cuspatial/geometry/box.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,9 +40,9 @@ class alignas(sizeof(Vertex)) box { private: /** - * @brief Output stream operator for `vec_2d` for human-readable formatting + * @brief Output stream operator for `box` for human-readable formatting */ - friend std::ostream& operator<<(std::ostream& os, cuspatial::box const& b) + friend std::ostream& operator<<(std::ostream& os, cuspatial::box const& b) { return os << "{" << b.v1 << ", " << b.v2 << "}"; } diff --git a/cpp/include/cuspatial_test/geometry_generator.cuh b/cpp/include/cuspatial_test/geometry_generator.cuh index a3efa5812..8c17ff291 100644 --- a/cpp/include/cuspatial_test/geometry_generator.cuh +++ b/cpp/include/cuspatial_test/geometry_generator.cuh @@ -33,6 +33,8 @@ #include +#include + namespace cuspatial { namespace test { @@ -399,8 +401,9 @@ auto generate_multipoint_array(multipoint_generator_parameter params, std::size_t{0}, params.num_points_per_multipoints); - auto engine_x = deterministic_engine(params.num_points()); - auto engine_y = deterministic_engine(2 * params.num_points()); + auto golden_ratio = (1 + std::sqrt(T{5})) / 2; + auto engine_x = deterministic_engine(golden_ratio * params.num_points()); + auto engine_y = deterministic_engine((1 / golden_ratio) * params.num_points()); auto x_dist = make_uniform_dist(params.lower_left.x, params.upper_right.x); auto y_dist = make_uniform_dist(params.lower_left.y, params.upper_right.y); diff --git a/cpp/include/cuspatial_test/vector_factories.cuh b/cpp/include/cuspatial_test/vector_factories.cuh index a772cd6cf..9d3e8a31a 100644 --- a/cpp/include/cuspatial_test/vector_factories.cuh +++ b/cpp/include/cuspatial_test/vector_factories.cuh @@ -363,6 +363,17 @@ class multipoint_array { _geometry_offsets.begin(), _geometry_offsets.end(), _coordinates.begin(), _coordinates.end()}; } + /** + * @brief Copy the offset arrays to host. + */ + auto to_host() const + { + auto geometry_offsets = cuspatial::test::to_host(_geometry_offsets); + auto coordinate_offsets = cuspatial::test::to_host(_coordinates); + + return std::tuple{geometry_offsets, coordinate_offsets}; + } + /// Release ownership auto release() { return std::pair{std::move(_geometry_offsets), std::move(_coordinates)}; } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 7f7cb112a..f6752c860 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -158,6 +158,9 @@ ConfigureTest(JOIN_POINT_IN_POLYGON_SMALL_TEST_EXP ConfigureTest(JOIN_POINT_IN_POLYGON_LARGE_TEST_EXP join/quadtree_point_in_polygon_test_large.cu) +ConfigureTest(JOIN_POINT_IN_POLYGON_OOM_TEST_EXP + join/quadtree_point_in_polygon_test_oom.cu) + ConfigureTest(JOIN_POINT_TO_LINESTRING_SMALL_TEST_EXP join/quadtree_point_to_nearest_linestring_test_small.cu) diff --git a/cpp/tests/join/quadtree_point_in_polygon_test_oom.cu b/cpp/tests/join/quadtree_point_in_polygon_test_oom.cu new file mode 100644 index 000000000..2f66918e0 --- /dev/null +++ b/cpp/tests/join/quadtree_point_in_polygon_test_oom.cu @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +/* + * The test uses the same quadtree structure as in pip_refine_test_small. However, the number of + * randomly generated points under all quadrants (min_size) are increased to be more than the + * number of threads per-block. + */ + +template +struct PIPRefineTestLarge : public cuspatial::test::BaseFixture {}; + +template +struct test_point_in_poly_functor { + cuspatial::multipolygon_range*> + polys; + __device__ inline T operator()(cuspatial::vec_2d point) + { + auto it = cuspatial::make_counting_transform_iterator(0, [&](auto i) { return polys[i][0]; }); + return thrust::count_if(thrust::seq, it, it + polys.num_multipolygons(), [&](auto poly) { + return cuspatial::is_point_in_polygon(point, poly); + }); + } +}; + +using TestTypes = ::testing::Types; + +TYPED_TEST_CASE(PIPRefineTestLarge, TestTypes); + +TYPED_TEST(PIPRefineTestLarge, TestOOM) +{ + using T = TypeParam; + using cuspatial::vec_2d; + using cuspatial::test::make_device_vector; + + using point_t = typename cuspatial::test::multipoint_array, + rmm::device_uvector>>; + + using polys_t = typename cuspatial::test::multipolygon_array, + rmm::device_uvector, + rmm::device_uvector, + rmm::device_uvector>>; + + vec_2d v_min{0.0, 0.0}; + vec_2d v_max{1'000.0, 1'000.0}; + T const scale{1.0}; + std::uint8_t const max_depth{15}; + std::uint32_t const min_size{10'000}; + + std::size_t const num_points{1'000'000}; + std::size_t const num_polys{24'000}; + + rmm::device_uvector> points = [&]() { + point_t points = cuspatial::test::generate_multipoint_array( + cuspatial::test::multipoint_generator_parameter{1, num_points, v_min, v_max}, + this->stream()); + return points.release().second; + }(); + + polys_t multipoly_array = cuspatial::test::generate_multipolygon_array( + cuspatial::test::multipolygon_generator_parameter{ + num_polys, + 1, + 0, + 4, + vec_2d{(v_max - v_min).x / 2, (v_max - v_min).y / 2}, + (v_max - v_min).x / 8}, + this->stream()); + + auto multipolygons = multipoly_array.range(); + + auto expected_size = thrust::count_if(rmm::exec_policy(this->stream()), + points.begin(), + points.end(), + test_point_in_poly_functor{multipolygons}); + + auto [point_indices, quadtree] = cuspatial::quadtree_on_points( + points.begin(), points.end(), v_min, v_max, scale, max_depth, min_size, this->stream()); + + auto bboxes = + rmm::device_uvector>(multipolygons.num_polygons(), this->stream()); + + cuspatial::polygon_bounding_boxes(multipolygons.part_offset_begin(), + multipolygons.part_offset_end(), + multipolygons.ring_offset_begin(), + multipolygons.ring_offset_end(), + multipolygons.point_begin(), + multipolygons.point_end(), + bboxes.begin(), + T{0}, + this->stream()); + + EXPECT_GT(bboxes.size(), 0); + + auto [poly_indices, quad_indices] = cuspatial::join_quadtree_and_bounding_boxes( + quadtree, bboxes.begin(), bboxes.end(), v_min, scale, max_depth, this->stream()); + + auto [actual_poly_indices, actual_point_indices] = + cuspatial::quadtree_point_in_polygon(poly_indices.begin(), + poly_indices.end(), + quad_indices.begin(), + quadtree, + point_indices.begin(), + point_indices.end(), + points.begin(), + multipolygons, + this->stream()); + + EXPECT_GT(actual_point_indices.size(), 0); + EXPECT_GT(actual_poly_indices.size(), 0); + + EXPECT_EQ(actual_point_indices.size(), expected_size); + EXPECT_EQ(actual_poly_indices.size(), expected_size); +}