From b1e1c9c060cc6b4b35b8590209177584336444bc Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 26 Sep 2024 11:00:00 -0700 Subject: [PATCH 01/14] Reapply `mixed_semi_join` refactoring and bug fixes (#16859) This PR reapplies changes from #16230 and adds bug fixes and performance improvements for mixed_semi_join. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Yunsong Wang (https://github.com/PointKernel) - MithunR (https://github.com/mythrocks) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/16859 --- cpp/src/join/join_common_utils.hpp | 6 - cpp/src/join/mixed_join_common_utils.cuh | 34 ++++++ cpp/src/join/mixed_join_kernels_semi.cu | 51 ++++---- cpp/src/join/mixed_join_kernels_semi.cuh | 6 +- cpp/src/join/mixed_join_semi.cu | 92 +++++--------- cpp/tests/join/mixed_join_tests.cu | 147 +++++++++++++++++++++++ 6 files changed, 239 insertions(+), 97 deletions(-) diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 86402a0e7de..573101cefd9 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -22,7 +22,6 @@ #include #include -#include #include #include @@ -51,11 +50,6 @@ using mixed_multimap_type = cudf::detail::cuco_allocator, cuco::legacy::double_hashing<1, hash_type, hash_type>>; -using semi_map_type = cuco::legacy::static_map>; - using row_hash_legacy = cudf::row_hasher; diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh index 19701816867..4a52cfe098a 100644 --- a/cpp/src/join/mixed_join_common_utils.cuh +++ b/cpp/src/join/mixed_join_common_utils.cuh @@ -25,6 +25,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -160,6 +161,39 @@ struct pair_expression_equality : public expression_equality { } }; +/** + * @brief Equality comparator that composes two row_equality comparators. + */ +struct double_row_equality_comparator { + row_equality const equality_comparator; + row_equality const conditional_comparator; + + __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept + { + using experimental::row::lhs_index_type; + using experimental::row::rhs_index_type; + + return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) && + conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}); + } +}; + +// A CUDA Cooperative Group of 1 thread for the hash set for mixed semi. +auto constexpr DEFAULT_MIXED_SEMI_JOIN_CG_SIZE = 1; + +// The hash set type used by mixed_semi_join with the build_table. +using hash_set_type = + cuco::static_set, + cuda::thread_scope_device, + double_row_equality_comparator, + cuco::linear_probing, + cudf::detail::cuco_allocator, + cuco::storage<1>>; + +// The hash_set_ref_type used by mixed_semi_join kerenels for probing. +using hash_set_ref_type = hash_set_type::ref_type; + } // namespace detail } // namespace cudf diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index 7459ac3e99c..bd8c80652a0 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -38,38 +38,48 @@ CUDF_KERNEL void __launch_bounds__(block_size) table_device_view right_table, table_device_view probe, table_device_view build, - row_hash const hash_probe, row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, + hash_set_ref_type set_ref, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data) { + auto constexpr cg_size = hash_set_ref_type::cg_size; + + auto const tile = cg::tiled_partition(cg::this_thread_block()); + // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is // used to circumvent conflicts between arrays of different types between // different template instantiations due to the extern specifier. extern __shared__ char raw_intermediate_storage[]; - cudf::ast::detail::IntermediateDataType* intermediate_storage = + auto intermediate_storage = reinterpret_cast*>(raw_intermediate_storage); auto thread_intermediate_storage = - &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; + intermediate_storage + (tile.meta_group_rank() * device_expression_data.num_intermediates); - cudf::size_type const left_num_rows = left_table.num_rows(); - cudf::size_type const right_num_rows = right_table.num_rows(); - auto const outer_num_rows = left_num_rows; + // Equality evaluator to use + auto const evaluator = cudf::ast::detail::expression_evaluator( + left_table, right_table, device_expression_data); - cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size; + // Make sure to swap_tables here as hash_set will use probe table as the left one + auto constexpr swap_tables = true; + auto const equality = single_expression_equality{ + evaluator, thread_intermediate_storage, swap_tables, equality_probe}; - auto evaluator = cudf::ast::detail::expression_evaluator( - left_table, right_table, device_expression_data); + // Create set ref with the new equality comparator + auto const set_ref_equality = set_ref.with_key_eq(equality); - if (outer_row_index < outer_num_rows) { - // Figure out the number of elements for this key. - auto equality = single_expression_equality{ - evaluator, thread_intermediate_storage, false, equality_probe}; + // Total number of rows to query the set + auto const outer_num_rows = left_table.num_rows(); + // Grid stride for the tile + auto const cg_grid_stride = cudf::detail::grid_1d::grid_stride() / cg_size; - left_table_keep_mask[outer_row_index] = - hash_table_view.contains(outer_row_index, hash_probe, equality); + // Find all the rows in the left table that are in the hash table + for (auto outer_row_index = cudf::detail::grid_1d::global_thread_id() / cg_size; + outer_row_index < outer_num_rows; + outer_row_index += cg_grid_stride) { + auto const result = set_ref_equality.contains(tile, outer_row_index); + if (tile.thread_rank() == 0) { left_table_keep_mask[outer_row_index] = result; } } } @@ -78,9 +88,8 @@ void launch_mixed_join_semi(bool has_nulls, table_device_view right_table, table_device_view probe, table_device_view build, - row_hash const hash_probe, row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, + hash_set_ref_type set_ref, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data, detail::grid_1d const config, @@ -94,9 +103,8 @@ void launch_mixed_join_semi(bool has_nulls, right_table, probe, build, - hash_probe, equality_probe, - hash_table_view, + set_ref, left_table_keep_mask, device_expression_data); } else { @@ -106,9 +114,8 @@ void launch_mixed_join_semi(bool has_nulls, right_table, probe, build, - hash_probe, equality_probe, - hash_table_view, + set_ref, left_table_keep_mask, device_expression_data); } diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh index 43714ffb36a..b08298e64e4 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cuh +++ b/cpp/src/join/mixed_join_kernels_semi.cuh @@ -45,9 +45,8 @@ namespace detail { * @param[in] right_table The right table * @param[in] probe The table with which to probe the hash table for matches. * @param[in] build The table with which the hash table was built. - * @param[in] hash_probe The hasher used for the probe table. * @param[in] equality_probe The equality comparator used when probing the hash table. - * @param[in] hash_table_view The hash table built from `build`. + * @param[in] set_ref The hash table device view built from `build`. * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating * the corresponding index from left table is present in output * @param[in] device_expression_data Container of device data required to evaluate the desired @@ -58,9 +57,8 @@ void launch_mixed_join_semi(bool has_nulls, table_device_view right_table, table_device_view probe, table_device_view build, - row_hash const hash_probe, row_equality const equality_probe, - cudf::detail::semi_map_type::device_view hash_table_view, + hash_set_ref_type set_ref, cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data, detail::grid_1d const config, diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index aa4fa281159..83a55eca50f 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -45,45 +45,6 @@ namespace cudf { namespace detail { -namespace { -/** - * @brief Device functor to create a pair of hash value and index for a given row. - */ -struct make_pair_function_semi { - __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept - { - // The value is irrelevant since we only ever use the hash map to check for - // membership of a particular row index. - return cuco::make_pair(static_cast(i), 0); - } -}; - -/** - * @brief Equality comparator that composes two row_equality comparators. - */ -class double_row_equality { - public: - double_row_equality(row_equality equality_comparator, row_equality conditional_comparator) - : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator} - { - } - - __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept - { - using experimental::row::lhs_index_type; - using experimental::row::rhs_index_type; - - return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) && - _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}); - } - - private: - row_equality _equality_comparator; - row_equality _conditional_comparator; -}; - -} // namespace - std::unique_ptr> mixed_join_semi( table_view const& left_equality, table_view const& right_equality, @@ -95,7 +56,7 @@ std::unique_ptr> mixed_join_semi( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) && + CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and (join_type != join_kind::FULL_JOIN), "Inner, left, and full joins should use mixed_join."); @@ -136,7 +97,7 @@ std::unique_ptr> mixed_join_semi( // output column and follow the null-supporting expression evaluation code // path. auto const has_nulls = cudf::nullate::DYNAMIC{ - cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) || + cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)}; auto const parser = ast::detail::expression_parser{ @@ -155,27 +116,20 @@ std::unique_ptr> mixed_join_semi( auto right_conditional_view = table_device_view::create(right_conditional, stream); auto const preprocessed_build = - experimental::row::equality::preprocessed_table::create(build, stream); + cudf::experimental::row::equality::preprocessed_table::create(build, stream); auto const preprocessed_probe = - experimental::row::equality::preprocessed_table::create(probe, stream); + cudf::experimental::row::equality::preprocessed_table::create(probe, stream); auto const row_comparator = - cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build}; + cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe}; auto const equality_probe = row_comparator.equal_to(has_nulls, compare_nulls); - semi_map_type hash_table{ - compute_hash_table_size(build.num_rows()), - cuco::empty_key{std::numeric_limits::max()}, - cuco::empty_value{cudf::detail::JoinNoneValue}, - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - // Create hash table containing all keys found in right table // TODO: To add support for nested columns we will need to flatten in many // places. However, this probably isn't worth adding any time soon since we // won't be able to support AST conditions for those types anyway. auto const build_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(build)}; auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build}; - auto const hash_build = row_hash_build.device_hasher(build_nulls); + // Since we may see multiple rows that are identical in the equality tables // but differ in the conditional tables, the equality comparator used for // insertion must account for both sets of tables. An alternative solution @@ -190,20 +144,28 @@ std::unique_ptr> mixed_join_semi( auto const equality_build_equality = row_comparator_build.equal_to(build_nulls, compare_nulls); auto const preprocessed_build_condtional = - experimental::row::equality::preprocessed_table::create(right_conditional, stream); + cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream); auto const row_comparator_conditional_build = cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional, preprocessed_build_condtional}; auto const equality_build_conditional = row_comparator_conditional_build.equal_to(build_nulls, compare_nulls); - double_row_equality equality_build{equality_build_equality, equality_build_conditional}; - make_pair_function_semi pair_func_build{}; - auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build); + hash_set_type row_set{ + {compute_hash_table_size(build.num_rows())}, + cuco::empty_key{JoinNoneValue}, + {equality_build_equality, equality_build_conditional}, + {row_hash_build.device_hasher(build_nulls)}, + {}, + {}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + {stream.value()}}; + + auto iter = thrust::make_counting_iterator(0); // skip rows that are null here. if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) { - hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value()); + row_set.insert_async(iter, iter + right_num_rows, stream.value()); } else { thrust::counting_iterator stencil(0); auto const [row_bitmask, _] = @@ -211,18 +173,19 @@ std::unique_ptr> mixed_join_semi( row_is_valid pred{static_cast(row_bitmask.data())}; // insert valid rows - hash_table.insert_if( - iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value()); + row_set.insert_if_async(iter, iter + right_num_rows, stencil, pred, stream.value()); } - auto hash_table_view = hash_table.get_device_view(); - - detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE); - auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; + detail::grid_1d const config(outer_num_rows * hash_set_type::cg_size, DEFAULT_JOIN_BLOCK_SIZE); + auto const shmem_size_per_block = + parser.shmem_per_thread * + cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size); auto const row_hash = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; auto const hash_probe = row_hash.device_hasher(has_nulls); + hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe); + // Vector used to indicate indices from left/probe table which are present in output auto left_table_keep_mask = rmm::device_uvector(probe.num_rows(), stream); @@ -231,9 +194,8 @@ std::unique_ptr> mixed_join_semi( *right_conditional_view, *probe_view, *build_view, - hash_probe, equality_probe, - hash_table_view, + row_set_ref, cudf::device_span(left_table_keep_mask), parser.device_expression_data, config, diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu index 6c147c8a128..9041969bec7 100644 --- a/cpp/tests/join/mixed_join_tests.cu +++ b/cpp/tests/join/mixed_join_tests.cu @@ -778,6 +778,138 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality) {1}); } +TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap) +{ + auto const col_ref_left_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); + auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_one_greater_right_one = + cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); + + this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}}, + {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}}, + {0}, + {1}, + left_one_greater_right_one, + {2, 7, 8}); +} + +TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMapLarge) +{ + using T1 = double; + + // Number of rows in each column + auto constexpr N = 10000; + + // Generate column data for left and right tables + auto const [left_col0, right_col0] = gen_random_nullable_repeated_columns(N, 200); + auto const [left_col1, right_col1] = gen_random_nullable_repeated_columns(N, 100); + + // Setup data and nulls for the left table + std::vector, std::vector>> lefts = { + {left_col0.first, left_col0.second}, {left_col1.first, left_col1.second}}; + std::vector> left_wrappers; + std::vector left_columns; + for (auto [data, valids] : lefts) { + left_wrappers.emplace_back( + cudf::test::fixed_width_column_wrapper(data.begin(), data.end(), valids.begin())); + left_columns.emplace_back(left_wrappers.back()); + }; + + // Setup data and nulls for the right table + std::vector, std::vector>> rights = { + {right_col0.first, right_col0.second}, {right_col1.first, right_col1.second}}; + std::vector> right_wrappers; + std::vector right_columns; + for (auto [data, valids] : rights) { + right_wrappers.emplace_back( + cudf::test::fixed_width_column_wrapper(data.begin(), data.end(), valids.begin())); + right_columns.emplace_back(left_wrappers.back()); + }; + + // Left and right table views. + auto const left_table = cudf::table_view{left_columns}; + auto const right_table = cudf::table_view{right_columns}; + + // Using the zeroth column for equality. + auto const left_equality = left_table.select({0}); + auto const right_equality = right_table.select({0}); + + // Column references for equality column. + auto const col_ref_left_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); + auto const col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_zero_eq_right_zero = + cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); + + // Mixed semi join with zeroth column equality + { + // Expected left_semi_join result + auto const expected_mixed_semi_join = + cudf::conditional_left_semi_join(left_table, right_table, left_zero_eq_right_zero); + + // Actual mixed_left_semi_join result + auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality, + right_equality, + left_table, + right_table, + left_zero_eq_right_zero, + cudf::null_equality::UNEQUAL); + + // Copy data back to host for comparisons + auto expected_indices = cudf::detail::make_std_vector_async( + cudf::device_span(*expected_mixed_semi_join), cudf::get_default_stream()); + auto result_indices = cudf::detail::make_std_vector_sync( + cudf::device_span(*mixed_semi_join), cudf::get_default_stream()); + + // Sort the indices for 1-1 comparison + std::sort(expected_indices.begin(), expected_indices.end()); + std::sort(result_indices.begin(), result_indices.end()); + + // Expected and actual vectors must match. + EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size()); + EXPECT_TRUE( + std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin())); + } + + // Mixed semi join with zeroth column equality and first column GREATER conditional + { + // Column references for conditional column. + auto const col_ref_left_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT); + auto const col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT); + auto left_one_gt_right_one = + cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); + + // Expected left_semi_join result + auto const expected_mixed_semi_join = cudf::conditional_left_semi_join( + left_table, + right_table, + cudf::ast::operation( + cudf::ast::ast_operator::LOGICAL_AND, left_zero_eq_right_zero, left_one_gt_right_one)); + + // Actual left_semi_join result + auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality, + right_equality, + left_table, + right_table, + left_one_gt_right_one, + cudf::null_equality::UNEQUAL); + + // Copy data back to host for comparisons + auto expected_indices = cudf::detail::make_std_vector_async( + cudf::device_span(*expected_mixed_semi_join), cudf::get_default_stream()); + auto result_indices = cudf::detail::make_std_vector_sync( + cudf::device_span(*mixed_semi_join), cudf::get_default_stream()); + + // Sort the indices for 1-1 comparison + std::sort(expected_indices.begin(), expected_indices.end()); + std::sort(result_indices.begin(), result_indices.end()); + + // Expected and actual vectors must match. + EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size()); + EXPECT_TRUE( + std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin())); + } +} + TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates) { this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}}, @@ -900,3 +1032,18 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality) left_zero_eq_right_zero, {0, 1, 3}); } + +TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap) +{ + auto const col_ref_left_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT); + auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_one_greater_right_one = + cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1); + + this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}}, + {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}}, + {0}, + {1}, + left_one_greater_right_one, + {0, 1, 3, 4, 5, 6, 9}); +} From d69e4b6fbdff9ad402a37de7940d64ed16b7d329 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 26 Sep 2024 08:07:48 -1000 Subject: [PATCH 02/14] Respect groupby.nunique(dropna=False) (#16921) closes https://github.com/rapidsai/cudf/issues/16861 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/16921 --- python/cudf/cudf/_lib/aggregation.pyx | 7 +++++-- python/cudf/cudf/core/groupby/groupby.py | 16 ++++++++++++++++ python/cudf/cudf/tests/test_groupby.py | 17 +++++++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 7c91533cf93..3c96b90f0a1 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -78,8 +78,11 @@ class Aggregation: ) @classmethod - def nunique(cls): - return cls(pylibcudf.aggregation.nunique(pylibcudf.types.NullPolicy.EXCLUDE)) + def nunique(cls, dropna=True): + return cls(pylibcudf.aggregation.nunique( + pylibcudf.types.NullPolicy.EXCLUDE + if dropna else pylibcudf.types.NullPolicy.INCLUDE + )) @classmethod def nth(cls, size): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index cb8cd0cd28b..be05075a2cd 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2232,6 +2232,22 @@ def func(x): return self.agg(func) + @_performance_tracking + def nunique(self, dropna: bool = True): + """ + Return number of unique elements in the group. + + Parameters + ---------- + dropna : bool, default True + Don't include NaN in the counts. + """ + + def func(x): + return getattr(x, "nunique")(dropna=dropna) + + return self.agg(func) + @_performance_tracking def std( self, diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 848bc259e7b..14ba9894fd3 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1940,6 +1940,23 @@ def test_groupby_nunique(agg, by): assert_groupby_results_equal(expect, got, check_dtype=False) +@pytest.mark.parametrize("dropna", [True, False]) +def test_nunique_dropna(dropna): + gdf = cudf.DataFrame( + { + "a": [1, 1, 2], + "b": [4, None, 5], + "c": [None, None, 7], + "d": [1, 1, 3], + } + ) + pdf = gdf.to_pandas() + + result = gdf.groupby("a")["b"].nunique(dropna=dropna) + expected = pdf.groupby("a")["b"].nunique(dropna=dropna) + assert_groupby_results_equal(result, expected, check_dtype=False) + + @pytest.mark.parametrize( "n", [0, 1, 2, 10], From 742eaadb92b0c5159d92be49e647a17e8c1d0b9b Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Thu, 26 Sep 2024 14:27:37 -0500 Subject: [PATCH 03/14] Fix links in Dask cuDF documentation (#16929) More follow-up fixes to the recent Dask-cuDF documentation additions. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16929 --- docs/dask_cudf/source/best_practices.rst | 15 +++++++++------ docs/dask_cudf/source/conf.py | 1 + docs/dask_cudf/source/index.rst | 11 +++++------ 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst index 83039f86fed..41263ebf589 100644 --- a/docs/dask_cudf/source/best_practices.rst +++ b/docs/dask_cudf/source/best_practices.rst @@ -81,7 +81,7 @@ representations, native cuDF spilling may be insufficient. For these cases, `JIT-unspill `__ is likely to produce better protection from out-of-memory (OOM) errors. Please see `Dask-CUDA's spilling documentation -`__ for further details +`__ for further details and guidance. Use RMM @@ -160,7 +160,7 @@ of the underlying task graph to materialize the collection. :func:`sort_values` / :func:`set_index` : These operations both require Dask to eagerly collect quantile information about the column(s) being targeted by the -global sort operation. See `Avoid Sorting`__ for notes on sorting considerations. +global sort operation. See the next section for notes on sorting considerations. .. note:: When using :func:`set_index`, be sure to pass in ``sort=False`` whenever the @@ -297,11 +297,14 @@ bottleneck is typically device-to-host memory spilling. Although every workflow is different, the following guidelines are often recommended: -* `Use a distributed cluster with Dask-CUDA workers `_ -* `Use native cuDF spilling whenever possible `_ +* Use a distributed cluster with `Dask-CUDA `__ workers + +* Use native cuDF spilling whenever possible (`Dask-CUDA spilling documentation `__) + * Avoid shuffling whenever possible - * Use ``split_out=1`` for low-cardinality groupby aggregations - * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``) + * Use ``split_out=1`` for low-cardinality groupby aggregations + * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``) + * `Use UCX `__ if communication is a bottleneck. .. note:: diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py index dc40254312e..5daa8245695 100644 --- a/docs/dask_cudf/source/conf.py +++ b/docs/dask_cudf/source/conf.py @@ -78,6 +78,7 @@ "cudf": ("https://docs.rapids.ai/api/cudf/stable/", None), "dask": ("https://docs.dask.org/en/stable/", None), "pandas": ("https://pandas.pydata.org/docs/", None), + "dask-cuda": ("https://docs.rapids.ai/api/dask-cuda/stable/", None), } numpydoc_show_inherited_class_members = True diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst index 6eb755d7854..c2891ebc15e 100644 --- a/docs/dask_cudf/source/index.rst +++ b/docs/dask_cudf/source/index.rst @@ -16,10 +16,9 @@ as the ``"cudf"`` dataframe backend for Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU or multi-node execution on their own. You must also deploy a `dask.distributed `__ cluster - to leverage multiple GPUs. We strongly recommend using `Dask-CUDA - `__ to simplify the - setup of the cluster, taking advantage of all features of the GPU - and networking hardware. + to leverage multiple GPUs. We strongly recommend using :doc:`dask-cuda:index` + to simplify the setup of the cluster, taking advantage of all features + of the GPU and networking hardware. If you are familiar with Dask and `pandas `__ or `cuDF `__, then Dask cuDF @@ -161,7 +160,7 @@ out-of-core computing. This also means that the compute tasks can be executed in parallel over a multi-GPU cluster. In order to execute your Dask workflow on multiple GPUs, you will -typically need to use `Dask-CUDA `__ +typically need to use :doc:`dask-cuda:index` to deploy distributed Dask cluster, and `Distributed `__ to define a client object. For example:: @@ -192,7 +191,7 @@ to define a client object. For example:: `__ for more details. -Please see the `Dask-CUDA `__ +Please see the :doc:`dask-cuda:index` documentation for more information about deploying GPU-aware clusters (including `best practices `__). From 40075f1115ecd82a74b46d98e80e19afbf8a0210 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 26 Sep 2024 16:32:09 -0400 Subject: [PATCH 04/14] Use `changed-files` shared workflow (#16713) Contributes to https://github.com/rapidsai/build-planning/issues/94 Depends on https://github.com/rapidsai/shared-workflows/pull/239 Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16713 --- .github/workflows/pr.yaml | 140 +++++++++++++++----------------------- 1 file changed, 56 insertions(+), 84 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a65cae34653..bc237cc73b0 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -43,80 +43,52 @@ jobs: with: needs: ${{ toJSON(needs) }} changed-files: - runs-on: ubuntu-latest - name: "Check changed files" - outputs: - test_cpp: ${{ steps.changed-files.outputs.cpp_any_changed == 'true' }} - test_java: ${{ steps.changed-files.outputs.java_any_changed == 'true' }} - test_notebooks: ${{ steps.changed-files.outputs.notebooks_any_changed == 'true' }} - test_python: ${{ steps.changed-files.outputs.python_any_changed == 'true' }} - test_cudf_pandas: ${{ steps.changed-files.outputs.cudf_pandas_any_changed == 'true' }} - steps: - - name: Get PR info - id: get-pr-info - uses: nv-gha-runners/get-pr-info@main - - name: Checkout code repo - uses: actions/checkout@v4 - with: - fetch-depth: 0 - persist-credentials: false - - name: Calculate merge base - id: calculate-merge-base - env: - PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }} - BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }} - run: | - (echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") > "$GITHUB_OUTPUT" - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@v45 - with: - base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }} - sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }} - files_yaml: | - cpp: - - '**' - - '!CONTRIBUTING.md' - - '!README.md' - - '!docs/**' - - '!img/**' - - '!java/**' - - '!notebooks/**' - - '!python/**' - - '!ci/cudf_pandas_scripts/**' - java: - - '**' - - '!CONTRIBUTING.md' - - '!README.md' - - '!docs/**' - - '!img/**' - - '!notebooks/**' - - '!python/**' - - '!ci/cudf_pandas_scripts/**' - notebooks: - - '**' - - '!CONTRIBUTING.md' - - '!README.md' - - '!java/**' - - '!ci/cudf_pandas_scripts/**' - python: - - '**' - - '!CONTRIBUTING.md' - - '!README.md' - - '!docs/**' - - '!img/**' - - '!java/**' - - '!notebooks/**' - - '!ci/cudf_pandas_scripts/**' - cudf_pandas: - - '**' - - 'ci/cudf_pandas_scripts/**' - - '!CONTRIBUTING.md' - - '!README.md' - - '!docs/**' - - '!img/**' - - '!java/**' - - '!notebooks/**' + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12 + with: + files_yaml: | + test_cpp: + - '**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!docs/**' + - '!img/**' + - '!java/**' + - '!notebooks/**' + - '!python/**' + test_cudf_pandas: + - '**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!docs/**' + - '!img/**' + - '!java/**' + - '!notebooks/**' + test_java: + - '**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!docs/**' + - '!img/**' + - '!notebooks/**' + - '!python/**' + test_notebooks: + - '**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!java/**' + test_python: + - '**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!docs/**' + - '!img/**' + - '!java/**' + - '!notebooks/**' checks: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12 @@ -139,7 +111,7 @@ jobs: needs: [conda-cpp-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12 - if: needs.changed-files.outputs.test_cpp == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp with: build_type: pull-request conda-python-build: @@ -152,7 +124,7 @@ jobs: needs: [conda-python-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: "ci/test_python_cudf.sh" @@ -161,7 +133,7 @@ jobs: needs: [conda-python-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: "ci/test_python_other.sh" @@ -169,7 +141,7 @@ jobs: needs: [conda-cpp-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 - if: needs.changed-files.outputs.test_java == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -190,7 +162,7 @@ jobs: needs: [conda-python-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 - if: needs.changed-files.outputs.test_notebooks == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -234,7 +206,7 @@ jobs: needs: [wheel-build-cudf, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: ci/test_wheel_cudf.sh @@ -251,7 +223,7 @@ jobs: needs: [wheel-build-cudf-polars, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -283,7 +255,7 @@ jobs: needs: [wheel-build-dask-cudf, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -303,7 +275,7 @@ jobs: needs: [wheel-build-cudf, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -314,7 +286,7 @@ jobs: needs: [wheel-build-cudf, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 - if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true' + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) From fa12901024fcc810fcf7f695d2f2e41f472f2306 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 26 Sep 2024 19:07:48 -0400 Subject: [PATCH 05/14] Fix cudf::strings::findall error with empty input (#16928) Fixes `cudf::strings::findall` error when passed an empty input column. Also adds a gtest for empty input and for all-rows do not match case. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16928 --- cpp/src/strings/search/findall.cu | 10 +++++++--- cpp/tests/strings/findall_tests.cpp | 28 ++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index 067a513af96..d8c1b50a94b 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -97,8 +98,11 @@ std::unique_ptr findall(strings_column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto const strings_count = input.size(); - auto const d_strings = column_device_view::create(input.parent(), stream); + if (input.is_empty()) { + return cudf::lists::detail::make_empty_lists_column(input.parent().type(), stream, mr); + } + + auto const d_strings = column_device_view::create(input.parent(), stream); // create device object from regex_program auto d_prog = regex_device_builder::create_prog_device(prog, stream); @@ -113,7 +117,7 @@ std::unique_ptr findall(strings_column_view const& input, auto strings_output = findall_util(*d_strings, *d_prog, total_matches, d_offsets, stream, mr); // Build the lists column from the offsets and the strings - return make_lists_column(strings_count, + return make_lists_column(input.size(), std::move(offsets), std::move(strings_output), input.null_count(), diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 47606b9b3ed..6eea1895fb1 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -148,3 +148,31 @@ TEST_F(StringsFindallTests, LargeRegex) LCW expected({LCW{large_regex.c_str()}, LCW{}, LCW{}}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } + +TEST_F(StringsFindallTests, NoMatches) +{ + cudf::test::strings_column_wrapper input({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"}); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("(^zzz$)"); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{}, LCW{}, LCW{}}); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::findall(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); +} + +TEST_F(StringsFindallTests, EmptyTest) +{ + std::string pattern = R"(\w+)"; + + auto prog = cudf::strings::regex_program::create(pattern); + + cudf::test::strings_column_wrapper input; + auto sv = cudf::strings_column_view(input); + auto results = cudf::strings::findall(sv, *prog); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); +} From 9125d2f19ecd6a82f29cdb41928737ec73eb491b Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 26 Sep 2024 20:07:39 -0500 Subject: [PATCH 06/14] reduce wheel build verbosity, narrow deprecation warning filter (#16896) Proposes some small changes I've taken as follow-ups from previous work here. * #16745 filtered out all linter warnings about uses of `datetime.utcnow()` ... this PR limits that to only the warnings observed from `botocore` (so that the linter will helpfully warn us about such uses directly in `cudf`) - ref https://github.com/rapidsai/cudf/pull/16745#discussion_r1746290952 * reduces the verbosity of logs for wheel builds (`-vvv` to `-v`) - similar to https://github.com/rapidsai/cugraph/pull/4651 ## Notes for Reviewers This is intentionally targeted at `24.12`. No need to rush this into 24.10 before code freeze. ### How I tested this
locally in docker (click me) ```shell docker run \ --rm \ --gpus 1 \ -v $(pwd):/opt/work \ -w /opt/work \ -it rapidsai/citestwheel:latest \ bash pip install \ --prefer-binary \ 'cudf-cu12[test]==24.10.*,>=0.0.0a0' \ 'flask' \ 'flask-cors' \ 'moto>=4.0.8' \ 'boto3' \ 's3fs>=2022.3.0' cd ./python/cudf pytest \ cudf/tests/test_s3.py ```
Authors: - James Lamb (https://github.com/jameslamb) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16896 --- ci/build_wheel.sh | 2 +- python/cudf/cudf/tests/pytest.ini | 2 +- python/dask_cudf/pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 7c1fa705faa..bf76f4ed29a 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -12,4 +12,4 @@ rapids-generate-version > ./VERSION cd "${package_dir}" -python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check +python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini index d05ba9aaacc..496a322ff80 100644 --- a/python/cudf/cudf/tests/pytest.ini +++ b/python/cudf/cudf/tests/pytest.ini @@ -9,7 +9,7 @@ filterwarnings = ignore:::.*xdist.* ignore:::.*pytest.* # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow() - ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning + ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+ ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning # PerformanceWarning from cupy warming up the JIT cache diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index c64de06338f..336b2d24948 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -119,7 +119,7 @@ filterwarnings = [ "error::FutureWarning", "error::DeprecationWarning", # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow() - "ignore:.*datetime.*utcnow.*scheduled for removal:DeprecationWarning", + "ignore:.*datetime.*utcnow.*scheduled for removal:DeprecationWarning:botocore", "ignore:create_block_manager_from_blocks is deprecated and will be removed in a future version. Use public APIs instead.:DeprecationWarning", # https://github.com/dask/partd/blob/main/partd/pandas.py#L198 "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning", From 0632538a69f55f6d489d306edf2910a111430425 Mon Sep 17 00:00:00 2001 From: Graham Markall <535640+gmarkall@users.noreply.github.com> Date: Fri, 27 Sep 2024 02:36:13 +0100 Subject: [PATCH 07/14] Use numba-cuda>=0.0.13 (#16474) Testing with https://github.com/NVIDIA/numba-cuda on CI. I am not sure if edits in other repos are required (e.g. I used to have to change an "integration" repo) - any pointers appreciated! Authors: - Graham Markall (https://github.com/gmarkall) - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) Approvers: - Mike Sarahan (https://github.com/msarahan) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16474 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 2 +- dependencies.yaml | 6 +++--- python/cudf/pyproject.toml | 2 +- python/dask_cudf/pyproject.toml | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 8db03812a19..8b45d26c367 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -54,7 +54,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba>=0.57 +- numba-cuda>=0.0.13 - numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-64=11.8 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index fdbe278b66b..354c1360e5a 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -53,7 +53,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba>=0.57 +- numba-cuda>=0.0.13 - numpy>=1.23,<3.0a0 - numpydoc - nvcomp==4.0.1 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index e22b4a4eddc..25e69b89789 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -80,7 +80,7 @@ requirements: - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.3dev0 - cupy >=12.0.0 - - numba >=0.57 + - numba-cuda >=0.0.13 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<18.0.0a0 - libcudf ={{ version }} diff --git a/dependencies.yaml b/dependencies.yaml index bb8635403a4..ed36a23e5c3 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -605,7 +605,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba numba>=0.57 + - &numba-cuda-dep numba-cuda>=0.0.13 - nvtx>=0.2.1 - packaging - rich @@ -720,7 +720,7 @@ dependencies: matrices: - matrix: {dependencies: "oldest"} packages: - - numba==0.57.* + - *numba-cuda-dep - pandas==2.0.* - matrix: packages: @@ -802,7 +802,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - dask-cuda==24.12.*,>=0.0.0a0 - - *numba + - *numba-cuda-dep specific: - output_types: [conda, requirements] matrices: diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index f90cb96e189..605f9be5a49 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==24.12.*,>=0.0.0a0", - "numba>=0.57", + "numba-cuda>=0.0.13", "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 336b2d24948..76e47b50c3b 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -46,7 +46,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint" [project.optional-dependencies] test = [ "dask-cuda==24.12.*,>=0.0.0a0", - "numba>=0.57", + "numba-cuda>=0.0.13", "pytest-cov", "pytest-xdist", "pytest<8", From 51e8a3fd446f7ef061c4a5d9aa7ea45f1ac3bab6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 27 Sep 2024 07:24:41 -0700 Subject: [PATCH 08/14] clang-tidy fixes part 1 (#16937) This PR includes a first set of fixes found by applying the latest version of clang-tidy to our code base. To keep things reviewable, I've restricted this PR to a smaller set of changes just to the includes. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/16937 --- .../cudf/column/column_device_view.cuh | 4 +- cpp/include/cudf/column/column_view.hpp | 4 +- .../cudf/detail/aggregation/aggregation.hpp | 4 +- .../cudf/detail/groupby/sort_helper.hpp | 1 - .../cudf/detail/utilities/host_vector.hpp | 2 +- .../dictionary/dictionary_column_view.hpp | 2 +- cpp/include/cudf/groupby.hpp | 2 +- cpp/include/cudf/io/json.hpp | 3 - cpp/include/cudf/lists/lists_column_view.hpp | 2 +- cpp/include/cudf/scalar/scalar.hpp | 6 +- .../cudf/strings/detail/char_tables.hpp | 4 +- .../cudf/strings/regex/regex_program.hpp | 4 +- cpp/include/cudf/strings/string_view.cuh | 13 ++-- .../cudf/strings/strings_column_view.hpp | 2 +- .../cudf/structs/structs_column_view.hpp | 2 +- .../cudf/tdigest/tdigest_column_view.hpp | 2 +- cpp/include/cudf/utilities/span.hpp | 64 +++++++++---------- 17 files changed, 56 insertions(+), 65 deletions(-) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index c3238cb94fd..35a39ef9758 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -1425,13 +1425,13 @@ struct pair_rep_accessor { private: template , void>* = nullptr> - __device__ inline auto get_rep(cudf::size_type i) const + __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const { return col.element(i); } template , void>* = nullptr> - __device__ inline auto get_rep(cudf::size_type i) const + __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const { return col.element(i).value(); } diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 3ef7bafe727..48f89b8be25 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -235,7 +235,7 @@ class column_view_base { * * @return Typed pointer to underlying data */ - virtual void const* get_data() const noexcept { return _data; } + [[nodiscard]] virtual void const* get_data() const noexcept { return _data; } data_type _type{type_id::EMPTY}; ///< Element type size_type _size{}; ///< Number of elements @@ -695,7 +695,7 @@ class mutable_column_view : public detail::column_view_base { * * @return Typed pointer to underlying data */ - void const* get_data() const noexcept override; + [[nodiscard]] void const* get_data() const noexcept override; private: friend mutable_column_view bit_cast(mutable_column_view const& input, data_type type); diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 4255faea702..6661a461b8b 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -683,7 +683,7 @@ class ewma_aggregation final : public scan_aggregation { { } - std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -694,7 +694,7 @@ class ewma_aggregation final : public scan_aggregation { return collector.visit(col_type, *this); } - bool is_equal(aggregation const& _other) const override + [[nodiscard]] bool is_equal(aggregation const& _other) const override { if (!this->aggregation::is_equal(_other)) { return false; } auto const& other = dynamic_cast(_other); diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp index ce8783d8b79..d7a42d0eca5 100644 --- a/cpp/include/cudf/detail/groupby/sort_helper.hpp +++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp @@ -211,7 +211,6 @@ struct sort_groupby_helper { */ column_view keys_bitmask_column(rmm::cuda_stream_view stream); - private: column_ptr _key_sorted_order; ///< Indices to produce _keys in sorted order column_ptr _unsorted_keys_labels; ///< Group labels for unsorted _keys column_ptr _keys_bitmask_column; ///< Column representing rows with one or more nulls values diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp index ecb8f910463..3f6ad7b7b1d 100644 --- a/cpp/include/cudf/detail/utilities/host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -183,7 +183,7 @@ class rmm_host_allocator { */ inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); } - bool is_device_accessible() const { return _is_device_accessible; } + [[nodiscard]] bool is_device_accessible() const { return _is_device_accessible; } private: rmm::host_async_resource_ref mr; diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp index dc822fee38b..5596f78a90b 100644 --- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp +++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp @@ -47,7 +47,7 @@ class dictionary_column_view : private column_view { dictionary_column_view(column_view const& dictionary_column); dictionary_column_view(dictionary_column_view&&) = default; ///< Move constructor dictionary_column_view(dictionary_column_view const&) = default; ///< Copy constructor - ~dictionary_column_view() = default; + ~dictionary_column_view() override = default; /** * @brief Move assignment operator diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp index 11c778408fe..c9df02f167a 100644 --- a/cpp/include/cudf/groupby.hpp +++ b/cpp/include/cudf/groupby.hpp @@ -36,7 +36,7 @@ namespace CUDF_EXPORT cudf { namespace groupby { namespace detail { namespace sort { -class sort_groupby_helper; +struct sort_groupby_helper; } // namespace sort } // namespace detail diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 6798557e14e..b662b660557 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -116,9 +116,6 @@ class json_reader_options { // Whether to parse dates as DD/MM versus MM/DD bool _dayfirst = false; - // Whether to use the legacy reader - bool _legacy = false; - // Whether to keep the quote characters of string values bool _keep_quotes = false; diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp index b117a871b64..d7057cfea7e 100644 --- a/cpp/include/cudf/lists/lists_column_view.hpp +++ b/cpp/include/cudf/lists/lists_column_view.hpp @@ -48,7 +48,7 @@ class lists_column_view : private column_view { lists_column_view(column_view const& lists_column); lists_column_view(lists_column_view&&) = default; ///< Move constructor lists_column_view(lists_column_view const&) = default; ///< Copy constructor - ~lists_column_view() = default; + ~lists_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index e8a498afc09..66be2a12fbe 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -47,6 +47,7 @@ namespace CUDF_EXPORT cudf { */ class scalar { public: + scalar() = delete; virtual ~scalar() = default; scalar& operator=(scalar const& other) = delete; scalar& operator=(scalar&& other) = delete; @@ -96,8 +97,6 @@ class scalar { data_type _type{type_id::EMPTY}; ///< Logical type of value in the scalar rmm::device_scalar _is_valid; ///< Device bool signifying validity - scalar() = delete; - /** * @brief Move constructor for scalar. * @param other The other scalar to move from. @@ -145,6 +144,7 @@ class fixed_width_scalar : public scalar { public: using value_type = T; ///< Type of the value held by the scalar. + fixed_width_scalar() = delete; ~fixed_width_scalar() override = default; /** @@ -203,8 +203,6 @@ class fixed_width_scalar : public scalar { protected: rmm::device_scalar _data; ///< device memory containing the value - fixed_width_scalar() = delete; - /** * @brief Construct a new fixed width scalar object. * diff --git a/cpp/include/cudf/strings/detail/char_tables.hpp b/cpp/include/cudf/strings/detail/char_tables.hpp index 5d6aff28826..6460d4f43ff 100644 --- a/cpp/include/cudf/strings/detail/char_tables.hpp +++ b/cpp/include/cudf/strings/detail/char_tables.hpp @@ -74,9 +74,9 @@ character_cases_table_type const* get_character_cases_table(); */ struct special_case_mapping { uint16_t num_upper_chars; - uint16_t upper[3]; + uint16_t upper[3]; // NOLINT uint16_t num_lower_chars; - uint16_t lower[3]; + uint16_t lower[3]; // NOLINT }; /** diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp index 9da859d9c87..1bf1c26f471 100644 --- a/cpp/include/cudf/strings/regex/regex_program.hpp +++ b/cpp/include/cudf/strings/regex/regex_program.hpp @@ -54,6 +54,8 @@ struct regex_program { regex_flags flags = regex_flags::DEFAULT, capture_groups capture = capture_groups::EXTRACT); + regex_program() = delete; + /** * @brief Move constructor * @@ -115,8 +117,6 @@ struct regex_program { ~regex_program(); private: - regex_program() = delete; - std::string _pattern; regex_flags _flags; capture_groups _capture; diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 14695c3bb27..34ed3c5618e 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -99,7 +99,7 @@ __device__ inline std::pair bytes_to_character_position(st * values. Also, this char pointer serves as valid device pointer of identity * value for minimum operator on string values. */ -static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"}; +static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"}; // NOLINT } // namespace detail } // namespace strings @@ -283,14 +283,11 @@ __device__ inline size_type string_view::const_iterator::position() const { retu __device__ inline size_type string_view::const_iterator::byte_offset() const { return byte_pos; } -__device__ inline string_view::const_iterator string_view::begin() const -{ - return const_iterator(*this, 0, 0); -} +__device__ inline string_view::const_iterator string_view::begin() const { return {*this, 0, 0}; } __device__ inline string_view::const_iterator string_view::end() const { - return const_iterator(*this, length(), size_bytes()); + return {*this, length(), size_bytes()}; } // @endcond @@ -411,7 +408,7 @@ __device__ inline size_type string_view::find(char const* str, __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size_type count) const { - char str[sizeof(char_utf8)]; + char str[sizeof(char_utf8)]; // NOLINT size_type chwidth = strings::detail::from_char_utf8(chr, str); return find(str, chwidth, pos, count); } @@ -433,7 +430,7 @@ __device__ inline size_type string_view::rfind(char const* str, __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, size_type count) const { - char str[sizeof(char_utf8)]; + char str[sizeof(char_utf8)]; // NOLINT size_type chwidth = strings::detail::from_char_utf8(chr, str); return rfind(str, chwidth, pos, count); } diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index 4a2512eb7c5..6ec8d1238d6 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -45,7 +45,7 @@ class strings_column_view : private column_view { strings_column_view(column_view strings_column); strings_column_view(strings_column_view&&) = default; ///< Move constructor strings_column_view(strings_column_view const&) = default; ///< Copy constructor - ~strings_column_view() = default; + ~strings_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/structs/structs_column_view.hpp b/cpp/include/cudf/structs/structs_column_view.hpp index 19798f51656..91d7ddce955 100644 --- a/cpp/include/cudf/structs/structs_column_view.hpp +++ b/cpp/include/cudf/structs/structs_column_view.hpp @@ -42,7 +42,7 @@ class structs_column_view : public column_view { // Foundation members: structs_column_view(structs_column_view const&) = default; ///< Copy constructor structs_column_view(structs_column_view&&) = default; ///< Move constructor - ~structs_column_view() = default; + ~structs_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.hpp b/cpp/include/cudf/tdigest/tdigest_column_view.hpp index 2f19efa5630..da4954b859c 100644 --- a/cpp/include/cudf/tdigest/tdigest_column_view.hpp +++ b/cpp/include/cudf/tdigest/tdigest_column_view.hpp @@ -59,7 +59,7 @@ class tdigest_column_view : private column_view { tdigest_column_view(column_view const&); ///< Construct tdigest_column_view from a column_view tdigest_column_view(tdigest_column_view&&) = default; ///< Move constructor tdigest_column_view(tdigest_column_view const&) = default; ///< Copy constructor - ~tdigest_column_view() = default; + ~tdigest_column_view() override = default; /** * @brief Copy assignment operator * diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 0daebc0dd8d..914731ea417 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -236,26 +236,26 @@ struct host_span : public cudf::detail::span_base::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr host_span(C& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } /// Constructor from const container /// @param in The container to construct the span from - template < - typename C, - // Only supported containers of types convertible to T - std::enable_if_t::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr host_span(C const& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } @@ -264,7 +264,7 @@ struct host_span : public cudf::detail::span_base>* = nullptr> + std::enable_if_t>* = nullptr> // NOLINT constexpr host_span(cudf::detail::host_vector& in) : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()} { @@ -274,7 +274,7 @@ struct host_span : public cudf::detail::span_base>* = nullptr> + std::enable_if_t>* = nullptr> // NOLINT constexpr host_span(cudf::detail::host_vector const& in) : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()} { @@ -285,7 +285,7 @@ struct host_span : public cudf::detail::span_base, + std::is_convertible_v, // NOLINT void>* = nullptr> constexpr host_span(host_span const& other) noexcept : base(other.data(), other.size()) @@ -333,26 +333,26 @@ struct device_span : public cudf::detail::span_base::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr device_span(C& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } /// Constructor from const container /// @param in The container to construct the span from - template < - typename C, - // Only supported containers of types convertible to T - std::enable_if_t::value && - std::is_convertible_v().data()))> (*)[], - T (*)[]>>* = nullptr> + template ::value && + std::is_convertible_v< + std::remove_pointer_t().data()))> (*)[], + T (*)[]>>* = nullptr> // NOLINT constexpr device_span(C const& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } @@ -362,7 +362,7 @@ struct device_span : public cudf::detail::span_base, + std::is_convertible_v, // NOLINT void>* = nullptr> constexpr device_span(device_span const& other) noexcept : base(other.data(), other.size()) From 1f25d7a24c5d58e6c1acdb3d3fbabc6a5a39ebe6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 27 Sep 2024 09:13:43 -0700 Subject: [PATCH 09/14] clang-tidy fixes part 3 (#16939) Subset of improvements to the code base proposed by the latest version of clang-tidy. **Note to reviewers**: The changeset looks deceptively large. Almost all of the change are really just switching from raw C-style arrays to C++ std::arrays. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - David Wendt (https://github.com/davidwendt) - Basit Ayantunde (https://github.com/lamarrr) URL: https://github.com/rapidsai/cudf/pull/16939 --- cpp/tests/copying/copy_tests.cpp | 174 ++++++++-------- cpp/tests/filling/sequence_tests.cpp | 13 +- cpp/tests/groupby/collect_list_tests.cpp | 7 +- cpp/tests/interop/dlpack_test.cpp | 22 +-- cpp/tests/io/orc_test.cpp | 77 ++++---- cpp/tests/io/parquet_chunked_writer_test.cpp | 72 +++---- cpp/tests/io/parquet_common.cpp | 18 +- cpp/tests/io/parquet_misc_test.cpp | 9 +- cpp/tests/io/parquet_reader_test.cpp | 187 +++++++++--------- cpp/tests/io/parquet_v2_test.cpp | 44 +++-- cpp/tests/io/parquet_writer_test.cpp | 88 ++++----- cpp/tests/json/json_tests.cpp | 12 +- cpp/tests/reductions/reduction_tests.cpp | 7 +- cpp/tests/reductions/scan_tests.cpp | 4 +- cpp/tests/rolling/nth_element_test.cpp | 4 +- cpp/tests/streams/transform_test.cpp | 2 +- cpp/tests/strings/chars_types_tests.cpp | 29 +-- cpp/tests/strings/contains_tests.cpp | 69 +++---- cpp/tests/strings/durations_tests.cpp | 143 +++++++------- cpp/tests/strings/extract_tests.cpp | 6 +- cpp/tests/strings/findall_tests.cpp | 6 +- .../integration/unary_transform_test.cpp | 2 +- 22 files changed, 483 insertions(+), 512 deletions(-) diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp index 7c8729b6a77..4124f749012 100644 --- a/cpp/tests/copying/copy_tests.cpp +++ b/cpp/tests/copying/copy_tests.cpp @@ -73,44 +73,45 @@ TYPED_TEST(CopyTest, CopyIfElseTestLong) using T = TypeParam; // make sure we span at least 2 warps - int num_els = 64; - - bool mask[] = {true, false, true, false, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, false, false, false, false, true, true, true, - true, true, true, true, true, true, false, false, false, false, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); - - bool lhs_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - wrapper lhs_w({5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, - lhs_v); - - bool rhs_v[] = {true, true, true, true, true, true, false, false, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - wrapper rhs_w({6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}, - rhs_v); - - bool exp_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, true}; - wrapper expected_w({5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, - 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, - exp_v); + constexpr int num_els = 64; + + std::array mask{ + true, false, true, false, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, false, false, false, false, true, true, true, + true, true, true, true, true, true, false, false, false, false, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); + + wrapper lhs_w( + {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, + {true, true, true, true, false, false, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); + + wrapper rhs_w( + {6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}, + {true, true, true, true, true, true, false, false, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); + + wrapper expected_w( + {5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, + 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, + {true, true, true, true, false, false, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, true}); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -318,19 +319,17 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarColumn) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); cudf::numeric_scalar lhs_w(5); auto const rhs = cudf::test::make_type_param_vector({6, 6, 6, 6}); - bool rhs_v[] = {true, false, true, true}; - wrapper rhs_w(rhs.begin(), rhs.end(), rhs_v); + std::array rhs_v{true, false, true, true}; + wrapper rhs_w(rhs.begin(), rhs.end(), rhs_v.begin()); auto const expected = cudf::test::make_type_param_vector({5, 6, 6, 5}); - wrapper expected_w(expected.begin(), expected.end(), rhs_v); + wrapper expected_w(expected.begin(), expected.end(), rhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -340,20 +339,18 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestColumnScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - bool mask_v[] = {true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els, mask_v); + std::array mask{true, false, false, true}; + std::array mask_v{true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto const lhs = cudf::test::make_type_param_vector({5, 5, 5, 5}); - bool lhs_v[] = {false, true, true, true}; - wrapper lhs_w(lhs.begin(), lhs.end(), lhs_v); + std::array lhs_v{false, true, true, true}; + wrapper lhs_w(lhs.begin(), lhs.end(), lhs_v.begin()); cudf::numeric_scalar rhs_w(6); auto const expected = cudf::test::make_type_param_vector({5, 6, 6, 6}); - wrapper expected_w(expected.begin(), expected.end(), lhs_v); + wrapper expected_w(expected.begin(), expected.end(), lhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -363,16 +360,14 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); cudf::numeric_scalar lhs_w(5); cudf::numeric_scalar rhs_w(6, false); auto const expected = cudf::test::make_type_param_vector({5, 6, 6, 5}); - wrapper expected_w(expected.begin(), expected.end(), mask); + wrapper expected_w(expected.begin(), expected.end(), mask.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -405,17 +400,15 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarColumn) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto lhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(5), true); - bool rhs_v[] = {true, false, true, true}; - wrapper rhs_w({6, 6, 6, 6}, rhs_v); + std::array rhs_v{true, false, true, true}; + wrapper rhs_w({6, 6, 6, 6}, rhs_v.begin()); - wrapper expected_w({5, 6, 6, 5}, rhs_v); + wrapper expected_w({5, 6, 6, 5}, rhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -425,17 +418,15 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestColumnScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); - bool lhs_v[] = {false, true, true, true}; - wrapper lhs_w({5, 5, 5, 5}, lhs_v); + std::array lhs_v{false, true, true, true}; + wrapper lhs_w({5, 5, 5, 5}, lhs_v.begin()); auto rhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(6), true); - wrapper expected_w({5, 6, 6, 5}, lhs_v); + wrapper expected_w({5, 6, 6, 5}, lhs_v.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -445,15 +436,13 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarScalar) { using T = TypeParam; - int num_els = 4; - - bool mask[] = {true, false, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + num_els); + std::array mask{true, false, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto lhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(5), true); auto rhs_w = create_chrono_scalar{}(cudf::test::make_type_param_scalar(6), false); - wrapper expected_w({5, 6, 6, 5}, mask); + wrapper expected_w({5, 6, 6, 5}, mask.begin()); auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w); CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w); @@ -483,9 +472,9 @@ TEST_F(StringsCopyIfElseTest, CopyIfElse) std::vector h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {true, true, false, true, false, true}; - bool mask_v[] = {true, true, true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); + std::array mask{true, true, false, true, false, true}; + std::array mask_v{true, true, true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto results = cudf::copy_if_else(strings1, strings2, mask_w); @@ -510,9 +499,9 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarColumn) std::vector h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {true, false, true, false, true, false}; - bool mask_v[] = {true, true, true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); + std::array mask{true, false, true, false, true, false}; + std::array mask_v{true, true, true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto results = cudf::copy_if_else(strings1, strings2, mask_w); @@ -538,8 +527,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseColumnScalar) std::vector h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"}; cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {false, true, true, true, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6); + std::array mask{false, true, true, true, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto results = cudf::copy_if_else(strings2, strings1, mask_w); @@ -565,9 +554,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarScalar) std::vector h_string2{"aaa"}; cudf::string_scalar string2{h_string2[0], false}; - constexpr cudf::size_type mask_size = 6; - bool mask[] = {true, false, true, false, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + mask_size); + std::array mask{true, false, true, false, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto results = cudf::copy_if_else(string1, string2, mask_w); @@ -652,9 +640,9 @@ TEST_F(DictionaryCopyIfElseTest, ColumnColumn) cudf::test::dictionary_column_wrapper input2( h_strings2.begin(), h_strings2.end(), valids); - bool mask[] = {true, true, false, true, false, true}; - bool mask_v[] = {true, true, true, true, true, false}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); + std::array mask{true, true, false, true, false, true}; + std::array mask_v{true, true, true, true, true, false}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end(), mask_v.begin()); auto results = cudf::copy_if_else(input1, input2, mask_w); auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view())); @@ -679,8 +667,8 @@ TEST_F(DictionaryCopyIfElseTest, ColumnScalar) cudf::test::dictionary_column_wrapper input2( h_strings.begin(), h_strings.end(), valids); - bool mask[] = {false, true, true, true, false, true}; - cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6); + std::array mask{false, true, true, true, false, true}; + cudf::test::fixed_width_column_wrapper mask_w(mask.begin(), mask.end()); auto results = cudf::copy_if_else(input2, input1, mask_w); auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view())); diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp index 5651a26f192..0783b4e5bbb 100644 --- a/cpp/tests/filling/sequence_tests.cpp +++ b/cpp/tests/filling/sequence_tests.cpp @@ -41,8 +41,7 @@ TYPED_TEST(SequenceTypedTestFixture, Incrementing) cudf::size_type num_els = 10; - T expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); auto result = cudf::sequence(num_els, init, step); @@ -58,8 +57,8 @@ TYPED_TEST(SequenceTypedTestFixture, Decrementing) cudf::size_type num_els = 10; - T expected[] = {0, -5, -10, -15, -20, -25, -30, -35, -40, -45}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w( + {0, -5, -10, -15, -20, -25, -30, -35, -40, -45}); auto result = cudf::sequence(num_els, init, step); @@ -75,8 +74,7 @@ TYPED_TEST(SequenceTypedTestFixture, EmptyOutput) cudf::size_type num_els = 0; - T expected[] = {}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w({}); auto result = cudf::sequence(num_els, init, step); @@ -121,8 +119,7 @@ TYPED_TEST(SequenceTypedTestFixture, DefaultStep) cudf::size_type num_els = 10; - T expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - cudf::test::fixed_width_column_wrapper expected_w(expected, expected + num_els); + cudf::test::fixed_width_column_wrapper expected_w({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); auto result = cudf::sequence(num_els, init); diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp index 749f4013013..a79b6a32916 100644 --- a/cpp/tests/groupby/collect_list_tests.cpp +++ b/cpp/tests/groupby/collect_list_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -127,8 +127,9 @@ TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion) using LCW = cudf::test::lists_column_wrapper; cudf::test::fixed_width_column_wrapper keys{1, 1, 2, 2, 3, 3, 4, 4}; - bool const validity_mask[] = {true, false, false, true, true, true, false, false}; - LCW values{{{1, 2}, {3, 4}, {5, 6, 7}, LCW{}, {9, 10}, {11}, {20, 30, 40}, LCW{}}, validity_mask}; + std::array const validity_mask{true, false, false, true, true, true, false, false}; + LCW values{{{1, 2}, {3, 4}, {5, 6, 7}, LCW{}, {9, 10}, {11}, {20, 30, 40}, LCW{}}, + validity_mask.data()}; cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3, 4}; diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp index 330f07ac8e2..ef4b9dd9b8a 100644 --- a/cpp/tests/interop/dlpack_test.cpp +++ b/cpp/tests/interop/dlpack_test.cpp @@ -225,8 +225,8 @@ TEST_F(DLPackUntypedTests, UnsupportedBroadcast1DTensorFromDlpack) constexpr int ndim = 1; // Broadcasted (stride-0) 1D tensor auto const data = cudf::test::make_type_param_vector({1}); - int64_t shape[ndim] = {5}; - int64_t strides[ndim] = {0}; + int64_t shape[ndim] = {5}; // NOLINT + int64_t strides[ndim] = {0}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -248,8 +248,8 @@ TEST_F(DLPackUntypedTests, UnsupportedStrided1DTensorFromDlpack) constexpr int ndim = 1; // Strided 1D tensor auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4}); - int64_t shape[ndim] = {2}; - int64_t strides[ndim] = {2}; + int64_t shape[ndim] = {2}; // NOLINT + int64_t strides[ndim] = {2}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -271,7 +271,7 @@ TEST_F(DLPackUntypedTests, UnsupportedImplicitRowMajor2DTensorFromDlpack) constexpr int ndim = 2; // Row major 2D tensor auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4}); - int64_t shape[ndim] = {2, 2}; + int64_t shape[ndim] = {2, 2}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -293,8 +293,8 @@ TEST_F(DLPackUntypedTests, UnsupportedExplicitRowMajor2DTensorFromDlpack) constexpr int ndim = 2; // Row major 2D tensor with explicit strides auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4}); - int64_t shape[ndim] = {2, 2}; - int64_t strides[ndim] = {2, 1}; + int64_t shape[ndim] = {2, 2}; // NOLINT + int64_t strides[ndim] = {2, 1}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -316,8 +316,8 @@ TEST_F(DLPackUntypedTests, UnsupportedStridedColMajor2DTensorFromDlpack) constexpr int ndim = 2; // Column major, but strided in fastest dimension auto const data = cudf::test::make_type_param_vector({1, 2, 3, 4, 5, 6, 7, 8}); - int64_t shape[ndim] = {2, 2}; - int64_t strides[ndim] = {2, 4}; + int64_t shape[ndim] = {2, 2}; // NOLINT + int64_t strides[ndim] = {2, 4}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; @@ -465,8 +465,8 @@ TYPED_TEST(DLPackNumericTests, FromDlpackCpu) using T = TypeParam; auto const data = cudf::test::make_type_param_vector({0, 1, 2, 3, 4, 0, 5, 6, 7, 8, 0}); uint64_t const offset{sizeof(T)}; - int64_t shape[2] = {4, 2}; - int64_t strides[2] = {1, 5}; + int64_t shape[2] = {4, 2}; // NOLINT + int64_t strides[2] = {1, 5}; // NOLINT DLManagedTensor tensor{}; tensor.dl_tensor.device.device_type = kDLCPU; diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 39ba62952b4..89e704f3ed3 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -38,6 +38,7 @@ #include +#include #include template @@ -767,14 +768,14 @@ TEST_F(OrcChunkedWriterTest, Metadata) TEST_F(OrcChunkedWriterTest, Strings) { - bool mask1[] = {true, true, false, true, true, true, true}; + std::array mask1{true, true, false, true, true, true, true}; std::vector h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"}; - str_col strings1(h_strings1.begin(), h_strings1.end(), mask1); + str_col strings1(h_strings1.begin(), h_strings1.end(), mask1.data()); table_view tbl1({strings1}); - bool mask2[] = {false, true, true, true, true, true, true}; + std::array mask2{false, true, true, true, true, true, true}; std::vector h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"}; - str_col strings2(h_strings2.begin(), h_strings2.end(), mask2); + str_col strings2(h_strings2.begin(), h_strings2.end(), mask2.data()); table_view tbl2({strings2}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -877,26 +878,26 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize) using T = TypeParam; - int num_els = 31; + constexpr int num_els{31}; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true}; + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); table_view tbl1({c1a_w, c1b_w}); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); table_view tbl2({c2a_w, c2b_w}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -920,26 +921,26 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2) using T = TypeParam; - int num_els = 33; + constexpr int num_els = 33; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true}; + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); table_view tbl1({c1a_w, c1b_w}); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); table_view tbl2({c2a_w, c2b_w}); auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); @@ -1140,7 +1141,7 @@ TEST_F(OrcReaderTest, zstdCompressionRegression) } // Test with zstd compressed orc file with high compression ratio. - constexpr uint8_t input_buffer[] = { + constexpr std::array input_buffer{ 0x4f, 0x52, 0x43, 0x5a, 0x00, 0x00, 0x28, 0xb5, 0x2f, 0xfd, 0xa4, 0x34, 0xc7, 0x03, 0x00, 0x74, 0x00, 0x00, 0x18, 0x41, 0xff, 0xaa, 0x02, 0x00, 0xbb, 0xff, 0x45, 0xc8, 0x01, 0x25, 0x30, 0x04, 0x65, 0x00, 0x00, 0x10, 0xaa, 0x1f, 0x02, 0x00, 0x01, 0x29, 0x0b, 0xc7, 0x39, 0xb8, 0x02, 0xcb, @@ -1154,7 +1155,7 @@ TEST_F(OrcReaderTest, zstdCompressionRegression) 0x30, 0x09, 0x82, 0xf4, 0x03, 0x03, 0x4f, 0x52, 0x43, 0x17}; auto source = - cudf::io::source_info(reinterpret_cast(input_buffer), sizeof(input_buffer)); + cudf::io::source_info(reinterpret_cast(input_buffer.data()), input_buffer.size()); cudf::io::orc_reader_options in_opts = cudf::io::orc_reader_options::builder(source).use_index(false); diff --git a/cpp/tests/io/parquet_chunked_writer_test.cpp b/cpp/tests/io/parquet_chunked_writer_test.cpp index 282c6f3adad..810fee89c48 100644 --- a/cpp/tests/io/parquet_chunked_writer_test.cpp +++ b/cpp/tests/io/parquet_chunked_writer_test.cpp @@ -124,15 +124,15 @@ TEST_F(ParquetChunkedWriterTest, Strings) { std::vector> cols; - bool mask1[] = {true, true, false, true, true, true, true}; + std::array mask1{true, true, false, true, true, true, true}; std::vector h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"}; - cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1); + cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1.data()); cols.push_back(strings1.release()); cudf::table tbl1(std::move(cols)); - bool mask2[] = {false, true, true, true, true, true, true}; + std::array mask2{false, true, true, true, true, true, true}; std::vector h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"}; - cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2); + cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2.data()); cols.push_back(strings2.release()); cudf::table tbl2(std::move(cols)); @@ -771,29 +771,29 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize) using T = TypeParam; - int num_els = 31; + constexpr int num_els = 31; std::vector> cols; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + true, true, true, true, true, true, true, true, true}; + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); cols.push_back(c1a_w.release()); cols.push_back(c1b_w.release()); cudf::table tbl1(std::move(cols)); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); cols.push_back(c2a_w.release()); cols.push_back(c2b_w.release()); cudf::table tbl2(std::move(cols)); @@ -819,29 +819,29 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize2) using T = TypeParam; - int num_els = 33; + constexpr int num_els = 33; std::vector> cols; - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true}; + std::array mask{false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); + std::array c1a; + std::fill(c1a.begin(), c1a.end(), static_cast(5)); + std::array c1b; + std::fill(c1b.begin(), c1b.end(), static_cast(5)); + column_wrapper c1a_w(c1a.begin(), c1a.end(), mask.begin()); + column_wrapper c1b_w(c1b.begin(), c1b.end(), mask.begin()); cols.push_back(c1a_w.release()); cols.push_back(c1b_w.release()); cudf::table tbl1(std::move(cols)); - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); + std::array c2a; + std::fill(c2a.begin(), c2a.end(), static_cast(8)); + std::array c2b; + std::fill(c2b.begin(), c2b.end(), static_cast(9)); + column_wrapper c2a_w(c2a.begin(), c2a.end(), mask.begin()); + column_wrapper c2b_w(c2b.begin(), c2b.end(), mask.begin()); cols.push_back(c2a_w.release()); cols.push_back(c2b_w.release()); cudf::table tbl2(std::move(cols)); diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp index 3dd5ad145ea..6141a40bc95 100644 --- a/cpp/tests/io/parquet_common.cpp +++ b/cpp/tests/io/parquet_common.cpp @@ -483,10 +483,10 @@ template std::enable_if_t, cudf::test::strings_column_wrapper> ascending() { - char buf[10]; + std::array buf; auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { - sprintf(buf, "%09d", i); - return std::string(buf); + sprintf(buf.data(), "%09d", i); + return std::string(buf.data()); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } @@ -495,10 +495,10 @@ template std::enable_if_t, cudf::test::strings_column_wrapper> descending() { - char buf[10]; + std::array buf; auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { - sprintf(buf, "%09d", num_ordered_rows - i); - return std::string(buf); + sprintf(buf.data(), "%09d", static_cast(num_ordered_rows - i)); + return std::string(buf.data()); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } @@ -507,10 +507,10 @@ template std::enable_if_t, cudf::test::strings_column_wrapper> unordered() { - char buf[10]; + std::array buf; auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { - sprintf(buf, "%09d", (i % 2 == 0) ? i : (num_ordered_rows - i)); - return std::string(buf); + sprintf(buf.data(), "%09d", (i % 2 == 0) ? i : (num_ordered_rows - i)); + return std::string(buf.data()); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp index 01027d04658..8b03e94191e 100644 --- a/cpp/tests/io/parquet_misc_test.cpp +++ b/cpp/tests/io/parquet_misc_test.cpp @@ -23,6 +23,8 @@ #include #include +#include + //////////////////////////////// // delta encoding writer tests @@ -225,10 +227,9 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) // now check that the boundary order for chunk 1 is ascending, // chunk 2 is descending, and chunk 3 is unordered - cudf::io::parquet::detail::BoundaryOrder expected_orders[] = { - cudf::io::parquet::detail::BoundaryOrder::ASCENDING, - cudf::io::parquet::detail::BoundaryOrder::DESCENDING, - cudf::io::parquet::detail::BoundaryOrder::UNORDERED}; + std::array expected_orders{cudf::io::parquet::detail::BoundaryOrder::ASCENDING, + cudf::io::parquet::detail::BoundaryOrder::DESCENDING, + cudf::io::parquet::detail::BoundaryOrder::UNORDERED}; for (std::size_t i = 0; i < columns.size(); i++) { auto const ci = read_column_index(source, columns[i]); diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index 6c61535359f..dc8e68b3a15 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -29,6 +29,8 @@ #include #include +#include + TEST_F(ParquetReaderTest, UserBounds) { // trying to read more rows than there are should result in @@ -569,7 +571,8 @@ TEST_F(ParquetReaderTest, DecimalRead) This test is a temporary test until python gains the ability to write decimal, so we're embedding a parquet file directly into the code here to prevent issues with finding the file */ - unsigned char const decimals_parquet[] = { + constexpr unsigned int decimals_parquet_len = 2366; + std::array const decimals_parquet{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xb0, 0x03, 0x15, 0xb8, 0x03, 0x2c, 0x15, 0x6a, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, 0x02, 0x28, 0x04, 0x7f, 0x96, 0x98, 0x00, 0x18, 0x04, 0x81, 0x69, 0x67, 0xff, 0x00, 0x00, 0x00, 0xd8, 0x01, 0xf0, 0xd7, 0x04, 0x00, @@ -728,10 +731,10 @@ TEST_F(ParquetReaderTest, DecimalRead) 0x30, 0x36, 0x30, 0x36, 0x39, 0x65, 0x35, 0x30, 0x63, 0x39, 0x62, 0x37, 0x39, 0x37, 0x30, 0x62, 0x65, 0x62, 0x64, 0x31, 0x29, 0x19, 0x3c, 0x1c, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xd3, 0x02, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - unsigned int decimals_parquet_len = 2366; - cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(decimals_parquet), decimals_parquet_len}); + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{ + reinterpret_cast(decimals_parquet.data()), decimals_parquet_len}); auto result = cudf::io::read_parquet(read_opts); auto validity = @@ -739,7 +742,7 @@ TEST_F(ParquetReaderTest, DecimalRead) EXPECT_EQ(result.tbl->view().num_columns(), 3); - int32_t col0_data[] = { + std::array col0_data{ -2354584, -190275, 8393572, 6446515, -5687920, -1843550, -6897687, -6780385, 3428529, 5842056, -4312278, -4450603, -7516141, 2974667, -4288640, 1065090, -9410428, 7891355, 1076244, -1975984, 6999466, 2666959, 9262967, 7931374, -1370640, 451074, 8799111, @@ -753,29 +756,28 @@ TEST_F(ParquetReaderTest, DecimalRead) std::begin(col0_data), std::end(col0_data), validity, numeric::scale_type{-4}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), col0); - int64_t col1_data[] = {29274040266581, -17210335917753, -58420730139037, - 68073792696254, 2236456014294, 13704555677045, - -70797090469548, -52248605513407, -68976081919961, - -34277313883112, 97774730521689, 21184241014572, - -670882460254, -40862944054399, -24079852370612, - -88670167797498, -84007574359403, -71843004533519, - -55538016554201, 3491435293032, -29085437167297, - 36901882672273, -98622066122568, -13974902998457, - 86712597643378, -16835133643735, -94759096142232, - 30708340810940, 79086853262082, 78923696440892, - -76316597208589, 37247268714759, 80303592631774, - 57790350050889, 19387319851064, -33186875066145, - 69701203023404, -7157433049060, -7073790423437, - 92769171617714, -75127120182184, -951893180618, - 64927618310150, -53875897154023, -16168039035569, - -24273449166429, -30359781249192, 35639397345991, - 45844829680593, 71401416837149, 0, - -99999999999999, 99999999999999}; - - EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), - sizeof(col1_data) / sizeof(col1_data[0])); + std::array col1_data{29274040266581, -17210335917753, -58420730139037, + 68073792696254, 2236456014294, 13704555677045, + -70797090469548, -52248605513407, -68976081919961, + -34277313883112, 97774730521689, 21184241014572, + -670882460254, -40862944054399, -24079852370612, + -88670167797498, -84007574359403, -71843004533519, + -55538016554201, 3491435293032, -29085437167297, + 36901882672273, -98622066122568, -13974902998457, + 86712597643378, -16835133643735, -94759096142232, + 30708340810940, 79086853262082, 78923696440892, + -76316597208589, 37247268714759, 80303592631774, + 57790350050889, 19387319851064, -33186875066145, + 69701203023404, -7157433049060, -7073790423437, + 92769171617714, -75127120182184, -951893180618, + 64927618310150, -53875897154023, -16168039035569, + -24273449166429, -30359781249192, 35639397345991, + 45844829680593, 71401416837149, 0, + -99999999999999, 99999999999999}; + + EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), col1_data.size()); cudf::test::fixed_point_column_wrapper col1( - std::begin(col1_data), std::end(col1_data), validity, numeric::scale_type{-5}); + col1_data.begin(), col1_data.end(), validity, numeric::scale_type{-5}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), col1); cudf::io::parquet_reader_options read_strict_opts = read_opts; @@ -786,7 +788,7 @@ TEST_F(ParquetReaderTest, DecimalRead) // dec7p3: Decimal(precision=7, scale=3) backed by FIXED_LENGTH_BYTE_ARRAY(length = 4) // dec12p11: Decimal(precision=12, scale=11) backed by FIXED_LENGTH_BYTE_ARRAY(length = 6) // dec20p1: Decimal(precision=20, scale=1) backed by FIXED_LENGTH_BYTE_ARRAY(length = 9) - unsigned char const fixed_len_bytes_decimal_parquet[] = { + std::array const fixed_len_bytes_decimal_parquet{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xA8, 0x01, 0x15, 0xAE, 0x01, 0x2C, 0x15, 0x28, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1C, 0x36, 0x02, 0x28, 0x04, 0x00, 0x97, 0x45, 0x72, 0x18, 0x04, 0x00, 0x01, 0x81, 0x3B, 0x00, 0x00, 0x00, 0x54, 0xF0, 0x53, 0x04, 0x00, 0x00, @@ -875,75 +877,72 @@ TEST_F(ParquetReaderTest, DecimalRead) cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{ - reinterpret_cast(fixed_len_bytes_decimal_parquet), parquet_len}); + reinterpret_cast(fixed_len_bytes_decimal_parquet.data()), parquet_len}); auto result = cudf::io::read_parquet(read_opts); EXPECT_EQ(result.tbl->view().num_columns(), 3); - auto validity_c0 = cudf::test::iterators::nulls_at({19}); - int32_t col0_data[] = {6361295, 698632, 7821423, 7073444, 9631892, 3021012, 5195059, - 9913714, 901749, 7776938, 3186566, 4955569, 5131067, 98619, - 2282579, 7521455, 4430706, 1937859, 4532040, 0}; + auto validity_c0 = cudf::test::iterators::nulls_at({19}); + std::array col0_data{6361295, 698632, 7821423, 7073444, 9631892, 3021012, 5195059, + 9913714, 901749, 7776938, 3186566, 4955569, 5131067, 98619, + 2282579, 7521455, 4430706, 1937859, 4532040, 0}; - EXPECT_EQ(static_cast(result.tbl->view().column(0).size()), - sizeof(col0_data) / sizeof(col0_data[0])); + EXPECT_EQ(static_cast(result.tbl->view().column(0).size()), col0_data.size()); cudf::test::fixed_point_column_wrapper col0( - std::begin(col0_data), std::end(col0_data), validity_c0, numeric::scale_type{-3}); + col0_data.begin(), col0_data.end(), validity_c0, numeric::scale_type{-3}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), col0); - auto validity_c1 = cudf::test::iterators::nulls_at({18}); - int64_t col1_data[] = {361378026250, - 30646804862, - 429930238629, - 418758703536, - 895494171113, - 435283865083, - 809096053722, - -999999999999, - 426465099333, - 526684574144, - 826310892810, - 584686967589, - 113822282951, - 409236212092, - 420631167535, - 918438386086, - -999999999999, - 489053889147, - 0, - 363993164092}; - - EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), - sizeof(col1_data) / sizeof(col1_data[0])); + auto validity_c1 = cudf::test::iterators::nulls_at({18}); + std::array col1_data{361378026250, + 30646804862, + 429930238629, + 418758703536, + 895494171113, + 435283865083, + 809096053722, + -999999999999, + 426465099333, + 526684574144, + 826310892810, + 584686967589, + 113822282951, + 409236212092, + 420631167535, + 918438386086, + -999999999999, + 489053889147, + 0, + 363993164092}; + + EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), col1_data.size()); cudf::test::fixed_point_column_wrapper col1( - std::begin(col1_data), std::end(col1_data), validity_c1, numeric::scale_type{-11}); + col1_data.begin(), col1_data.end(), validity_c1, numeric::scale_type{-11}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), col1); - auto validity_c2 = cudf::test::iterators::nulls_at({6, 14}); - __int128_t col2_data[] = {9078697037144433659, - 9050770539577117612, - 2358363961733893636, - 1566059559232276662, - 6658306200002735268, - 4967909073046397334, - 0, - 7235588493887532473, - 5023160741463849572, - 2765173712965988273, - 3880866513515749646, - 5019704400576359500, - 5544435986818825655, - 7265381725809874549, - 0, - 1576192427381240677, - 2828305195087094598, - 260308667809395171, - 2460080200895288476, - 2718441925197820439}; - - EXPECT_EQ(static_cast(result.tbl->view().column(2).size()), - sizeof(col2_data) / sizeof(col2_data[0])); + auto validity_c2 = cudf::test::iterators::nulls_at({6, 14}); + std::array<__int128_t, 20> col2_data{9078697037144433659, + 9050770539577117612, + 2358363961733893636, + 1566059559232276662, + 6658306200002735268, + 4967909073046397334, + 0, + 7235588493887532473, + 5023160741463849572, + 2765173712965988273, + 3880866513515749646, + 5019704400576359500, + 5544435986818825655, + 7265381725809874549, + 0, + 1576192427381240677, + 2828305195087094598, + 260308667809395171, + 2460080200895288476, + 2718441925197820439}; + + EXPECT_EQ(static_cast(result.tbl->view().column(2).size()), col2_data.size()); cudf::test::fixed_point_column_wrapper<__int128_t> col2( - std::begin(col2_data), std::end(col2_data), validity_c2, numeric::scale_type{-1}); + col2_data.begin(), col2_data.end(), validity_c2, numeric::scale_type{-1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(2), col2); } } @@ -1221,7 +1220,7 @@ TEST_F(ParquetReaderTest, NestingOptimizationTest) TEST_F(ParquetReaderTest, SingleLevelLists) { - unsigned char list_bytes[] = { + std::array list_bytes{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06, 0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, @@ -1239,7 +1238,7 @@ TEST_F(ParquetReaderTest, SingleLevelLists) // read single level list reproducing parquet file cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(list_bytes), sizeof(list_bytes)}); + cudf::io::source_info{reinterpret_cast(list_bytes.data()), list_bytes.size()}); auto table = cudf::io::read_parquet(read_opts); auto const c0 = table.tbl->get_column(0); @@ -1252,7 +1251,7 @@ TEST_F(ParquetReaderTest, SingleLevelLists) TEST_F(ParquetReaderTest, ChunkedSingleLevelLists) { - unsigned char list_bytes[] = { + std::array list_bytes{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06, 0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, @@ -1271,7 +1270,7 @@ TEST_F(ParquetReaderTest, ChunkedSingleLevelLists) auto reader = cudf::io::chunked_parquet_reader( 1L << 31, cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(list_bytes), sizeof(list_bytes)})); + cudf::io::source_info{reinterpret_cast(list_bytes.data()), list_bytes.size()})); int iterations = 0; while (reader.has_next() && iterations < 10) { auto chunk = reader.read_chunk(); @@ -1932,7 +1931,7 @@ TEST_F(ParquetReaderTest, FilterFloatNAN) TEST_F(ParquetReaderTest, RepeatedNoAnnotations) { - constexpr unsigned char repeated_bytes[] = { + constexpr std::array repeated_bytes{ 0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x30, 0x15, 0x30, 0x4c, 0x15, 0x0c, 0x15, 0x00, 0x12, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x0a, 0x15, 0x0a, @@ -1976,9 +1975,9 @@ TEST_F(ParquetReaderTest, RepeatedNoAnnotations) 0x61, 0x38, 0x33, 0x39, 0x31, 0x36, 0x63, 0x36, 0x39, 0x62, 0x35, 0x65, 0x29, 0x00, 0x32, 0x01, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - auto read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(repeated_bytes), sizeof(repeated_bytes)}); - auto result = cudf::io::read_parquet(read_opts); + auto read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{ + reinterpret_cast(repeated_bytes.data()), repeated_bytes.size()}); + auto result = cudf::io::read_parquet(read_opts); EXPECT_EQ(result.tbl->view().column(0).size(), 6); EXPECT_EQ(result.tbl->view().num_columns(), 2); diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp index 9e66fc9409f..7c305235ea6 100644 --- a/cpp/tests/io/parquet_v2_test.cpp +++ b/cpp/tests/io/parquet_v2_test.cpp @@ -23,6 +23,8 @@ #include +#include + using cudf::test::iterators::no_nulls; // Base test fixture for V2 header tests @@ -693,9 +695,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%012d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%012d", i); + return std::string(buf.data()); }); auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); @@ -715,9 +717,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) // mixed length strings auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%d", i); + return std::string(buf.data()); }); auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); @@ -787,9 +789,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%012d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%012d", i); + return std::string(buf.data()); }); auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); @@ -819,9 +821,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) // mixed length strings auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%d", i); + return std::string(buf.data()); }); auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows, valids); @@ -897,9 +899,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%012d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%012d", i); + return std::string(buf.data()); }); auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); @@ -914,9 +916,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) // mixed length strings auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%d", i); - return std::string(buf); + std::array buf; + sprintf(buf.data(), "%d", i); + return std::string(buf.data()); }); auto col3 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); @@ -1034,7 +1036,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) // hard coded schema indices. // TODO find a way to do this without magic - size_t const colidxs[] = {1, 3, 4, 5, 8}; + constexpr std::array colidxs{1, 3, 4, 5, 8}; for (size_t r = 0; r < fmd.row_groups.size(); r++) { auto const& rg = fmd.row_groups[r]; for (size_t c = 0; c < rg.columns.size(); c++) { @@ -1129,7 +1131,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) // col1 will have num_ordered_rows / 2 nulls total // col2 will have num_ordered_rows / 3 nulls total // col3 will have num_ordered_rows / 4 nulls total - int const null_mods[] = {0, 2, 3, 4}; + constexpr std::array null_mods{0, 2, 3, 4}; for (auto const& rg : fmd.row_groups) { for (size_t c = 0; c < rg.columns.size(); c++) { @@ -1299,7 +1301,7 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) table_view expected({col0, col1, col2, col3, col4, col5, col6, col7}); - int64_t const expected_null_counts[] = {4, 4, 4, 6, 4, 6, 4, 5, 11}; + std::array expected_null_counts{4, 4, 4, 6, 4, 6, 4, 5, 11}; std::vector const expected_def_hists[] = {{1, 1, 2, 3}, {1, 3, 10}, {1, 1, 2, 10}, diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index c8100038942..8794f2ee304 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -31,6 +31,7 @@ #include #include +#include #include using cudf::test::iterators::no_nulls; @@ -879,53 +880,52 @@ TEST_F(ParquetWriterTest, Decimal128Stats) TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) { - char const* coldata[] = { - // in-range 7 bit. should truncate to "yyyyyyyz" - "yyyyyyyyy", - // max 7 bit. should truncate to "x7fx7fx7fx7fx7fx7fx7fx80", since it's - // considered binary, not UTF-8. If UTF-8 it should not truncate. - "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", - // max binary. this should not truncate - "\xff\xff\xff\xff\xff\xff\xff\xff\xff", - // in-range 2-byte UTF8 (U+00E9). should truncate to "éééê" - "ééééé", - // max 2-byte UTF8 (U+07FF). should not truncate - "߿߿߿߿߿", - // in-range 3-byte UTF8 (U+0800). should truncate to "ࠀࠁ" - "ࠀࠀࠀ", - // max 3-byte UTF8 (U+FFFF). should not truncate - "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", - // in-range 4-byte UTF8 (U+10000). should truncate to "𐀀𐀁" - "𐀀𐀀𐀀", - // max unicode (U+10FFFF). should truncate to \xf4\x8f\xbf\xbf\xf4\x90\x80\x80, - // which is no longer valid unicode, but is still ok UTF-8??? - "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", - // max 4-byte UTF8 (U+1FFFFF). should not truncate - "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; + std::array coldata{// in-range 7 bit. should truncate to "yyyyyyyz" + "yyyyyyyyy", + // max 7 bit. should truncate to "x7fx7fx7fx7fx7fx7fx7fx80", since it's + // considered binary, not UTF-8. If UTF-8 it should not truncate. + "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", + // max binary. this should not truncate + "\xff\xff\xff\xff\xff\xff\xff\xff\xff", + // in-range 2-byte UTF8 (U+00E9). should truncate to "éééê" + "ééééé", + // max 2-byte UTF8 (U+07FF). should not truncate + "߿߿߿߿߿", + // in-range 3-byte UTF8 (U+0800). should truncate to "ࠀࠁ" + "ࠀࠀࠀ", + // max 3-byte UTF8 (U+FFFF). should not truncate + "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", + // in-range 4-byte UTF8 (U+10000). should truncate to "𐀀𐀁" + "𐀀𐀀𐀀", + // max unicode (U+10FFFF). should truncate to \xf4\x8f\xbf\xbf\xf4\x90\x80\x80, + // which is no longer valid unicode, but is still ok UTF-8??? + "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", + // max 4-byte UTF8 (U+1FFFFF). should not truncate + "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; // NOTE: UTF8 min is initialized with 0xf7bfbfbf. Binary values larger // than that will not become minimum value (when written as UTF-8). - char const* truncated_min[] = {"yyyyyyyy", - "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", - "\xf7\xbf\xbf\xbf", - "éééé", - "߿߿߿߿", - "ࠀࠀ", - "\xef\xbf\xbf\xef\xbf\xbf", - "𐀀𐀀", - "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", - "\xf7\xbf\xbf\xbf"}; - - char const* truncated_max[] = {"yyyyyyyz", - "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x80", - "\xff\xff\xff\xff\xff\xff\xff\xff\xff", - "éééê", - "߿߿߿߿߿", - "ࠀࠁ", - "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", - "𐀀𐀁", - "\xf4\x8f\xbf\xbf\xf4\x90\x80\x80", - "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; + std::array truncated_min{"yyyyyyyy", + "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", + "\xf7\xbf\xbf\xbf", + "éééé", + "߿߿߿߿", + "ࠀࠀ", + "\xef\xbf\xbf\xef\xbf\xbf", + "𐀀𐀀", + "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", + "\xf7\xbf\xbf\xbf"}; + + std::array truncated_max{"yyyyyyyz", + "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x80", + "\xff\xff\xff\xff\xff\xff\xff\xff\xff", + "éééê", + "߿߿߿߿߿", + "ࠀࠁ", + "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", + "𐀀𐀁", + "\xf4\x8f\xbf\xbf\xf4\x90\x80\x80", + "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; auto cols = [&]() { using string_wrapper = column_wrapper; diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp index a9186874e83..42a574ac5c0 100644 --- a/cpp/tests/json/json_tests.cpp +++ b/cpp/tests/json/json_tests.cpp @@ -652,7 +652,7 @@ TEST_F(JsonPathTests, MixedOutput) // various queries on: // clang-format off std::vector input_strings { - "{\"a\": {\"b\" : \"c\"}}", + R"({"a": {"b" : "c"}})", "{" "\"a\": {\"b\" : \"c\"}," @@ -827,7 +827,7 @@ TEST_F(JsonPathTests, AllowSingleQuotes) // various queries on: std::vector input_strings{ // clang-format off - "{\'a\': {\'b\' : \'c\'}}", + R"({'a': {'b' : 'c'}})", "{" "\'a\': {\'b\' : \"c\"}," @@ -902,7 +902,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars) { std::vector input_strings{ // clang-format off - "{\"item\" : [{\"key\" : \"value[\"}]}", + R"({"item" : [{"key" : "value["}]})", // clang-format on }; @@ -927,7 +927,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars) { std::vector input_strings{ // clang-format off - "{\"a\" : \"[}{}][][{[\\\"}}[\\\"]\"}", + R"({"a" : "[}{}][][{[\"}}[\"]"})", // clang-format on }; @@ -958,8 +958,8 @@ TEST_F(JsonPathTests, EscapeSequences) std::vector input_strings{ // clang-format off - "{\"a\" : \"\\\" \\\\ \\/ \\b \\f \\n \\r \\t\"}", - "{\"a\" : \"\\u1248 \\uacdf \\uACDF \\u10EF\"}" + R"({"a" : "\" \\ \/ \b \f \n \r \t"})", + R"({"a" : "\u1248 \uacdf \uACDF \u10EF"})" // clang-format on }; diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index 949ffcc26a6..1e9e13ded93 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -35,7 +35,6 @@ #include -#include #include using aggregation = cudf::aggregation; @@ -1254,7 +1253,7 @@ struct StringReductionTest : public cudf::test::BaseFixture, }; // ------------------------------------------------------------------------ -std::vector string_list[] = { +std::vector> string_list{{ {"one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}, {"", "two", "three", "four", "five", "six", "seven", "eight", "nine"}, {"one", "", "three", "four", "five", "six", "seven", "eight", "nine"}, @@ -1264,7 +1263,7 @@ std::vector string_list[] = { {"\xF7\xBF\xBF\xBF", "", "", "", "", "", "", "", ""}, {"one", "two", "three", "four", "\xF7\xBF\xBF\xBF", "six", "seven", "eight", "nine"}, {"one", "two", "\xF7\xBF\xBF\xBF", "four", "five", "six", "seven", "eight", "nine"}, -}; +}}; INSTANTIATE_TEST_CASE_P(string_cases, StringReductionTest, testing::ValuesIn(string_list)); TEST_P(StringReductionTest, MinMax) { @@ -2235,7 +2234,7 @@ TYPED_TEST(ReductionTest, NthElement) struct DictionaryStringReductionTest : public StringReductionTest {}; -std::vector data_list[] = { +std::vector> data_list = { {"nine", "two", "five", "three", "five", "six", "two", "eight", "nine"}, }; INSTANTIATE_TEST_CASE_P(dictionary_cases, diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp index 76dbbaef491..c4463d68a68 100644 --- a/cpp/tests/reductions/scan_tests.cpp +++ b/cpp/tests/reductions/scan_tests.cpp @@ -415,8 +415,8 @@ TEST_F(ScanStringsTest, MoreStringsMinMax) int row_count = 512; auto data_begin = cudf::detail::make_counting_transform_iterator(0, [](auto idx) { - char const s[] = {static_cast('a' + (idx % 26)), 0}; - return std::string(s); + char const s = static_cast('a' + (idx % 26)); + return std::string{1, s}; }); auto validity = cudf::detail::make_counting_transform_iterator( 0, [](auto idx) -> bool { return (idx % 23) != 22; }); diff --git a/cpp/tests/rolling/nth_element_test.cpp b/cpp/tests/rolling/nth_element_test.cpp index 9cc8b6dec81..2444992e68f 100644 --- a/cpp/tests/rolling/nth_element_test.cpp +++ b/cpp/tests/rolling/nth_element_test.cpp @@ -83,7 +83,7 @@ class rolling_exec { return *this; } - std::unique_ptr test_grouped_nth_element( + [[nodiscard]] std::unique_ptr test_grouped_nth_element( cudf::size_type n, std::optional null_handling = std::nullopt) const { return cudf::grouped_rolling_window( @@ -96,7 +96,7 @@ class rolling_exec { n, null_handling.value_or(_null_handling))); } - std::unique_ptr test_nth_element( + [[nodiscard]] std::unique_ptr test_nth_element( cudf::size_type n, std::optional null_handling = std::nullopt) const { return cudf::rolling_window(_input, diff --git a/cpp/tests/streams/transform_test.cpp b/cpp/tests/streams/transform_test.cpp index 9187672221c..cf81dc6fb42 100644 --- a/cpp/tests/streams/transform_test.cpp +++ b/cpp/tests/streams/transform_test.cpp @@ -32,7 +32,7 @@ class TransformTest : public cudf::test::BaseFixture {}; template -void test_udf(char const udf[], Data data_init, cudf::size_type size, bool is_ptx) +void test_udf(char const* udf, Data data_init, cudf::size_type size, bool is_ptx) { auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init); diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp index 7e530b2a34d..5923f8dee5a 100644 --- a/cpp/tests/strings/chars_types_tests.cpp +++ b/cpp/tests/strings/chars_types_tests.cpp @@ -24,6 +24,7 @@ #include +#include #include struct StringsCharsTest : public cudf::test::BaseFixture {}; @@ -50,20 +51,20 @@ TEST_P(CharsTypes, AllTypes) "de", "\t\r\n\f "}; - bool expecteds[] = {false, false, false, false, false, false, false, false, - false, false, false, false, false, true, false, false, // decimal - false, false, false, false, false, false, false, false, - false, true, false, true, false, true, false, false, // numeric - false, false, false, false, false, false, false, false, - false, false, false, true, false, true, false, false, // digit - true, true, false, true, false, false, false, false, - false, false, false, false, false, false, true, false, // alpha - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, true, // space - false, false, false, true, false, false, false, false, - false, false, false, false, false, false, false, false, // upper - false, true, false, false, false, false, false, false, - false, false, false, false, false, false, true, false}; // lower + std::array expecteds{false, false, false, false, false, false, false, false, + false, false, false, false, false, true, false, false, // decimal + false, false, false, false, false, false, false, false, + false, true, false, true, false, true, false, false, // numeric + false, false, false, false, false, false, false, false, + false, false, false, true, false, true, false, false, // digit + true, true, false, true, false, false, false, false, + false, false, false, false, false, false, true, false, // alpha + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, true, // space + false, false, false, true, false, false, false, false, + false, false, false, false, false, false, false, false, // upper + false, true, false, false, false, false, false, false, + false, false, false, false, false, false, true, false}; // lower auto is_parm = GetParam(); diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index acf850c7a66..bdfd38267e6 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -32,6 +32,7 @@ #include #include +#include #include struct StringsContainsTests : public cudf::test::BaseFixture {}; @@ -167,10 +168,8 @@ TEST_F(StringsContainsTests, MatchesTest) auto strings_view = cudf::strings_column_view(strings); { auto const pattern = std::string("lazy"); - bool h_expected[] = {false, false, true, false, false, false, false}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {false, false, true, false, false, false, false}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -178,10 +177,8 @@ TEST_F(StringsContainsTests, MatchesTest) } { auto const pattern = std::string("\\d+"); - bool h_expected[] = {false, false, false, true, true, false, false}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {false, false, false, true, true, false, false}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -189,10 +186,8 @@ TEST_F(StringsContainsTests, MatchesTest) } { auto const pattern = std::string("@\\w+"); - bool h_expected[] = {false, false, false, false, false, false, false}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {false, false, false, false, false, false, false}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -200,10 +195,8 @@ TEST_F(StringsContainsTests, MatchesTest) } { auto const pattern = std::string(".*"); - bool h_expected[] = {true, true, true, true, true, false, true}; cudf::test::fixed_width_column_wrapper expected( - h_expected, - h_expected + h_strings.size(), + {true, true, true, true, true, false, true}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::matches_re(strings_view, *prog); @@ -335,9 +328,9 @@ TEST_F(StringsContainsTests, EmbeddedNullCharacter) { std::vector data(10); std::generate(data.begin(), data.end(), [n = 0]() mutable { - char first = static_cast('A' + n++); - char raw_data[] = {first, '\0', 'B'}; - return std::string{raw_data, 3}; + char first = static_cast('A' + n++); + std::array raw_data = {first, '\0', 'B'}; + return std::string{raw_data.data(), 3}; }); cudf::test::strings_column_wrapper input(data.begin(), data.end()); auto strings_view = cudf::strings_column_view(input); @@ -749,11 +742,11 @@ TEST_F(StringsContainsTests, ASCII) auto input = cudf::test::strings_column_wrapper({"abc \t\f\r 12", "áé  ❽❽", "aZ ❽4", "XYZ 8"}); auto view = cudf::strings_column_view(input); - std::string patterns[] = {R"(\w+[\s]+\d+)", - R"([^\W]+\s+[^\D]+)", - R"([\w]+[^\S]+[\d]+)", - R"([\w]+\s+[\d]+)", - R"(\w+\s+\d+)"}; + std::array patterns = {R"(\w+[\s]+\d+)", + R"([^\W]+\s+[^\D]+)", + R"([\w]+[^\S]+[\d]+)", + R"([\w]+\s+[\d]+)", + R"(\w+\s+\d+)"}; for (auto ptn : patterns) { auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 0, 0, 0}); @@ -787,24 +780,18 @@ TEST_F(StringsContainsTests, MediumRegex) auto strings_view = cudf::strings_column_view(strings); { - auto results = cudf::strings::contains_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::contains_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::matches_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::matches_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::count_re(strings_view, *prog); - int32_t h_expected[] = {1, 0, 0}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::count_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({1, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } } @@ -828,24 +815,18 @@ TEST_F(StringsContainsTests, LargeRegex) auto strings_view = cudf::strings_column_view(strings); { - auto results = cudf::strings::contains_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::contains_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::matches_re(strings_view, *prog); - bool h_expected[] = {true, false, false}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::matches_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({true, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::count_re(strings_view, *prog); - int32_t h_expected[] = {1, 0, 0}; - cudf::test::fixed_width_column_wrapper expected(h_expected, - h_expected + h_strings.size()); + auto results = cudf::strings::count_re(strings_view, *prog); + cudf::test::fixed_width_column_wrapper expected({1, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } } diff --git a/cpp/tests/strings/durations_tests.cpp b/cpp/tests/strings/durations_tests.cpp index 86189b29981..f2e31339035 100644 --- a/cpp/tests/strings/durations_tests.cpp +++ b/cpp/tests/strings/durations_tests.cpp @@ -24,6 +24,7 @@ #include +#include #include struct StringsDurationsTest : public cudf::test::BaseFixture {}; @@ -403,17 +404,17 @@ TEST_F(StringsDurationsTest, ParseSingle) "01", ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, 0, 1, -1, 23, -23, 59, -59, 99, -99, 0, 1, 0}; - auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i * 3600}; }); + std::array expected_v{0, 0, 1, -1, 23, -23, 59, -59, 99, -99, 0, 1, 0}; + auto it1 = thrust::make_transform_iterator(expected_v.data(), + [](auto i) { return cudf::duration_s{i * 3600}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), "%H"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); - auto it2 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i * 60}; }); + auto it2 = thrust::make_transform_iterator(expected_v.data(), + [](auto i) { return cudf::duration_s{i * 60}; }); cudf::test::fixed_width_column_wrapper expected_s2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -421,14 +422,14 @@ TEST_F(StringsDurationsTest, ParseSingle) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s2); auto it3 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s3(it3, it3 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), "%S"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s3); - auto it4 = thrust::make_transform_iterator(expected_v, + auto it4 = thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_ms{i * 60000}; }); cudf::test::fixed_width_column_wrapper expected_ms(it4, it4 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), @@ -454,21 +455,21 @@ TEST_F(StringsDurationsTest, ParseMultiple) "01:01:01", ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, - 0, - -1, - -(3600 + 60 + 1), - 23 * 3600 + 1, - -(23 * 3600 + 1), - 59 * 3600, - -59 * 3600, - 99 * 3600, - -99 * 3600, - 0, - 3661, - 0}; + std::array expected_v{0, + 0, + -1, + -(3600 + 60 + 1), + 23 * 3600 + 1, + -(23 * 3600 + 1), + 59 * 3600, + -59 * 3600, + 99 * 3600, + -99 * 3600, + 0, + 3661, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -476,7 +477,7 @@ TEST_F(StringsDurationsTest, ParseMultiple) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); auto it2 = thrust::make_transform_iterator( - expected_v, [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); + expected_v.data(), [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); cudf::test::fixed_width_column_wrapper expected_D2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -508,28 +509,28 @@ TEST_F(StringsDurationsTest, ParseSubsecond) "01:01:01", ""}; // error auto size = cudf::column_view(string_src).size(); - int64_t expected_v[]{0, - -123456789L, - -1000666999L, - -((3600 + 60 + 1) * 1000000000L + 100000000L), - (23 * 3600 + 1) * 1000000000L + 80L, - -((23 * 3600 + 1) * 1000000000L + 123000000L), - (59 * 3600) * 1000000000L, - -(59 * 3600) * 1000000000L, - (99 * 3600) * 1000000000L, - -(99 * 3600) * 1000000000L, - 0, - (3661) * 1000000000L, - 0}; + std::array expected_v{0, + -123456789L, + -1000666999L, + -((3600 + 60 + 1) * 1000000000L + 100000000L), + (23 * 3600 + 1) * 1000000000L + 80L, + -((23 * 3600 + 1) * 1000000000L + 123000000L), + (59 * 3600) * 1000000000L, + -(59 * 3600) * 1000000000L, + (99 * 3600) * 1000000000L, + -(99 * 3600) * 1000000000L, + 0, + (3661) * 1000000000L, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_ns{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_ns{i}; }); cudf::test::fixed_width_column_wrapper expected_ns1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), "%H:%M:%S"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ns1); - auto it2 = thrust::make_transform_iterator(expected_v, + auto it2 = thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_ms{i / 1000000}; }); cudf::test::fixed_width_column_wrapper expected_ms2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), @@ -559,25 +560,25 @@ TEST_F(StringsDurationsTest, ParseAMPM) "01:01:01", // error ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, - 0 + 12 * 3600, - 0, - 0 - 12 * 3600, - -1, - -1 - 12 * 3600, - -(3600 + 60 + 1), - -(3600 + 60 + 1) - 12 * 3600, - 11 * 3600 + 59 * 60 + 59, - 11 * 3600 + 59 * 60 + 59 + 12 * 3600, - -(11 * 3600 + 59 * 60 + 59), - -(11 * 3600 + 59 * 60 + 59 + 12 * 3600), - 0, - 0, - 0, - 0, - 0}; + std::array expected_v{0, + 0 + 12 * 3600, + 0, + 0 - 12 * 3600, + -1, + -1 - 12 * 3600, + -(3600 + 60 + 1), + -(3600 + 60 + 1) - 12 * 3600, + 11 * 3600 + 59 * 60 + 59, + 11 * 3600 + 59 * 60 + 59 + 12 * 3600, + -(11 * 3600 + 59 * 60 + 59), + -(11 * 3600 + 59 * 60 + 59 + 12 * 3600), + 0, + 0, + 0, + 0, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -585,7 +586,7 @@ TEST_F(StringsDurationsTest, ParseAMPM) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); auto it2 = thrust::make_transform_iterator( - expected_v, [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); + expected_v.data(), [](auto i) { return cudf::duration_D{i / (24 * 3600)}; }); cudf::test::fixed_width_column_wrapper expected_D2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -616,20 +617,20 @@ TEST_F(StringsDurationsTest, ParseCompoundSpecifier) "01:01:01", // error ""}; // error auto size = cudf::column_view(string_src).size(); - int32_t expected_v[]{0, - 0 + 12 * 3600, - 1, - 1 + 12 * 3600, - (3600 + 60 + 1), - (3600 + 60 + 1) + 12 * 3600, - 11 * 3600 + 59 * 60 + 59, - 11 * 3600 + 59 * 60 + 59 + 12 * 3600, - 0, - 0, - 0, - 0}; + std::array expected_v{0, + 0 + 12 * 3600, + 1, + 1 + 12 * 3600, + (3600 + 60 + 1), + (3600 + 60 + 1) + 12 * 3600, + 11 * 3600 + 59 * 60 + 59, + 11 * 3600 + 59 * 60 + 59 + 12 * 3600, + 0, + 0, + 0, + 0}; auto it1 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; }); + thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; }); cudf::test::fixed_width_column_wrapper expected_s1(it1, it1 + size); auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), @@ -641,8 +642,8 @@ TEST_F(StringsDurationsTest, ParseCompoundSpecifier) "%OI:%OM:%OS %p"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1); - auto it2 = - thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_ms{i * 1000}; }); + auto it2 = thrust::make_transform_iterator(expected_v.data(), + [](auto i) { return cudf::duration_ms{i * 1000}; }); cudf::test::fixed_width_column_wrapper expected_s2(it2, it2 + size); results = cudf::strings::to_durations(cudf::strings_column_view(string_src), cudf::data_type(cudf::type_to_id()), diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index 1491da758d5..61246fb098d 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -275,8 +275,8 @@ TEST_F(StringsExtractTests, ExtractAllTest) auto pattern = std::string("(\\d+) (\\w+)"); - bool valids[] = {true, true, true, false, false, false, true}; - using LCW = cudf::test::lists_column_wrapper; + std::array valids{true, true, true, false, false, false, true}; + using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{"123", "banana", "7", "eleven"}, LCW{"41", "apple"}, LCW{"6", "péar", "0", "pair"}, @@ -284,7 +284,7 @@ TEST_F(StringsExtractTests, ExtractAllTest) LCW{}, LCW{}, LCW{"4", "paré"}}, - valids); + valids.data()); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::extract_all_record(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 6eea1895fb1..73da4d081e2 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -33,10 +33,10 @@ struct StringsFindallTests : public cudf::test::BaseFixture {}; TEST_F(StringsFindallTests, FindallTest) { - bool valids[] = {true, true, true, true, true, false, true, true}; + std::array valids{true, true, true, true, true, false, true, true}; cudf::test::strings_column_wrapper input( {"3-A", "4-May 5-Day 6-Hay", "12-Dec-2021-Jan", "Feb-March", "4 ABC", "", "", "25-9000-Hal"}, - valids); + valids.data()); auto sv = cudf::strings_column_view(input); auto pattern = std::string("(\\d+)-(\\w+)"); @@ -50,7 +50,7 @@ TEST_F(StringsFindallTests, FindallTest) LCW{}, LCW{}, LCW{"25-9000"}}, - valids); + valids.data()); auto prog = cudf::strings::regex_program::create(pattern); auto results = cudf::strings::findall(sv, *prog); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); diff --git a/cpp/tests/transform/integration/unary_transform_test.cpp b/cpp/tests/transform/integration/unary_transform_test.cpp index 5fa02d9978a..1785848ec77 100644 --- a/cpp/tests/transform/integration/unary_transform_test.cpp +++ b/cpp/tests/transform/integration/unary_transform_test.cpp @@ -30,7 +30,7 @@ namespace transformation { struct UnaryOperationIntegrationTest : public cudf::test::BaseFixture {}; template -void test_udf(char const udf[], Op op, Data data_init, cudf::size_type size, bool is_ptx) +void test_udf(char const* udf, Op op, Data data_init, cudf::size_type size, bool is_ptx) { auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init); From 4018d3116b2bfd876253b187894df10cb325db2f Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 27 Sep 2024 13:17:03 -0400 Subject: [PATCH 10/14] Remove superfluous use of std::vector for std::future (#16829) This PR addresses #16888 , where a superfluous use of `std::vector` should be removed. closes #16888 Authors: - Tianyu Liu (https://github.com/kingcrimsontianyu) - Vukasin Milovanovic (https://github.com/vuule) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/16829 --- cpp/src/io/parquet/reader_impl.hpp | 4 +-- cpp/src/io/parquet/reader_impl_preprocess.cu | 26 +++++++++----------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 2d46da14bec..62ffc4d3077 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -188,10 +188,10 @@ class reader::impl { * * Does not decompress the chunk data. * - * @return pair of boolean indicating if compressed chunks were found and a vector of futures for + * @return pair of boolean indicating if compressed chunks were found and a future for * read completion */ - std::pair>> read_column_chunks(); + std::pair> read_column_chunks(); /** * @brief Read compressed data and page information for the current pass. diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 8e67f233213..3763c2e8e6d 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -964,7 +964,7 @@ void reader::impl::allocate_level_decode_space() } } -std::pair>> reader::impl::read_column_chunks() +std::pair> reader::impl::read_column_chunks() { auto const& row_groups_info = _pass_itm_data->row_groups; @@ -989,7 +989,6 @@ std::pair>> reader::impl::read_column_chunks // TODO: make this respect the pass-wide skip_rows/num_rows instead of the file-wide // skip_rows/num_rows // auto remaining_rows = num_rows; - std::vector> read_chunk_tasks; size_type chunk_count = 0; for (auto const& rg : row_groups_info) { auto const& row_group = _metadata->get_row_group(rg.index, rg.source_index); @@ -1018,16 +1017,15 @@ std::pair>> reader::impl::read_column_chunks } // Read compressed chunk data to device memory - read_chunk_tasks.push_back(read_column_chunks_async(_sources, - raw_page_data, - chunks, - 0, - chunks.size(), - column_chunk_offsets, - chunk_source_map, - _stream)); - - return {total_decompressed_size > 0, std::move(read_chunk_tasks)}; + return {total_decompressed_size > 0, + read_column_chunks_async(_sources, + raw_page_data, + chunks, + 0, + chunks.size(), + column_chunk_offsets, + chunk_source_map, + _stream)}; } void reader::impl::read_compressed_data() @@ -1042,9 +1040,7 @@ void reader::impl::read_compressed_data() auto const [has_compressed_data, read_chunks_tasks] = read_column_chunks(); pass.has_compressed_data = has_compressed_data; - for (auto& task : read_chunks_tasks) { - task.wait(); - } + read_chunks_tasks.wait(); // Process dataset chunk pages into output columns auto const total_pages = _has_page_index ? count_page_headers_with_pgidx(chunks, _stream) From afe9f929abf565c235d5a4e375ef33f2cf032487 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 27 Sep 2024 10:55:48 -0700 Subject: [PATCH 11/14] clang-tidy fixes part 2 (#16938) Subset of improvements to the code base proposed by the latest version of clang-tidy. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/16938 --- cpp/src/datetime/timezone.cpp | 2 +- cpp/src/dictionary/dictionary_column_view.cpp | 5 ++--- cpp/src/interop/dlpack.cpp | 4 ++-- cpp/src/io/avro/avro.cpp | 2 +- cpp/src/io/avro/avro.hpp | 21 ++++++++++--------- cpp/src/io/comp/uncomp.cpp | 4 ++-- cpp/src/io/parquet/parquet_gpu.hpp | 21 ++++++++++--------- cpp/src/jit/parser.cpp | 4 +--- cpp/src/strings/regex/regcomp.cpp | 4 ++-- cpp/src/utilities/stream_pool.cpp | 2 +- 10 files changed, 34 insertions(+), 35 deletions(-) diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index cf239297255..a6b6cbbf0b5 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -38,7 +38,7 @@ std::string const tzif_system_directory = "/usr/share/zoneinfo/"; struct timezone_file_header { uint32_t magic; ///< "TZif" uint8_t version; ///< 0:version1, '2':version2, '3':version3 - uint8_t reserved15[15]; ///< unused, reserved for future use + uint8_t reserved15[15]; ///< unused, reserved for future use // NOLINT uint32_t isutccnt; ///< number of UTC/local indicators contained in the body uint32_t isstdcnt; ///< number of standard/wall indicators contained in the body uint32_t leapcnt; ///< number of leap second records contained in the body diff --git a/cpp/src/dictionary/dictionary_column_view.cpp b/cpp/src/dictionary/dictionary_column_view.cpp index 4906e5b4f9c..3e4a201bba4 100644 --- a/cpp/src/dictionary/dictionary_column_view.cpp +++ b/cpp/src/dictionary/dictionary_column_view.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,8 +36,7 @@ column_view dictionary_column_view::indices() const noexcept { return child(0); column_view dictionary_column_view::get_indices_annotated() const noexcept { - return column_view( - indices().type(), size(), indices().head(), null_mask(), null_count(), offset()); + return {indices().type(), size(), indices().head(), null_mask(), null_count(), offset()}; } column_view dictionary_column_view::keys() const noexcept { return child(1); } diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index ba5b11b90d8..a1be6aade4e 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -118,8 +118,8 @@ DLDataType data_type_to_DLDataType(data_type type) // Context object to own memory allocated for DLManagedTensor struct dltensor_context { - int64_t shape[2]; - int64_t strides[2]; + int64_t shape[2]; // NOLINT + int64_t strides[2]; // NOLINT rmm::device_buffer buffer; static void deleter(DLManagedTensor* arg) diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp index 2041f03cd81..03cf6d4a0e0 100644 --- a/cpp/src/io/avro/avro.cpp +++ b/cpp/src/io/avro/avro.cpp @@ -199,7 +199,7 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row) // Read the next sync markers and ensure they match the first ones we // encountered. If they don't, we have to assume the data is corrupted, // and thus, we terminate processing immediately. - uint64_t const sync_marker[] = {get_raw(), get_raw()}; + std::array const sync_marker = {get_raw(), get_raw()}; bool valid_sync_markers = ((sync_marker[0] == md->sync_marker[0]) && (sync_marker[1] == md->sync_marker[1])); if (!valid_sync_markers) { return false; } diff --git a/cpp/src/io/avro/avro.hpp b/cpp/src/io/avro/avro.hpp index f2813a1ba51..2e992546ccc 100644 --- a/cpp/src/io/avro/avro.hpp +++ b/cpp/src/io/avro/avro.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include "avro_common.hpp" #include +#include #include #include #include @@ -100,15 +101,15 @@ struct column_desc { */ struct file_metadata { std::map user_data; - std::string codec = ""; - uint64_t sync_marker[2] = {0, 0}; - size_t metadata_size = 0; - size_t total_data_size = 0; - size_t selected_data_size = 0; - size_type num_rows = 0; - size_type skip_rows = 0; - size_type total_num_rows = 0; - uint32_t max_block_size = 0; + std::string codec = ""; + std::array sync_marker = {0, 0}; + size_t metadata_size = 0; + size_t total_data_size = 0; + size_t selected_data_size = 0; + size_type num_rows = 0; + size_type skip_rows = 0; + size_type total_num_rows = 0; + uint32_t max_block_size = 0; std::vector schema; std::vector block_list; std::vector columns; diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index 602ff1734b6..1af45b41d8e 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -42,7 +42,7 @@ struct gz_file_header_s { uint8_t id2; // 0x8b uint8_t comp_mthd; // compression method (0-7=reserved, 8=deflate) uint8_t flags; // flags (GZIPHeaderFlag) - uint8_t mtime[4]; // If non-zero: modification time (Unix format) + uint8_t mtime[4]; // If non-zero: modification time (Unix format) // NOLINT uint8_t xflags; // Extra compressor-specific flags uint8_t os; // OS id }; @@ -103,7 +103,7 @@ struct zip_lfh_s { }; struct bz2_file_header_s { - uint8_t sig[3]; // "BZh" + uint8_t sig[3]; // "BZh" // NOLINT uint8_t blksz; // block size 1..9 in 100kB units (post-RLE) }; diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 1390339c1ae..e631e12119d 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -294,7 +294,8 @@ struct PageInfo { int32_t uncompressed_page_size; // uncompressed data size in bytes // for V2 pages, the def and rep level data is not compressed, and lacks the 4-byte length // indicator. instead the lengths for these are stored in the header. - int32_t lvl_bytes[level_type::NUM_LEVEL_TYPES]; // length of the rep/def levels (V2 header) + int32_t // NOLINT + lvl_bytes[level_type::NUM_LEVEL_TYPES]; // length of the rep/def levels (V2 header) // Number of values in this data page or dictionary. // Important : the # of input values does not necessarily // correspond to the number of rows in the output. It just reflects the number @@ -345,7 +346,7 @@ struct PageInfo { PageNestingDecodeInfo* nesting_decode; // level decode buffers - uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES]; + uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES]; // NOLINT // temporary space for decoding DELTA_BYTE_ARRAY encoded strings int64_t temp_string_size; @@ -431,14 +432,14 @@ struct ColumnChunkDesc { size_t num_values{}; // total number of values in this column size_t start_row{}; // file-wide, absolute starting row of this chunk uint32_t num_rows{}; // number of rows in this chunk - int16_t max_level[level_type::NUM_LEVEL_TYPES]{}; // max definition/repetition level - int16_t max_nesting_depth{}; // max nesting depth of the output - int32_t type_length{}; // type length from schema (for FLBA only) - Type physical_type{}; // parquet physical data type - uint8_t - level_bits[level_type::NUM_LEVEL_TYPES]{}; // bits to encode max definition/repetition levels - int32_t num_data_pages{}; // number of data pages - int32_t num_dict_pages{}; // number of dictionary pages + int16_t max_level[level_type::NUM_LEVEL_TYPES]{}; // max definition/repetition level // NOLINT + int16_t max_nesting_depth{}; // max nesting depth of the output + int32_t type_length{}; // type length from schema (for FLBA only) + Type physical_type{}; // parquet physical data type + uint8_t level_bits[level_type::NUM_LEVEL_TYPES]{}; // bits to encode max // NOLINT + // definition/repetition levels + int32_t num_data_pages{}; // number of data pages + int32_t num_dict_pages{}; // number of dictionary pages PageInfo const* dict_page{}; string_index_pair* str_dict_index{}; // index for string dictionary bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp index 398c36821cc..519ac2d1a2e 100644 --- a/cpp/src/jit/parser.cpp +++ b/cpp/src/jit/parser.cpp @@ -19,8 +19,6 @@ #include #include -#include -#include #include #include #include @@ -28,7 +26,7 @@ namespace cudf { namespace jit { -constexpr char percent_escape[] = "_"; +constexpr char percent_escape[] = "_"; // NOLINT inline bool is_white(char const c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; } diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 7c4c89bd3fb..51c6e765edd 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -35,7 +35,7 @@ namespace strings { namespace detail { namespace { // Bitmask of all operators -#define OPERATOR_MASK 0200 +enum { OPERATOR_MASK = 0200 }; enum OperatorType : int32_t { START = 0200, // Start, used for marker on stack LBRA_NC = 0203, // non-capturing group @@ -50,7 +50,7 @@ enum OperatorType : int32_t { COUNTED_LAZY = 0215, NOP = 0302, // No operation, internal use only }; -#define ITEM_MASK 0300 +enum { ITEM_MASK = 0300 }; static reclass cclass_w(CCLASS_W); // \w static reclass cclass_s(CCLASS_S); // \s diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index 9824c472b20..8c29182bfb5 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -82,7 +82,7 @@ class rmm_cuda_stream_pool : public cuda_stream_pool { return streams; } - std::size_t get_stream_pool_size() const override { return STREAM_POOL_SIZE; } + [[nodiscard]] std::size_t get_stream_pool_size() const override { return STREAM_POOL_SIZE; } }; /** From 670cc3f9c6add1fddde142ec3dece65643d3f022 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 27 Sep 2024 08:43:53 -1000 Subject: [PATCH 12/14] Avoid public constructors when called with columns to avoid unnecessary validation (#16747) This PR continues an effort to avoid some public constructors when passing a column(s) to avoid unnecessary validation Maybe we should consider disallowing public constructors to accept columns all-together, but I suspect some RAPIDS libraries are passing columns to public constructors Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16747 --- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/dataframe.py | 44 +++++++++------------ python/cudf/cudf/core/multiindex.py | 26 +++++------- python/cudf/cudf/core/reshape.py | 29 +++++++------- python/cudf/cudf/core/window/ewm.py | 33 ++++++++-------- python/cudf/cudf/core/window/rolling.py | 39 +++++++++--------- 6 files changed, 77 insertions(+), 96 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index de5ed15771d..864e87b5377 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1337,7 +1337,7 @@ def _set_categories( # Ensure new_categories is unique first if not (is_unique or new_cats.is_unique): - new_cats = cudf.Series(new_cats)._column.unique() + new_cats = new_cats.unique() if cur_cats.equals(new_cats, check_dtypes=True): # TODO: Internal usages don't always need a copy; add a copy keyword diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 16b0aa95c35..79ed5a0e187 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6287,14 +6287,17 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only): ) if not skipna and any(col.nullable for col in filtered._columns): - mask = DataFrame( + length = filtered._data.nrows + ca = ColumnAccessor( { - name: filtered._data[name]._get_mask_as_column() - if filtered._data[name].nullable - else as_column(True, length=len(filtered._data[name])) - for name in filtered._column_names - } + name: col._get_mask_as_column() + if col.nullable + else as_column(True, length=length) + for name, col in filtered._data.items() + }, + verify=False, ) + mask = DataFrame._from_data(ca) mask = mask.all(axis=1) else: mask = None @@ -6679,19 +6682,10 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): ) return Series._from_column(result, index=self.index) else: - result_df = DataFrame(result).set_index(self.index) + result_df = DataFrame(result, index=self.index) result_df._set_columns_like(prepared._data) return result_df - @_performance_tracking - def _columns_view(self, columns): - """ - Return a subset of the DataFrame's columns as a view. - """ - return DataFrame( - {col: self._data[col] for col in columns}, index=self.index - ) - @_performance_tracking def select_dtypes(self, include=None, exclude=None): """Return a subset of the DataFrame's columns based on the column dtypes. @@ -6763,8 +6757,6 @@ def select_dtypes(self, include=None, exclude=None): if not isinstance(exclude, (list, tuple)): exclude = (exclude,) if exclude is not None else () - df = DataFrame(index=self.index) - # cudf_dtype_from_pydata_dtype can distinguish between # np.float and np.number selection = tuple(map(frozenset, (include, exclude))) @@ -6820,12 +6812,12 @@ def select_dtypes(self, include=None, exclude=None): # remove all exclude types inclusion = inclusion - exclude_subtypes - for k, col in self._column_labels_and_values: - infered_type = cudf_dtype_from_pydata_dtype(col.dtype) - if infered_type in inclusion: - df._insert(len(df._data), k, col) - - return df + to_select = [ + label + for label, dtype in self._dtypes + if cudf_dtype_from_pydata_dtype(dtype) in inclusion + ] + return self.loc[:, to_select] @ioutils.doc_to_parquet() def to_parquet( @@ -7331,7 +7323,7 @@ def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False): cov = cupy.cov(self.values, ddof=ddof, rowvar=False) cols = self._data.to_pandas_index() - df = DataFrame(cupy.asfortranarray(cov)).set_index(cols) + df = DataFrame(cupy.asfortranarray(cov), index=cols) df._set_columns_like(self._data) return df @@ -7374,7 +7366,7 @@ def corr( corr = cupy.corrcoef(values, rowvar=False) cols = self._data.to_pandas_index() - df = DataFrame(cupy.asfortranarray(corr)).set_index(cols) + df = DataFrame(cupy.asfortranarray(corr), index=cols) df._set_columns_like(self._data) return df diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 6de3981ba66..92d094d9de5 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -700,7 +700,10 @@ def _compute_validity_mask(self, index, row_tuple, max_length): lookup_dict[i] = row lookup = cudf.DataFrame(lookup_dict) frame = cudf.DataFrame._from_data( - ColumnAccessor(dict(enumerate(index._columns)), verify=False) + ColumnAccessor( + dict(enumerate(index._columns)), + verify=False, + ) ) with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) @@ -780,18 +783,12 @@ def _index_and_downcast(self, result, index, index_key): index_key = index_key[0] slice_access = isinstance(index_key, slice) - out_index = cudf.DataFrame() - # Select the last n-k columns where n is the number of columns and k is + # Count the last n-k columns where n is the number of columns and k is # the length of the indexing tuple size = 0 if not isinstance(index_key, (numbers.Number, slice)): size = len(index_key) - for k in range(size, len(index._data)): - out_index.insert( - out_index._num_columns, - k, - cudf.Series._from_column(index._columns[k]), - ) + num_selected = max(0, index.nlevels - size) # determine if we should downcast from a DataFrame to a Series need_downcast = ( @@ -814,16 +811,13 @@ def _index_and_downcast(self, result, index, index_key): result = cudf.Series._from_data( {}, name=tuple(col[0] for col in index._columns) ) - elif out_index._num_columns == 1: + elif num_selected == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to that column's name. - last_column = index._columns[-1] - out_index = cudf.Index._from_column( - last_column, name=index.names[-1] - ) - index = out_index - elif out_index._num_columns > 1: + *_, last_column = index._data.columns + index = cudf.Index._from_column(last_column, name=index.names[-1]) + elif num_selected > 1: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) result.reset_index(drop=True) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 401fef67ee6..6e5abb2b82b 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -961,14 +961,14 @@ def _merge_sorted( ) -def _pivot(df, index, columns): +def _pivot(col_accessor: ColumnAccessor, index, columns) -> cudf.DataFrame: """ Reorganize the values of the DataFrame according to the given index and columns. Parameters ---------- - df : DataFrame + col_accessor : DataFrame index : cudf.Index Index labels of the result columns : cudf.Index @@ -985,7 +985,7 @@ def as_tuple(x): return x if isinstance(x, tuple) else (x,) nrows = len(index_labels) - for col_label, col in df._column_labels_and_values: + for col_label, col in col_accessor.items(): names = [ as_tuple(col_label) + as_tuple(name) for name in column_labels ] @@ -1067,22 +1067,21 @@ def pivot(data, columns=None, index=no_default, values=no_default): 2 three """ - df = data values_is_list = True if values is no_default: - values = df._columns_view( - col for col in df._column_names if col not in (index, columns) - ) + cols_to_select = [ + col for col in data._column_names if col not in (index, columns) + ] + elif not isinstance(values, (list, tuple)): + cols_to_select = [values] + values_is_list = False else: - if not isinstance(values, (list, tuple)): - values = [values] - values_is_list = False - values = df._columns_view(values) + cols_to_select = values if index is no_default: - index = df.index + index = data.index else: - index = cudf.Index(df.loc[:, index]) - columns = cudf.Index(df.loc[:, columns]) + index = cudf.Index(data.loc[:, index]) + columns = cudf.Index(data.loc[:, columns]) # Create a DataFrame composed of columns from both # columns and index @@ -1096,7 +1095,7 @@ def pivot(data, columns=None, index=no_default, values=no_default): if len(columns_index) != len(columns_index.drop_duplicates()): raise ValueError("Duplicate index-column pairs found. Cannot reshape.") - result = _pivot(values, index, columns) + result = _pivot(data._data.select_by_label(cols_to_select), index, columns) # MultiIndex to Index if not values_is_list: diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py index ef0f6958aeb..094df955273 100644 --- a/python/cudf/cudf/core/window/ewm.py +++ b/python/cudf/cudf/core/window/ewm.py @@ -2,7 +2,7 @@ from __future__ import annotations import warnings -from typing import Literal +from typing import TYPE_CHECKING, Literal import numpy as np @@ -10,6 +10,9 @@ from cudf.api.types import is_numeric_dtype from cudf.core.window.rolling import _RollingBase +if TYPE_CHECKING: + from cudf.core.column.column import ColumnBase + class ExponentialMovingWindow(_RollingBase): r""" @@ -179,8 +182,10 @@ def cov( ): raise NotImplementedError("cov not yet supported.") - def _apply_agg_series(self, sr, agg_name): - if not is_numeric_dtype(sr.dtype): + def _apply_agg_column( + self, source_column: ColumnBase, agg_name: str + ) -> ColumnBase: + if not is_numeric_dtype(source_column.dtype): raise TypeError("No numeric types to aggregate") # libcudf ewm has special casing for nulls only @@ -188,20 +193,14 @@ def _apply_agg_series(self, sr, agg_name): # pandas does nans in the same positions mathematically. # as such we need to convert the nans to nulls before # passing them in. - to_libcudf_column = sr._column.astype("float64").nans_to_nulls() - - return self.obj._from_data_like_self( - self.obj._data._from_columns_like_self( - [ - scan( - agg_name, - to_libcudf_column, - True, - com=self.com, - adjust=self.adjust, - ) - ] - ) + to_libcudf_column = source_column.astype("float64").nans_to_nulls() + + return scan( + agg_name, + to_libcudf_column, + True, + com=self.com, + adjust=self.adjust, ) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 043a41145e5..967edc2ab15 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -2,6 +2,7 @@ from __future__ import annotations import warnings +from typing import TYPE_CHECKING import numba import pandas as pd @@ -16,25 +17,29 @@ from cudf.utils import cudautils from cudf.utils.utils import GetAttrGetItemMixin +if TYPE_CHECKING: + from cudf.core.column.column import ColumnBase + class _RollingBase: """ - Contains methods common to all kinds of rolling + Contains routines to apply a window aggregation to a column. """ - def _apply_agg_dataframe(self, df, agg_name): - result_df = cudf.DataFrame({}) - for i, col_name in enumerate(df.columns): - result_col = self._apply_agg_series(df[col_name], agg_name) - result_df.insert(i, col_name, result_col) - result_df.index = df.index - return result_df + obj: cudf.DataFrame | cudf.Series - def _apply_agg(self, agg_name): - if isinstance(self.obj, cudf.Series): - return self._apply_agg_series(self.obj, agg_name) - else: - return self._apply_agg_dataframe(self.obj, agg_name) + def _apply_agg_column( + self, source_column: ColumnBase, agg_name: str + ) -> ColumnBase: + raise NotImplementedError + + def _apply_agg(self, agg_name: str) -> cudf.DataFrame | cudf.Series: + applied = ( + self._apply_agg_column(col, agg_name) for col in self.obj._columns + ) + return self.obj._from_data_like_self( + self.obj._data._from_columns_like_self(applied) + ) class Rolling(GetAttrGetItemMixin, _RollingBase, Reducible): @@ -290,14 +295,6 @@ def _apply_agg_column(self, source_column, agg_name): agg_params=self.agg_params, ) - def _apply_agg(self, agg_name): - applied = ( - self._apply_agg_column(col, agg_name) for col in self.obj._columns - ) - return self.obj._from_data_like_self( - self.obj._data._from_columns_like_self(applied) - ) - def _reduce( self, op: str, From 22d481a4e3a34d517ad9a9ac46b8b1b456d365c6 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 27 Sep 2024 14:45:47 -0400 Subject: [PATCH 13/14] Fix JsonLargeReaderTest.MultiBatch use of LIBCUDF_JSON_BATCH_SIZE env var (#16927) Fixes the `unsetenv` to use `LIBCUDF_JSON_BATCH_SIZE` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16927 --- cpp/tests/large_strings/json_tests.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/large_strings/json_tests.cu b/cpp/tests/large_strings/json_tests.cu index 80bde168b75..a212d7d654a 100644 --- a/cpp/tests/large_strings/json_tests.cu +++ b/cpp/tests/large_strings/json_tests.cu @@ -96,5 +96,5 @@ TEST_F(JsonLargeReaderTest, MultiBatch) } // go back to normal batch_size - unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD"); + unsetenv("LIBCUDF_JSON_BATCH_SIZE"); } From 6973ef806bc9d3cbda37a4c7caa763da12b84b7f Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 27 Sep 2024 16:50:15 -0400 Subject: [PATCH 14/14] Parse newline as whitespace character while tokenizing JSONL inputs with non-newline delimiter (#16923) Addresses #16915 Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Basit Ayantunde (https://github.com/lamarrr) - Karthikeyan (https://github.com/karthikeyann) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/16923 --- cpp/src/io/json/nested_json_gpu.cu | 4 +- cpp/tests/io/json/json_test.cpp | 24 ++++ cpp/tests/io/json/nested_json_test.cpp | 178 +++++++++++++++++++++++++ 3 files changed, 204 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 1c15e147b13..bf81162a0ac 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -618,12 +618,12 @@ struct PdaSymbolToSymbolGroupId { constexpr auto pda_sgid_lookup_size = static_cast(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0])); // We map the delimiter character to LINE_BREAK symbol group id, and the newline character - // to OTHER. Note that delimiter cannot be any of opening(closing) brace, bracket, quote, + // to WHITE_SPACE. Note that delimiter cannot be any of opening(closing) brace, bracket, quote, // escape, comma, colon or whitespace characters. auto const symbol_position = symbol == delimiter ? static_cast('\n') - : (symbol == '\n' ? static_cast(delimiter) : static_cast(symbol)); + : (symbol == '\n' ? static_cast(' ') : static_cast(symbol)); PdaSymbolGroupIdT symbol_gid = tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)]; return stack_idx * static_cast(symbol_group_id::NUM_PDA_INPUT_SGS) + diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 68ec255b39d..a094ac7d772 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2575,6 +2575,30 @@ TEST_F(JsonReaderTest, ViableDelimiter) EXPECT_THROW(json_parser_options.set_delimiter('\t'), std::invalid_argument); } +TEST_F(JsonReaderTest, ViableDelimiterNewlineWS) +{ + // Test input + std::string input = R"({"a": + 100})"; + + cudf::io::json_reader_options json_parser_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()}) + .lines(true) + .delimiter('\0'); + + auto result = cudf::io::read_json(json_parser_options); + EXPECT_EQ(result.tbl->num_columns(), 1); + EXPECT_EQ(result.tbl->num_rows(), 1); + + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); + + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + + auto col1_iterator = thrust::constant_iterator(100); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), + int64_wrapper(col1_iterator, col1_iterator + 1)); +} + // Test case for dtype prune: // all paths, only one. // one present, another not present, nothing present diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp index 327169ae563..f32aba0e632 100644 --- a/cpp/tests/io/json/nested_json_test.cpp +++ b/cpp/tests/io/json/nested_json_test.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -1196,4 +1197,181 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAndDelimiter) } } +TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAsWSAndDelimiter) +{ + // Test input. Inline comments used to indicate character indexes + // 012345678 <= line 0 + char const delimiter = GetParam(); + + /* Input: (Note that \n is considered whitespace according to the JSON spec when it is not used as + * a delimiter for JSONL) + * {"a":2} + * {"a":{"a":{"a":[321{"a":[1]} + * + * {"b":123} + * {"b":123} + * {"b"\n:\n\n\n123\n} + */ + std::string input = R"({"a":2})" + "\n"; + // starting position 8 (zero indexed) + input += R"({"a":)" + std::string(1, delimiter); + // starting position 14 (zero indexed) + input += R"({"a":{"a":[321)" + std::string(1, delimiter); + // starting position 29 (zero indexed) + input += R"({"a":[1]})" + std::string("\n\n") + std::string(1, delimiter); + // starting position 41 (zero indexed) + input += R"({"b":123})" + "\n"; + // starting position 51 (zero indexed) + input += R"({"b":123})" + std::string(1, delimiter); + // starting position 61 (zero indexed) + input += R"({"b")" + std::string("\n:\n\n\n123\n}"); + + // Golden token stream sample + using token_t = cuio_json::token_t; + std::vector> golden_token_stream; + if (delimiter != '\n') { + golden_token_stream = {// Line 0 (valid) + {0, token_t::StructBegin}, + {1, token_t::StructMemberBegin}, + {1, token_t::FieldNameBegin}, + {3, token_t::FieldNameEnd}, + {5, token_t::ValueBegin}, + {6, token_t::ValueEnd}, + {6, token_t::StructMemberEnd}, + {6, token_t::StructEnd}, + // Line 1 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 2 (valid) + {29, token_t::StructBegin}, + {30, token_t::StructMemberBegin}, + {30, token_t::FieldNameBegin}, + {32, token_t::FieldNameEnd}, + {34, token_t::ListBegin}, + {35, token_t::ValueBegin}, + {36, token_t::ValueEnd}, + {36, token_t::ListEnd}, + {37, token_t::StructMemberEnd}, + {37, token_t::StructEnd}, + // Line 3 (valid) + {41, token_t::StructBegin}, + {42, token_t::StructMemberBegin}, + {42, token_t::FieldNameBegin}, + {44, token_t::FieldNameEnd}, + {46, token_t::ValueBegin}, + {49, token_t::ValueEnd}, + {49, token_t::StructMemberEnd}, + {49, token_t::StructEnd}, + // Line 4 (valid) + {61, token_t::StructBegin}, + {62, token_t::StructMemberBegin}, + {62, token_t::FieldNameBegin}, + {64, token_t::FieldNameEnd}, + {70, token_t::ValueBegin}, + {73, token_t::ValueEnd}, + {74, token_t::StructMemberEnd}, + {74, token_t::StructEnd}}; + } else { + /* Input: + * {"a":2} + * {"a": + * {"a":{"a":[321 + * {"a":[1]} + * + * + * {"b":123} + * {"b":123} + * {"b"\n:\n\n\n123\n} + */ + golden_token_stream = {// Line 0 (valid) + {0, token_t::StructBegin}, + {1, token_t::StructMemberBegin}, + {1, token_t::FieldNameBegin}, + {3, token_t::FieldNameEnd}, + {5, token_t::ValueBegin}, + {6, token_t::ValueEnd}, + {6, token_t::StructMemberEnd}, + {6, token_t::StructEnd}, + // Line 1 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 2 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 3 (valid) + {29, token_t::StructBegin}, + {30, token_t::StructMemberBegin}, + {30, token_t::FieldNameBegin}, + {32, token_t::FieldNameEnd}, + {34, token_t::ListBegin}, + {35, token_t::ValueBegin}, + {36, token_t::ValueEnd}, + {36, token_t::ListEnd}, + {37, token_t::StructMemberEnd}, + {37, token_t::StructEnd}, + // Line 4 (valid) + {41, token_t::StructBegin}, + {42, token_t::StructMemberBegin}, + {42, token_t::FieldNameBegin}, + {44, token_t::FieldNameEnd}, + {46, token_t::ValueBegin}, + {49, token_t::ValueEnd}, + {49, token_t::StructMemberEnd}, + {49, token_t::StructEnd}, + // Line 5 (valid) + {51, token_t::StructBegin}, + {52, token_t::StructMemberBegin}, + {52, token_t::FieldNameBegin}, + {54, token_t::FieldNameEnd}, + {56, token_t::ValueBegin}, + {59, token_t::ValueEnd}, + {59, token_t::StructMemberEnd}, + {59, token_t::StructEnd}, + // Line 6 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + {0, token_t::StructBegin}, + {0, token_t::StructEnd}}; + } + + auto const stream = cudf::get_default_stream(); + + // Prepare input & output buffers + cudf::string_scalar const d_scalar(input, true, stream); + auto const d_input = cudf::device_span{ + d_scalar.data(), static_cast(d_scalar.size())}; + + // Default parsing options + cudf::io::json_reader_options const in_opts = + cudf::io::json_reader_options::builder(cudf::io::source_info{}) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .delimiter(delimiter) + .lines(true); + + // Parse the JSON and get the token stream + auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream( + d_input, in_opts, stream, cudf::get_current_device_resource_ref()); + // Copy back the number of tokens that were written + auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream); + auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream); + + stream.synchronize(); + // Verify the number of tokens matches + ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size()); + ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size()); + + for (std::size_t i = 0; i < tokens_gpu.size(); i++) { + // Ensure the index the tokens are pointing to do match + EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i; + // Ensure the token category is correct + EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i; + } +} + CUDF_TEST_PROGRAM_MAIN()