From b1e1c9c060cc6b4b35b8590209177584336444bc Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 26 Sep 2024 11:00:00 -0700
Subject: [PATCH 01/14] Reapply `mixed_semi_join` refactoring and bug fixes
 (#16859)

This PR reapplies changes from #16230 and adds bug fixes and performance improvements for mixed_semi_join.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16859
---
 cpp/src/join/join_common_utils.hpp       |   6 -
 cpp/src/join/mixed_join_common_utils.cuh |  34 ++++++
 cpp/src/join/mixed_join_kernels_semi.cu  |  51 ++++----
 cpp/src/join/mixed_join_kernels_semi.cuh |   6 +-
 cpp/src/join/mixed_join_semi.cu          |  92 +++++---------
 cpp/tests/join/mixed_join_tests.cu       | 147 +++++++++++++++++++++++
 6 files changed, 239 insertions(+), 97 deletions(-)
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 86402a0e7de..573101cefd9 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -22,7 +22,6 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <cuco/static_map.cuh>
 #include <cuco/static_multimap.cuh>
 #include <cuda/atomic>
 
@@ -51,11 +50,6 @@ using mixed_multimap_type =
                         cudf::detail::cuco_allocator<char>,
                         cuco::legacy::double_hashing<1, hash_type, hash_type>>;
 
-using semi_map_type = cuco::legacy::static_map<hash_value_type,
-                                               size_type,
-                                               cuda::thread_scope_device,
-                                               cudf::detail::cuco_allocator<char>>;
-
 using row_hash_legacy =
   cudf::row_hasher<cudf::hashing::detail::default_hash, cudf::nullate::DYNAMIC>;
 
diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh
index 19701816867..4a52cfe098a 100644
--- a/cpp/src/join/mixed_join_common_utils.cuh
+++ b/cpp/src/join/mixed_join_common_utils.cuh
@@ -25,6 +25,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cub/cub.cuh>
+#include <cuco/static_set.cuh>
 
 namespace cudf {
 namespace detail {
@@ -160,6 +161,39 @@ struct pair_expression_equality : public expression_equality<has_nulls> {
   }
 };
 
+/**
+ * @brief Equality comparator that composes two row_equality comparators.
+ */
+struct double_row_equality_comparator {
+  row_equality const equality_comparator;
+  row_equality const conditional_comparator;
+
+  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
+  {
+    using experimental::row::lhs_index_type;
+    using experimental::row::rhs_index_type;
+
+    return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
+           conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
+  }
+};
+
+// A CUDA Cooperative Group of 1 thread for the hash set for mixed semi.
+auto constexpr DEFAULT_MIXED_SEMI_JOIN_CG_SIZE = 1;
+
+// The hash set type used by mixed_semi_join with the build_table.
+using hash_set_type =
+  cuco::static_set<size_type,
+                   cuco::extent<size_t>,
+                   cuda::thread_scope_device,
+                   double_row_equality_comparator,
+                   cuco::linear_probing<DEFAULT_MIXED_SEMI_JOIN_CG_SIZE, row_hash>,
+                   cudf::detail::cuco_allocator<char>,
+                   cuco::storage<1>>;
+
+// The hash_set_ref_type used by mixed_semi_join kerenels for probing.
+using hash_set_ref_type = hash_set_type::ref_type<cuco::contains_tag>;
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index 7459ac3e99c..bd8c80652a0 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -38,38 +38,48 @@ CUDF_KERNEL void __launch_bounds__(block_size)
                   table_device_view right_table,
                   table_device_view probe,
                   table_device_view build,
-                  row_hash const hash_probe,
                   row_equality const equality_probe,
-                  cudf::detail::semi_map_type::device_view hash_table_view,
+                  hash_set_ref_type set_ref,
                   cudf::device_span<bool> left_table_keep_mask,
                   cudf::ast::detail::expression_device_view device_expression_data)
 {
+  auto constexpr cg_size = hash_set_ref_type::cg_size;
+
+  auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
   // used to circumvent conflicts between arrays of different types between
   // different template instantiations due to the extern specifier.
   extern __shared__ char raw_intermediate_storage[];
-  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+  auto intermediate_storage =
     reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
   auto thread_intermediate_storage =
-    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
+    intermediate_storage + (tile.meta_group_rank() * device_expression_data.num_intermediates);
 
-  cudf::size_type const left_num_rows  = left_table.num_rows();
-  cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = left_num_rows;
+  // Equality evaluator to use
+  auto const evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, device_expression_data);
 
-  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
+  // Make sure to swap_tables here as hash_set will use probe table as the left one
+  auto constexpr swap_tables = true;
+  auto const equality        = single_expression_equality<has_nulls>{
+    evaluator, thread_intermediate_storage, swap_tables, equality_probe};
 
-  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
-    left_table, right_table, device_expression_data);
+  // Create set ref with the new equality comparator
+  auto const set_ref_equality = set_ref.with_key_eq(equality);
 
-  if (outer_row_index < outer_num_rows) {
-    // Figure out the number of elements for this key.
-    auto equality = single_expression_equality<has_nulls>{
-      evaluator, thread_intermediate_storage, false, equality_probe};
+  // Total number of rows to query the set
+  auto const outer_num_rows = left_table.num_rows();
+  // Grid stride for the tile
+  auto const cg_grid_stride = cudf::detail::grid_1d::grid_stride<block_size>() / cg_size;
 
-    left_table_keep_mask[outer_row_index] =
-      hash_table_view.contains(outer_row_index, hash_probe, equality);
+  // Find all the rows in the left table that are in the hash table
+  for (auto outer_row_index = cudf::detail::grid_1d::global_thread_id<block_size>() / cg_size;
+       outer_row_index < outer_num_rows;
+       outer_row_index += cg_grid_stride) {
+    auto const result = set_ref_equality.contains(tile, outer_row_index);
+    if (tile.thread_rank() == 0) { left_table_keep_mask[outer_row_index] = result; }
   }
 }
 
@@ -78,9 +88,8 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
-                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            hash_set_ref_type set_ref,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
@@ -94,9 +103,8 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
-        hash_probe,
         equality_probe,
-        hash_table_view,
+        set_ref,
         left_table_keep_mask,
         device_expression_data);
   } else {
@@ -106,9 +114,8 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
-        hash_probe,
         equality_probe,
-        hash_table_view,
+        set_ref,
         left_table_keep_mask,
         device_expression_data);
   }
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
index 43714ffb36a..b08298e64e4 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cuh
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -45,9 +45,8 @@ namespace detail {
  * @param[in] right_table The right table
  * @param[in] probe The table with which to probe the hash table for matches.
  * @param[in] build The table with which the hash table was built.
- * @param[in] hash_probe The hasher used for the probe table.
  * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] hash_table_view The hash table built from `build`.
+ * @param[in] set_ref The hash table device view built from `build`.
  * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating
  * the corresponding index from left table is present in output
  * @param[in] device_expression_data Container of device data required to evaluate the desired
@@ -58,9 +57,8 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
-                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            hash_set_ref_type set_ref,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index aa4fa281159..83a55eca50f 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -45,45 +45,6 @@
 namespace cudf {
 namespace detail {
 
-namespace {
-/**
- * @brief Device functor to create a pair of hash value and index for a given row.
- */
-struct make_pair_function_semi {
-  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
-  {
-    // The value is irrelevant since we only ever use the hash map to check for
-    // membership of a particular row index.
-    return cuco::make_pair(static_cast<hash_value_type>(i), 0);
-  }
-};
-
-/**
- * @brief Equality comparator that composes two row_equality comparators.
- */
-class double_row_equality {
- public:
-  double_row_equality(row_equality equality_comparator, row_equality conditional_comparator)
-    : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator}
-  {
-  }
-
-  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
-  {
-    using experimental::row::lhs_index_type;
-    using experimental::row::rhs_index_type;
-
-    return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
-           _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
-  }
-
- private:
-  row_equality _equality_comparator;
-  row_equality _conditional_comparator;
-};
-
-}  // namespace
-
 std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   table_view const& left_equality,
   table_view const& right_equality,
@@ -95,7 +56,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
+  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and
                  (join_type != join_kind::FULL_JOIN),
                "Inner, left, and full joins should use mixed_join.");
 
@@ -136,7 +97,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   // output column and follow the null-supporting expression evaluation code
   // path.
   auto const has_nulls = cudf::nullate::DYNAMIC{
-    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
+    cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or
     binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)};
 
   auto const parser = ast::detail::expression_parser{
@@ -155,27 +116,20 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto right_conditional_view = table_device_view::create(right_conditional, stream);
 
   auto const preprocessed_build =
-    experimental::row::equality::preprocessed_table::create(build, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(build, stream);
   auto const preprocessed_probe =
-    experimental::row::equality::preprocessed_table::create(probe, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(probe, stream);
   auto const row_comparator =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
+    cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe};
   auto const equality_probe = row_comparator.equal_to<false>(has_nulls, compare_nulls);
 
-  semi_map_type hash_table{
-    compute_hash_table_size(build.num_rows()),
-    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-    cuco::empty_value{cudf::detail::JoinNoneValue},
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    stream.value()};
-
   // Create hash table containing all keys found in right table
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
   auto const build_nulls    = cudf::nullate::DYNAMIC{cudf::has_nulls(build)};
   auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build};
-  auto const hash_build     = row_hash_build.device_hasher(build_nulls);
+
   // Since we may see multiple rows that are identical in the equality tables
   // but differ in the conditional tables, the equality comparator used for
   // insertion must account for both sets of tables. An alternative solution
@@ -190,20 +144,28 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto const equality_build_equality =
     row_comparator_build.equal_to<false>(build_nulls, compare_nulls);
   auto const preprocessed_build_condtional =
-    experimental::row::equality::preprocessed_table::create(right_conditional, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream);
   auto const row_comparator_conditional_build =
     cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional,
                                                             preprocessed_build_condtional};
   auto const equality_build_conditional =
     row_comparator_conditional_build.equal_to<false>(build_nulls, compare_nulls);
-  double_row_equality equality_build{equality_build_equality, equality_build_conditional};
-  make_pair_function_semi pair_func_build{};
 
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
+  hash_set_type row_set{
+    {compute_hash_table_size(build.num_rows())},
+    cuco::empty_key{JoinNoneValue},
+    {equality_build_equality, equality_build_conditional},
+    {row_hash_build.device_hasher(build_nulls)},
+    {},
+    {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    {stream.value()}};
+
+  auto iter = thrust::make_counting_iterator(0);
 
   // skip rows that are null here.
   if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
-    hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
+    row_set.insert_async(iter, iter + right_num_rows, stream.value());
   } else {
     thrust::counting_iterator<cudf::size_type> stencil(0);
     auto const [row_bitmask, _] =
@@ -211,18 +173,19 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
     row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
     // insert valid rows
-    hash_table.insert_if(
-      iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value());
+    row_set.insert_if_async(iter, iter + right_num_rows, stencil, pred, stream.value());
   }
 
-  auto hash_table_view = hash_table.get_device_view();
-
-  detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
-  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+  detail::grid_1d const config(outer_num_rows * hash_set_type::cg_size, DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block =
+    parser.shmem_per_thread *
+    cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size);
 
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
   auto const hash_probe = row_hash.device_hasher(has_nulls);
 
+  hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe);
+
   // Vector used to indicate indices from left/probe table which are present in output
   auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
 
@@ -231,9 +194,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
                          *right_conditional_view,
                          *probe_view,
                          *build_view,
-                         hash_probe,
                          equality_probe,
-                         hash_table_view,
+                         row_set_ref,
                          cudf::device_span<bool>(left_table_keep_mask),
                          parser.device_expression_data,
                          config,
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index 6c147c8a128..9041969bec7 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -778,6 +778,138 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality)
              {1});
 }
 
+TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto left_one_greater_right_one =
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+
+  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
+             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
+             {0},
+             {1},
+             left_one_greater_right_one,
+             {2, 7, 8});
+}
+
+TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMapLarge)
+{
+  using T1 = double;
+
+  // Number of rows in each column
+  auto constexpr N = 10000;
+
+  // Generate column data for left and right tables
+  auto const [left_col0, right_col0] = gen_random_nullable_repeated_columns<T1>(N, 200);
+  auto const [left_col1, right_col1] = gen_random_nullable_repeated_columns<T1>(N, 100);
+
+  // Setup data and nulls for the left table
+  std::vector<std::pair<std::vector<T1>, std::vector<bool>>> lefts = {
+    {left_col0.first, left_col0.second}, {left_col1.first, left_col1.second}};
+  std::vector<cudf::test::fixed_width_column_wrapper<T1>> left_wrappers;
+  std::vector<cudf::column_view> left_columns;
+  for (auto [data, valids] : lefts) {
+    left_wrappers.emplace_back(
+      cudf::test::fixed_width_column_wrapper<T1>(data.begin(), data.end(), valids.begin()));
+    left_columns.emplace_back(left_wrappers.back());
+  };
+
+  // Setup data and nulls for the right table
+  std::vector<std::pair<std::vector<T1>, std::vector<bool>>> rights = {
+    {right_col0.first, right_col0.second}, {right_col1.first, right_col1.second}};
+  std::vector<cudf::test::fixed_width_column_wrapper<T1>> right_wrappers;
+  std::vector<cudf::column_view> right_columns;
+  for (auto [data, valids] : rights) {
+    right_wrappers.emplace_back(
+      cudf::test::fixed_width_column_wrapper<T1>(data.begin(), data.end(), valids.begin()));
+    right_columns.emplace_back(left_wrappers.back());
+  };
+
+  // Left and right table views.
+  auto const left_table  = cudf::table_view{left_columns};
+  auto const right_table = cudf::table_view{right_columns};
+
+  // Using the zeroth column for equality.
+  auto const left_equality  = left_table.select({0});
+  auto const right_equality = right_table.select({0});
+
+  // Column references for equality column.
+  auto const col_ref_left_0  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto left_zero_eq_right_zero =
+    cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
+
+  // Mixed semi join with zeroth column equality
+  {
+    // Expected left_semi_join result
+    auto const expected_mixed_semi_join =
+      cudf::conditional_left_semi_join(left_table, right_table, left_zero_eq_right_zero);
+
+    // Actual mixed_left_semi_join result
+    auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality,
+                                                            right_equality,
+                                                            left_table,
+                                                            right_table,
+                                                            left_zero_eq_right_zero,
+                                                            cudf::null_equality::UNEQUAL);
+
+    // Copy data back to host for comparisons
+    auto expected_indices = cudf::detail::make_std_vector_async<int32_t>(
+      cudf::device_span<int32_t>(*expected_mixed_semi_join), cudf::get_default_stream());
+    auto result_indices = cudf::detail::make_std_vector_sync<int32_t>(
+      cudf::device_span<int32_t>(*mixed_semi_join), cudf::get_default_stream());
+
+    // Sort the indices for 1-1 comparison
+    std::sort(expected_indices.begin(), expected_indices.end());
+    std::sort(result_indices.begin(), result_indices.end());
+
+    // Expected and actual vectors must match.
+    EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size());
+    EXPECT_TRUE(
+      std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin()));
+  }
+
+  // Mixed semi join with zeroth column equality and first column GREATER conditional
+  {
+    // Column references for conditional column.
+    auto const col_ref_left_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
+    auto const col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+    auto left_one_gt_right_one =
+      cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+
+    // Expected left_semi_join result
+    auto const expected_mixed_semi_join = cudf::conditional_left_semi_join(
+      left_table,
+      right_table,
+      cudf::ast::operation(
+        cudf::ast::ast_operator::LOGICAL_AND, left_zero_eq_right_zero, left_one_gt_right_one));
+
+    // Actual left_semi_join result
+    auto const mixed_semi_join = cudf::mixed_left_semi_join(left_equality,
+                                                            right_equality,
+                                                            left_table,
+                                                            right_table,
+                                                            left_one_gt_right_one,
+                                                            cudf::null_equality::UNEQUAL);
+
+    // Copy data back to host for comparisons
+    auto expected_indices = cudf::detail::make_std_vector_async<int32_t>(
+      cudf::device_span<int32_t>(*expected_mixed_semi_join), cudf::get_default_stream());
+    auto result_indices = cudf::detail::make_std_vector_sync<int32_t>(
+      cudf::device_span<int32_t>(*mixed_semi_join), cudf::get_default_stream());
+
+    // Sort the indices for 1-1 comparison
+    std::sort(expected_indices.begin(), expected_indices.end());
+    std::sort(result_indices.begin(), result_indices.end());
+
+    // Expected and actual vectors must match.
+    EXPECT_EQ(expected_mixed_semi_join->size(), mixed_semi_join->size());
+    EXPECT_TRUE(
+      std::equal(expected_indices.begin(), expected_indices.end(), result_indices.begin()));
+  }
+}
+
 TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates)
 {
   this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}},
@@ -900,3 +1032,18 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality)
              left_zero_eq_right_zero,
              {0, 1, 3});
 }
+
+TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto left_one_greater_right_one =
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+
+  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
+             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
+             {0},
+             {1},
+             left_one_greater_right_one,
+             {0, 1, 3, 4, 5, 6, 9});
+}

From d69e4b6fbdff9ad402a37de7940d64ed16b7d329 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 26 Sep 2024 08:07:48 -1000
Subject: [PATCH 02/14] Respect groupby.nunique(dropna=False) (#16921)

closes https://github.com/rapidsai/cudf/issues/16861

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16921
---
 python/cudf/cudf/_lib/aggregation.pyx    |  7 +++++--
 python/cudf/cudf/core/groupby/groupby.py | 16 ++++++++++++++++
 python/cudf/cudf/tests/test_groupby.py   | 17 +++++++++++++++++
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 7c91533cf93..3c96b90f0a1 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -78,8 +78,11 @@ class Aggregation:
         )
 
     @classmethod
-    def nunique(cls):
-        return cls(pylibcudf.aggregation.nunique(pylibcudf.types.NullPolicy.EXCLUDE))
+    def nunique(cls, dropna=True):
+        return cls(pylibcudf.aggregation.nunique(
+            pylibcudf.types.NullPolicy.EXCLUDE
+            if dropna else pylibcudf.types.NullPolicy.INCLUDE
+        ))
 
     @classmethod
     def nth(cls, size):
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index cb8cd0cd28b..be05075a2cd 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -2232,6 +2232,22 @@ def func(x):
 
         return self.agg(func)
 
+    @_performance_tracking
+    def nunique(self, dropna: bool = True):
+        """
+        Return number of unique elements in the group.
+
+        Parameters
+        ----------
+        dropna : bool, default True
+            Don't include NaN in the counts.
+        """
+
+        def func(x):
+            return getattr(x, "nunique")(dropna=dropna)
+
+        return self.agg(func)
+
     @_performance_tracking
     def std(
         self,
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 848bc259e7b..14ba9894fd3 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1940,6 +1940,23 @@ def test_groupby_nunique(agg, by):
     assert_groupby_results_equal(expect, got, check_dtype=False)
 
 
+@pytest.mark.parametrize("dropna", [True, False])
+def test_nunique_dropna(dropna):
+    gdf = cudf.DataFrame(
+        {
+            "a": [1, 1, 2],
+            "b": [4, None, 5],
+            "c": [None, None, 7],
+            "d": [1, 1, 3],
+        }
+    )
+    pdf = gdf.to_pandas()
+
+    result = gdf.groupby("a")["b"].nunique(dropna=dropna)
+    expected = pdf.groupby("a")["b"].nunique(dropna=dropna)
+    assert_groupby_results_equal(result, expected, check_dtype=False)
+
+
 @pytest.mark.parametrize(
     "n",
     [0, 1, 2, 10],

From 742eaadb92b0c5159d92be49e647a17e8c1d0b9b Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 26 Sep 2024 14:27:37 -0500
Subject: [PATCH 03/14] Fix links in Dask cuDF documentation (#16929)

More follow-up fixes to the recent Dask-cuDF documentation additions.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16929
---
 docs/dask_cudf/source/best_practices.rst | 15 +++++++++------
 docs/dask_cudf/source/conf.py            |  1 +
 docs/dask_cudf/source/index.rst          | 11 +++++------
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst
index 83039f86fed..41263ebf589 100644
--- a/docs/dask_cudf/source/best_practices.rst
+++ b/docs/dask_cudf/source/best_practices.rst
@@ -81,7 +81,7 @@ representations, native cuDF spilling may be insufficient. For these cases,
 `JIT-unspill <https://docs.rapids.ai/api/dask-cuda/nightly/spilling/#jit-unspill>`__
 is likely to produce better protection from out-of-memory (OOM) errors.
 Please see `Dask-CUDA's spilling documentation
-<https://docs.rapids.ai/api/dask-cuda/24.10/spilling/>`__ for further details
+<https://docs.rapids.ai/api/dask-cuda/stable/spilling/>`__ for further details
 and guidance.
 
 Use RMM
@@ -160,7 +160,7 @@ of the underlying task graph to materialize the collection.
 
 :func:`sort_values` / :func:`set_index` : These operations both require Dask to
 eagerly collect quantile information about the column(s) being targeted by the
-global sort operation. See `Avoid Sorting`__ for notes on sorting considerations.
+global sort operation. See the next section for notes on sorting considerations.
 
 .. note::
   When using :func:`set_index`, be sure to pass in ``sort=False`` whenever the
@@ -297,11 +297,14 @@ bottleneck is typically device-to-host memory spilling.
 Although every workflow is different, the following guidelines
 are often recommended:
 
-* `Use a distributed cluster with Dask-CUDA workers <Use Dask-CUDA>`_
-* `Use native cuDF spilling whenever possible <Enable cuDF Spilling>`_
+* Use a distributed cluster with `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__ workers
+
+* Use native cuDF spilling whenever possible (`Dask-CUDA spilling documentation <https://docs.rapids.ai/api/dask-cuda/stable/spilling/>`__)
+
 * Avoid shuffling whenever possible
-  * Use ``split_out=1`` for low-cardinality groupby aggregations
-  * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``)
+    * Use ``split_out=1`` for low-cardinality groupby aggregations
+    * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``)
+
 * `Use UCX <https://docs.rapids.ai/api/dask-cuda/nightly/examples/ucx/>`__ if communication is a bottleneck.
 
 .. note::
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
index dc40254312e..5daa8245695 100644
--- a/docs/dask_cudf/source/conf.py
+++ b/docs/dask_cudf/source/conf.py
@@ -78,6 +78,7 @@
     "cudf": ("https://docs.rapids.ai/api/cudf/stable/", None),
     "dask": ("https://docs.dask.org/en/stable/", None),
     "pandas": ("https://pandas.pydata.org/docs/", None),
+    "dask-cuda": ("https://docs.rapids.ai/api/dask-cuda/stable/", None),
 }
 
 numpydoc_show_inherited_class_members = True
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
index 6eb755d7854..c2891ebc15e 100644
--- a/docs/dask_cudf/source/index.rst
+++ b/docs/dask_cudf/source/index.rst
@@ -16,10 +16,9 @@ as the ``"cudf"`` dataframe backend for
   Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU
   or multi-node execution on their own. You must also deploy a
   `dask.distributed <https://distributed.dask.org/en/stable/>`__ cluster
-  to leverage multiple GPUs. We strongly recommend using `Dask-CUDA
-  <https://docs.rapids.ai/api/dask-cuda/stable/>`__ to simplify the
-  setup of the cluster, taking advantage of all features of the GPU
-  and networking hardware.
+  to leverage multiple GPUs. We strongly recommend using :doc:`dask-cuda:index`
+  to simplify the setup of the cluster, taking advantage of all features
+  of the GPU and networking hardware.
 
 If you are familiar with Dask and `pandas <pandas.pydata.org>`__ or
 `cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask cuDF
@@ -161,7 +160,7 @@ out-of-core computing. This also means that the compute tasks can be
 executed in parallel over a multi-GPU cluster.
 
 In order to execute your Dask workflow on multiple GPUs, you will
-typically need to use `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+typically need to use :doc:`dask-cuda:index`
 to deploy distributed Dask cluster, and
 `Distributed <https://distributed.dask.org/en/stable/client.html>`__
 to define a client object. For example::
@@ -192,7 +191,7 @@ to define a client object. For example::
   <https://distributed.dask.org/en/stable/manage-computation.html>`__
   for more details.
 
-Please see the `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+Please see the :doc:`dask-cuda:index`
 documentation for more information about deploying GPU-aware clusters
 (including `best practices
 <https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/>`__).

From 40075f1115ecd82a74b46d98e80e19afbf8a0210 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 26 Sep 2024 16:32:09 -0400
Subject: [PATCH 04/14] Use `changed-files` shared workflow (#16713)

Contributes to https://github.com/rapidsai/build-planning/issues/94

Depends on https://github.com/rapidsai/shared-workflows/pull/239

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16713
---
 .github/workflows/pr.yaml | 140 +++++++++++++++-----------------------
 1 file changed, 56 insertions(+), 84 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a65cae34653..bc237cc73b0 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -43,80 +43,52 @@ jobs:
     with:
       needs: ${{ toJSON(needs) }}
   changed-files:
-    runs-on: ubuntu-latest
-    name: "Check changed files"
-    outputs:
-      test_cpp: ${{ steps.changed-files.outputs.cpp_any_changed == 'true' }}
-      test_java: ${{ steps.changed-files.outputs.java_any_changed == 'true' }}
-      test_notebooks: ${{ steps.changed-files.outputs.notebooks_any_changed == 'true' }}
-      test_python: ${{ steps.changed-files.outputs.python_any_changed == 'true' }}
-      test_cudf_pandas: ${{ steps.changed-files.outputs.cudf_pandas_any_changed == 'true' }}
-    steps:
-      - name: Get PR info
-        id: get-pr-info
-        uses: nv-gha-runners/get-pr-info@main
-      - name: Checkout code repo
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          persist-credentials: false
-      - name: Calculate merge base
-        id: calculate-merge-base
-        env:
-          PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
-          BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
-        run: |
-          (echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") > "$GITHUB_OUTPUT"
-      - name: Get changed files
-        id: changed-files
-        uses: tj-actions/changed-files@v45
-        with:
-          base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }}
-          sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
-          files_yaml: |
-            cpp:
-              - '**'
-              - '!CONTRIBUTING.md'
-              - '!README.md'
-              - '!docs/**'
-              - '!img/**'
-              - '!java/**'
-              - '!notebooks/**'
-              - '!python/**'
-              - '!ci/cudf_pandas_scripts/**'
-            java:
-              - '**'
-              - '!CONTRIBUTING.md'
-              - '!README.md'
-              - '!docs/**'
-              - '!img/**'
-              - '!notebooks/**'
-              - '!python/**'
-              - '!ci/cudf_pandas_scripts/**'
-            notebooks:
-              - '**'
-              - '!CONTRIBUTING.md'
-              - '!README.md'
-              - '!java/**'
-              - '!ci/cudf_pandas_scripts/**'
-            python:
-              - '**'
-              - '!CONTRIBUTING.md'
-              - '!README.md'
-              - '!docs/**'
-              - '!img/**'
-              - '!java/**'
-              - '!notebooks/**'
-              - '!ci/cudf_pandas_scripts/**'
-            cudf_pandas:
-              - '**'
-              - 'ci/cudf_pandas_scripts/**'
-              - '!CONTRIBUTING.md'
-              - '!README.md'
-              - '!docs/**'
-              - '!img/**'
-              - '!java/**'
-              - '!notebooks/**'
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12
+    with:
+      files_yaml: |
+        test_cpp:
+          - '**'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!ci/cudf_pandas_scripts/**'
+          - '!docs/**'
+          - '!img/**'
+          - '!java/**'
+          - '!notebooks/**'
+          - '!python/**'
+        test_cudf_pandas:
+          - '**'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!docs/**'
+          - '!img/**'
+          - '!java/**'
+          - '!notebooks/**'
+        test_java:
+          - '**'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!ci/cudf_pandas_scripts/**'
+          - '!docs/**'
+          - '!img/**'
+          - '!notebooks/**'
+          - '!python/**'
+        test_notebooks:
+          - '**'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!ci/cudf_pandas_scripts/**'
+          - '!java/**'
+        test_python:
+          - '**'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!ci/cudf_pandas_scripts/**'
+          - '!docs/**'
+          - '!img/**'
+          - '!java/**'
+          - '!notebooks/**'
   checks:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
@@ -139,7 +111,7 @@ jobs:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
-    if: needs.changed-files.outputs.test_cpp == 'true'
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-python-build:
@@ -152,7 +124,7 @@ jobs:
     needs: [conda-python-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
-    if: needs.changed-files.outputs.test_python == 'true'
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: "ci/test_python_cudf.sh"
@@ -161,7 +133,7 @@ jobs:
     needs: [conda-python-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
-    if: needs.changed-files.outputs.test_python == 'true'
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: "ci/test_python_other.sh"
@@ -169,7 +141,7 @@ jobs:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
-    if: needs.changed-files.outputs.test_java == 'true'
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -190,7 +162,7 @@ jobs:
     needs: [conda-python-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
-    if: needs.changed-files.outputs.test_notebooks == 'true'
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -234,7 +206,7 @@ jobs:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
-    if: needs.changed-files.outputs.test_python == 'true'
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
@@ -251,7 +223,7 @@ jobs:
     needs: [wheel-build-cudf-polars, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
-    if: needs.changed-files.outputs.test_python == 'true'
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -283,7 +255,7 @@ jobs:
     needs: [wheel-build-dask-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
-    if: needs.changed-files.outputs.test_python == 'true'
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -303,7 +275,7 @@ jobs:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
-    if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true'
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -314,7 +286,7 @@ jobs:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
-    if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true'
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))

From fa12901024fcc810fcf7f695d2f2e41f472f2306 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 26 Sep 2024 19:07:48 -0400
Subject: [PATCH 05/14] Fix cudf::strings::findall error with empty input
 (#16928)

Fixes `cudf::strings::findall` error when passed an empty input column.
Also adds a gtest for empty input and for all-rows do not match case.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16928
---
 cpp/src/strings/search/findall.cu   | 10 +++++++---
 cpp/tests/strings/findall_tests.cpp | 28 ++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 067a513af96..d8c1b50a94b 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
+#include <cudf/lists/detail/lists_column_factories.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/findall.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -97,8 +98,11 @@ std::unique_ptr<column> findall(strings_column_view const& input,
                                 rmm::cuda_stream_view stream,
                                 rmm::device_async_resource_ref mr)
 {
-  auto const strings_count = input.size();
-  auto const d_strings     = column_device_view::create(input.parent(), stream);
+  if (input.is_empty()) {
+    return cudf::lists::detail::make_empty_lists_column(input.parent().type(), stream, mr);
+  }
+
+  auto const d_strings = column_device_view::create(input.parent(), stream);
 
   // create device object from regex_program
   auto d_prog = regex_device_builder::create_prog_device(prog, stream);
@@ -113,7 +117,7 @@ std::unique_ptr<column> findall(strings_column_view const& input,
   auto strings_output = findall_util(*d_strings, *d_prog, total_matches, d_offsets, stream, mr);
 
   // Build the lists column from the offsets and the strings
-  return make_lists_column(strings_count,
+  return make_lists_column(input.size(),
                            std::move(offsets),
                            std::move(strings_output),
                            input.null_count(),
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index 47606b9b3ed..6eea1895fb1 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -148,3 +148,31 @@ TEST_F(StringsFindallTests, LargeRegex)
   LCW expected({LCW{large_regex.c_str()}, LCW{}, LCW{}});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
+
+TEST_F(StringsFindallTests, NoMatches)
+{
+  cudf::test::strings_column_wrapper input({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"});
+  auto sv = cudf::strings_column_view(input);
+
+  auto pattern = std::string("(^zzz$)");
+  using LCW    = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{}, LCW{}, LCW{}, LCW{}, LCW{}});
+  auto prog    = cudf::strings::regex_program::create(pattern);
+  auto results = cudf::strings::findall(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+}
+
+TEST_F(StringsFindallTests, EmptyTest)
+{
+  std::string pattern = R"(\w+)";
+
+  auto prog = cudf::strings::regex_program::create(pattern);
+
+  cudf::test::strings_column_wrapper input;
+  auto sv      = cudf::strings_column_view(input);
+  auto results = cudf::strings::findall(sv, *prog);
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected;
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+}

From 9125d2f19ecd6a82f29cdb41928737ec73eb491b Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 26 Sep 2024 20:07:39 -0500
Subject: [PATCH 06/14] reduce wheel build verbosity, narrow deprecation
 warning filter (#16896)

Proposes some small changes I've taken as follow-ups from previous work here.

* #16745 filtered out all linter warnings about uses of `datetime.utcnow()` ... this PR limits that to only the warnings observed from `botocore` (so that the linter will helpfully warn us about such uses directly in `cudf`)
  - ref https://github.com/rapidsai/cudf/pull/16745#discussion_r1746290952
* reduces the verbosity of logs for wheel builds (`-vvv` to `-v`)
  - similar to https://github.com/rapidsai/cugraph/pull/4651

## Notes for Reviewers

This is intentionally targeted at `24.12`. No need to rush this into 24.10 before code freeze.

### How I tested this

<details><summary>locally in docker (click me)</summary>

```shell
docker run \
    --rm \
    --gpus 1 \
    -v $(pwd):/opt/work \
    -w /opt/work \
    -it rapidsai/citestwheel:latest \
    bash

pip install \
    --prefer-binary \
    'cudf-cu12[test]==24.10.*,>=0.0.0a0' \
    'flask' \
    'flask-cors' \
    'moto>=4.0.8' \
    'boto3' \
    's3fs>=2022.3.0'

cd ./python/cudf

pytest \
    cudf/tests/test_s3.py
```

</details>

Authors:
  - James Lamb (https://github.com/jameslamb)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16896
---
 ci/build_wheel.sh                 | 2 +-
 python/cudf/cudf/tests/pytest.ini | 2 +-
 python/dask_cudf/pyproject.toml   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 7c1fa705faa..bf76f4ed29a 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -12,4 +12,4 @@ rapids-generate-version > ./VERSION
 
 cd "${package_dir}"
 
-python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini
index d05ba9aaacc..496a322ff80 100644
--- a/python/cudf/cudf/tests/pytest.ini
+++ b/python/cudf/cudf/tests/pytest.ini
@@ -9,7 +9,7 @@ filterwarnings =
     ignore:::.*xdist.*
     ignore:::.*pytest.*
     # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow()
-    ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning
+    ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore
     # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+
     ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning
     # PerformanceWarning from cupy warming up the JIT cache
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index c64de06338f..336b2d24948 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -119,7 +119,7 @@ filterwarnings = [
     "error::FutureWarning",
     "error::DeprecationWarning",
     # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow()
-    "ignore:.*datetime.*utcnow.*scheduled for removal:DeprecationWarning",
+    "ignore:.*datetime.*utcnow.*scheduled for removal:DeprecationWarning:botocore",
     "ignore:create_block_manager_from_blocks is deprecated and will be removed in a future version. Use public APIs instead.:DeprecationWarning",
     # https://github.com/dask/partd/blob/main/partd/pandas.py#L198
     "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning",

From 0632538a69f55f6d489d306edf2910a111430425 Mon Sep 17 00:00:00 2001
From: Graham Markall <535640+gmarkall@users.noreply.github.com>
Date: Fri, 27 Sep 2024 02:36:13 +0100
Subject: [PATCH 07/14] Use numba-cuda>=0.0.13 (#16474)

Testing with https://github.com/NVIDIA/numba-cuda on CI.

I am not sure if edits in other repos are required (e.g. I used to have to change an "integration" repo) - any pointers appreciated!

Authors:
  - Graham Markall (https://github.com/gmarkall)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Mike Sarahan (https://github.com/msarahan)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16474
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 2 +-
 dependencies.yaml                                | 6 +++---
 python/cudf/pyproject.toml                       | 2 +-
 python/dask_cudf/pyproject.toml                  | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8db03812a19..8b45d26c367 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -54,7 +54,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba>=0.57
+- numba-cuda>=0.0.13
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index fdbe278b66b..354c1360e5a 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -53,7 +53,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba>=0.57
+- numba-cuda>=0.0.13
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcomp==4.0.1
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index e22b4a4eddc..25e69b89789 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -80,7 +80,7 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.3dev0
     - cupy >=12.0.0
-    - numba >=0.57
+    - numba-cuda >=0.0.13
     - numpy >=1.23,<3.0a0
     - pyarrow>=14.0.0,<18.0.0a0
     - libcudf ={{ version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index bb8635403a4..ed36a23e5c3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -605,7 +605,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cachetools
-          - &numba numba>=0.57
+          - &numba-cuda-dep numba-cuda>=0.0.13
           - nvtx>=0.2.1
           - packaging
           - rich
@@ -720,7 +720,7 @@ dependencies:
         matrices:
           - matrix: {dependencies: "oldest"}
             packages:
-              - numba==0.57.*
+              - *numba-cuda-dep
               - pandas==2.0.*
           - matrix:
             packages:
@@ -802,7 +802,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - dask-cuda==24.12.*,>=0.0.0a0
-          - *numba
+          - *numba-cuda-dep
     specific:
       - output_types: [conda, requirements]
         matrices:
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index f90cb96e189..605f9be5a49 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -24,7 +24,7 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "libcudf==24.12.*,>=0.0.0a0",
-    "numba>=0.57",
+    "numba-cuda>=0.0.13",
     "numpy>=1.23,<3.0a0",
     "nvtx>=0.2.1",
     "packaging",
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 336b2d24948..76e47b50c3b 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -46,7 +46,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint"
 [project.optional-dependencies]
 test = [
     "dask-cuda==24.12.*,>=0.0.0a0",
-    "numba>=0.57",
+    "numba-cuda>=0.0.13",
     "pytest-cov",
     "pytest-xdist",
     "pytest<8",

From 51e8a3fd446f7ef061c4a5d9aa7ea45f1ac3bab6 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 27 Sep 2024 07:24:41 -0700
Subject: [PATCH 08/14] clang-tidy fixes part 1 (#16937)

This PR includes a first set of fixes found by applying the latest version of clang-tidy to our code base. To keep things reviewable, I've restricted this PR to a smaller set of changes just to the includes.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16937
---
 .../cudf/column/column_device_view.cuh        |  4 +-
 cpp/include/cudf/column/column_view.hpp       |  4 +-
 .../cudf/detail/aggregation/aggregation.hpp   |  4 +-
 .../cudf/detail/groupby/sort_helper.hpp       |  1 -
 .../cudf/detail/utilities/host_vector.hpp     |  2 +-
 .../dictionary/dictionary_column_view.hpp     |  2 +-
 cpp/include/cudf/groupby.hpp                  |  2 +-
 cpp/include/cudf/io/json.hpp                  |  3 -
 cpp/include/cudf/lists/lists_column_view.hpp  |  2 +-
 cpp/include/cudf/scalar/scalar.hpp            |  6 +-
 .../cudf/strings/detail/char_tables.hpp       |  4 +-
 .../cudf/strings/regex/regex_program.hpp      |  4 +-
 cpp/include/cudf/strings/string_view.cuh      | 13 ++--
 .../cudf/strings/strings_column_view.hpp      |  2 +-
 .../cudf/structs/structs_column_view.hpp      |  2 +-
 .../cudf/tdigest/tdigest_column_view.hpp      |  2 +-
 cpp/include/cudf/utilities/span.hpp           | 64 +++++++++----------
 17 files changed, 56 insertions(+), 65 deletions(-)

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index c3238cb94fd..35a39ef9758 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -1425,13 +1425,13 @@ struct pair_rep_accessor {
 
  private:
   template <typename R, std::enable_if_t<std::is_same_v<R, rep_type>, void>* = nullptr>
-  __device__ inline auto get_rep(cudf::size_type i) const
+  __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const
   {
     return col.element<R>(i);
   }
 
   template <typename R, std::enable_if_t<not std::is_same_v<R, rep_type>, void>* = nullptr>
-  __device__ inline auto get_rep(cudf::size_type i) const
+  __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const
   {
     return col.element<R>(i).value();
   }
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 3ef7bafe727..48f89b8be25 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -235,7 +235,7 @@ class column_view_base {
    *
    * @return Typed pointer to underlying data
    */
-  virtual void const* get_data() const noexcept { return _data; }
+  [[nodiscard]] virtual void const* get_data() const noexcept { return _data; }
 
   data_type _type{type_id::EMPTY};   ///< Element type
   size_type _size{};                 ///< Number of elements
@@ -695,7 +695,7 @@ class mutable_column_view : public detail::column_view_base {
    *
    * @return Typed pointer to underlying data
    */
-  void const* get_data() const noexcept override;
+  [[nodiscard]] void const* get_data() const noexcept override;
 
  private:
   friend mutable_column_view bit_cast(mutable_column_view const& input, data_type type);
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 4255faea702..6661a461b8b 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -683,7 +683,7 @@ class ewma_aggregation final : public scan_aggregation {
   {
   }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<ewma_aggregation>(*this);
   }
@@ -694,7 +694,7 @@ class ewma_aggregation final : public scan_aggregation {
     return collector.visit(col_type, *this);
   }
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<ewma_aggregation const&>(_other);
diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index ce8783d8b79..d7a42d0eca5 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -211,7 +211,6 @@ struct sort_groupby_helper {
    */
   column_view keys_bitmask_column(rmm::cuda_stream_view stream);
 
- private:
   column_ptr _key_sorted_order;      ///< Indices to produce _keys in sorted order
   column_ptr _unsorted_keys_labels;  ///< Group labels for unsorted _keys
   column_ptr _keys_bitmask_column;   ///< Column representing rows with one or more nulls values
diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
index ecb8f910463..3f6ad7b7b1d 100644
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -183,7 +183,7 @@ class rmm_host_allocator {
    */
   inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); }
 
-  bool is_device_accessible() const { return _is_device_accessible; }
+  [[nodiscard]] bool is_device_accessible() const { return _is_device_accessible; }
 
  private:
   rmm::host_async_resource_ref mr;
diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
index dc822fee38b..5596f78a90b 100644
--- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
@@ -47,7 +47,7 @@ class dictionary_column_view : private column_view {
   dictionary_column_view(column_view const& dictionary_column);
   dictionary_column_view(dictionary_column_view&&)      = default;  ///< Move constructor
   dictionary_column_view(dictionary_column_view const&) = default;  ///< Copy constructor
-  ~dictionary_column_view()                             = default;
+  ~dictionary_column_view() override                    = default;
 
   /**
    * @brief Move assignment operator
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 11c778408fe..c9df02f167a 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -36,7 +36,7 @@ namespace CUDF_EXPORT cudf {
 namespace groupby {
 namespace detail {
 namespace sort {
-class sort_groupby_helper;
+struct sort_groupby_helper;
 
 }  // namespace sort
 }  // namespace detail
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 6798557e14e..b662b660557 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -116,9 +116,6 @@ class json_reader_options {
   // Whether to parse dates as DD/MM versus MM/DD
   bool _dayfirst = false;
 
-  // Whether to use the legacy reader
-  bool _legacy = false;
-
   // Whether to keep the quote characters of string values
   bool _keep_quotes = false;
 
diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
index b117a871b64..d7057cfea7e 100644
--- a/cpp/include/cudf/lists/lists_column_view.hpp
+++ b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -48,7 +48,7 @@ class lists_column_view : private column_view {
   lists_column_view(column_view const& lists_column);
   lists_column_view(lists_column_view&&)      = default;  ///< Move constructor
   lists_column_view(lists_column_view const&) = default;  ///< Copy constructor
-  ~lists_column_view()                        = default;
+  ~lists_column_view() override               = default;
   /**
    * @brief Copy assignment operator
    *
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index e8a498afc09..66be2a12fbe 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -47,6 +47,7 @@ namespace CUDF_EXPORT cudf {
  */
 class scalar {
  public:
+  scalar()                               = delete;
   virtual ~scalar()                      = default;
   scalar& operator=(scalar const& other) = delete;
   scalar& operator=(scalar&& other)      = delete;
@@ -96,8 +97,6 @@ class scalar {
   data_type _type{type_id::EMPTY};     ///< Logical type of value in the scalar
   rmm::device_scalar<bool> _is_valid;  ///< Device bool signifying validity
 
-  scalar() = delete;
-
   /**
    * @brief Move constructor for scalar.
    * @param other The other scalar to move from.
@@ -145,6 +144,7 @@ class fixed_width_scalar : public scalar {
  public:
   using value_type = T;  ///< Type of the value held by the scalar.
 
+  fixed_width_scalar()           = delete;
   ~fixed_width_scalar() override = default;
 
   /**
@@ -203,8 +203,6 @@ class fixed_width_scalar : public scalar {
  protected:
   rmm::device_scalar<T> _data;  ///< device memory containing the value
 
-  fixed_width_scalar() = delete;
-
   /**
    * @brief Construct a new fixed width scalar object.
    *
diff --git a/cpp/include/cudf/strings/detail/char_tables.hpp b/cpp/include/cudf/strings/detail/char_tables.hpp
index 5d6aff28826..6460d4f43ff 100644
--- a/cpp/include/cudf/strings/detail/char_tables.hpp
+++ b/cpp/include/cudf/strings/detail/char_tables.hpp
@@ -74,9 +74,9 @@ character_cases_table_type const* get_character_cases_table();
  */
 struct special_case_mapping {
   uint16_t num_upper_chars;
-  uint16_t upper[3];
+  uint16_t upper[3];  // NOLINT
   uint16_t num_lower_chars;
-  uint16_t lower[3];
+  uint16_t lower[3];  // NOLINT
 };
 
 /**
diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp
index 9da859d9c87..1bf1c26f471 100644
--- a/cpp/include/cudf/strings/regex/regex_program.hpp
+++ b/cpp/include/cudf/strings/regex/regex_program.hpp
@@ -54,6 +54,8 @@ struct regex_program {
                                                regex_flags flags      = regex_flags::DEFAULT,
                                                capture_groups capture = capture_groups::EXTRACT);
 
+  regex_program() = delete;
+
   /**
    * @brief Move constructor
    *
@@ -115,8 +117,6 @@ struct regex_program {
   ~regex_program();
 
  private:
-  regex_program() = delete;
-
   std::string _pattern;
   regex_flags _flags;
   capture_groups _capture;
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 14695c3bb27..34ed3c5618e 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -99,7 +99,7 @@ __device__ inline std::pair<size_type, size_type> bytes_to_character_position(st
  * values. Also, this char pointer serves as valid device pointer of identity
  * value for minimum operator on string values.
  */
-static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"};
+static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"};  // NOLINT
 }  // namespace detail
 }  // namespace strings
 
@@ -283,14 +283,11 @@ __device__ inline size_type string_view::const_iterator::position() const { retu
 
 __device__ inline size_type string_view::const_iterator::byte_offset() const { return byte_pos; }
 
-__device__ inline string_view::const_iterator string_view::begin() const
-{
-  return const_iterator(*this, 0, 0);
-}
+__device__ inline string_view::const_iterator string_view::begin() const { return {*this, 0, 0}; }
 
 __device__ inline string_view::const_iterator string_view::end() const
 {
-  return const_iterator(*this, length(), size_bytes());
+  return {*this, length(), size_bytes()};
 }
 // @endcond
 
@@ -411,7 +408,7 @@ __device__ inline size_type string_view::find(char const* str,
 
 __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size_type count) const
 {
-  char str[sizeof(char_utf8)];
+  char str[sizeof(char_utf8)];  // NOLINT
   size_type chwidth = strings::detail::from_char_utf8(chr, str);
   return find(str, chwidth, pos, count);
 }
@@ -433,7 +430,7 @@ __device__ inline size_type string_view::rfind(char const* str,
 
 __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, size_type count) const
 {
-  char str[sizeof(char_utf8)];
+  char str[sizeof(char_utf8)];  // NOLINT
   size_type chwidth = strings::detail::from_char_utf8(chr, str);
   return rfind(str, chwidth, pos, count);
 }
diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index 4a2512eb7c5..6ec8d1238d6 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -45,7 +45,7 @@ class strings_column_view : private column_view {
   strings_column_view(column_view strings_column);
   strings_column_view(strings_column_view&&)      = default;  ///< Move constructor
   strings_column_view(strings_column_view const&) = default;  ///< Copy constructor
-  ~strings_column_view()                          = default;
+  ~strings_column_view() override                 = default;
   /**
    * @brief Copy assignment operator
    *
diff --git a/cpp/include/cudf/structs/structs_column_view.hpp b/cpp/include/cudf/structs/structs_column_view.hpp
index 19798f51656..91d7ddce955 100644
--- a/cpp/include/cudf/structs/structs_column_view.hpp
+++ b/cpp/include/cudf/structs/structs_column_view.hpp
@@ -42,7 +42,7 @@ class structs_column_view : public column_view {
   // Foundation members:
   structs_column_view(structs_column_view const&) = default;  ///< Copy constructor
   structs_column_view(structs_column_view&&)      = default;  ///< Move constructor
-  ~structs_column_view()                          = default;
+  ~structs_column_view() override                 = default;
   /**
    * @brief Copy assignment operator
    *
diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.hpp b/cpp/include/cudf/tdigest/tdigest_column_view.hpp
index 2f19efa5630..da4954b859c 100644
--- a/cpp/include/cudf/tdigest/tdigest_column_view.hpp
+++ b/cpp/include/cudf/tdigest/tdigest_column_view.hpp
@@ -59,7 +59,7 @@ class tdigest_column_view : private column_view {
   tdigest_column_view(column_view const&);  ///< Construct tdigest_column_view from a column_view
   tdigest_column_view(tdigest_column_view&&)      = default;  ///< Move constructor
   tdigest_column_view(tdigest_column_view const&) = default;  ///< Copy constructor
-  ~tdigest_column_view()                          = default;
+  ~tdigest_column_view() override                 = default;
   /**
    * @brief Copy assignment operator
    *
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 0daebc0dd8d..914731ea417 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -236,26 +236,26 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
 
   /// Constructor from container
   /// @param in The container to construct the span from
-  template <
-    typename C,
-    // Only supported containers of types convertible to T
-    std::enable_if_t<is_host_span_supported_container<C>::value &&
-                     std::is_convertible_v<std::remove_pointer_t<decltype(thrust::raw_pointer_cast(
-                                             std::declval<C&>().data()))> (*)[],
-                                           T (*)[]>>* = nullptr>
+  template <typename C,
+            // Only supported containers of types convertible to T
+            std::enable_if_t<is_host_span_supported_container<C>::value &&
+                             std::is_convertible_v<
+                               std::remove_pointer_t<decltype(thrust::raw_pointer_cast(  // NOLINT
+                                 std::declval<C&>().data()))> (*)[],
+                               T (*)[]>>* = nullptr>  // NOLINT
   constexpr host_span(C& in) : base(thrust::raw_pointer_cast(in.data()), in.size())
   {
   }
 
   /// Constructor from const container
   /// @param in The container to construct the span from
-  template <
-    typename C,
-    // Only supported containers of types convertible to T
-    std::enable_if_t<is_host_span_supported_container<C>::value &&
-                     std::is_convertible_v<std::remove_pointer_t<decltype(thrust::raw_pointer_cast(
-                                             std::declval<C&>().data()))> (*)[],
-                                           T (*)[]>>* = nullptr>
+  template <typename C,
+            // Only supported containers of types convertible to T
+            std::enable_if_t<is_host_span_supported_container<C>::value &&
+                             std::is_convertible_v<
+                               std::remove_pointer_t<decltype(thrust::raw_pointer_cast(  // NOLINT
+                                 std::declval<C&>().data()))> (*)[],
+                               T (*)[]>>* = nullptr>  // NOLINT
   constexpr host_span(C const& in) : base(thrust::raw_pointer_cast(in.data()), in.size())
   {
   }
@@ -264,7 +264,7 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
   /// @param in The host_vector to construct the span from
   template <typename OtherT,
             // Only supported containers of types convertible to T
-            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>>* = nullptr>
+            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>>* = nullptr>  // NOLINT
   constexpr host_span(cudf::detail::host_vector<OtherT>& in)
     : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()}
   {
@@ -274,7 +274,7 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
   /// @param in The host_vector to construct the span from
   template <typename OtherT,
             // Only supported containers of types convertible to T
-            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>>* = nullptr>
+            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>>* = nullptr>  // NOLINT
   constexpr host_span(cudf::detail::host_vector<OtherT> const& in)
     : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()}
   {
@@ -285,7 +285,7 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
   template <typename OtherT,
             std::size_t OtherExtent,
             std::enable_if_t<(Extent == OtherExtent || Extent == dynamic_extent) &&
-                               std::is_convertible_v<OtherT (*)[], T (*)[]>,
+                               std::is_convertible_v<OtherT (*)[], T (*)[]>,  // NOLINT
                              void>* = nullptr>
   constexpr host_span(host_span<OtherT, OtherExtent> const& other) noexcept
     : base(other.data(), other.size())
@@ -333,26 +333,26 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
 
   /// Constructor from container
   /// @param in The container to construct the span from
-  template <
-    typename C,
-    // Only supported containers of types convertible to T
-    std::enable_if_t<is_device_span_supported_container<C>::value &&
-                     std::is_convertible_v<std::remove_pointer_t<decltype(thrust::raw_pointer_cast(
-                                             std::declval<C&>().data()))> (*)[],
-                                           T (*)[]>>* = nullptr>
+  template <typename C,
+            // Only supported containers of types convertible to T
+            std::enable_if_t<is_device_span_supported_container<C>::value &&
+                             std::is_convertible_v<
+                               std::remove_pointer_t<decltype(thrust::raw_pointer_cast(  // NOLINT
+                                 std::declval<C&>().data()))> (*)[],
+                               T (*)[]>>* = nullptr>  // NOLINT
   constexpr device_span(C& in) : base(thrust::raw_pointer_cast(in.data()), in.size())
   {
   }
 
   /// Constructor from const container
   /// @param in The container to construct the span from
-  template <
-    typename C,
-    // Only supported containers of types convertible to T
-    std::enable_if_t<is_device_span_supported_container<C>::value &&
-                     std::is_convertible_v<std::remove_pointer_t<decltype(thrust::raw_pointer_cast(
-                                             std::declval<C&>().data()))> (*)[],
-                                           T (*)[]>>* = nullptr>
+  template <typename C,
+            // Only supported containers of types convertible to T
+            std::enable_if_t<is_device_span_supported_container<C>::value &&
+                             std::is_convertible_v<
+                               std::remove_pointer_t<decltype(thrust::raw_pointer_cast(  // NOLINT
+                                 std::declval<C&>().data()))> (*)[],
+                               T (*)[]>>* = nullptr>  // NOLINT
   constexpr device_span(C const& in) : base(thrust::raw_pointer_cast(in.data()), in.size())
   {
   }
@@ -362,7 +362,7 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
   template <typename OtherT,
             std::size_t OtherExtent,
             std::enable_if_t<(Extent == OtherExtent || Extent == dynamic_extent) &&
-                               std::is_convertible_v<OtherT (*)[], T (*)[]>,
+                               std::is_convertible_v<OtherT (*)[], T (*)[]>,  // NOLINT
                              void>* = nullptr>
   constexpr device_span(device_span<OtherT, OtherExtent> const& other) noexcept
     : base(other.data(), other.size())

From 1f25d7a24c5d58e6c1acdb3d3fbabc6a5a39ebe6 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 27 Sep 2024 09:13:43 -0700
Subject: [PATCH 09/14] clang-tidy fixes part 3 (#16939)

Subset of improvements to the code base proposed by the latest version of clang-tidy.

**Note to reviewers**: The changeset looks deceptively large. Almost all of the change are really just switching from raw C-style arrays to C++ std::arrays.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Basit Ayantunde (https://github.com/lamarrr)

URL: https://github.com/rapidsai/cudf/pull/16939
---
 cpp/tests/copying/copy_tests.cpp              | 174 ++++++++--------
 cpp/tests/filling/sequence_tests.cpp          |  13 +-
 cpp/tests/groupby/collect_list_tests.cpp      |   7 +-
 cpp/tests/interop/dlpack_test.cpp             |  22 +--
 cpp/tests/io/orc_test.cpp                     |  77 ++++----
 cpp/tests/io/parquet_chunked_writer_test.cpp  |  72 +++----
 cpp/tests/io/parquet_common.cpp               |  18 +-
 cpp/tests/io/parquet_misc_test.cpp            |   9 +-
 cpp/tests/io/parquet_reader_test.cpp          | 187 +++++++++---------
 cpp/tests/io/parquet_v2_test.cpp              |  44 +++--
 cpp/tests/io/parquet_writer_test.cpp          |  88 ++++-----
 cpp/tests/json/json_tests.cpp                 |  12 +-
 cpp/tests/reductions/reduction_tests.cpp      |   7 +-
 cpp/tests/reductions/scan_tests.cpp           |   4 +-
 cpp/tests/rolling/nth_element_test.cpp        |   4 +-
 cpp/tests/streams/transform_test.cpp          |   2 +-
 cpp/tests/strings/chars_types_tests.cpp       |  29 +--
 cpp/tests/strings/contains_tests.cpp          |  69 +++----
 cpp/tests/strings/durations_tests.cpp         | 143 +++++++-------
 cpp/tests/strings/extract_tests.cpp           |   6 +-
 cpp/tests/strings/findall_tests.cpp           |   6 +-
 .../integration/unary_transform_test.cpp      |   2 +-
 22 files changed, 483 insertions(+), 512 deletions(-)

diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp
index 7c8729b6a77..4124f749012 100644
--- a/cpp/tests/copying/copy_tests.cpp
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -73,44 +73,45 @@ TYPED_TEST(CopyTest, CopyIfElseTestLong)
   using T = TypeParam;
 
   // make sure we span at least 2 warps
-  int num_els = 64;
-
-  bool mask[] = {true, false, true, false, true, true, true,  true,  true,  true,  true, true, true,
-                 true, true,  true, true,  true, true, false, false, false, false, true, true, true,
-                 true, true,  true, true,  true, true, false, false, false, false, true, true, true,
-                 true, true,  true, true,  true, true, true,  true,  true,  true,  true, true, true,
-                 true, true,  true, true,  true, true, true,  true,  true,  true,  true, true};
-  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
-
-  bool lhs_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true,
-                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
-                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
-                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
-                  true, true, true, true, true,  true,  true, true, true, true, true, true};
-  wrapper<T, int32_t> lhs_w({5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-                             5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-                             5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
-                            lhs_v);
-
-  bool rhs_v[] = {true, true, true, true, true, true, false, false, true, true, true, true, true,
-                  true, true, true, true, true, true, true,  true,  true, true, true, true, true,
-                  true, true, true, true, true, true, true,  true,  true, true, true, true, true,
-                  true, true, true, true, true, true, true,  true,  true, true, true, true, true,
-                  true, true, true, true, true, true, true,  true,  true, true, true, true};
-  wrapper<T, int32_t> rhs_w({6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-                             6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-                             6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6},
-                            rhs_v);
-
-  bool exp_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true,
-                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
-                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
-                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
-                  true, true, true, true, true,  true,  true, true, true, true, true, true};
-  wrapper<T, int32_t> expected_w({5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6,
-                                  6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5,
-                                  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
-                                 exp_v);
+  constexpr int num_els = 64;
+
+  std::array<bool, num_els> mask{
+    true, false, true, false, true, true, true,  true,  true,  true,  true, true, true,
+    true, true,  true, true,  true, true, false, false, false, false, true, true, true,
+    true, true,  true, true,  true, true, false, false, false, false, true, true, true,
+    true, true,  true, true,  true, true, true,  true,  true,  true,  true, true, true,
+    true, true,  true, true,  true, true, true,  true,  true,  true,  true, true};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask.begin(), mask.end());
+
+  wrapper<T, int32_t> lhs_w(
+    {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
+    {true, true, true, true, false, false, true, true, true, true, true, true, true,
+     true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+     true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+     true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+     true, true, true, true, true,  true,  true, true, true, true, true, true});
+
+  wrapper<T, int32_t> rhs_w(
+    {6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6},
+    {true, true, true, true, true, true, false, false, true, true, true, true, true,
+     true, true, true, true, true, true, true,  true,  true, true, true, true, true,
+     true, true, true, true, true, true, true,  true,  true, true, true, true, true,
+     true, true, true, true, true, true, true,  true,  true, true, true, true, true,
+     true, true, true, true, true, true, true,  true,  true, true, true, true});
+
+  wrapper<T, int32_t> expected_w(
+    {5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6,
+     6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5,
+     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
+    {true, true, true, true, false, false, true, true, true, true, true, true, true,
+     true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+     true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+     true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+     true, true, true, true, true,  true,  true, true, true, true, true, true});
 
   auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w);
@@ -318,19 +319,17 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarColumn)
 {
   using T = TypeParam;
 
-  int num_els = 4;
-
-  bool mask[] = {true, false, false, true};
-  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
+  std::array mask{true, false, false, true};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask.begin(), mask.end());
 
   cudf::numeric_scalar<T> lhs_w(5);
 
   auto const rhs = cudf::test::make_type_param_vector<T>({6, 6, 6, 6});
-  bool rhs_v[]   = {true, false, true, true};
-  wrapper<T> rhs_w(rhs.begin(), rhs.end(), rhs_v);
+  std::array rhs_v{true, false, true, true};
+  wrapper<T> rhs_w(rhs.begin(), rhs.end(), rhs_v.begin());
 
   auto const expected = cudf::test::make_type_param_vector<T>({5, 6, 6, 5});
-  wrapper<T> expected_w(expected.begin(), expected.end(), rhs_v);
+  wrapper<T> expected_w(expected.begin(), expected.end(), rhs_v.begin());
 
   auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w);
@@ -340,20 +339,18 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestColumnScalar)
 {
   using T = TypeParam;
 
-  int num_els = 4;
-
-  bool mask[]   = {true, false, false, true};
-  bool mask_v[] = {true, true, true, false};
-  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els, mask_v);
+  std::array mask{true, false, false, true};
+  std::array mask_v{true, true, true, false};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask.begin(), mask.end(), mask_v.begin());
 
   auto const lhs = cudf::test::make_type_param_vector<T>({5, 5, 5, 5});
-  bool lhs_v[]   = {false, true, true, true};
-  wrapper<T> lhs_w(lhs.begin(), lhs.end(), lhs_v);
+  std::array lhs_v{false, true, true, true};
+  wrapper<T> lhs_w(lhs.begin(), lhs.end(), lhs_v.begin());
 
   cudf::numeric_scalar<T> rhs_w(6);
 
   auto const expected = cudf::test::make_type_param_vector<T>({5, 6, 6, 6});
-  wrapper<T> expected_w(expected.begin(), expected.end(), lhs_v);
+  wrapper<T> expected_w(expected.begin(), expected.end(), lhs_v.begin());
 
   auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w);
@@ -363,16 +360,14 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarScalar)
 {
   using T = TypeParam;
 
-  int num_els = 4;
-
-  bool mask[] = {true, false, false, true};
-  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
+  std::array mask{true, false, false, true};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask.begin(), mask.end());
 
   cudf::numeric_scalar<T> lhs_w(5);
   cudf::numeric_scalar<T> rhs_w(6, false);
 
   auto const expected = cudf::test::make_type_param_vector<T>({5, 6, 6, 5});
-  wrapper<T> expected_w(expected.begin(), expected.end(), mask);
+  wrapper<T> expected_w(expected.begin(), expected.end(), mask.begin());
 
   auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w);
@@ -405,17 +400,15 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarColumn)
 {
   using T = TypeParam;
 
-  int num_els = 4;
-
-  bool mask[] = {true, false, false, true};
-  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
+  std::array mask{true, false, false, true};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask.begin(), mask.end());
 
   auto lhs_w = create_chrono_scalar<T>{}(cudf::test::make_type_param_scalar<T>(5), true);
 
-  bool rhs_v[] = {true, false, true, true};
-  wrapper<T, int32_t> rhs_w({6, 6, 6, 6}, rhs_v);
+  std::array rhs_v{true, false, true, true};
+  wrapper<T, int32_t> rhs_w({6, 6, 6, 6}, rhs_v.begin());
 
-  wrapper<T, int32_t> expected_w({5, 6, 6, 5}, rhs_v);
+  wrapper<T, int32_t> expected_w({5, 6, 6, 5}, rhs_v.begin());
 
   auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w);
@@ -425,17 +418,15 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestColumnScalar)
 {
   using T = TypeParam;
 
-  int num_els = 4;
-
-  bool mask[] = {true, false, false, true};
-  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
+  std::array mask{true, false, false, true};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask.begin(), mask.end());
 
-  bool lhs_v[] = {false, true, true, true};
-  wrapper<T, int32_t> lhs_w({5, 5, 5, 5}, lhs_v);
+  std::array lhs_v{false, true, true, true};
+  wrapper<T, int32_t> lhs_w({5, 5, 5, 5}, lhs_v.begin());
 
   auto rhs_w = create_chrono_scalar<T>{}(cudf::test::make_type_param_scalar<T>(6), true);
 
-  wrapper<T, int32_t> expected_w({5, 6, 6, 5}, lhs_v);
+  wrapper<T, int32_t> expected_w({5, 6, 6, 5}, lhs_v.begin());
 
   auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w);
@@ -445,15 +436,13 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarScalar)
 {
   using T = TypeParam;
 
-  int num_els = 4;
-
-  bool mask[] = {true, false, false, true};
-  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
+  std::array mask{true, false, false, true};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask.begin(), mask.end());
 
   auto lhs_w = create_chrono_scalar<T>{}(cudf::test::make_type_param_scalar<T>(5), true);
   auto rhs_w = create_chrono_scalar<T>{}(cudf::test::make_type_param_scalar<T>(6), false);
 
-  wrapper<T, int32_t> expected_w({5, 6, 6, 5}, mask);
+  wrapper<T, int32_t> expected_w({5, 6, 6, 5}, mask.begin());
 
   auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w);
@@ -483,9 +472,9 @@ TEST_F(StringsCopyIfElseTest, CopyIfElse)
   std::vector<char const*> h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids);
 
-  bool mask[]   = {true, true, false, true, false, true};
-  bool mask_v[] = {true, true, true, true, true, false};
-  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6, mask_v);
+  std::array mask{true, true, false, true, false, true};
+  std::array mask_v{true, true, true, true, true, false};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask.begin(), mask.end(), mask_v.begin());
 
   auto results = cudf::copy_if_else(strings1, strings2, mask_w);
 
@@ -510,9 +499,9 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarColumn)
   std::vector<char const*> h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids);
 
-  bool mask[]   = {true, false, true, false, true, false};
-  bool mask_v[] = {true, true, true, true, true, false};
-  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6, mask_v);
+  std::array mask{true, false, true, false, true, false};
+  std::array mask_v{true, true, true, true, true, false};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask.begin(), mask.end(), mask_v.begin());
 
   auto results = cudf::copy_if_else(strings1, strings2, mask_w);
 
@@ -538,8 +527,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseColumnScalar)
   std::vector<char const*> h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids);
 
-  bool mask[] = {false, true, true, true, false, true};
-  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6);
+  std::array mask{false, true, true, true, false, true};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask.begin(), mask.end());
 
   auto results = cudf::copy_if_else(strings2, strings1, mask_w);
 
@@ -565,9 +554,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarScalar)
   std::vector<char const*> h_string2{"aaa"};
   cudf::string_scalar string2{h_string2[0], false};
 
-  constexpr cudf::size_type mask_size = 6;
-  bool mask[]                         = {true, false, true, false, true, false};
-  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + mask_size);
+  std::array mask{true, false, true, false, true, false};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask.begin(), mask.end());
 
   auto results = cudf::copy_if_else(string1, string2, mask_w);
 
@@ -652,9 +640,9 @@ TEST_F(DictionaryCopyIfElseTest, ColumnColumn)
   cudf::test::dictionary_column_wrapper<std::string> input2(
     h_strings2.begin(), h_strings2.end(), valids);
 
-  bool mask[]   = {true, true, false, true, false, true};
-  bool mask_v[] = {true, true, true, true, true, false};
-  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6, mask_v);
+  std::array mask{true, true, false, true, false, true};
+  std::array mask_v{true, true, true, true, true, false};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask.begin(), mask.end(), mask_v.begin());
 
   auto results = cudf::copy_if_else(input1, input2, mask_w);
   auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view()));
@@ -679,8 +667,8 @@ TEST_F(DictionaryCopyIfElseTest, ColumnScalar)
   cudf::test::dictionary_column_wrapper<std::string> input2(
     h_strings.begin(), h_strings.end(), valids);
 
-  bool mask[] = {false, true, true, true, false, true};
-  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6);
+  std::array mask{false, true, true, true, false, true};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask.begin(), mask.end());
 
   auto results = cudf::copy_if_else(input2, input1, mask_w);
   auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view()));
diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp
index 5651a26f192..0783b4e5bbb 100644
--- a/cpp/tests/filling/sequence_tests.cpp
+++ b/cpp/tests/filling/sequence_tests.cpp
@@ -41,8 +41,7 @@ TYPED_TEST(SequenceTypedTestFixture, Incrementing)
 
   cudf::size_type num_els = 10;
 
-  T expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  cudf::test::fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
+  cudf::test::fixed_width_column_wrapper<T> expected_w({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
 
   auto result = cudf::sequence(num_els, init, step);
 
@@ -58,8 +57,8 @@ TYPED_TEST(SequenceTypedTestFixture, Decrementing)
 
   cudf::size_type num_els = 10;
 
-  T expected[] = {0, -5, -10, -15, -20, -25, -30, -35, -40, -45};
-  cudf::test::fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
+  cudf::test::fixed_width_column_wrapper<T> expected_w(
+    {0, -5, -10, -15, -20, -25, -30, -35, -40, -45});
 
   auto result = cudf::sequence(num_els, init, step);
 
@@ -75,8 +74,7 @@ TYPED_TEST(SequenceTypedTestFixture, EmptyOutput)
 
   cudf::size_type num_els = 0;
 
-  T expected[] = {};
-  cudf::test::fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
+  cudf::test::fixed_width_column_wrapper<T> expected_w({});
 
   auto result = cudf::sequence(num_els, init, step);
 
@@ -121,8 +119,7 @@ TYPED_TEST(SequenceTypedTestFixture, DefaultStep)
 
   cudf::size_type num_els = 10;
 
-  T expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  cudf::test::fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
+  cudf::test::fixed_width_column_wrapper<T> expected_w({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
 
   auto result = cudf::sequence(num_els, init);
 
diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp
index 749f4013013..a79b6a32916 100644
--- a/cpp/tests/groupby/collect_list_tests.cpp
+++ b/cpp/tests/groupby/collect_list_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -127,8 +127,9 @@ TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion)
   using LCW = cudf::test::lists_column_wrapper<V, int32_t>;
 
   cudf::test::fixed_width_column_wrapper<K, int32_t> keys{1, 1, 2, 2, 3, 3, 4, 4};
-  bool const validity_mask[] = {true, false, false, true, true, true, false, false};
-  LCW values{{{1, 2}, {3, 4}, {5, 6, 7}, LCW{}, {9, 10}, {11}, {20, 30, 40}, LCW{}}, validity_mask};
+  std::array const validity_mask{true, false, false, true, true, true, false, false};
+  LCW values{{{1, 2}, {3, 4}, {5, 6, 7}, LCW{}, {9, 10}, {11}, {20, 30, 40}, LCW{}},
+             validity_mask.data()};
 
   cudf::test::fixed_width_column_wrapper<K, int32_t> expect_keys{1, 2, 3, 4};
 
diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp
index 330f07ac8e2..ef4b9dd9b8a 100644
--- a/cpp/tests/interop/dlpack_test.cpp
+++ b/cpp/tests/interop/dlpack_test.cpp
@@ -225,8 +225,8 @@ TEST_F(DLPackUntypedTests, UnsupportedBroadcast1DTensorFromDlpack)
   constexpr int ndim = 1;
   // Broadcasted (stride-0) 1D tensor
   auto const data       = cudf::test::make_type_param_vector<T>({1});
-  int64_t shape[ndim]   = {5};
-  int64_t strides[ndim] = {0};
+  int64_t shape[ndim]   = {5};  // NOLINT
+  int64_t strides[ndim] = {0};  // NOLINT
 
   DLManagedTensor tensor{};
   tensor.dl_tensor.device.device_type = kDLCPU;
@@ -248,8 +248,8 @@ TEST_F(DLPackUntypedTests, UnsupportedStrided1DTensorFromDlpack)
   constexpr int ndim = 1;
   // Strided 1D tensor
   auto const data       = cudf::test::make_type_param_vector<T>({1, 2, 3, 4});
-  int64_t shape[ndim]   = {2};
-  int64_t strides[ndim] = {2};
+  int64_t shape[ndim]   = {2};  // NOLINT
+  int64_t strides[ndim] = {2};  // NOLINT
 
   DLManagedTensor tensor{};
   tensor.dl_tensor.device.device_type = kDLCPU;
@@ -271,7 +271,7 @@ TEST_F(DLPackUntypedTests, UnsupportedImplicitRowMajor2DTensorFromDlpack)
   constexpr int ndim = 2;
   // Row major 2D tensor
   auto const data     = cudf::test::make_type_param_vector<T>({1, 2, 3, 4});
-  int64_t shape[ndim] = {2, 2};
+  int64_t shape[ndim] = {2, 2};  // NOLINT
 
   DLManagedTensor tensor{};
   tensor.dl_tensor.device.device_type = kDLCPU;
@@ -293,8 +293,8 @@ TEST_F(DLPackUntypedTests, UnsupportedExplicitRowMajor2DTensorFromDlpack)
   constexpr int ndim = 2;
   // Row major 2D tensor with explicit strides
   auto const data       = cudf::test::make_type_param_vector<T>({1, 2, 3, 4});
-  int64_t shape[ndim]   = {2, 2};
-  int64_t strides[ndim] = {2, 1};
+  int64_t shape[ndim]   = {2, 2};  // NOLINT
+  int64_t strides[ndim] = {2, 1};  // NOLINT
 
   DLManagedTensor tensor{};
   tensor.dl_tensor.device.device_type = kDLCPU;
@@ -316,8 +316,8 @@ TEST_F(DLPackUntypedTests, UnsupportedStridedColMajor2DTensorFromDlpack)
   constexpr int ndim = 2;
   // Column major, but strided in fastest dimension
   auto const data       = cudf::test::make_type_param_vector<T>({1, 2, 3, 4, 5, 6, 7, 8});
-  int64_t shape[ndim]   = {2, 2};
-  int64_t strides[ndim] = {2, 4};
+  int64_t shape[ndim]   = {2, 2};  // NOLINT
+  int64_t strides[ndim] = {2, 4};  // NOLINT
 
   DLManagedTensor tensor{};
   tensor.dl_tensor.device.device_type = kDLCPU;
@@ -465,8 +465,8 @@ TYPED_TEST(DLPackNumericTests, FromDlpackCpu)
   using T         = TypeParam;
   auto const data = cudf::test::make_type_param_vector<T>({0, 1, 2, 3, 4, 0, 5, 6, 7, 8, 0});
   uint64_t const offset{sizeof(T)};
-  int64_t shape[2]   = {4, 2};
-  int64_t strides[2] = {1, 5};
+  int64_t shape[2]   = {4, 2};  // NOLINT
+  int64_t strides[2] = {1, 5};  // NOLINT
 
   DLManagedTensor tensor{};
   tensor.dl_tensor.device.device_type = kDLCPU;
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 39ba62952b4..89e704f3ed3 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -38,6 +38,7 @@
 
 #include <src/io/comp/nvcomp_adapter.hpp>
 
+#include <array>
 #include <type_traits>
 
 template <typename T, typename SourceElementT = T>
@@ -767,14 +768,14 @@ TEST_F(OrcChunkedWriterTest, Metadata)
 
 TEST_F(OrcChunkedWriterTest, Strings)
 {
-  bool mask1[] = {true, true, false, true, true, true, true};
+  std::array mask1{true, true, false, true, true, true, true};
   std::vector<char const*> h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"};
-  str_col strings1(h_strings1.begin(), h_strings1.end(), mask1);
+  str_col strings1(h_strings1.begin(), h_strings1.end(), mask1.data());
   table_view tbl1({strings1});
 
-  bool mask2[] = {false, true, true, true, true, true, true};
+  std::array mask2{false, true, true, true, true, true, true};
   std::vector<char const*> h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"};
-  str_col strings2(h_strings2.begin(), h_strings2.end(), mask2);
+  str_col strings2(h_strings2.begin(), h_strings2.end(), mask2.data());
   table_view tbl2({strings2});
 
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
@@ -877,26 +878,26 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
 
   using T = TypeParam;
 
-  int num_els = 31;
+  constexpr int num_els{31};
 
-  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
-                 true,  true, true, true, true, true, true, true, true, true, true,
-                 true,  true, true, true, true, true, true, true, true};
+  std::array<bool, num_els> mask{false, true, true, true, true, true, true, true, true, true, true,
+                                 true,  true, true, true, true, true, true, true, true, true, true,
+                                 true,  true, true, true, true, true, true, true, true};
 
-  T c1a[num_els];
-  std::fill(c1a, c1a + num_els, static_cast<T>(5));
-  T c1b[num_els];
-  std::fill(c1b, c1b + num_els, static_cast<T>(6));
-  column_wrapper<T> c1a_w(c1a, c1a + num_els, mask);
-  column_wrapper<T> c1b_w(c1b, c1b + num_els, mask);
+  std::array<T, num_els> c1a;
+  std::fill(c1a.begin(), c1a.end(), static_cast<T>(5));
+  std::array<T, num_els> c1b;
+  std::fill(c1b.begin(), c1b.end(), static_cast<T>(5));
+  column_wrapper<T> c1a_w(c1a.begin(), c1a.end(), mask.begin());
+  column_wrapper<T> c1b_w(c1b.begin(), c1b.end(), mask.begin());
   table_view tbl1({c1a_w, c1b_w});
 
-  T c2a[num_els];
-  std::fill(c2a, c2a + num_els, static_cast<T>(8));
-  T c2b[num_els];
-  std::fill(c2b, c2b + num_els, static_cast<T>(9));
-  column_wrapper<T> c2a_w(c2a, c2a + num_els, mask);
-  column_wrapper<T> c2b_w(c2b, c2b + num_els, mask);
+  std::array<T, num_els> c2a;
+  std::fill(c2a.begin(), c2a.end(), static_cast<T>(8));
+  std::array<T, num_els> c2b;
+  std::fill(c2b.begin(), c2b.end(), static_cast<T>(9));
+  column_wrapper<T> c2a_w(c2a.begin(), c2a.end(), mask.begin());
+  column_wrapper<T> c2b_w(c2b.begin(), c2b.end(), mask.begin());
   table_view tbl2({c2a_w, c2b_w});
 
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
@@ -920,26 +921,26 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
 
   using T = TypeParam;
 
-  int num_els = 33;
+  constexpr int num_els = 33;
 
-  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
-                 true,  true, true, true, true, true, true, true, true, true, true,
-                 true,  true, true, true, true, true, true, true, true, true, true};
+  std::array<bool, num_els> mask{false, true, true, true, true, true, true, true, true, true, true,
+                                 true,  true, true, true, true, true, true, true, true, true, true,
+                                 true,  true, true, true, true, true, true, true, true, true, true};
 
-  T c1a[num_els];
-  std::fill(c1a, c1a + num_els, static_cast<T>(5));
-  T c1b[num_els];
-  std::fill(c1b, c1b + num_els, static_cast<T>(6));
-  column_wrapper<T> c1a_w(c1a, c1a + num_els, mask);
-  column_wrapper<T> c1b_w(c1b, c1b + num_els, mask);
+  std::array<T, num_els> c1a;
+  std::fill(c1a.begin(), c1a.end(), static_cast<T>(5));
+  std::array<T, num_els> c1b;
+  std::fill(c1b.begin(), c1b.end(), static_cast<T>(5));
+  column_wrapper<T> c1a_w(c1a.begin(), c1a.end(), mask.begin());
+  column_wrapper<T> c1b_w(c1b.begin(), c1b.end(), mask.begin());
   table_view tbl1({c1a_w, c1b_w});
 
-  T c2a[num_els];
-  std::fill(c2a, c2a + num_els, static_cast<T>(8));
-  T c2b[num_els];
-  std::fill(c2b, c2b + num_els, static_cast<T>(9));
-  column_wrapper<T> c2a_w(c2a, c2a + num_els, mask);
-  column_wrapper<T> c2b_w(c2b, c2b + num_els, mask);
+  std::array<T, num_els> c2a;
+  std::fill(c2a.begin(), c2a.end(), static_cast<T>(8));
+  std::array<T, num_els> c2b;
+  std::fill(c2b.begin(), c2b.end(), static_cast<T>(9));
+  column_wrapper<T> c2a_w(c2a.begin(), c2a.end(), mask.begin());
+  column_wrapper<T> c2b_w(c2b.begin(), c2b.end(), mask.begin());
   table_view tbl2({c2a_w, c2b_w});
 
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
@@ -1140,7 +1141,7 @@ TEST_F(OrcReaderTest, zstdCompressionRegression)
   }
 
   // Test with zstd compressed orc file with high compression ratio.
-  constexpr uint8_t input_buffer[] = {
+  constexpr std::array<uint8_t, 170> input_buffer{
     0x4f, 0x52, 0x43, 0x5a, 0x00, 0x00, 0x28, 0xb5, 0x2f, 0xfd, 0xa4, 0x34, 0xc7, 0x03, 0x00, 0x74,
     0x00, 0x00, 0x18, 0x41, 0xff, 0xaa, 0x02, 0x00, 0xbb, 0xff, 0x45, 0xc8, 0x01, 0x25, 0x30, 0x04,
     0x65, 0x00, 0x00, 0x10, 0xaa, 0x1f, 0x02, 0x00, 0x01, 0x29, 0x0b, 0xc7, 0x39, 0xb8, 0x02, 0xcb,
@@ -1154,7 +1155,7 @@ TEST_F(OrcReaderTest, zstdCompressionRegression)
     0x30, 0x09, 0x82, 0xf4, 0x03, 0x03, 0x4f, 0x52, 0x43, 0x17};
 
   auto source =
-    cudf::io::source_info(reinterpret_cast<char const*>(input_buffer), sizeof(input_buffer));
+    cudf::io::source_info(reinterpret_cast<char const*>(input_buffer.data()), input_buffer.size());
   cudf::io::orc_reader_options in_opts =
     cudf::io::orc_reader_options::builder(source).use_index(false);
 
diff --git a/cpp/tests/io/parquet_chunked_writer_test.cpp b/cpp/tests/io/parquet_chunked_writer_test.cpp
index 282c6f3adad..810fee89c48 100644
--- a/cpp/tests/io/parquet_chunked_writer_test.cpp
+++ b/cpp/tests/io/parquet_chunked_writer_test.cpp
@@ -124,15 +124,15 @@ TEST_F(ParquetChunkedWriterTest, Strings)
 {
   std::vector<std::unique_ptr<cudf::column>> cols;
 
-  bool mask1[] = {true, true, false, true, true, true, true};
+  std::array mask1{true, true, false, true, true, true, true};
   std::vector<char const*> h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"};
-  cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1);
+  cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1.data());
   cols.push_back(strings1.release());
   cudf::table tbl1(std::move(cols));
 
-  bool mask2[] = {false, true, true, true, true, true, true};
+  std::array mask2{false, true, true, true, true, true, true};
   std::vector<char const*> h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"};
-  cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2);
+  cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2.data());
   cols.push_back(strings2.release());
   cudf::table tbl2(std::move(cols));
 
@@ -771,29 +771,29 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize)
 
   using T = TypeParam;
 
-  int num_els = 31;
+  constexpr int num_els = 31;
   std::vector<std::unique_ptr<cudf::column>> cols;
 
-  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
-                 true,  true, true, true, true, true, true, true, true, true, true,
+  std::array<bool, num_els> mask{false, true, true, true, true, true, true, true, true, true, true,
+                                 true,  true, true, true, true, true, true, true, true, true, true,
 
-                 true,  true, true, true, true, true, true, true, true};
-  T c1a[num_els];
-  std::fill(c1a, c1a + num_els, static_cast<T>(5));
-  T c1b[num_els];
-  std::fill(c1b, c1b + num_els, static_cast<T>(6));
-  column_wrapper<T> c1a_w(c1a, c1a + num_els, mask);
-  column_wrapper<T> c1b_w(c1b, c1b + num_els, mask);
+                                 true,  true, true, true, true, true, true, true, true};
+  std::array<T, num_els> c1a;
+  std::fill(c1a.begin(), c1a.end(), static_cast<T>(5));
+  std::array<T, num_els> c1b;
+  std::fill(c1b.begin(), c1b.end(), static_cast<T>(5));
+  column_wrapper<T> c1a_w(c1a.begin(), c1a.end(), mask.begin());
+  column_wrapper<T> c1b_w(c1b.begin(), c1b.end(), mask.begin());
   cols.push_back(c1a_w.release());
   cols.push_back(c1b_w.release());
   cudf::table tbl1(std::move(cols));
 
-  T c2a[num_els];
-  std::fill(c2a, c2a + num_els, static_cast<T>(8));
-  T c2b[num_els];
-  std::fill(c2b, c2b + num_els, static_cast<T>(9));
-  column_wrapper<T> c2a_w(c2a, c2a + num_els, mask);
-  column_wrapper<T> c2b_w(c2b, c2b + num_els, mask);
+  std::array<T, num_els> c2a;
+  std::fill(c2a.begin(), c2a.end(), static_cast<T>(8));
+  std::array<T, num_els> c2b;
+  std::fill(c2b.begin(), c2b.end(), static_cast<T>(9));
+  column_wrapper<T> c2a_w(c2a.begin(), c2a.end(), mask.begin());
+  column_wrapper<T> c2b_w(c2b.begin(), c2b.end(), mask.begin());
   cols.push_back(c2a_w.release());
   cols.push_back(c2b_w.release());
   cudf::table tbl2(std::move(cols));
@@ -819,29 +819,29 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize2)
 
   using T = TypeParam;
 
-  int num_els = 33;
+  constexpr int num_els = 33;
   std::vector<std::unique_ptr<cudf::column>> cols;
 
-  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
-                 true,  true, true, true, true, true, true, true, true, true, true,
-                 true,  true, true, true, true, true, true, true, true, true, true};
+  std::array<bool, num_els> mask{false, true, true, true, true, true, true, true, true, true, true,
+                                 true,  true, true, true, true, true, true, true, true, true, true,
+                                 true,  true, true, true, true, true, true, true, true, true, true};
 
-  T c1a[num_els];
-  std::fill(c1a, c1a + num_els, static_cast<T>(5));
-  T c1b[num_els];
-  std::fill(c1b, c1b + num_els, static_cast<T>(6));
-  column_wrapper<T> c1a_w(c1a, c1a + num_els, mask);
-  column_wrapper<T> c1b_w(c1b, c1b + num_els, mask);
+  std::array<T, num_els> c1a;
+  std::fill(c1a.begin(), c1a.end(), static_cast<T>(5));
+  std::array<T, num_els> c1b;
+  std::fill(c1b.begin(), c1b.end(), static_cast<T>(5));
+  column_wrapper<T> c1a_w(c1a.begin(), c1a.end(), mask.begin());
+  column_wrapper<T> c1b_w(c1b.begin(), c1b.end(), mask.begin());
   cols.push_back(c1a_w.release());
   cols.push_back(c1b_w.release());
   cudf::table tbl1(std::move(cols));
 
-  T c2a[num_els];
-  std::fill(c2a, c2a + num_els, static_cast<T>(8));
-  T c2b[num_els];
-  std::fill(c2b, c2b + num_els, static_cast<T>(9));
-  column_wrapper<T> c2a_w(c2a, c2a + num_els, mask);
-  column_wrapper<T> c2b_w(c2b, c2b + num_els, mask);
+  std::array<T, num_els> c2a;
+  std::fill(c2a.begin(), c2a.end(), static_cast<T>(8));
+  std::array<T, num_els> c2b;
+  std::fill(c2b.begin(), c2b.end(), static_cast<T>(9));
+  column_wrapper<T> c2a_w(c2a.begin(), c2a.end(), mask.begin());
+  column_wrapper<T> c2b_w(c2b.begin(), c2b.end(), mask.begin());
   cols.push_back(c2a_w.release());
   cols.push_back(c2b_w.release());
   cudf::table tbl2(std::move(cols));
diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp
index 3dd5ad145ea..6141a40bc95 100644
--- a/cpp/tests/io/parquet_common.cpp
+++ b/cpp/tests/io/parquet_common.cpp
@@ -483,10 +483,10 @@ template <typename T>
 std::enable_if_t<std::is_same_v<T, cudf::string_view>, cudf::test::strings_column_wrapper>
 ascending()
 {
-  char buf[10];
+  std::array<char, 10> buf;
   auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) {
-    sprintf(buf, "%09d", i);
-    return std::string(buf);
+    sprintf(buf.data(), "%09d", i);
+    return std::string(buf.data());
   });
   return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows);
 }
@@ -495,10 +495,10 @@ template <typename T>
 std::enable_if_t<std::is_same_v<T, cudf::string_view>, cudf::test::strings_column_wrapper>
 descending()
 {
-  char buf[10];
+  std::array<char, 10> buf;
   auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) {
-    sprintf(buf, "%09d", num_ordered_rows - i);
-    return std::string(buf);
+    sprintf(buf.data(), "%09d", static_cast<short>(num_ordered_rows - i));
+    return std::string(buf.data());
   });
   return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows);
 }
@@ -507,10 +507,10 @@ template <typename T>
 std::enable_if_t<std::is_same_v<T, cudf::string_view>, cudf::test::strings_column_wrapper>
 unordered()
 {
-  char buf[10];
+  std::array<char, 10> buf;
   auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) {
-    sprintf(buf, "%09d", (i % 2 == 0) ? i : (num_ordered_rows - i));
-    return std::string(buf);
+    sprintf(buf.data(), "%09d", (i % 2 == 0) ? i : (num_ordered_rows - i));
+    return std::string(buf.data());
   });
   return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows);
 }
diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp
index 01027d04658..8b03e94191e 100644
--- a/cpp/tests/io/parquet_misc_test.cpp
+++ b/cpp/tests/io/parquet_misc_test.cpp
@@ -23,6 +23,8 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/transform.hpp>
 
+#include <array>
+
 ////////////////////////////////
 // delta encoding writer tests
 
@@ -225,10 +227,9 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted)
 
   // now check that the boundary order for chunk 1 is ascending,
   // chunk 2 is descending, and chunk 3 is unordered
-  cudf::io::parquet::detail::BoundaryOrder expected_orders[] = {
-    cudf::io::parquet::detail::BoundaryOrder::ASCENDING,
-    cudf::io::parquet::detail::BoundaryOrder::DESCENDING,
-    cudf::io::parquet::detail::BoundaryOrder::UNORDERED};
+  std::array expected_orders{cudf::io::parquet::detail::BoundaryOrder::ASCENDING,
+                             cudf::io::parquet::detail::BoundaryOrder::DESCENDING,
+                             cudf::io::parquet::detail::BoundaryOrder::UNORDERED};
 
   for (std::size_t i = 0; i < columns.size(); i++) {
     auto const ci = read_column_index(source, columns[i]);
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index 6c61535359f..dc8e68b3a15 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -29,6 +29,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/transform.hpp>
 
+#include <array>
+
 TEST_F(ParquetReaderTest, UserBounds)
 {
   // trying to read more rows than there are should result in
@@ -569,7 +571,8 @@ TEST_F(ParquetReaderTest, DecimalRead)
        This test is a temporary test until python gains the ability to write decimal, so we're
        embedding
        a parquet file directly into the code here to prevent issues with finding the file */
-    unsigned char const decimals_parquet[] = {
+    constexpr unsigned int decimals_parquet_len = 2366;
+    std::array<unsigned char, decimals_parquet_len> const decimals_parquet{
       0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xb0, 0x03, 0x15, 0xb8, 0x03, 0x2c, 0x15, 0x6a,
       0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, 0x02, 0x28, 0x04, 0x7f, 0x96, 0x98, 0x00,
       0x18, 0x04, 0x81, 0x69, 0x67, 0xff, 0x00, 0x00, 0x00, 0xd8, 0x01, 0xf0, 0xd7, 0x04, 0x00,
@@ -728,10 +731,10 @@ TEST_F(ParquetReaderTest, DecimalRead)
       0x30, 0x36, 0x30, 0x36, 0x39, 0x65, 0x35, 0x30, 0x63, 0x39, 0x62, 0x37, 0x39, 0x37, 0x30,
       0x62, 0x65, 0x62, 0x64, 0x31, 0x29, 0x19, 0x3c, 0x1c, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x1c,
       0x00, 0x00, 0x00, 0xd3, 0x02, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31};
-    unsigned int decimals_parquet_len = 2366;
 
-    cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(
-      cudf::io::source_info{reinterpret_cast<char const*>(decimals_parquet), decimals_parquet_len});
+    cudf::io::parquet_reader_options read_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{
+        reinterpret_cast<char const*>(decimals_parquet.data()), decimals_parquet_len});
     auto result = cudf::io::read_parquet(read_opts);
 
     auto validity =
@@ -739,7 +742,7 @@ TEST_F(ParquetReaderTest, DecimalRead)
 
     EXPECT_EQ(result.tbl->view().num_columns(), 3);
 
-    int32_t col0_data[] = {
+    std::array<int32_t, 53> col0_data{
       -2354584, -190275,  8393572,  6446515,  -5687920, -1843550, -6897687, -6780385, 3428529,
       5842056,  -4312278, -4450603, -7516141, 2974667,  -4288640, 1065090,  -9410428, 7891355,
       1076244,  -1975984, 6999466,  2666959,  9262967,  7931374,  -1370640, 451074,   8799111,
@@ -753,29 +756,28 @@ TEST_F(ParquetReaderTest, DecimalRead)
       std::begin(col0_data), std::end(col0_data), validity, numeric::scale_type{-4});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), col0);
 
-    int64_t col1_data[] = {29274040266581,  -17210335917753, -58420730139037,
-                           68073792696254,  2236456014294,   13704555677045,
-                           -70797090469548, -52248605513407, -68976081919961,
-                           -34277313883112, 97774730521689,  21184241014572,
-                           -670882460254,   -40862944054399, -24079852370612,
-                           -88670167797498, -84007574359403, -71843004533519,
-                           -55538016554201, 3491435293032,   -29085437167297,
-                           36901882672273,  -98622066122568, -13974902998457,
-                           86712597643378,  -16835133643735, -94759096142232,
-                           30708340810940,  79086853262082,  78923696440892,
-                           -76316597208589, 37247268714759,  80303592631774,
-                           57790350050889,  19387319851064,  -33186875066145,
-                           69701203023404,  -7157433049060,  -7073790423437,
-                           92769171617714,  -75127120182184, -951893180618,
-                           64927618310150,  -53875897154023, -16168039035569,
-                           -24273449166429, -30359781249192, 35639397345991,
-                           45844829680593,  71401416837149,  0,
-                           -99999999999999, 99999999999999};
-
-    EXPECT_EQ(static_cast<std::size_t>(result.tbl->view().column(1).size()),
-              sizeof(col1_data) / sizeof(col1_data[0]));
+    std::array<int64_t, 53> col1_data{29274040266581,  -17210335917753, -58420730139037,
+                                      68073792696254,  2236456014294,   13704555677045,
+                                      -70797090469548, -52248605513407, -68976081919961,
+                                      -34277313883112, 97774730521689,  21184241014572,
+                                      -670882460254,   -40862944054399, -24079852370612,
+                                      -88670167797498, -84007574359403, -71843004533519,
+                                      -55538016554201, 3491435293032,   -29085437167297,
+                                      36901882672273,  -98622066122568, -13974902998457,
+                                      86712597643378,  -16835133643735, -94759096142232,
+                                      30708340810940,  79086853262082,  78923696440892,
+                                      -76316597208589, 37247268714759,  80303592631774,
+                                      57790350050889,  19387319851064,  -33186875066145,
+                                      69701203023404,  -7157433049060,  -7073790423437,
+                                      92769171617714,  -75127120182184, -951893180618,
+                                      64927618310150,  -53875897154023, -16168039035569,
+                                      -24273449166429, -30359781249192, 35639397345991,
+                                      45844829680593,  71401416837149,  0,
+                                      -99999999999999, 99999999999999};
+
+    EXPECT_EQ(static_cast<std::size_t>(result.tbl->view().column(1).size()), col1_data.size());
     cudf::test::fixed_point_column_wrapper<int64_t> col1(
-      std::begin(col1_data), std::end(col1_data), validity, numeric::scale_type{-5});
+      col1_data.begin(), col1_data.end(), validity, numeric::scale_type{-5});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), col1);
 
     cudf::io::parquet_reader_options read_strict_opts = read_opts;
@@ -786,7 +788,7 @@ TEST_F(ParquetReaderTest, DecimalRead)
     // dec7p3: Decimal(precision=7, scale=3) backed by FIXED_LENGTH_BYTE_ARRAY(length = 4)
     // dec12p11: Decimal(precision=12, scale=11) backed by FIXED_LENGTH_BYTE_ARRAY(length = 6)
     // dec20p1: Decimal(precision=20, scale=1) backed by FIXED_LENGTH_BYTE_ARRAY(length = 9)
-    unsigned char const fixed_len_bytes_decimal_parquet[] = {
+    std::array<unsigned char, 1226> const fixed_len_bytes_decimal_parquet{
       0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xA8, 0x01, 0x15, 0xAE, 0x01, 0x2C, 0x15, 0x28,
       0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1C, 0x36, 0x02, 0x28, 0x04, 0x00, 0x97, 0x45, 0x72,
       0x18, 0x04, 0x00, 0x01, 0x81, 0x3B, 0x00, 0x00, 0x00, 0x54, 0xF0, 0x53, 0x04, 0x00, 0x00,
@@ -875,75 +877,72 @@ TEST_F(ParquetReaderTest, DecimalRead)
 
     cudf::io::parquet_reader_options read_opts =
       cudf::io::parquet_reader_options::builder(cudf::io::source_info{
-        reinterpret_cast<char const*>(fixed_len_bytes_decimal_parquet), parquet_len});
+        reinterpret_cast<char const*>(fixed_len_bytes_decimal_parquet.data()), parquet_len});
     auto result = cudf::io::read_parquet(read_opts);
     EXPECT_EQ(result.tbl->view().num_columns(), 3);
 
-    auto validity_c0    = cudf::test::iterators::nulls_at({19});
-    int32_t col0_data[] = {6361295, 698632,  7821423, 7073444, 9631892, 3021012, 5195059,
-                           9913714, 901749,  7776938, 3186566, 4955569, 5131067, 98619,
-                           2282579, 7521455, 4430706, 1937859, 4532040, 0};
+    auto validity_c0 = cudf::test::iterators::nulls_at({19});
+    std::array col0_data{6361295, 698632,  7821423, 7073444, 9631892, 3021012, 5195059,
+                         9913714, 901749,  7776938, 3186566, 4955569, 5131067, 98619,
+                         2282579, 7521455, 4430706, 1937859, 4532040, 0};
 
-    EXPECT_EQ(static_cast<std::size_t>(result.tbl->view().column(0).size()),
-              sizeof(col0_data) / sizeof(col0_data[0]));
+    EXPECT_EQ(static_cast<std::size_t>(result.tbl->view().column(0).size()), col0_data.size());
     cudf::test::fixed_point_column_wrapper<int32_t> col0(
-      std::begin(col0_data), std::end(col0_data), validity_c0, numeric::scale_type{-3});
+      col0_data.begin(), col0_data.end(), validity_c0, numeric::scale_type{-3});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), col0);
 
-    auto validity_c1    = cudf::test::iterators::nulls_at({18});
-    int64_t col1_data[] = {361378026250,
-                           30646804862,
-                           429930238629,
-                           418758703536,
-                           895494171113,
-                           435283865083,
-                           809096053722,
-                           -999999999999,
-                           426465099333,
-                           526684574144,
-                           826310892810,
-                           584686967589,
-                           113822282951,
-                           409236212092,
-                           420631167535,
-                           918438386086,
-                           -999999999999,
-                           489053889147,
-                           0,
-                           363993164092};
-
-    EXPECT_EQ(static_cast<std::size_t>(result.tbl->view().column(1).size()),
-              sizeof(col1_data) / sizeof(col1_data[0]));
+    auto validity_c1 = cudf::test::iterators::nulls_at({18});
+    std::array<int64_t, 20> col1_data{361378026250,
+                                      30646804862,
+                                      429930238629,
+                                      418758703536,
+                                      895494171113,
+                                      435283865083,
+                                      809096053722,
+                                      -999999999999,
+                                      426465099333,
+                                      526684574144,
+                                      826310892810,
+                                      584686967589,
+                                      113822282951,
+                                      409236212092,
+                                      420631167535,
+                                      918438386086,
+                                      -999999999999,
+                                      489053889147,
+                                      0,
+                                      363993164092};
+
+    EXPECT_EQ(static_cast<std::size_t>(result.tbl->view().column(1).size()), col1_data.size());
     cudf::test::fixed_point_column_wrapper<int64_t> col1(
-      std::begin(col1_data), std::end(col1_data), validity_c1, numeric::scale_type{-11});
+      col1_data.begin(), col1_data.end(), validity_c1, numeric::scale_type{-11});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), col1);
 
-    auto validity_c2       = cudf::test::iterators::nulls_at({6, 14});
-    __int128_t col2_data[] = {9078697037144433659,
-                              9050770539577117612,
-                              2358363961733893636,
-                              1566059559232276662,
-                              6658306200002735268,
-                              4967909073046397334,
-                              0,
-                              7235588493887532473,
-                              5023160741463849572,
-                              2765173712965988273,
-                              3880866513515749646,
-                              5019704400576359500,
-                              5544435986818825655,
-                              7265381725809874549,
-                              0,
-                              1576192427381240677,
-                              2828305195087094598,
-                              260308667809395171,
-                              2460080200895288476,
-                              2718441925197820439};
-
-    EXPECT_EQ(static_cast<std::size_t>(result.tbl->view().column(2).size()),
-              sizeof(col2_data) / sizeof(col2_data[0]));
+    auto validity_c2 = cudf::test::iterators::nulls_at({6, 14});
+    std::array<__int128_t, 20> col2_data{9078697037144433659,
+                                         9050770539577117612,
+                                         2358363961733893636,
+                                         1566059559232276662,
+                                         6658306200002735268,
+                                         4967909073046397334,
+                                         0,
+                                         7235588493887532473,
+                                         5023160741463849572,
+                                         2765173712965988273,
+                                         3880866513515749646,
+                                         5019704400576359500,
+                                         5544435986818825655,
+                                         7265381725809874549,
+                                         0,
+                                         1576192427381240677,
+                                         2828305195087094598,
+                                         260308667809395171,
+                                         2460080200895288476,
+                                         2718441925197820439};
+
+    EXPECT_EQ(static_cast<std::size_t>(result.tbl->view().column(2).size()), col2_data.size());
     cudf::test::fixed_point_column_wrapper<__int128_t> col2(
-      std::begin(col2_data), std::end(col2_data), validity_c2, numeric::scale_type{-1});
+      col2_data.begin(), col2_data.end(), validity_c2, numeric::scale_type{-1});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(2), col2);
   }
 }
@@ -1221,7 +1220,7 @@ TEST_F(ParquetReaderTest, NestingOptimizationTest)
 
 TEST_F(ParquetReaderTest, SingleLevelLists)
 {
-  unsigned char list_bytes[] = {
+  std::array<unsigned char, 214> list_bytes{
     0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06,
     0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03,
     0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15,
@@ -1239,7 +1238,7 @@ TEST_F(ParquetReaderTest, SingleLevelLists)
 
   // read single level list reproducing parquet file
   cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(
-    cudf::io::source_info{reinterpret_cast<char const*>(list_bytes), sizeof(list_bytes)});
+    cudf::io::source_info{reinterpret_cast<char const*>(list_bytes.data()), list_bytes.size()});
   auto table = cudf::io::read_parquet(read_opts);
 
   auto const c0 = table.tbl->get_column(0);
@@ -1252,7 +1251,7 @@ TEST_F(ParquetReaderTest, SingleLevelLists)
 
 TEST_F(ParquetReaderTest, ChunkedSingleLevelLists)
 {
-  unsigned char list_bytes[] = {
+  std::array<unsigned char, 214> list_bytes{
     0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06,
     0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03,
     0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15,
@@ -1271,7 +1270,7 @@ TEST_F(ParquetReaderTest, ChunkedSingleLevelLists)
   auto reader = cudf::io::chunked_parquet_reader(
     1L << 31,
     cudf::io::parquet_reader_options::builder(
-      cudf::io::source_info{reinterpret_cast<char const*>(list_bytes), sizeof(list_bytes)}));
+      cudf::io::source_info{reinterpret_cast<char const*>(list_bytes.data()), list_bytes.size()}));
   int iterations = 0;
   while (reader.has_next() && iterations < 10) {
     auto chunk = reader.read_chunk();
@@ -1932,7 +1931,7 @@ TEST_F(ParquetReaderTest, FilterFloatNAN)
 
 TEST_F(ParquetReaderTest, RepeatedNoAnnotations)
 {
-  constexpr unsigned char repeated_bytes[] = {
+  constexpr std::array<unsigned char, 662> repeated_bytes{
     0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x30, 0x15, 0x30, 0x4c, 0x15, 0x0c, 0x15, 0x00, 0x12,
     0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00,
     0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x0a, 0x15, 0x0a,
@@ -1976,9 +1975,9 @@ TEST_F(ParquetReaderTest, RepeatedNoAnnotations)
     0x61, 0x38, 0x33, 0x39, 0x31, 0x36, 0x63, 0x36, 0x39, 0x62, 0x35, 0x65, 0x29, 0x00, 0x32, 0x01,
     0x00, 0x00, 0x50, 0x41, 0x52, 0x31};
 
-  auto read_opts = cudf::io::parquet_reader_options::builder(
-    cudf::io::source_info{reinterpret_cast<char const*>(repeated_bytes), sizeof(repeated_bytes)});
-  auto result = cudf::io::read_parquet(read_opts);
+  auto read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{
+    reinterpret_cast<char const*>(repeated_bytes.data()), repeated_bytes.size()});
+  auto result    = cudf::io::read_parquet(read_opts);
 
   EXPECT_EQ(result.tbl->view().column(0).size(), 6);
   EXPECT_EQ(result.tbl->view().num_columns(), 2);
diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp
index 9e66fc9409f..7c305235ea6 100644
--- a/cpp/tests/io/parquet_v2_test.cpp
+++ b/cpp/tests/io/parquet_v2_test.cpp
@@ -23,6 +23,8 @@
 
 #include <cudf/io/parquet.hpp>
 
+#include <array>
+
 using cudf::test::iterators::no_nulls;
 
 // Base test fixture for V2 header tests
@@ -693,9 +695,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-    char buf[30];
-    sprintf(buf, "%012d", i);
-    return std::string(buf);
+    std::array<char, 30> buf;
+    sprintf(buf.data(), "%012d", i);
+    return std::string(buf.data());
   });
   auto col0          = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows);
 
@@ -715,9 +717,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
 
   // mixed length strings
   auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-    char buf[30];
-    sprintf(buf, "%d", i);
-    return std::string(buf);
+    std::array<char, 30> buf;
+    sprintf(buf.data(), "%d", i);
+    return std::string(buf.data());
   });
   auto col7          = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows);
 
@@ -787,9 +789,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-    char buf[30];
-    sprintf(buf, "%012d", i);
-    return std::string(buf);
+    std::array<char, 30> buf;
+    sprintf(buf.data(), "%012d", i);
+    return std::string(buf.data());
   });
   auto col0          = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows);
 
@@ -819,9 +821,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
 
   // mixed length strings
   auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-    char buf[30];
-    sprintf(buf, "%d", i);
-    return std::string(buf);
+    std::array<char, 30> buf;
+    sprintf(buf.data(), "%d", i);
+    return std::string(buf.data());
   });
   auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows, valids);
 
@@ -897,9 +899,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-    char buf[30];
-    sprintf(buf, "%012d", i);
-    return std::string(buf);
+    std::array<char, 30> buf;
+    sprintf(buf.data(), "%012d", i);
+    return std::string(buf.data());
   });
   auto col0          = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows);
 
@@ -914,9 +916,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
 
   // mixed length strings
   auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-    char buf[30];
-    sprintf(buf, "%d", i);
-    return std::string(buf);
+    std::array<char, 30> buf;
+    sprintf(buf.data(), "%d", i);
+    return std::string(buf.data());
   });
   auto col3          = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows);
 
@@ -1034,7 +1036,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
 
   // hard coded schema indices.
   // TODO find a way to do this without magic
-  size_t const colidxs[] = {1, 3, 4, 5, 8};
+  constexpr std::array<size_t, 5> colidxs{1, 3, 4, 5, 8};
   for (size_t r = 0; r < fmd.row_groups.size(); r++) {
     auto const& rg = fmd.row_groups[r];
     for (size_t c = 0; c < rg.columns.size(); c++) {
@@ -1129,7 +1131,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
   // col1 will have num_ordered_rows / 2 nulls total
   // col2 will have num_ordered_rows / 3 nulls total
   // col3 will have num_ordered_rows / 4 nulls total
-  int const null_mods[] = {0, 2, 3, 4};
+  constexpr std::array<int, 4> null_mods{0, 2, 3, 4};
 
   for (auto const& rg : fmd.row_groups) {
     for (size_t c = 0; c < rg.columns.size(); c++) {
@@ -1299,7 +1301,7 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
 
   table_view expected({col0, col1, col2, col3, col4, col5, col6, col7});
 
-  int64_t const expected_null_counts[]            = {4, 4, 4, 6, 4, 6, 4, 5, 11};
+  std::array<int64_t, 9> expected_null_counts{4, 4, 4, 6, 4, 6, 4, 5, 11};
   std::vector<int64_t> const expected_def_hists[] = {{1, 1, 2, 3},
                                                      {1, 3, 10},
                                                      {1, 1, 2, 10},
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index c8100038942..8794f2ee304 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -31,6 +31,7 @@
 #include <src/io/parquet/parquet.hpp>
 #include <src/io/parquet/parquet_common.hpp>
 
+#include <array>
 #include <fstream>
 
 using cudf::test::iterators::no_nulls;
@@ -879,53 +880,52 @@ TEST_F(ParquetWriterTest, Decimal128Stats)
 
 TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
 {
-  char const* coldata[] = {
-    // in-range 7 bit.  should truncate to "yyyyyyyz"
-    "yyyyyyyyy",
-    // max 7 bit. should truncate to "x7fx7fx7fx7fx7fx7fx7fx80", since it's
-    // considered binary, not UTF-8.  If UTF-8 it should not truncate.
-    "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f",
-    // max binary.  this should not truncate
-    "\xff\xff\xff\xff\xff\xff\xff\xff\xff",
-    // in-range 2-byte UTF8 (U+00E9). should truncate to "éééê"
-    "ééééé",
-    // max 2-byte UTF8 (U+07FF). should not truncate
-    "߿߿߿߿߿",
-    // in-range 3-byte UTF8 (U+0800). should truncate to "ࠀࠁ"
-    "ࠀࠀࠀ",
-    // max 3-byte UTF8 (U+FFFF). should not truncate
-    "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf",
-    // in-range 4-byte UTF8 (U+10000). should truncate to "𐀀𐀁"
-    "𐀀𐀀𐀀",
-    // max unicode (U+10FFFF). should truncate to \xf4\x8f\xbf\xbf\xf4\x90\x80\x80,
-    // which is no longer valid unicode, but is still ok UTF-8???
-    "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf",
-    // max 4-byte UTF8 (U+1FFFFF). should not truncate
-    "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"};
+  std::array coldata{// in-range 7 bit.  should truncate to "yyyyyyyz"
+                     "yyyyyyyyy",
+                     // max 7 bit. should truncate to "x7fx7fx7fx7fx7fx7fx7fx80", since it's
+                     // considered binary, not UTF-8.  If UTF-8 it should not truncate.
+                     "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f",
+                     // max binary.  this should not truncate
+                     "\xff\xff\xff\xff\xff\xff\xff\xff\xff",
+                     // in-range 2-byte UTF8 (U+00E9). should truncate to "éééê"
+                     "ééééé",
+                     // max 2-byte UTF8 (U+07FF). should not truncate
+                     "߿߿߿߿߿",
+                     // in-range 3-byte UTF8 (U+0800). should truncate to "ࠀࠁ"
+                     "ࠀࠀࠀ",
+                     // max 3-byte UTF8 (U+FFFF). should not truncate
+                     "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf",
+                     // in-range 4-byte UTF8 (U+10000). should truncate to "𐀀𐀁"
+                     "𐀀𐀀𐀀",
+                     // max unicode (U+10FFFF). should truncate to \xf4\x8f\xbf\xbf\xf4\x90\x80\x80,
+                     // which is no longer valid unicode, but is still ok UTF-8???
+                     "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf",
+                     // max 4-byte UTF8 (U+1FFFFF). should not truncate
+                     "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"};
 
   // NOTE: UTF8 min is initialized with 0xf7bfbfbf. Binary values larger
   // than that will not become minimum value (when written as UTF-8).
-  char const* truncated_min[] = {"yyyyyyyy",
-                                 "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f",
-                                 "\xf7\xbf\xbf\xbf",
-                                 "éééé",
-                                 "߿߿߿߿",
-                                 "ࠀࠀ",
-                                 "\xef\xbf\xbf\xef\xbf\xbf",
-                                 "𐀀𐀀",
-                                 "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf",
-                                 "\xf7\xbf\xbf\xbf"};
-
-  char const* truncated_max[] = {"yyyyyyyz",
-                                 "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x80",
-                                 "\xff\xff\xff\xff\xff\xff\xff\xff\xff",
-                                 "éééê",
-                                 "߿߿߿߿߿",
-                                 "ࠀࠁ",
-                                 "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf",
-                                 "𐀀𐀁",
-                                 "\xf4\x8f\xbf\xbf\xf4\x90\x80\x80",
-                                 "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"};
+  std::array truncated_min{"yyyyyyyy",
+                           "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f",
+                           "\xf7\xbf\xbf\xbf",
+                           "éééé",
+                           "߿߿߿߿",
+                           "ࠀࠀ",
+                           "\xef\xbf\xbf\xef\xbf\xbf",
+                           "𐀀𐀀",
+                           "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf",
+                           "\xf7\xbf\xbf\xbf"};
+
+  std::array truncated_max{"yyyyyyyz",
+                           "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x80",
+                           "\xff\xff\xff\xff\xff\xff\xff\xff\xff",
+                           "éééê",
+                           "߿߿߿߿߿",
+                           "ࠀࠁ",
+                           "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf",
+                           "𐀀𐀁",
+                           "\xf4\x8f\xbf\xbf\xf4\x90\x80\x80",
+                           "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"};
 
   auto cols = [&]() {
     using string_wrapper = column_wrapper<cudf::string_view>;
diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp
index a9186874e83..42a574ac5c0 100644
--- a/cpp/tests/json/json_tests.cpp
+++ b/cpp/tests/json/json_tests.cpp
@@ -652,7 +652,7 @@ TEST_F(JsonPathTests, MixedOutput)
   // various queries on:
   // clang-format off
   std::vector<std::string> input_strings {
-    "{\"a\": {\"b\" : \"c\"}}",
+    R"({"a": {"b" : "c"}})",
 
     "{"
       "\"a\": {\"b\" : \"c\"},"
@@ -827,7 +827,7 @@ TEST_F(JsonPathTests, AllowSingleQuotes)
   // various queries on:
   std::vector<std::string> input_strings{
     // clang-format off
-    "{\'a\': {\'b\' : \'c\'}}",
+    R"({'a': {'b' : 'c'}})",
 
     "{"
       "\'a\': {\'b\' : \"c\"},"
@@ -902,7 +902,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars)
   {
     std::vector<std::string> input_strings{
       // clang-format off
-      "{\"item\" : [{\"key\" : \"value[\"}]}",
+      R"({"item" : [{"key" : "value["}]})",
       // clang-format on
     };
 
@@ -927,7 +927,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars)
   {
     std::vector<std::string> input_strings{
       // clang-format off
-      "{\"a\" : \"[}{}][][{[\\\"}}[\\\"]\"}",
+      R"({"a" : "[}{}][][{[\"}}[\"]"})",
       // clang-format on
     };
 
@@ -958,8 +958,8 @@ TEST_F(JsonPathTests, EscapeSequences)
 
   std::vector<std::string> input_strings{
     // clang-format off
-    "{\"a\" : \"\\\" \\\\ \\/ \\b \\f \\n \\r \\t\"}",
-    "{\"a\" : \"\\u1248 \\uacdf \\uACDF \\u10EF\"}"
+    R"({"a" : "\" \\ \/ \b \f \n \r \t"})",
+    R"({"a" : "\u1248 \uacdf \uACDF \u10EF"})"
     // clang-format on
   };
 
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 949ffcc26a6..1e9e13ded93 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -35,7 +35,6 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
-#include <iostream>
 #include <vector>
 
 using aggregation        = cudf::aggregation;
@@ -1254,7 +1253,7 @@ struct StringReductionTest : public cudf::test::BaseFixture,
 };
 
 // ------------------------------------------------------------------------
-std::vector<std::string> string_list[] = {
+std::vector<std::vector<std::string>> string_list{{
   {"one", "two", "three", "four", "five", "six", "seven", "eight", "nine"},
   {"", "two", "three", "four", "five", "six", "seven", "eight", "nine"},
   {"one", "", "three", "four", "five", "six", "seven", "eight", "nine"},
@@ -1264,7 +1263,7 @@ std::vector<std::string> string_list[] = {
   {"\xF7\xBF\xBF\xBF", "", "", "", "", "", "", "", ""},
   {"one", "two", "three", "four", "\xF7\xBF\xBF\xBF", "six", "seven", "eight", "nine"},
   {"one", "two", "\xF7\xBF\xBF\xBF", "four", "five", "six", "seven", "eight", "nine"},
-};
+}};
 INSTANTIATE_TEST_CASE_P(string_cases, StringReductionTest, testing::ValuesIn(string_list));
 TEST_P(StringReductionTest, MinMax)
 {
@@ -2235,7 +2234,7 @@ TYPED_TEST(ReductionTest, NthElement)
 
 struct DictionaryStringReductionTest : public StringReductionTest {};
 
-std::vector<std::string> data_list[] = {
+std::vector<std::vector<std::string>> data_list = {
   {"nine", "two", "five", "three", "five", "six", "two", "eight", "nine"},
 };
 INSTANTIATE_TEST_CASE_P(dictionary_cases,
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index 76dbbaef491..c4463d68a68 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -415,8 +415,8 @@ TEST_F(ScanStringsTest, MoreStringsMinMax)
   int row_count = 512;
 
   auto data_begin = cudf::detail::make_counting_transform_iterator(0, [](auto idx) {
-    char const s[] = {static_cast<char>('a' + (idx % 26)), 0};
-    return std::string(s);
+    char const s = static_cast<char>('a' + (idx % 26));
+    return std::string{1, s};
   });
   auto validity   = cudf::detail::make_counting_transform_iterator(
     0, [](auto idx) -> bool { return (idx % 23) != 22; });
diff --git a/cpp/tests/rolling/nth_element_test.cpp b/cpp/tests/rolling/nth_element_test.cpp
index 9cc8b6dec81..2444992e68f 100644
--- a/cpp/tests/rolling/nth_element_test.cpp
+++ b/cpp/tests/rolling/nth_element_test.cpp
@@ -83,7 +83,7 @@ class rolling_exec {
     return *this;
   }
 
-  std::unique_ptr<cudf::column> test_grouped_nth_element(
+  [[nodiscard]] std::unique_ptr<cudf::column> test_grouped_nth_element(
     cudf::size_type n, std::optional<cudf::null_policy> null_handling = std::nullopt) const
   {
     return cudf::grouped_rolling_window(
@@ -96,7 +96,7 @@ class rolling_exec {
         n, null_handling.value_or(_null_handling)));
   }
 
-  std::unique_ptr<cudf::column> test_nth_element(
+  [[nodiscard]] std::unique_ptr<cudf::column> test_nth_element(
     cudf::size_type n, std::optional<cudf::null_policy> null_handling = std::nullopt) const
   {
     return cudf::rolling_window(_input,
diff --git a/cpp/tests/streams/transform_test.cpp b/cpp/tests/streams/transform_test.cpp
index 9187672221c..cf81dc6fb42 100644
--- a/cpp/tests/streams/transform_test.cpp
+++ b/cpp/tests/streams/transform_test.cpp
@@ -32,7 +32,7 @@
 class TransformTest : public cudf::test::BaseFixture {};
 
 template <class dtype, class Data>
-void test_udf(char const udf[], Data data_init, cudf::size_type size, bool is_ptx)
+void test_udf(char const* udf, Data data_init, cudf::size_type size, bool is_ptx)
 {
   auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
   auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init);
diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp
index 7e530b2a34d..5923f8dee5a 100644
--- a/cpp/tests/strings/chars_types_tests.cpp
+++ b/cpp/tests/strings/chars_types_tests.cpp
@@ -24,6 +24,7 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
+#include <array>
 #include <vector>
 
 struct StringsCharsTest : public cudf::test::BaseFixture {};
@@ -50,20 +51,20 @@ TEST_P(CharsTypes, AllTypes)
                                      "de",
                                      "\t\r\n\f "};
 
-  bool expecteds[] = {false, false, false, false, false, false, false, false,
-                      false, false, false, false, false, true,  false, false,  // decimal
-                      false, false, false, false, false, false, false, false,
-                      false, true,  false, true,  false, true,  false, false,  // numeric
-                      false, false, false, false, false, false, false, false,
-                      false, false, false, true,  false, true,  false, false,  // digit
-                      true,  true,  false, true,  false, false, false, false,
-                      false, false, false, false, false, false, true,  false,  // alpha
-                      false, false, false, false, false, false, false, false,
-                      false, false, false, false, false, false, false, true,  // space
-                      false, false, false, true,  false, false, false, false,
-                      false, false, false, false, false, false, false, false,  // upper
-                      false, true,  false, false, false, false, false, false,
-                      false, false, false, false, false, false, true,  false};  // lower
+  std::array expecteds{false, false, false, false, false, false, false, false,
+                       false, false, false, false, false, true,  false, false,  // decimal
+                       false, false, false, false, false, false, false, false,
+                       false, true,  false, true,  false, true,  false, false,  // numeric
+                       false, false, false, false, false, false, false, false,
+                       false, false, false, true,  false, true,  false, false,  // digit
+                       true,  true,  false, true,  false, false, false, false,
+                       false, false, false, false, false, false, true,  false,  // alpha
+                       false, false, false, false, false, false, false, false,
+                       false, false, false, false, false, false, false, true,  // space
+                       false, false, false, true,  false, false, false, false,
+                       false, false, false, false, false, false, false, false,  // upper
+                       false, true,  false, false, false, false, false, false,
+                       false, false, false, false, false, false, true,  false};  // lower
 
   auto is_parm = GetParam();
 
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index acf850c7a66..bdfd38267e6 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -32,6 +32,7 @@
 #include <thrust/iterator/transform_iterator.h>
 
 #include <algorithm>
+#include <array>
 #include <vector>
 
 struct StringsContainsTests : public cudf::test::BaseFixture {};
@@ -167,10 +168,8 @@ TEST_F(StringsContainsTests, MatchesTest)
   auto strings_view = cudf::strings_column_view(strings);
   {
     auto const pattern = std::string("lazy");
-    bool h_expected[]  = {false, false, true, false, false, false, false};
     cudf::test::fixed_width_column_wrapper<bool> expected(
-      h_expected,
-      h_expected + h_strings.size(),
+      {false, false, true, false, false, false, false},
       thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
     auto prog    = cudf::strings::regex_program::create(pattern);
     auto results = cudf::strings::matches_re(strings_view, *prog);
@@ -178,10 +177,8 @@ TEST_F(StringsContainsTests, MatchesTest)
   }
   {
     auto const pattern = std::string("\\d+");
-    bool h_expected[]  = {false, false, false, true, true, false, false};
     cudf::test::fixed_width_column_wrapper<bool> expected(
-      h_expected,
-      h_expected + h_strings.size(),
+      {false, false, false, true, true, false, false},
       thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
     auto prog    = cudf::strings::regex_program::create(pattern);
     auto results = cudf::strings::matches_re(strings_view, *prog);
@@ -189,10 +186,8 @@ TEST_F(StringsContainsTests, MatchesTest)
   }
   {
     auto const pattern = std::string("@\\w+");
-    bool h_expected[]  = {false, false, false, false, false, false, false};
     cudf::test::fixed_width_column_wrapper<bool> expected(
-      h_expected,
-      h_expected + h_strings.size(),
+      {false, false, false, false, false, false, false},
       thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
     auto prog    = cudf::strings::regex_program::create(pattern);
     auto results = cudf::strings::matches_re(strings_view, *prog);
@@ -200,10 +195,8 @@ TEST_F(StringsContainsTests, MatchesTest)
   }
   {
     auto const pattern = std::string(".*");
-    bool h_expected[]  = {true, true, true, true, true, false, true};
     cudf::test::fixed_width_column_wrapper<bool> expected(
-      h_expected,
-      h_expected + h_strings.size(),
+      {true, true, true, true, true, false, true},
       thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
     auto prog    = cudf::strings::regex_program::create(pattern);
     auto results = cudf::strings::matches_re(strings_view, *prog);
@@ -335,9 +328,9 @@ TEST_F(StringsContainsTests, EmbeddedNullCharacter)
 {
   std::vector<std::string> data(10);
   std::generate(data.begin(), data.end(), [n = 0]() mutable {
-    char first      = static_cast<char>('A' + n++);
-    char raw_data[] = {first, '\0', 'B'};
-    return std::string{raw_data, 3};
+    char first          = static_cast<char>('A' + n++);
+    std::array raw_data = {first, '\0', 'B'};
+    return std::string{raw_data.data(), 3};
   });
   cudf::test::strings_column_wrapper input(data.begin(), data.end());
   auto strings_view = cudf::strings_column_view(input);
@@ -749,11 +742,11 @@ TEST_F(StringsContainsTests, ASCII)
   auto input = cudf::test::strings_column_wrapper({"abc \t\f\r 12", "áé 　❽❽", "aZ ❽4", "XYZ　8"});
   auto view = cudf::strings_column_view(input);
 
-  std::string patterns[] = {R"(\w+[\s]+\d+)",
-                            R"([^\W]+\s+[^\D]+)",
-                            R"([\w]+[^\S]+[\d]+)",
-                            R"([\w]+\s+[\d]+)",
-                            R"(\w+\s+\d+)"};
+  std::array patterns = {R"(\w+[\s]+\d+)",
+                         R"([^\W]+\s+[^\D]+)",
+                         R"([\w]+[^\S]+[\d]+)",
+                         R"([\w]+\s+[\d]+)",
+                         R"(\w+\s+\d+)"};
 
   for (auto ptn : patterns) {
     auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 0, 0});
@@ -787,24 +780,18 @@ TEST_F(StringsContainsTests, MediumRegex)
 
   auto strings_view = cudf::strings_column_view(strings);
   {
-    auto results      = cudf::strings::contains_re(strings_view, *prog);
-    bool h_expected[] = {true, false, false};
-    cudf::test::fixed_width_column_wrapper<bool> expected(h_expected,
-                                                          h_expected + h_strings.size());
+    auto results = cudf::strings::contains_re(strings_view, *prog);
+    cudf::test::fixed_width_column_wrapper<bool> expected({true, false, false});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
   {
-    auto results      = cudf::strings::matches_re(strings_view, *prog);
-    bool h_expected[] = {true, false, false};
-    cudf::test::fixed_width_column_wrapper<bool> expected(h_expected,
-                                                          h_expected + h_strings.size());
+    auto results = cudf::strings::matches_re(strings_view, *prog);
+    cudf::test::fixed_width_column_wrapper<bool> expected({true, false, false});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
   {
-    auto results         = cudf::strings::count_re(strings_view, *prog);
-    int32_t h_expected[] = {1, 0, 0};
-    cudf::test::fixed_width_column_wrapper<int32_t> expected(h_expected,
-                                                             h_expected + h_strings.size());
+    auto results = cudf::strings::count_re(strings_view, *prog);
+    cudf::test::fixed_width_column_wrapper<int32_t> expected({1, 0, 0});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
 }
@@ -828,24 +815,18 @@ TEST_F(StringsContainsTests, LargeRegex)
 
   auto strings_view = cudf::strings_column_view(strings);
   {
-    auto results      = cudf::strings::contains_re(strings_view, *prog);
-    bool h_expected[] = {true, false, false};
-    cudf::test::fixed_width_column_wrapper<bool> expected(h_expected,
-                                                          h_expected + h_strings.size());
+    auto results = cudf::strings::contains_re(strings_view, *prog);
+    cudf::test::fixed_width_column_wrapper<bool> expected({true, false, false});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
   {
-    auto results      = cudf::strings::matches_re(strings_view, *prog);
-    bool h_expected[] = {true, false, false};
-    cudf::test::fixed_width_column_wrapper<bool> expected(h_expected,
-                                                          h_expected + h_strings.size());
+    auto results = cudf::strings::matches_re(strings_view, *prog);
+    cudf::test::fixed_width_column_wrapper<bool> expected({true, false, false});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
   {
-    auto results         = cudf::strings::count_re(strings_view, *prog);
-    int32_t h_expected[] = {1, 0, 0};
-    cudf::test::fixed_width_column_wrapper<int32_t> expected(h_expected,
-                                                             h_expected + h_strings.size());
+    auto results = cudf::strings::count_re(strings_view, *prog);
+    cudf::test::fixed_width_column_wrapper<int32_t> expected({1, 0, 0});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
 }
diff --git a/cpp/tests/strings/durations_tests.cpp b/cpp/tests/strings/durations_tests.cpp
index 86189b29981..f2e31339035 100644
--- a/cpp/tests/strings/durations_tests.cpp
+++ b/cpp/tests/strings/durations_tests.cpp
@@ -24,6 +24,7 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
+#include <array>
 #include <vector>
 
 struct StringsDurationsTest : public cudf::test::BaseFixture {};
@@ -403,17 +404,17 @@ TEST_F(StringsDurationsTest, ParseSingle)
                                                 "01",
                                                 ""};  // error
   auto size = cudf::column_view(string_src).size();
-  int32_t expected_v[]{0, 0, 1, -1, 23, -23, 59, -59, 99, -99, 0, 1, 0};
-  auto it1 =
-    thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i * 3600}; });
+  std::array expected_v{0, 0, 1, -1, 23, -23, 59, -59, 99, -99, 0, 1, 0};
+  auto it1 = thrust::make_transform_iterator(expected_v.data(),
+                                             [](auto i) { return cudf::duration_s{i * 3600}; });
   cudf::test::fixed_width_column_wrapper<cudf::duration_s> expected_s1(it1, it1 + size);
   auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src),
                                              cudf::data_type(cudf::type_to_id<cudf::duration_s>()),
                                              "%H");
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1);
 
-  auto it2 =
-    thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i * 60}; });
+  auto it2 = thrust::make_transform_iterator(expected_v.data(),
+                                             [](auto i) { return cudf::duration_s{i * 60}; });
   cudf::test::fixed_width_column_wrapper<cudf::duration_s> expected_s2(it2, it2 + size);
   results = cudf::strings::to_durations(cudf::strings_column_view(string_src),
                                         cudf::data_type(cudf::type_to_id<cudf::duration_s>()),
@@ -421,14 +422,14 @@ TEST_F(StringsDurationsTest, ParseSingle)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s2);
 
   auto it3 =
-    thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; });
+    thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; });
   cudf::test::fixed_width_column_wrapper<cudf::duration_s> expected_s3(it3, it3 + size);
   results = cudf::strings::to_durations(cudf::strings_column_view(string_src),
                                         cudf::data_type(cudf::type_to_id<cudf::duration_s>()),
                                         "%S");
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s3);
 
-  auto it4 = thrust::make_transform_iterator(expected_v,
+  auto it4 = thrust::make_transform_iterator(expected_v.data(),
                                              [](auto i) { return cudf::duration_ms{i * 60000}; });
   cudf::test::fixed_width_column_wrapper<cudf::duration_ms> expected_ms(it4, it4 + size);
   results = cudf::strings::to_durations(cudf::strings_column_view(string_src),
@@ -454,21 +455,21 @@ TEST_F(StringsDurationsTest, ParseMultiple)
                                                 "01:01:01",
                                                 ""};  // error
   auto size = cudf::column_view(string_src).size();
-  int32_t expected_v[]{0,
-                       0,
-                       -1,
-                       -(3600 + 60 + 1),
-                       23 * 3600 + 1,
-                       -(23 * 3600 + 1),
-                       59 * 3600,
-                       -59 * 3600,
-                       99 * 3600,
-                       -99 * 3600,
-                       0,
-                       3661,
-                       0};
+  std::array expected_v{0,
+                        0,
+                        -1,
+                        -(3600 + 60 + 1),
+                        23 * 3600 + 1,
+                        -(23 * 3600 + 1),
+                        59 * 3600,
+                        -59 * 3600,
+                        99 * 3600,
+                        -99 * 3600,
+                        0,
+                        3661,
+                        0};
   auto it1 =
-    thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; });
+    thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; });
   cudf::test::fixed_width_column_wrapper<cudf::duration_s> expected_s1(it1, it1 + size);
   auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src),
                                              cudf::data_type(cudf::type_to_id<cudf::duration_s>()),
@@ -476,7 +477,7 @@ TEST_F(StringsDurationsTest, ParseMultiple)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1);
 
   auto it2 = thrust::make_transform_iterator(
-    expected_v, [](auto i) { return cudf::duration_D{i / (24 * 3600)}; });
+    expected_v.data(), [](auto i) { return cudf::duration_D{i / (24 * 3600)}; });
   cudf::test::fixed_width_column_wrapper<cudf::duration_D> expected_D2(it2, it2 + size);
   results = cudf::strings::to_durations(cudf::strings_column_view(string_src),
                                         cudf::data_type(cudf::type_to_id<cudf::duration_D>()),
@@ -508,28 +509,28 @@ TEST_F(StringsDurationsTest, ParseSubsecond)
                                                 "01:01:01",
                                                 ""};  // error
   auto size = cudf::column_view(string_src).size();
-  int64_t expected_v[]{0,
-                       -123456789L,
-                       -1000666999L,
-                       -((3600 + 60 + 1) * 1000000000L + 100000000L),
-                       (23 * 3600 + 1) * 1000000000L + 80L,
-                       -((23 * 3600 + 1) * 1000000000L + 123000000L),
-                       (59 * 3600) * 1000000000L,
-                       -(59 * 3600) * 1000000000L,
-                       (99 * 3600) * 1000000000L,
-                       -(99 * 3600) * 1000000000L,
-                       0,
-                       (3661) * 1000000000L,
-                       0};
+  std::array<int64_t, 13> expected_v{0,
+                                     -123456789L,
+                                     -1000666999L,
+                                     -((3600 + 60 + 1) * 1000000000L + 100000000L),
+                                     (23 * 3600 + 1) * 1000000000L + 80L,
+                                     -((23 * 3600 + 1) * 1000000000L + 123000000L),
+                                     (59 * 3600) * 1000000000L,
+                                     -(59 * 3600) * 1000000000L,
+                                     (99 * 3600) * 1000000000L,
+                                     -(99 * 3600) * 1000000000L,
+                                     0,
+                                     (3661) * 1000000000L,
+                                     0};
   auto it1 =
-    thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_ns{i}; });
+    thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_ns{i}; });
   cudf::test::fixed_width_column_wrapper<cudf::duration_ns> expected_ns1(it1, it1 + size);
   auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src),
                                              cudf::data_type(cudf::type_to_id<cudf::duration_ns>()),
                                              "%H:%M:%S");
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ns1);
 
-  auto it2 = thrust::make_transform_iterator(expected_v,
+  auto it2 = thrust::make_transform_iterator(expected_v.data(),
                                              [](auto i) { return cudf::duration_ms{i / 1000000}; });
   cudf::test::fixed_width_column_wrapper<cudf::duration_ms> expected_ms2(it2, it2 + size);
   results = cudf::strings::to_durations(cudf::strings_column_view(string_src),
@@ -559,25 +560,25 @@ TEST_F(StringsDurationsTest, ParseAMPM)
                                                 "01:01:01",   // error
                                                 ""};          // error
   auto size = cudf::column_view(string_src).size();
-  int32_t expected_v[]{0,
-                       0 + 12 * 3600,
-                       0,
-                       0 - 12 * 3600,
-                       -1,
-                       -1 - 12 * 3600,
-                       -(3600 + 60 + 1),
-                       -(3600 + 60 + 1) - 12 * 3600,
-                       11 * 3600 + 59 * 60 + 59,
-                       11 * 3600 + 59 * 60 + 59 + 12 * 3600,
-                       -(11 * 3600 + 59 * 60 + 59),
-                       -(11 * 3600 + 59 * 60 + 59 + 12 * 3600),
-                       0,
-                       0,
-                       0,
-                       0,
-                       0};
+  std::array expected_v{0,
+                        0 + 12 * 3600,
+                        0,
+                        0 - 12 * 3600,
+                        -1,
+                        -1 - 12 * 3600,
+                        -(3600 + 60 + 1),
+                        -(3600 + 60 + 1) - 12 * 3600,
+                        11 * 3600 + 59 * 60 + 59,
+                        11 * 3600 + 59 * 60 + 59 + 12 * 3600,
+                        -(11 * 3600 + 59 * 60 + 59),
+                        -(11 * 3600 + 59 * 60 + 59 + 12 * 3600),
+                        0,
+                        0,
+                        0,
+                        0,
+                        0};
   auto it1 =
-    thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; });
+    thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; });
   cudf::test::fixed_width_column_wrapper<cudf::duration_s> expected_s1(it1, it1 + size);
   auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src),
                                              cudf::data_type(cudf::type_to_id<cudf::duration_s>()),
@@ -585,7 +586,7 @@ TEST_F(StringsDurationsTest, ParseAMPM)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1);
 
   auto it2 = thrust::make_transform_iterator(
-    expected_v, [](auto i) { return cudf::duration_D{i / (24 * 3600)}; });
+    expected_v.data(), [](auto i) { return cudf::duration_D{i / (24 * 3600)}; });
   cudf::test::fixed_width_column_wrapper<cudf::duration_D> expected_D2(it2, it2 + size);
   results = cudf::strings::to_durations(cudf::strings_column_view(string_src),
                                         cudf::data_type(cudf::type_to_id<cudf::duration_D>()),
@@ -616,20 +617,20 @@ TEST_F(StringsDurationsTest, ParseCompoundSpecifier)
                                                 "01:01:01",  // error
                                                 ""};         // error
   auto size = cudf::column_view(string_src).size();
-  int32_t expected_v[]{0,
-                       0 + 12 * 3600,
-                       1,
-                       1 + 12 * 3600,
-                       (3600 + 60 + 1),
-                       (3600 + 60 + 1) + 12 * 3600,
-                       11 * 3600 + 59 * 60 + 59,
-                       11 * 3600 + 59 * 60 + 59 + 12 * 3600,
-                       0,
-                       0,
-                       0,
-                       0};
+  std::array expected_v{0,
+                        0 + 12 * 3600,
+                        1,
+                        1 + 12 * 3600,
+                        (3600 + 60 + 1),
+                        (3600 + 60 + 1) + 12 * 3600,
+                        11 * 3600 + 59 * 60 + 59,
+                        11 * 3600 + 59 * 60 + 59 + 12 * 3600,
+                        0,
+                        0,
+                        0,
+                        0};
   auto it1 =
-    thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_s{i}; });
+    thrust::make_transform_iterator(expected_v.data(), [](auto i) { return cudf::duration_s{i}; });
   cudf::test::fixed_width_column_wrapper<cudf::duration_s> expected_s1(it1, it1 + size);
   auto results = cudf::strings::to_durations(cudf::strings_column_view(string_src),
                                              cudf::data_type(cudf::type_to_id<cudf::duration_s>()),
@@ -641,8 +642,8 @@ TEST_F(StringsDurationsTest, ParseCompoundSpecifier)
                                         "%OI:%OM:%OS %p");
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_s1);
 
-  auto it2 =
-    thrust::make_transform_iterator(expected_v, [](auto i) { return cudf::duration_ms{i * 1000}; });
+  auto it2 = thrust::make_transform_iterator(expected_v.data(),
+                                             [](auto i) { return cudf::duration_ms{i * 1000}; });
   cudf::test::fixed_width_column_wrapper<cudf::duration_ms> expected_s2(it2, it2 + size);
   results = cudf::strings::to_durations(cudf::strings_column_view(string_src),
                                         cudf::data_type(cudf::type_to_id<cudf::duration_ms>()),
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index 1491da758d5..61246fb098d 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -275,8 +275,8 @@ TEST_F(StringsExtractTests, ExtractAllTest)
 
   auto pattern = std::string("(\\d+) (\\w+)");
 
-  bool valids[] = {true, true, true, false, false, false, true};
-  using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
+  std::array valids{true, true, true, false, false, false, true};
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
   LCW expected({LCW{"123", "banana", "7", "eleven"},
                 LCW{"41", "apple"},
                 LCW{"6", "péar", "0", "pair"},
@@ -284,7 +284,7 @@ TEST_F(StringsExtractTests, ExtractAllTest)
                 LCW{},
                 LCW{},
                 LCW{"4", "paré"}},
-               valids);
+               valids.data());
   auto prog    = cudf::strings::regex_program::create(pattern);
   auto results = cudf::strings::extract_all_record(sv, *prog);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index 6eea1895fb1..73da4d081e2 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -33,10 +33,10 @@ struct StringsFindallTests : public cudf::test::BaseFixture {};
 
 TEST_F(StringsFindallTests, FindallTest)
 {
-  bool valids[] = {true, true, true, true, true, false, true, true};
+  std::array valids{true, true, true, true, true, false, true, true};
   cudf::test::strings_column_wrapper input(
     {"3-A", "4-May 5-Day 6-Hay", "12-Dec-2021-Jan", "Feb-March", "4 ABC", "", "", "25-9000-Hal"},
-    valids);
+    valids.data());
   auto sv = cudf::strings_column_view(input);
 
   auto pattern = std::string("(\\d+)-(\\w+)");
@@ -50,7 +50,7 @@ TEST_F(StringsFindallTests, FindallTest)
                 LCW{},
                 LCW{},
                 LCW{"25-9000"}},
-               valids);
+               valids.data());
   auto prog    = cudf::strings::regex_program::create(pattern);
   auto results = cudf::strings::findall(sv, *prog);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
diff --git a/cpp/tests/transform/integration/unary_transform_test.cpp b/cpp/tests/transform/integration/unary_transform_test.cpp
index 5fa02d9978a..1785848ec77 100644
--- a/cpp/tests/transform/integration/unary_transform_test.cpp
+++ b/cpp/tests/transform/integration/unary_transform_test.cpp
@@ -30,7 +30,7 @@ namespace transformation {
 struct UnaryOperationIntegrationTest : public cudf::test::BaseFixture {};
 
 template <class dtype, class Op, class Data>
-void test_udf(char const udf[], Op op, Data data_init, cudf::size_type size, bool is_ptx)
+void test_udf(char const* udf, Op op, Data data_init, cudf::size_type size, bool is_ptx)
 {
   auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
   auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init);

From 4018d3116b2bfd876253b187894df10cb325db2f Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 27 Sep 2024 13:17:03 -0400
Subject: [PATCH 10/14] Remove superfluous use of std::vector for std::future
 (#16829)

This PR addresses #16888 , where a superfluous use of `std::vector` should be removed.

closes #16888

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16829
---
 cpp/src/io/parquet/reader_impl.hpp           |  4 +--
 cpp/src/io/parquet/reader_impl_preprocess.cu | 26 +++++++++-----------
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 2d46da14bec..62ffc4d3077 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -188,10 +188,10 @@ class reader::impl {
    *
    * Does not decompress the chunk data.
    *
-   * @return pair of boolean indicating if compressed chunks were found and a vector of futures for
+   * @return pair of boolean indicating if compressed chunks were found and a future for
    * read completion
    */
-  std::pair<bool, std::vector<std::future<void>>> read_column_chunks();
+  std::pair<bool, std::future<void>> read_column_chunks();
 
   /**
    * @brief Read compressed data and page information for the current pass.
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 8e67f233213..3763c2e8e6d 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -964,7 +964,7 @@ void reader::impl::allocate_level_decode_space()
   }
 }
 
-std::pair<bool, std::vector<std::future<void>>> reader::impl::read_column_chunks()
+std::pair<bool, std::future<void>> reader::impl::read_column_chunks()
 {
   auto const& row_groups_info = _pass_itm_data->row_groups;
 
@@ -989,7 +989,6 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_column_chunks
   // TODO: make this respect the pass-wide skip_rows/num_rows instead of the file-wide
   // skip_rows/num_rows
   // auto remaining_rows            = num_rows;
-  std::vector<std::future<void>> read_chunk_tasks;
   size_type chunk_count = 0;
   for (auto const& rg : row_groups_info) {
     auto const& row_group       = _metadata->get_row_group(rg.index, rg.source_index);
@@ -1018,16 +1017,15 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_column_chunks
   }
 
   // Read compressed chunk data to device memory
-  read_chunk_tasks.push_back(read_column_chunks_async(_sources,
-                                                      raw_page_data,
-                                                      chunks,
-                                                      0,
-                                                      chunks.size(),
-                                                      column_chunk_offsets,
-                                                      chunk_source_map,
-                                                      _stream));
-
-  return {total_decompressed_size > 0, std::move(read_chunk_tasks)};
+  return {total_decompressed_size > 0,
+          read_column_chunks_async(_sources,
+                                   raw_page_data,
+                                   chunks,
+                                   0,
+                                   chunks.size(),
+                                   column_chunk_offsets,
+                                   chunk_source_map,
+                                   _stream)};
 }
 
 void reader::impl::read_compressed_data()
@@ -1042,9 +1040,7 @@ void reader::impl::read_compressed_data()
   auto const [has_compressed_data, read_chunks_tasks] = read_column_chunks();
   pass.has_compressed_data                            = has_compressed_data;
 
-  for (auto& task : read_chunks_tasks) {
-    task.wait();
-  }
+  read_chunks_tasks.wait();
 
   // Process dataset chunk pages into output columns
   auto const total_pages = _has_page_index ? count_page_headers_with_pgidx(chunks, _stream)

From afe9f929abf565c235d5a4e375ef33f2cf032487 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 27 Sep 2024 10:55:48 -0700
Subject: [PATCH 11/14] clang-tidy fixes part 2 (#16938)

Subset of improvements to the code base proposed by the latest version of clang-tidy.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16938
---
 cpp/src/datetime/timezone.cpp                 |  2 +-
 cpp/src/dictionary/dictionary_column_view.cpp |  5 ++---
 cpp/src/interop/dlpack.cpp                    |  4 ++--
 cpp/src/io/avro/avro.cpp                      |  2 +-
 cpp/src/io/avro/avro.hpp                      | 21 ++++++++++---------
 cpp/src/io/comp/uncomp.cpp                    |  4 ++--
 cpp/src/io/parquet/parquet_gpu.hpp            | 21 ++++++++++---------
 cpp/src/jit/parser.cpp                        |  4 +---
 cpp/src/strings/regex/regcomp.cpp             |  4 ++--
 cpp/src/utilities/stream_pool.cpp             |  2 +-
 10 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index cf239297255..a6b6cbbf0b5 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -38,7 +38,7 @@ std::string const tzif_system_directory = "/usr/share/zoneinfo/";
 struct timezone_file_header {
   uint32_t magic;          ///< "TZif"
   uint8_t version;         ///< 0:version1, '2':version2, '3':version3
-  uint8_t reserved15[15];  ///< unused, reserved for future use
+  uint8_t reserved15[15];  ///< unused, reserved for future use // NOLINT
   uint32_t isutccnt;       ///< number of UTC/local indicators contained in the body
   uint32_t isstdcnt;       ///< number of standard/wall indicators contained in the body
   uint32_t leapcnt;        ///< number of leap second records contained in the body
diff --git a/cpp/src/dictionary/dictionary_column_view.cpp b/cpp/src/dictionary/dictionary_column_view.cpp
index 4906e5b4f9c..3e4a201bba4 100644
--- a/cpp/src/dictionary/dictionary_column_view.cpp
+++ b/cpp/src/dictionary/dictionary_column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,8 +36,7 @@ column_view dictionary_column_view::indices() const noexcept { return child(0);
 
 column_view dictionary_column_view::get_indices_annotated() const noexcept
 {
-  return column_view(
-    indices().type(), size(), indices().head(), null_mask(), null_count(), offset());
+  return {indices().type(), size(), indices().head(), null_mask(), null_count(), offset()};
 }
 
 column_view dictionary_column_view::keys() const noexcept { return child(1); }
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index ba5b11b90d8..a1be6aade4e 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -118,8 +118,8 @@ DLDataType data_type_to_DLDataType(data_type type)
 
 // Context object to own memory allocated for DLManagedTensor
 struct dltensor_context {
-  int64_t shape[2];
-  int64_t strides[2];
+  int64_t shape[2];    // NOLINT
+  int64_t strides[2];  // NOLINT
   rmm::device_buffer buffer;
 
   static void deleter(DLManagedTensor* arg)
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index 2041f03cd81..03cf6d4a0e0 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -199,7 +199,7 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
     // Read the next sync markers and ensure they match the first ones we
     // encountered.  If they don't, we have to assume the data is corrupted,
     // and thus, we terminate processing immediately.
-    uint64_t const sync_marker[] = {get_raw<uint64_t>(), get_raw<uint64_t>()};
+    std::array const sync_marker = {get_raw<uint64_t>(), get_raw<uint64_t>()};
     bool valid_sync_markers =
       ((sync_marker[0] == md->sync_marker[0]) && (sync_marker[1] == md->sync_marker[1]));
     if (!valid_sync_markers) { return false; }
diff --git a/cpp/src/io/avro/avro.hpp b/cpp/src/io/avro/avro.hpp
index f2813a1ba51..2e992546ccc 100644
--- a/cpp/src/io/avro/avro.hpp
+++ b/cpp/src/io/avro/avro.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include "avro_common.hpp"
 
 #include <algorithm>
+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
@@ -100,15 +101,15 @@ struct column_desc {
  */
 struct file_metadata {
   std::map<std::string, std::string> user_data;
-  std::string codec         = "";
-  uint64_t sync_marker[2]   = {0, 0};
-  size_t metadata_size      = 0;
-  size_t total_data_size    = 0;
-  size_t selected_data_size = 0;
-  size_type num_rows        = 0;
-  size_type skip_rows       = 0;
-  size_type total_num_rows  = 0;
-  uint32_t max_block_size   = 0;
+  std::string codec                   = "";
+  std::array<uint64_t, 2> sync_marker = {0, 0};
+  size_t metadata_size                = 0;
+  size_t total_data_size              = 0;
+  size_t selected_data_size           = 0;
+  size_type num_rows                  = 0;
+  size_type skip_rows                 = 0;
+  size_type total_num_rows            = 0;
+  uint32_t max_block_size             = 0;
   std::vector<schema_entry> schema;
   std::vector<block_desc_s> block_list;
   std::vector<column_desc> columns;
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 602ff1734b6..1af45b41d8e 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -42,7 +42,7 @@ struct gz_file_header_s {
   uint8_t id2;        // 0x8b
   uint8_t comp_mthd;  // compression method (0-7=reserved, 8=deflate)
   uint8_t flags;      // flags (GZIPHeaderFlag)
-  uint8_t mtime[4];   // If non-zero: modification time (Unix format)
+  uint8_t mtime[4];   // If non-zero: modification time (Unix format)  // NOLINT
   uint8_t xflags;     // Extra compressor-specific flags
   uint8_t os;         // OS id
 };
@@ -103,7 +103,7 @@ struct zip_lfh_s {
 };
 
 struct bz2_file_header_s {
-  uint8_t sig[3];  // "BZh"
+  uint8_t sig[3];  // "BZh" // NOLINT
   uint8_t blksz;   // block size 1..9 in 100kB units (post-RLE)
 };
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 1390339c1ae..e631e12119d 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -294,7 +294,8 @@ struct PageInfo {
   int32_t uncompressed_page_size;  // uncompressed data size in bytes
   // for V2 pages, the def and rep level data is not compressed, and lacks the 4-byte length
   // indicator. instead the lengths for these are stored in the header.
-  int32_t lvl_bytes[level_type::NUM_LEVEL_TYPES];  // length of the rep/def levels (V2 header)
+  int32_t                                    // NOLINT
+    lvl_bytes[level_type::NUM_LEVEL_TYPES];  // length of the rep/def levels (V2 header)
   // Number of values in this data page or dictionary.
   // Important : the # of input values does not necessarily
   // correspond to the number of rows in the output. It just reflects the number
@@ -345,7 +346,7 @@ struct PageInfo {
   PageNestingDecodeInfo* nesting_decode;
 
   // level decode buffers
-  uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES];
+  uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES];  // NOLINT
 
   // temporary space for decoding DELTA_BYTE_ARRAY encoded strings
   int64_t temp_string_size;
@@ -431,14 +432,14 @@ struct ColumnChunkDesc {
   size_t num_values{};               // total number of values in this column
   size_t start_row{};                // file-wide, absolute starting row of this chunk
   uint32_t num_rows{};               // number of rows in this chunk
-  int16_t max_level[level_type::NUM_LEVEL_TYPES]{};  // max definition/repetition level
-  int16_t max_nesting_depth{};                       // max nesting depth of the output
-  int32_t type_length{};                             // type length from schema (for FLBA only)
-  Type physical_type{};                              // parquet physical data type
-  uint8_t
-    level_bits[level_type::NUM_LEVEL_TYPES]{};  // bits to encode max definition/repetition levels
-  int32_t num_data_pages{};                     // number of data pages
-  int32_t num_dict_pages{};                     // number of dictionary pages
+  int16_t max_level[level_type::NUM_LEVEL_TYPES]{};   // max definition/repetition level  // NOLINT
+  int16_t max_nesting_depth{};                        // max nesting depth of the output
+  int32_t type_length{};                              // type length from schema (for FLBA only)
+  Type physical_type{};                               // parquet physical data type
+  uint8_t level_bits[level_type::NUM_LEVEL_TYPES]{};  // bits to encode max   // NOLINT
+                                                      // definition/repetition levels
+  int32_t num_data_pages{};                           // number of data pages
+  int32_t num_dict_pages{};                           // number of dictionary pages
   PageInfo const* dict_page{};
   string_index_pair* str_dict_index{};  // index for string dictionary
   bitmask_type** valid_map_base{};      // base pointers of valid bit map for this column
diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp
index 398c36821cc..519ac2d1a2e 100644
--- a/cpp/src/jit/parser.cpp
+++ b/cpp/src/jit/parser.cpp
@@ -19,8 +19,6 @@
 #include <cudf/utilities/error.hpp>
 
 #include <algorithm>
-#include <cctype>
-#include <map>
 #include <set>
 #include <string>
 #include <utility>
@@ -28,7 +26,7 @@
 
 namespace cudf {
 namespace jit {
-constexpr char percent_escape[] = "_";
+constexpr char percent_escape[] = "_";  // NOLINT
 
 inline bool is_white(char const c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
 
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 7c4c89bd3fb..51c6e765edd 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -35,7 +35,7 @@ namespace strings {
 namespace detail {
 namespace {
 // Bitmask of all operators
-#define OPERATOR_MASK 0200
+enum { OPERATOR_MASK = 0200 };
 enum OperatorType : int32_t {
   START        = 0200,  // Start, used for marker on stack
   LBRA_NC      = 0203,  // non-capturing group
@@ -50,7 +50,7 @@ enum OperatorType : int32_t {
   COUNTED_LAZY = 0215,
   NOP          = 0302,  // No operation, internal use only
 };
-#define ITEM_MASK 0300
+enum { ITEM_MASK = 0300 };
 
 static reclass cclass_w(CCLASS_W);   // \w
 static reclass cclass_s(CCLASS_S);   // \s
diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
index 9824c472b20..8c29182bfb5 100644
--- a/cpp/src/utilities/stream_pool.cpp
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -82,7 +82,7 @@ class rmm_cuda_stream_pool : public cuda_stream_pool {
     return streams;
   }
 
-  std::size_t get_stream_pool_size() const override { return STREAM_POOL_SIZE; }
+  [[nodiscard]] std::size_t get_stream_pool_size() const override { return STREAM_POOL_SIZE; }
 };
 
 /**

From 670cc3f9c6add1fddde142ec3dece65643d3f022 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 27 Sep 2024 08:43:53 -1000
Subject: [PATCH 12/14] Avoid public constructors when called with columns to
 avoid unnecessary validation (#16747)

This PR continues an effort to avoid some public constructors when passing a column(s) to avoid unnecessary validation

Maybe we should consider disallowing public constructors to accept columns all-together, but I suspect some RAPIDS libraries are passing columns to public constructors

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16747
---
 python/cudf/cudf/core/column/categorical.py |  2 +-
 python/cudf/cudf/core/dataframe.py          | 44 +++++++++------------
 python/cudf/cudf/core/multiindex.py         | 26 +++++-------
 python/cudf/cudf/core/reshape.py            | 29 +++++++-------
 python/cudf/cudf/core/window/ewm.py         | 33 ++++++++--------
 python/cudf/cudf/core/window/rolling.py     | 39 +++++++++---------
 6 files changed, 77 insertions(+), 96 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index de5ed15771d..864e87b5377 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1337,7 +1337,7 @@ def _set_categories(
 
         # Ensure new_categories is unique first
         if not (is_unique or new_cats.is_unique):
-            new_cats = cudf.Series(new_cats)._column.unique()
+            new_cats = new_cats.unique()
 
         if cur_cats.equals(new_cats, check_dtypes=True):
             # TODO: Internal usages don't always need a copy; add a copy keyword
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 16b0aa95c35..79ed5a0e187 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6287,14 +6287,17 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
             )
 
         if not skipna and any(col.nullable for col in filtered._columns):
-            mask = DataFrame(
+            length = filtered._data.nrows
+            ca = ColumnAccessor(
                 {
-                    name: filtered._data[name]._get_mask_as_column()
-                    if filtered._data[name].nullable
-                    else as_column(True, length=len(filtered._data[name]))
-                    for name in filtered._column_names
-                }
+                    name: col._get_mask_as_column()
+                    if col.nullable
+                    else as_column(True, length=length)
+                    for name, col in filtered._data.items()
+                },
+                verify=False,
             )
+            mask = DataFrame._from_data(ca)
             mask = mask.all(axis=1)
         else:
             mask = None
@@ -6679,19 +6682,10 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
                 )
             return Series._from_column(result, index=self.index)
         else:
-            result_df = DataFrame(result).set_index(self.index)
+            result_df = DataFrame(result, index=self.index)
             result_df._set_columns_like(prepared._data)
             return result_df
 
-    @_performance_tracking
-    def _columns_view(self, columns):
-        """
-        Return a subset of the DataFrame's columns as a view.
-        """
-        return DataFrame(
-            {col: self._data[col] for col in columns}, index=self.index
-        )
-
     @_performance_tracking
     def select_dtypes(self, include=None, exclude=None):
         """Return a subset of the DataFrame's columns based on the column dtypes.
@@ -6763,8 +6757,6 @@ def select_dtypes(self, include=None, exclude=None):
         if not isinstance(exclude, (list, tuple)):
             exclude = (exclude,) if exclude is not None else ()
 
-        df = DataFrame(index=self.index)
-
         # cudf_dtype_from_pydata_dtype can distinguish between
         # np.float and np.number
         selection = tuple(map(frozenset, (include, exclude)))
@@ -6820,12 +6812,12 @@ def select_dtypes(self, include=None, exclude=None):
         # remove all exclude types
         inclusion = inclusion - exclude_subtypes
 
-        for k, col in self._column_labels_and_values:
-            infered_type = cudf_dtype_from_pydata_dtype(col.dtype)
-            if infered_type in inclusion:
-                df._insert(len(df._data), k, col)
-
-        return df
+        to_select = [
+            label
+            for label, dtype in self._dtypes
+            if cudf_dtype_from_pydata_dtype(dtype) in inclusion
+        ]
+        return self.loc[:, to_select]
 
     @ioutils.doc_to_parquet()
     def to_parquet(
@@ -7331,7 +7323,7 @@ def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False):
 
         cov = cupy.cov(self.values, ddof=ddof, rowvar=False)
         cols = self._data.to_pandas_index()
-        df = DataFrame(cupy.asfortranarray(cov)).set_index(cols)
+        df = DataFrame(cupy.asfortranarray(cov), index=cols)
         df._set_columns_like(self._data)
         return df
 
@@ -7374,7 +7366,7 @@ def corr(
 
         corr = cupy.corrcoef(values, rowvar=False)
         cols = self._data.to_pandas_index()
-        df = DataFrame(cupy.asfortranarray(corr)).set_index(cols)
+        df = DataFrame(cupy.asfortranarray(corr), index=cols)
         df._set_columns_like(self._data)
         return df
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 6de3981ba66..92d094d9de5 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -700,7 +700,10 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
             lookup_dict[i] = row
         lookup = cudf.DataFrame(lookup_dict)
         frame = cudf.DataFrame._from_data(
-            ColumnAccessor(dict(enumerate(index._columns)), verify=False)
+            ColumnAccessor(
+                dict(enumerate(index._columns)),
+                verify=False,
+            )
         )
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", FutureWarning)
@@ -780,18 +783,12 @@ def _index_and_downcast(self, result, index, index_key):
             index_key = index_key[0]
 
         slice_access = isinstance(index_key, slice)
-        out_index = cudf.DataFrame()
-        # Select the last n-k columns where n is the number of columns and k is
+        # Count the last n-k columns where n is the number of columns and k is
         # the length of the indexing tuple
         size = 0
         if not isinstance(index_key, (numbers.Number, slice)):
             size = len(index_key)
-        for k in range(size, len(index._data)):
-            out_index.insert(
-                out_index._num_columns,
-                k,
-                cudf.Series._from_column(index._columns[k]),
-            )
+        num_selected = max(0, index.nlevels - size)
 
         # determine if we should downcast from a DataFrame to a Series
         need_downcast = (
@@ -814,16 +811,13 @@ def _index_and_downcast(self, result, index, index_key):
             result = cudf.Series._from_data(
                 {}, name=tuple(col[0] for col in index._columns)
             )
-        elif out_index._num_columns == 1:
+        elif num_selected == 1:
             # If there's only one column remaining in the output index, convert
             # it into an Index and name the final index values according
             # to that column's name.
-            last_column = index._columns[-1]
-            out_index = cudf.Index._from_column(
-                last_column, name=index.names[-1]
-            )
-            index = out_index
-        elif out_index._num_columns > 1:
+            *_, last_column = index._data.columns
+            index = cudf.Index._from_column(last_column, name=index.names[-1])
+        elif num_selected > 1:
             # Otherwise pop the leftmost levels, names, and codes from the
             # source index until it has the correct number of columns (n-k)
             result.reset_index(drop=True)
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 401fef67ee6..6e5abb2b82b 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -961,14 +961,14 @@ def _merge_sorted(
     )
 
 
-def _pivot(df, index, columns):
+def _pivot(col_accessor: ColumnAccessor, index, columns) -> cudf.DataFrame:
     """
     Reorganize the values of the DataFrame according to the given
     index and columns.
 
     Parameters
     ----------
-    df : DataFrame
+    col_accessor : DataFrame
     index : cudf.Index
         Index labels of the result
     columns : cudf.Index
@@ -985,7 +985,7 @@ def as_tuple(x):
             return x if isinstance(x, tuple) else (x,)
 
         nrows = len(index_labels)
-        for col_label, col in df._column_labels_and_values:
+        for col_label, col in col_accessor.items():
             names = [
                 as_tuple(col_label) + as_tuple(name) for name in column_labels
             ]
@@ -1067,22 +1067,21 @@ def pivot(data, columns=None, index=no_default, values=no_default):
         2  <NA>  <NA>  three
 
     """
-    df = data
     values_is_list = True
     if values is no_default:
-        values = df._columns_view(
-            col for col in df._column_names if col not in (index, columns)
-        )
+        cols_to_select = [
+            col for col in data._column_names if col not in (index, columns)
+        ]
+    elif not isinstance(values, (list, tuple)):
+        cols_to_select = [values]
+        values_is_list = False
     else:
-        if not isinstance(values, (list, tuple)):
-            values = [values]
-            values_is_list = False
-        values = df._columns_view(values)
+        cols_to_select = values
     if index is no_default:
-        index = df.index
+        index = data.index
     else:
-        index = cudf.Index(df.loc[:, index])
-    columns = cudf.Index(df.loc[:, columns])
+        index = cudf.Index(data.loc[:, index])
+    columns = cudf.Index(data.loc[:, columns])
 
     # Create a DataFrame composed of columns from both
     # columns and index
@@ -1096,7 +1095,7 @@ def pivot(data, columns=None, index=no_default, values=no_default):
     if len(columns_index) != len(columns_index.drop_duplicates()):
         raise ValueError("Duplicate index-column pairs found. Cannot reshape.")
 
-    result = _pivot(values, index, columns)
+    result = _pivot(data._data.select_by_label(cols_to_select), index, columns)
 
     # MultiIndex to Index
     if not values_is_list:
diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index ef0f6958aeb..094df955273 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 import warnings
-from typing import Literal
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 
@@ -10,6 +10,9 @@
 from cudf.api.types import is_numeric_dtype
 from cudf.core.window.rolling import _RollingBase
 
+if TYPE_CHECKING:
+    from cudf.core.column.column import ColumnBase
+
 
 class ExponentialMovingWindow(_RollingBase):
     r"""
@@ -179,8 +182,10 @@ def cov(
     ):
         raise NotImplementedError("cov not yet supported.")
 
-    def _apply_agg_series(self, sr, agg_name):
-        if not is_numeric_dtype(sr.dtype):
+    def _apply_agg_column(
+        self, source_column: ColumnBase, agg_name: str
+    ) -> ColumnBase:
+        if not is_numeric_dtype(source_column.dtype):
             raise TypeError("No numeric types to aggregate")
 
         # libcudf ewm has special casing for nulls only
@@ -188,20 +193,14 @@ def _apply_agg_series(self, sr, agg_name):
         # pandas does nans in the same positions mathematically.
         # as such we need to convert the nans to nulls before
         # passing them in.
-        to_libcudf_column = sr._column.astype("float64").nans_to_nulls()
-
-        return self.obj._from_data_like_self(
-            self.obj._data._from_columns_like_self(
-                [
-                    scan(
-                        agg_name,
-                        to_libcudf_column,
-                        True,
-                        com=self.com,
-                        adjust=self.adjust,
-                    )
-                ]
-            )
+        to_libcudf_column = source_column.astype("float64").nans_to_nulls()
+
+        return scan(
+            agg_name,
+            to_libcudf_column,
+            True,
+            com=self.com,
+            adjust=self.adjust,
         )
 
 
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 043a41145e5..967edc2ab15 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import warnings
+from typing import TYPE_CHECKING
 
 import numba
 import pandas as pd
@@ -16,25 +17,29 @@
 from cudf.utils import cudautils
 from cudf.utils.utils import GetAttrGetItemMixin
 
+if TYPE_CHECKING:
+    from cudf.core.column.column import ColumnBase
+
 
 class _RollingBase:
     """
-    Contains methods common to all kinds of rolling
+    Contains routines to apply a window aggregation to a column.
     """
 
-    def _apply_agg_dataframe(self, df, agg_name):
-        result_df = cudf.DataFrame({})
-        for i, col_name in enumerate(df.columns):
-            result_col = self._apply_agg_series(df[col_name], agg_name)
-            result_df.insert(i, col_name, result_col)
-        result_df.index = df.index
-        return result_df
+    obj: cudf.DataFrame | cudf.Series
 
-    def _apply_agg(self, agg_name):
-        if isinstance(self.obj, cudf.Series):
-            return self._apply_agg_series(self.obj, agg_name)
-        else:
-            return self._apply_agg_dataframe(self.obj, agg_name)
+    def _apply_agg_column(
+        self, source_column: ColumnBase, agg_name: str
+    ) -> ColumnBase:
+        raise NotImplementedError
+
+    def _apply_agg(self, agg_name: str) -> cudf.DataFrame | cudf.Series:
+        applied = (
+            self._apply_agg_column(col, agg_name) for col in self.obj._columns
+        )
+        return self.obj._from_data_like_self(
+            self.obj._data._from_columns_like_self(applied)
+        )
 
 
 class Rolling(GetAttrGetItemMixin, _RollingBase, Reducible):
@@ -290,14 +295,6 @@ def _apply_agg_column(self, source_column, agg_name):
             agg_params=self.agg_params,
         )
 
-    def _apply_agg(self, agg_name):
-        applied = (
-            self._apply_agg_column(col, agg_name) for col in self.obj._columns
-        )
-        return self.obj._from_data_like_self(
-            self.obj._data._from_columns_like_self(applied)
-        )
-
     def _reduce(
         self,
         op: str,

From 22d481a4e3a34d517ad9a9ac46b8b1b456d365c6 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 27 Sep 2024 14:45:47 -0400
Subject: [PATCH 13/14] Fix JsonLargeReaderTest.MultiBatch use of
 LIBCUDF_JSON_BATCH_SIZE env var (#16927)

Fixes the `unsetenv` to use `LIBCUDF_JSON_BATCH_SIZE`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16927
---
 cpp/tests/large_strings/json_tests.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tests/large_strings/json_tests.cu b/cpp/tests/large_strings/json_tests.cu
index 80bde168b75..a212d7d654a 100644
--- a/cpp/tests/large_strings/json_tests.cu
+++ b/cpp/tests/large_strings/json_tests.cu
@@ -96,5 +96,5 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
   }
 
   // go back to normal batch_size
-  unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
+  unsetenv("LIBCUDF_JSON_BATCH_SIZE");
 }

From 6973ef806bc9d3cbda37a4c7caa763da12b84b7f Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Fri, 27 Sep 2024 16:50:15 -0400
Subject: [PATCH 14/14] Parse newline as whitespace character while tokenizing
 JSONL inputs with non-newline delimiter (#16923)

Addresses #16915

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Basit Ayantunde (https://github.com/lamarrr)
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16923
---
 cpp/src/io/json/nested_json_gpu.cu     |   4 +-
 cpp/tests/io/json/json_test.cpp        |  24 ++++
 cpp/tests/io/json/nested_json_test.cpp | 178 +++++++++++++++++++++++++
 3 files changed, 204 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 1c15e147b13..bf81162a0ac 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -618,12 +618,12 @@ struct PdaSymbolToSymbolGroupId {
     constexpr auto pda_sgid_lookup_size =
       static_cast<int32_t>(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0]));
     // We map the delimiter character to LINE_BREAK symbol group id, and the newline character
-    // to OTHER. Note that delimiter cannot be any of opening(closing) brace, bracket, quote,
+    // to WHITE_SPACE. Note that delimiter cannot be any of opening(closing) brace, bracket, quote,
     // escape, comma, colon or whitespace characters.
     auto const symbol_position =
       symbol == delimiter
         ? static_cast<int32_t>('\n')
-        : (symbol == '\n' ? static_cast<int32_t>(delimiter) : static_cast<int32_t>(symbol));
+        : (symbol == '\n' ? static_cast<int32_t>(' ') : static_cast<int32_t>(symbol));
     PdaSymbolGroupIdT symbol_gid =
       tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)];
     return stack_idx * static_cast<PdaSymbolGroupIdT>(symbol_group_id::NUM_PDA_INPUT_SGS) +
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 68ec255b39d..a094ac7d772 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2575,6 +2575,30 @@ TEST_F(JsonReaderTest, ViableDelimiter)
   EXPECT_THROW(json_parser_options.set_delimiter('\t'), std::invalid_argument);
 }
 
+TEST_F(JsonReaderTest, ViableDelimiterNewlineWS)
+{
+  // Test input
+  std::string input = R"({"a":
+  100})";
+
+  cudf::io::json_reader_options json_parser_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()})
+      .lines(true)
+      .delimiter('\0');
+
+  auto result = cudf::io::read_json(json_parser_options);
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 1);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+
+  auto col1_iterator = thrust::constant_iterator<int64_t>(100);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper(col1_iterator, col1_iterator + 1));
+}
+
 // Test case for dtype prune:
 // all paths, only one.
 // one present, another not present, nothing present
diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp
index 327169ae563..f32aba0e632 100644
--- a/cpp/tests/io/json/nested_json_test.cpp
+++ b/cpp/tests/io/json/nested_json_test.cpp
@@ -29,6 +29,7 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -1196,4 +1197,181 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAndDelimiter)
   }
 }
 
+TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAsWSAndDelimiter)
+{
+  // Test input. Inline comments used to indicate character indexes
+  //                           012345678 <= line 0
+  char const delimiter = GetParam();
+
+  /* Input: (Note that \n is considered whitespace according to the JSON spec when it is not used as
+   * a delimiter for JSONL)
+   * {"a":2}
+   * {"a":<delimiter>{"a":{"a":[321<delimiter>{"a":[1]}
+   *
+   * <delimiter>{"b":123}
+   * {"b":123}<delimiter>
+   * {"b"\n:\n\n\n123\n}
+   */
+  std::string input = R"({"a":2})"
+                      "\n";
+  // starting position 8 (zero indexed)
+  input += R"({"a":)" + std::string(1, delimiter);
+  // starting position 14 (zero indexed)
+  input += R"({"a":{"a":[321)" + std::string(1, delimiter);
+  // starting position 29 (zero indexed)
+  input += R"({"a":[1]})" + std::string("\n\n") + std::string(1, delimiter);
+  // starting position 41 (zero indexed)
+  input += R"({"b":123})"
+           "\n";
+  // starting position 51 (zero indexed)
+  input += R"({"b":123})" + std::string(1, delimiter);
+  // starting position 61 (zero indexed)
+  input += R"({"b")" + std::string("\n:\n\n\n123\n}");
+
+  // Golden token stream sample
+  using token_t = cuio_json::token_t;
+  std::vector<std::pair<std::size_t, cuio_json::PdaTokenT>> golden_token_stream;
+  if (delimiter != '\n') {
+    golden_token_stream = {// Line 0 (valid)
+                           {0, token_t::StructBegin},
+                           {1, token_t::StructMemberBegin},
+                           {1, token_t::FieldNameBegin},
+                           {3, token_t::FieldNameEnd},
+                           {5, token_t::ValueBegin},
+                           {6, token_t::ValueEnd},
+                           {6, token_t::StructMemberEnd},
+                           {6, token_t::StructEnd},
+                           // Line 1 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 2 (valid)
+                           {29, token_t::StructBegin},
+                           {30, token_t::StructMemberBegin},
+                           {30, token_t::FieldNameBegin},
+                           {32, token_t::FieldNameEnd},
+                           {34, token_t::ListBegin},
+                           {35, token_t::ValueBegin},
+                           {36, token_t::ValueEnd},
+                           {36, token_t::ListEnd},
+                           {37, token_t::StructMemberEnd},
+                           {37, token_t::StructEnd},
+                           // Line 3 (valid)
+                           {41, token_t::StructBegin},
+                           {42, token_t::StructMemberBegin},
+                           {42, token_t::FieldNameBegin},
+                           {44, token_t::FieldNameEnd},
+                           {46, token_t::ValueBegin},
+                           {49, token_t::ValueEnd},
+                           {49, token_t::StructMemberEnd},
+                           {49, token_t::StructEnd},
+                           // Line 4 (valid)
+                           {61, token_t::StructBegin},
+                           {62, token_t::StructMemberBegin},
+                           {62, token_t::FieldNameBegin},
+                           {64, token_t::FieldNameEnd},
+                           {70, token_t::ValueBegin},
+                           {73, token_t::ValueEnd},
+                           {74, token_t::StructMemberEnd},
+                           {74, token_t::StructEnd}};
+  } else {
+    /* Input:
+     * {"a":2}
+     * {"a":
+     * {"a":{"a":[321
+     * {"a":[1]}
+     *
+     *
+     * {"b":123}
+     * {"b":123}
+     * {"b"\n:\n\n\n123\n}
+     */
+    golden_token_stream = {// Line 0 (valid)
+                           {0, token_t::StructBegin},
+                           {1, token_t::StructMemberBegin},
+                           {1, token_t::FieldNameBegin},
+                           {3, token_t::FieldNameEnd},
+                           {5, token_t::ValueBegin},
+                           {6, token_t::ValueEnd},
+                           {6, token_t::StructMemberEnd},
+                           {6, token_t::StructEnd},
+                           // Line 1 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 2 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 3 (valid)
+                           {29, token_t::StructBegin},
+                           {30, token_t::StructMemberBegin},
+                           {30, token_t::FieldNameBegin},
+                           {32, token_t::FieldNameEnd},
+                           {34, token_t::ListBegin},
+                           {35, token_t::ValueBegin},
+                           {36, token_t::ValueEnd},
+                           {36, token_t::ListEnd},
+                           {37, token_t::StructMemberEnd},
+                           {37, token_t::StructEnd},
+                           // Line 4 (valid)
+                           {41, token_t::StructBegin},
+                           {42, token_t::StructMemberBegin},
+                           {42, token_t::FieldNameBegin},
+                           {44, token_t::FieldNameEnd},
+                           {46, token_t::ValueBegin},
+                           {49, token_t::ValueEnd},
+                           {49, token_t::StructMemberEnd},
+                           {49, token_t::StructEnd},
+                           // Line 5 (valid)
+                           {51, token_t::StructBegin},
+                           {52, token_t::StructMemberBegin},
+                           {52, token_t::FieldNameBegin},
+                           {54, token_t::FieldNameEnd},
+                           {56, token_t::ValueBegin},
+                           {59, token_t::ValueEnd},
+                           {59, token_t::StructMemberEnd},
+                           {59, token_t::StructEnd},
+                           // Line 6 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd}};
+  }
+
+  auto const stream = cudf::get_default_stream();
+
+  // Prepare input & output buffers
+  cudf::string_scalar const d_scalar(input, true, stream);
+  auto const d_input = cudf::device_span<cuio_json::SymbolT const>{
+    d_scalar.data(), static_cast<size_t>(d_scalar.size())};
+
+  // Default parsing options
+  cudf::io::json_reader_options const in_opts =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{})
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+      .delimiter(delimiter)
+      .lines(true);
+
+  // Parse the JSON and get the token stream
+  auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
+    d_input, in_opts, stream, cudf::get_current_device_resource_ref());
+  // Copy back the number of tokens that were written
+  auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
+  auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
+
+  stream.synchronize();
+  // Verify the number of tokens matches
+  ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size());
+  ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size());
+
+  for (std::size_t i = 0; i < tokens_gpu.size(); i++) {
+    // Ensure the index the tokens are pointing to do match
+    EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i;
+    // Ensure the token category is correct
+    EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i;
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()