From 127d57485445001e5d0c9fdc20fd8e380d9a410d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 17 Aug 2022 14:43:16 -0700 Subject: [PATCH] Fully support nested types in `cudf::contains` (#10656) This extends the `cudf::contains` API to support nested types (lists + structs) with arbitrarily nested levels. As such, `cudf::contains` will work with literally any type of input data. In addition, this fixes null handling of `cudf::contains` with structs column + struct scalar input when the structs column contains null rows at the top level while the scalar key is valid but all nulls at children levels. Closes: https://github.com/rapidsai/cudf/issues/8965 Depends on: * https://github.com/rapidsai/cudf/pull/10730 * https://github.com/rapidsai/cudf/pull/10883 * https://github.com/rapidsai/cudf/pull/10802 * https://github.com/rapidsai/cudf/pull/10997 * https://github.com/NVIDIA/cuCollections/issues/172 * https://github.com/NVIDIA/cuCollections/issues/173 * https://github.com/rapidsai/cudf/issues/11037 * https://github.com/rapidsai/cudf/pull/11356 Authors: - Nghia Truong (https://github.com/ttnghia) - Devavret Makkar (https://github.com/devavret) - Bradley Dice (https://github.com/bdice) - Karthikeyan (https://github.com/karthikeyann) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/10656 --- cpp/CMakeLists.txt | 2 +- cpp/include/cudf/detail/search.hpp | 16 - cpp/src/search/contains_column.cu | 202 +++------- cpp/src/search/contains_nested.cu | 66 ---- cpp/src/search/contains_scalar.cu | 176 +++++++++ cpp/tests/CMakeLists.txt | 4 +- cpp/tests/search/search_list_test.cpp | 349 ++++++++++++++++++ cpp/tests/search/search_struct_test.cpp | 472 ++++++++++++++++-------- 8 files changed, 894 insertions(+), 393 deletions(-) delete mode 100644 cpp/src/search/contains_nested.cu create mode 100644 cpp/src/search/contains_scalar.cu create mode 100644 cpp/tests/search/search_list_test.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 44aaac54adb..bb8620cd99c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -449,8 +449,8 @@ add_library( src/scalar/scalar.cpp src/scalar/scalar_factories.cpp src/search/contains_column.cu + src/search/contains_scalar.cu src/search/contains_table.cu - src/search/contains_nested.cu src/search/search_ordered.cu src/sort/is_sorted.cu src/sort/rank.cu diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp index a9764235c90..56d41fd635c 100644 --- a/cpp/include/cudf/detail/search.hpp +++ b/cpp/include/cudf/detail/search.hpp @@ -97,20 +97,4 @@ rmm::device_uvector contains( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Check if the (unique) row of the `needle` column is contained in the `haystack` column. - * - * If the input `needle` column has more than one row, only the first row will be considered. - * - * This function is designed for nested types only. It can also work with non-nested types - * but with lower performance due to the complexity of the implementation. - * - * @param haystack The column containing search space. - * @param needle A scalar value to check for existence in the search space. - * @return true if the given `needle` value exists in the `haystack` column. - */ -bool contains_nested_element(column_view const& haystack, - column_view const& needle, - rmm::cuda_stream_view stream); - } // namespace cudf::detail diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu index 5d068e72584..51d265263fb 100644 --- a/cpp/src/search/contains_column.cu +++ b/cpp/src/search/contains_column.cu @@ -17,22 +17,18 @@ #include #include +#include #include #include #include #include -#include -#include -#include -#include +#include #include #include #include -#include #include -#include #include #include @@ -41,96 +37,25 @@ namespace detail { namespace { -/** - * @brief Get the underlying value of a scalar through a scalar device view. - * - * @tparam Type The scalar's value type - * @tparam ScalarDView Type of the input scalar device view - * @param d_scalar The input scalar device view - */ -template -__device__ auto inline get_scalar_value(ScalarDView d_scalar) -{ - if constexpr (cudf::is_fixed_point()) { - return d_scalar.rep(); - } else { - return d_scalar.value(); - } -} - -struct contains_scalar_dispatch { - template - bool operator()(column_view const& haystack, - scalar const& needle, - rmm::cuda_stream_view stream) const - { - CUDF_EXPECTS(haystack.type() == needle.type(), "scalar and column types must match"); - - using DType = device_storage_type_t; - auto const d_haystack = column_device_view::create(haystack, stream); - auto const d_needle = - get_scalar_device_view(static_cast&>(const_cast(needle))); - - if (haystack.has_nulls()) { - auto const begin = d_haystack->pair_begin(); - auto const end = d_haystack->pair_end(); - - return thrust::count_if( - rmm::exec_policy(stream), begin, end, [d_needle] __device__(auto const val_pair) { - auto const needle_pair = thrust::make_pair(get_scalar_value(d_needle), true); - return val_pair == needle_pair; - }) > 0; - } else { - auto const begin = d_haystack->begin(); - auto const end = d_haystack->end(); - - return thrust::count_if( - rmm::exec_policy(stream), begin, end, [d_needle] __device__(auto const val) { - return val == get_scalar_value(d_needle); - }) > 0; +struct contains_column_dispatch { + template + struct contains_fn { + bool __device__ operator()(size_type const idx) const + { + if (needles_have_nulls && needles.is_null_nocheck(idx)) { + // Exit early. The value doesn't matter, and will be masked as a null element. + return true; + } + + return haystack.contains(needles.template element(idx)); } - } -}; -template <> -bool contains_scalar_dispatch::operator()(column_view const&, - scalar const&, - rmm::cuda_stream_view) const -{ - CUDF_FAIL("list_view type not supported yet"); -} + Haystack const haystack; + column_device_view const needles; + bool const needles_have_nulls; + }; -template <> -bool contains_scalar_dispatch::operator()(column_view const& haystack, - scalar const& needle, - rmm::cuda_stream_view stream) const -{ - CUDF_EXPECTS(haystack.type() == needle.type(), "scalar and column types must match"); - // Haystack and needle structure compatibility will be checked by the table comparator - // constructor during call to `contains_nested_element`. - - auto const needle_as_col = make_column_from_scalar(needle, 1, stream); - return contains_nested_element(haystack, needle_as_col->view(), stream); -} - -template <> -bool contains_scalar_dispatch::operator()(column_view const& haystack, - scalar const& needle, - rmm::cuda_stream_view stream) const -{ - auto const dict_col = cudf::dictionary_column_view(haystack); - // first, find the needle in the dictionary's key set - auto const index = cudf::dictionary::detail::get_index(dict_col, needle, stream); - // if found, check the index is actually in the indices column - return index->is_valid(stream) && cudf::type_dispatcher(dict_col.indices().type(), - contains_scalar_dispatch{}, - dict_col.indices(), - *index, - stream); -} - -struct multi_contains_dispatch { - template + template ())> std::unique_ptr operator()(column_view const& haystack, column_view const& needles, rmm::cuda_stream_view stream, @@ -138,7 +63,7 @@ struct multi_contains_dispatch { { auto result = make_numeric_column(data_type{type_to_id()}, needles.size(), - copy_bitmask(needles), + copy_bitmask(needles, stream, mr), needles.null_count(), stream, mr); @@ -151,57 +76,38 @@ struct multi_contains_dispatch { return result; } - auto const haystack_set = cudf::detail::unordered_multiset::create(haystack, stream); + auto const haystack_set = cudf::detail::unordered_multiset::create(haystack, stream); + auto const haystack_set_dv = haystack_set.to_device(); auto const needles_cdv_ptr = column_device_view::create(needles, stream); - auto const needles_it = thrust::make_counting_iterator(0); - - if (needles.has_nulls()) { - thrust::transform(rmm::exec_policy(stream), - needles_it, - needles_it + needles.size(), - out_begin, - [haystack = haystack_set.to_device(), - needles = *needles_cdv_ptr] __device__(size_type const idx) { - return needles.is_null_nocheck(idx) || - haystack.contains(needles.template element(idx)); - }); - } else { - thrust::transform(rmm::exec_policy(stream), - needles_it, - needles_it + needles.size(), - out_begin, - [haystack = haystack_set.to_device(), - needles = *needles_cdv_ptr] __device__(size_type const index) { - return haystack.contains(needles.template element(index)); - }); - } + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(needles.size()), + out_begin, + contains_fn{ + haystack_set_dv, *needles_cdv_ptr, needles.has_nulls()}); return result; } -}; -template <> -std::unique_ptr multi_contains_dispatch::operator()( - column_view const&, - column_view const&, - rmm::cuda_stream_view, - rmm::mr::device_memory_resource*) const -{ - CUDF_FAIL("list_view type not supported"); -} - -template <> -std::unique_ptr multi_contains_dispatch::operator()( - column_view const&, - column_view const&, - rmm::cuda_stream_view, - rmm::mr::device_memory_resource*) const -{ - CUDF_FAIL("struct_view type not supported"); -} + template ())> + std::unique_ptr operator()(column_view const& haystack, + column_view const& needles, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + auto result_v = detail::contains(table_view{{haystack}}, + table_view{{needles}}, + null_equality::EQUAL, + nan_equality::ALL_EQUAL, + stream, + mr); + return std::make_unique( + std::move(result_v), copy_bitmask(needles, stream, mr), needles.null_count()); + } +}; template <> -std::unique_ptr multi_contains_dispatch::operator()( +std::unique_ptr contains_column_dispatch::operator()( column_view const& haystack_in, column_view const& needles_in, rmm::cuda_stream_view stream, @@ -219,22 +125,14 @@ std::unique_ptr multi_contains_dispatch::operator()( column_view const haystack_indices = haystack_view.get_indices_annotated(); column_view const needles_indices = needles_view.get_indices_annotated(); return cudf::type_dispatcher(haystack_indices.type(), - multi_contains_dispatch{}, + contains_column_dispatch{}, haystack_indices, needles_indices, stream, mr); } -} // namespace -bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream) -{ - if (haystack.is_empty()) { return false; } - if (not needle.is_valid(stream)) { return haystack.has_nulls(); } - - return cudf::type_dispatcher( - haystack.type(), contains_scalar_dispatch{}, haystack, needle, stream); -} +} // namespace std::unique_ptr contains(column_view const& haystack, column_view const& needles, @@ -244,17 +142,11 @@ std::unique_ptr contains(column_view const& haystack, CUDF_EXPECTS(haystack.type() == needles.type(), "DTYPE mismatch"); return cudf::type_dispatcher( - haystack.type(), multi_contains_dispatch{}, haystack, needles, stream, mr); + haystack.type(), contains_column_dispatch{}, haystack, needles, stream, mr); } } // namespace detail -bool contains(column_view const& haystack, scalar const& needle) -{ - CUDF_FUNC_RANGE(); - return detail::contains(haystack, needle, cudf::default_stream_value); -} - std::unique_ptr contains(column_view const& haystack, column_view const& needles, rmm::mr::device_memory_resource* mr) diff --git a/cpp/src/search/contains_nested.cu b/cpp/src/search/contains_nested.cu deleted file mode 100644 index 6767b27a918..00000000000 --- a/cpp/src/search/contains_nested.cu +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include -#include - -#include - -namespace cudf::detail { - -bool contains_nested_element(column_view const& haystack, - column_view const& needle, - rmm::cuda_stream_view stream) -{ - CUDF_EXPECTS(needle.size() > 0, "Input needle column should have at least ONE row."); - - auto const haystack_tv = table_view{{haystack}}; - auto const needle_tv = table_view{{needle}}; - auto const has_nulls = has_nested_nulls(haystack_tv) || has_nested_nulls(needle_tv); - - auto const comparator = - cudf::experimental::row::equality::two_table_comparator(haystack_tv, needle_tv, stream); - auto const d_comp = comparator.equal_to(nullate::DYNAMIC{has_nulls}); - - auto const begin = cudf::experimental::row::lhs_iterator(0); - auto const end = begin + haystack.size(); - using cudf::experimental::row::rhs_index_type; - - if (haystack.has_nulls()) { - auto const haystack_cdv_ptr = column_device_view::create(haystack, stream); - auto const haystack_valid_it = cudf::detail::make_validity_iterator(*haystack_cdv_ptr); - - return thrust::count_if(rmm::exec_policy(stream), - begin, - end, - [d_comp, haystack_valid_it] __device__(auto const idx) { - if (!haystack_valid_it[static_cast(idx)]) { return false; } - return d_comp( - idx, rhs_index_type{0}); // compare haystack[idx] == needle[0]. - }) > 0; - } - - return thrust::count_if( - rmm::exec_policy(stream), begin, end, [d_comp] __device__(auto const idx) { - return d_comp(idx, rhs_index_type{0}); // compare haystack[idx] == needle[0]. - }) > 0; -} - -} // namespace cudf::detail diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu new file mode 100644 index 00000000000..e64cca44507 --- /dev/null +++ b/cpp/src/search/contains_scalar.cu @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace detail { + +namespace { + +/** + * @brief Get the underlying value of a scalar through a scalar device view. + * + * @tparam Element The scalar's value type + * @tparam ScalarDView Type of the input scalar device view + * @param d_scalar The input scalar device view + */ +template +__device__ auto inline get_scalar_value(ScalarDView d_scalar) +{ + if constexpr (cudf::is_fixed_point()) { + return d_scalar.rep(); + } else { + return d_scalar.value(); + } +} + +struct contains_scalar_dispatch { + // SFINAE with conditional return type because we need to support device lambda in this function. + // This is required due to a limitation of nvcc. + template + std::enable_if_t(), bool> operator()(column_view const& haystack, + scalar const& needle, + rmm::cuda_stream_view stream) const + { + CUDF_EXPECTS(haystack.type() == needle.type(), "Scalar and column types must match"); + // Don't need to check for needle validity. If it is invalid, it should be handled by the caller + // before dispatching to this function. + + using DType = device_storage_type_t; + auto const d_haystack = column_device_view::create(haystack, stream); + auto const d_needle = get_scalar_device_view( + static_cast&>(const_cast(needle))); + + if (haystack.has_nulls()) { + auto const begin = d_haystack->pair_begin(); + auto const end = d_haystack->pair_end(); + + return thrust::count_if( + rmm::exec_policy(stream), begin, end, [d_needle] __device__(auto const val_pair) { + auto const needle_pair = + thrust::make_pair(get_scalar_value(d_needle), true); + return val_pair == needle_pair; + }) > 0; + } else { + auto const begin = d_haystack->begin(); + auto const end = d_haystack->end(); + + return thrust::count_if( + rmm::exec_policy(stream), begin, end, [d_needle] __device__(auto const val) { + return val == get_scalar_value(d_needle); + }) > 0; + } + } + + template + std::enable_if_t(), bool> operator()(column_view const& haystack, + scalar const& needle, + rmm::cuda_stream_view stream) const + { + CUDF_EXPECTS(haystack.type() == needle.type(), "Scalar and column types must match"); + // Don't need to check for needle validity. If it is invalid, it should be handled by the caller + // before dispatching to this function. + // In addition, haystack and needle structure compatibility will be checked later on by + // constructor of the table comparator. + + auto const haystack_tv = table_view{{haystack}}; + auto const needle_as_col = make_column_from_scalar(needle, 1, stream); + auto const needle_tv = table_view{{needle_as_col->view()}}; + auto const has_nulls = has_nested_nulls(haystack_tv) || has_nested_nulls(needle_tv); + + auto const comparator = + cudf::experimental::row::equality::two_table_comparator(haystack_tv, needle_tv, stream); + auto const d_comp = comparator.equal_to(nullate::DYNAMIC{has_nulls}); + + auto const begin = cudf::experimental::row::lhs_iterator(0); + auto const end = begin + haystack.size(); + using cudf::experimental::row::rhs_index_type; + + if (haystack.has_nulls()) { + auto const haystack_cdv_ptr = column_device_view::create(haystack, stream); + auto const haystack_valid_it = cudf::detail::make_validity_iterator(*haystack_cdv_ptr); + + return thrust::count_if(rmm::exec_policy(stream), + begin, + end, + [d_comp, haystack_valid_it] __device__(auto const idx) { + if (!haystack_valid_it[static_cast(idx)]) { + return false; + } + return d_comp( + idx, rhs_index_type{0}); // compare haystack[idx] == needle[0]. + }) > 0; + } + + return thrust::count_if( + rmm::exec_policy(stream), begin, end, [d_comp] __device__(auto const idx) { + return d_comp(idx, rhs_index_type{0}); // compare haystack[idx] == needle[0]. + }) > 0; + } +}; + +template <> +bool contains_scalar_dispatch::operator()(column_view const& haystack, + scalar const& needle, + rmm::cuda_stream_view stream) const +{ + auto const dict_col = cudf::dictionary_column_view(haystack); + // first, find the needle in the dictionary's key set + auto const index = cudf::dictionary::detail::get_index(dict_col, needle, stream); + // if found, check the index is actually in the indices column + return index->is_valid(stream) && cudf::type_dispatcher(dict_col.indices().type(), + contains_scalar_dispatch{}, + dict_col.indices(), + *index, + stream); +} + +} // namespace + +bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream) +{ + if (haystack.is_empty()) { return false; } + if (not needle.is_valid(stream)) { return haystack.has_nulls(); } + + return cudf::type_dispatcher( + haystack.type(), contains_scalar_dispatch{}, haystack, needle, stream); +} + +} // namespace detail + +bool contains(column_view const& haystack, scalar const& needle) +{ + CUDF_FUNC_RANGE(); + return detail::contains(haystack, needle, cudf::default_stream_value); +} + +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 8aba2a11d10..1964db53659 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -358,8 +358,8 @@ ConfigureTest( # ################################################################################################## # * search test ----------------------------------------------------------------------------------- ConfigureTest( - SEARCH_TEST search/search_dictionary_test.cpp search/search_struct_test.cpp - search/search_test.cpp + SEARCH_TEST search/search_dictionary_test.cpp search/search_list_test.cpp + search/search_struct_test.cpp search/search_test.cpp ) # ################################################################################################## diff --git a/cpp/tests/search/search_list_test.cpp b/cpp/tests/search/search_list_test.cpp new file mode 100644 index 00000000000..1393095037d --- /dev/null +++ b/cpp/tests/search/search_list_test.cpp @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace cudf::test::iterators; + +using bools_col = cudf::test::fixed_width_column_wrapper; +using int32s_col = cudf::test::fixed_width_column_wrapper; +using structs_col = cudf::test::structs_column_wrapper; +using strings_col = cudf::test::strings_column_wrapper; + +constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR}; +constexpr int32_t null{0}; // Mark for null child elements at the current level +constexpr int32_t XXX{0}; // Mark for null elements at all levels +constexpr int32_t dont_care{0}; // Mark for elements that will be sliced off + +using TestTypes = cudf::test::Concat; + +template +struct TypedListsContainsTestScalarNeedle : public cudf::test::BaseFixture { +}; +TYPED_TEST_SUITE(TypedListsContainsTestScalarNeedle, TestTypes); + +TYPED_TEST(TypedListsContainsTestScalarNeedle, EmptyInput) +{ + using tdata_col = cudf::test::fixed_width_column_wrapper; + using lists_col = cudf::test::lists_column_wrapper; + + auto const haystack = lists_col{}; + + auto const needle1 = [] { + auto child = tdata_col{}; + return cudf::list_scalar(child); + }(); + auto const needle2 = [] { + auto child = tdata_col{1, 2, 3}; + return cudf::list_scalar(child); + }(); + + EXPECT_FALSE(cudf::contains(haystack, needle1)); + EXPECT_FALSE(cudf::contains(haystack, needle2)); +} + +TYPED_TEST(TypedListsContainsTestScalarNeedle, TrivialInput) +{ + using tdata_col = cudf::test::fixed_width_column_wrapper; + using lists_col = cudf::test::lists_column_wrapper; + + auto const haystack = lists_col{{1, 2}, {1}, {}, {1, 3}, {4}, {1, 1}}; + + auto const needle1 = [] { + auto child = tdata_col{1, 2}; + return cudf::list_scalar(child); + }(); + auto const needle2 = [] { + auto child = tdata_col{2, 1}; + return cudf::list_scalar(child); + }(); + + EXPECT_TRUE(cudf::contains(haystack, needle1)); + + // Lists are order-sensitive. + EXPECT_FALSE(cudf::contains(haystack, needle2)); +} + +TYPED_TEST(TypedListsContainsTestScalarNeedle, SlicedColumnInput) +{ + using tdata_col = cudf::test::fixed_width_column_wrapper; + using lists_col = cudf::test::lists_column_wrapper; + + auto const haystack_original = + lists_col{{dont_care, dont_care}, {dont_care}, {1, 2}, {1}, {}, {1, 3}, {dont_care, dont_care}}; + auto const haystack = cudf::slice(haystack_original, {2, 6})[0]; + + auto const needle1 = [] { + auto child = tdata_col{1, 2}; + return cudf::list_scalar(child); + }(); + auto const needle2 = [] { + auto child = tdata_col{}; + return cudf::list_scalar(child); + }(); + auto const needle3 = [] { + auto child = tdata_col{dont_care, dont_care}; + return cudf::list_scalar(child); + }(); + + EXPECT_TRUE(cudf::contains(haystack, needle1)); + EXPECT_TRUE(cudf::contains(haystack, needle2)); + EXPECT_FALSE(cudf::contains(haystack, needle3)); +} + +TYPED_TEST(TypedListsContainsTestScalarNeedle, SimpleInputWithNulls) +{ + using tdata_col = cudf::test::fixed_width_column_wrapper; + using lists_col = cudf::test::lists_column_wrapper; + + // Test with invalid scalar. + { + auto const haystack = lists_col{{1, 2}, {1}, {}, {1, 3}, {4}, {}, {1, 1}}; + auto const needle = [] { + auto child = tdata_col{}; + return cudf::list_scalar(child, false); + }(); + + EXPECT_FALSE(cudf::contains(haystack, needle)); + } + + // Test with nulls at the top level. + { + auto const haystack = + lists_col{{{1, 2}, {1}, {} /*NULL*/, {1, 3}, {4}, {} /*NULL*/, {1, 1}}, nulls_at({2, 5})}; + + auto const needle1 = [] { + auto child = tdata_col{1, 2}; + return cudf::list_scalar(child); + }(); + auto const needle2 = [] { + auto child = tdata_col{}; + return cudf::list_scalar(child, false); + }(); + + EXPECT_TRUE(cudf::contains(haystack, needle1)); + EXPECT_TRUE(cudf::contains(haystack, needle2)); + } + + // Test with nulls at the children level. + { + auto const haystack = lists_col{{lists_col{1, 2}, + lists_col{1}, + lists_col{{1, null}, null_at(1)}, + lists_col{} /*NULL*/, + lists_col{1, 3}, + lists_col{1, 4}, + lists_col{4}, + lists_col{} /*NULL*/, + lists_col{1, 1}}, + nulls_at({3, 7})}; + + auto const needle1 = [] { + auto child = tdata_col{{1, null}, null_at(1)}; + return cudf::list_scalar(child); + }(); + auto const needle2 = [] { + auto child = tdata_col{{null, 1}, null_at(0)}; + return cudf::list_scalar(child); + }(); + auto const needle3 = [] { + auto child = tdata_col{1, 0}; + return cudf::list_scalar(child); + }(); + + EXPECT_TRUE(cudf::contains(haystack, needle1)); + EXPECT_FALSE(cudf::contains(haystack, needle2)); + EXPECT_FALSE(cudf::contains(haystack, needle3)); + } +} + +TYPED_TEST(TypedListsContainsTestScalarNeedle, SlicedInputHavingNulls) +{ + using tdata_col = cudf::test::fixed_width_column_wrapper; + using lists_col = cudf::test::lists_column_wrapper; + + auto const haystack_original = lists_col{{{dont_care, dont_care}, + {dont_care} /*NULL*/, + lists_col{{1, null}, null_at(1)}, + {1}, + {} /*NULL*/, + {1, 3}, + {4}, + {} /*NULL*/, + {1, 1}, + {dont_care}}, + nulls_at({1, 4, 7})}; + auto const haystack = cudf::slice(haystack_original, {2, 9})[0]; + + auto const needle1 = [] { + auto child = tdata_col{{1, null}, null_at(1)}; + return cudf::list_scalar(child); + }(); + auto const needle2 = [] { + auto child = tdata_col{}; + return cudf::list_scalar(child); + }(); + auto const needle3 = [] { + auto child = tdata_col{dont_care, dont_care}; + return cudf::list_scalar(child); + }(); + + EXPECT_TRUE(cudf::contains(haystack, needle1)); + EXPECT_FALSE(cudf::contains(haystack, needle2)); + EXPECT_FALSE(cudf::contains(haystack, needle3)); +} + +template +struct TypedListContainsTestColumnNeedles : public cudf::test::BaseFixture { +}; + +TYPED_TEST_SUITE(TypedListContainsTestColumnNeedles, TestTypes); + +TYPED_TEST(TypedListContainsTestColumnNeedles, EmptyInput) +{ + using lists_col = cudf::test::lists_column_wrapper; + + auto const haystack = lists_col{}; + auto const needles = lists_col{}; + auto const expected = bools_col{}; + auto const result = cudf::contains(haystack, needles); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result, verbosity); +} + +TYPED_TEST(TypedListContainsTestColumnNeedles, TrivialInput) +{ + using lists_col = cudf::test::lists_column_wrapper; + + auto const haystack = lists_col{{0, 1}, {2}, {3, 4, 5}, {2, 3, 4}, {}, {0, 2, 0}}; + auto const needles = lists_col{{0, 1}, {1}, {3, 5, 4}, {}}; + + auto const expected = bools_col{1, 0, 0, 1}; + auto const result = cudf::contains(haystack, needles); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result, verbosity); +} + +TYPED_TEST(TypedListContainsTestColumnNeedles, SlicedInputNoNulls) +{ + using lists_col = cudf::test::lists_column_wrapper; + + auto const haystack_original = lists_col{ + {dont_care, dont_care}, {dont_care}, {0, 1}, {2}, {3, 4, 5}, {2, 3, 4}, {}, {0, 2, 0}}; + auto const haystack = cudf::slice(haystack_original, {2, 8})[0]; + + auto const needles_original = + lists_col{{dont_care}, {0, 1}, {0, 0}, {3, 5, 4}, {}, {dont_care, dont_care}, {} /*dont_care*/}; + auto const needles = cudf::slice(needles_original, {1, 5})[0]; + + auto const expected = bools_col{1, 0, 0, 1}; + auto const result = cudf::contains(haystack, needles); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result, verbosity); +} + +TYPED_TEST(TypedListContainsTestColumnNeedles, SlicedInputHavingNulls) +{ + using lists_col = cudf::test::lists_column_wrapper; + + auto const haystack_original = lists_col{{{dont_care, dont_care}, + {dont_care} /*NULL*/, + lists_col{{1, null}, null_at(1)}, + {1}, + {} /*NULL*/, + {1, 3}, + {4}, + {} /*NULL*/, + {1, 1}, + {dont_care}}, + nulls_at({1, 4, 7})}; + auto const haystack = cudf::slice(haystack_original, {2, 9})[0]; + + auto const needles_original = lists_col{{{dont_care, dont_care}, + {dont_care} /*NULL*/, + lists_col{{1, null}, null_at(1)}, + {1}, + {} /*NULL*/, + {1, 3, 1}, + {4}, + {} /*NULL*/, + {}, + {dont_care}}, + nulls_at({1, 4, 7})}; + auto const needles = cudf::slice(needles_original, {2, 9})[0]; + + auto const expected = bools_col{{1, 1, null, 0, 1, null, 0}, nulls_at({2, 5})}; + auto const result = cudf::contains(haystack, needles); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result, verbosity); +} + +TYPED_TEST(TypedListContainsTestColumnNeedles, ListsOfStructs) +{ + using tdata_col = cudf::test::fixed_width_column_wrapper; + + auto const haystack = [] { + auto offsets = int32s_col{0, 2, 3, 5, 8, 10}; + // clang-format off + auto data1 = tdata_col{1, 2, // + 1, // + 0, 1, // + 1, 3, 4, // + 0, 0 // + }; + auto data2 = tdata_col{1, 3, // + 2, // + 1, 1, // + 0, 2, 0, // + 1, 2 // + }; + // clang-format on + auto child = structs_col{{data1, data2}}; + return cudf::make_lists_column(5, offsets.release(), child.release(), 0, {}); + }(); + + auto const needles = [] { + auto offsets = int32s_col{0, 3, 4, 6, 9, 11}; + // clang-format off + auto data1 = tdata_col{1, 2, 1, // + 1, // + 0, 1, // + 1, 3, 4, // + 0, 0 // + }; + auto data2 = tdata_col{1, 3, 0, // + 2, // + 1, 1, // + 0, 2, 2, // + 1, 1 // + }; + // clang-format on + auto child = structs_col{{data1, data2}}; + return cudf::make_lists_column(5, offsets.release(), child.release(), 0, {}); + }(); + + auto const expected = bools_col{0, 1, 1, 0, 0}; + auto const result = cudf::contains(*haystack, *needles); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result, verbosity); +} diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp index 159b082890a..5d9ef85a249 100644 --- a/cpp/tests/search/search_struct_test.cpp +++ b/cpp/tests/search/search_struct_test.cpp @@ -33,8 +33,9 @@ using structs_col = cudf::test::structs_column_wrapper; using strings_col = cudf::test::strings_column_wrapper; constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR}; -constexpr int32_t null{0}; // Mark for null child elements -constexpr int32_t XXX{0}; // Mark for null struct elements +constexpr int32_t null{0}; // Mark for null child elements at the current level +constexpr int32_t XXX{0}; // Mark for null elements at all levels +constexpr int32_t dont_care{0}; // Mark for elements that will be sliced off using TestTypes = cudf::test::Concat const& t_col, return search_bounds(t_col->view(), values_col, column_orders, null_precedence); } +template +auto make_struct_scalar(Args&&... args) +{ + return cudf::struct_scalar(std::vector{std::forward(args)...}); +} + } // namespace // Test case when all input columns are empty -TYPED_TEST(TypedStructSearchTest, EmptyInputTest) +TYPED_TEST(TypedStructSearchTest, EmptyInput) { - using col_wrapper = cudf::test::fixed_width_column_wrapper; + using tdata_col = cudf::test::fixed_width_column_wrapper; - auto child_col_t = col_wrapper{}; + auto child_col_t = tdata_col{}; auto const structs_t = structs_col{{child_col_t}, std::vector{}}.release(); - auto child_col_values = col_wrapper{}; + auto child_col_values = tdata_col{}; auto const structs_values = structs_col{{child_col_values}, std::vector{}}.release(); auto const results = search_bounds(structs_t, structs_values); @@ -90,15 +97,15 @@ TYPED_TEST(TypedStructSearchTest, EmptyInputTest) TYPED_TEST(TypedStructSearchTest, TrivialInputTests) { - using col_wrapper = cudf::test::fixed_width_column_wrapper; + using tdata_col = cudf::test::fixed_width_column_wrapper; - auto child_col_t = col_wrapper{10, 20, 30, 40, 50}; + auto child_col_t = tdata_col{10, 20, 30, 40, 50}; auto const structs_t = structs_col{{child_col_t}}.release(); - auto child_col_values1 = col_wrapper{0, 1, 2, 3, 4}; + auto child_col_values1 = tdata_col{0, 1, 2, 3, 4}; auto const structs_values1 = structs_col{{child_col_values1}}.release(); - auto child_col_values2 = col_wrapper{100, 101, 102, 103, 104}; + auto child_col_values2 = tdata_col{100, 101, 102, 103, 104}; auto const structs_values2 = structs_col{{child_col_values2}}.release(); auto const results1 = search_bounds(structs_t, structs_values1); @@ -114,12 +121,12 @@ TYPED_TEST(TypedStructSearchTest, TrivialInputTests) TYPED_TEST(TypedStructSearchTest, SlicedColumnInputTests) { - using col_wrapper = cudf::test::fixed_width_column_wrapper; + using tdata_col = cudf::test::fixed_width_column_wrapper; - auto child_col_values = col_wrapper{0, 1, 2, 3, 4, 5}; + auto child_col_values = tdata_col{0, 1, 2, 3, 4, 5}; auto const structs_values = structs_col{child_col_values}.release(); - auto child_col_t = col_wrapper{0, 1, 2, 2, 2, 2, 3, 3, 4, 4}; + auto child_col_t = tdata_col{0, 1, 2, 2, 2, 2, 3, 3, 4, 4}; auto const structs_t_original = structs_col{child_col_t}.release(); auto structs_t = cudf::slice(structs_t_original->view(), {0, 10})[0]; // the entire column t @@ -146,13 +153,13 @@ TYPED_TEST(TypedStructSearchTest, SlicedColumnInputTests) TYPED_TEST(TypedStructSearchTest, SimpleInputWithNullsTests) { - using col_wrapper = cudf::test::fixed_width_column_wrapper; + using tdata_col = cudf::test::fixed_width_column_wrapper; - auto child_col_values = col_wrapper{{1, null, 70, XXX, 2, 100}, null_at(1)}; + auto child_col_values = tdata_col{{1, null, 70, XXX, 2, 100}, null_at(1)}; auto const structs_values = structs_col{{child_col_values}, null_at(3)}.release(); // Sorted asc, nulls first - auto child_col_t = col_wrapper{{XXX, null, 0, 1, 2, 2, 2, 2, 3, 3, 4}, null_at(1)}; + auto child_col_t = tdata_col{{XXX, null, 0, 1, 2, 2, 2, 2, 3, 3, 4}, null_at(1)}; auto structs_t = structs_col{{child_col_t}, null_at(0)}.release(); auto results = @@ -163,7 +170,7 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithNullsTests) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), verbosity); // Sorted asc, nulls last - child_col_t = col_wrapper{{0, 1, 2, 2, 2, 2, 3, 3, 4, null, XXX}, null_at(9)}; + child_col_t = tdata_col{{0, 1, 2, 2, 2, 2, 3, 3, 4, null, XXX}, null_at(9)}; structs_t = structs_col{{child_col_t}, null_at(10)}.release(); results = search_bounds(structs_t, structs_values, {cudf::order::ASCENDING}, {cudf::null_order::AFTER}); @@ -173,7 +180,7 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithNullsTests) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), verbosity); // Sorted dsc, nulls first - child_col_t = col_wrapper{{XXX, null, 4, 3, 3, 2, 2, 2, 2, 1, 0}, null_at(1)}; + child_col_t = tdata_col{{XXX, null, 4, 3, 3, 2, 2, 2, 2, 1, 0}, null_at(1)}; structs_t = structs_col{{child_col_t}, null_at(0)}.release(); results = search_bounds(structs_t, structs_values, {cudf::order::DESCENDING}, {cudf::null_order::BEFORE}); @@ -183,7 +190,7 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithNullsTests) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), verbosity); // Sorted dsc, nulls last - child_col_t = col_wrapper{{4, 3, 3, 2, 2, 2, 2, 1, 0, null, XXX}, null_at(9)}; + child_col_t = tdata_col{{4, 3, 3, 2, 2, 2, 2, 1, 0, null, XXX}, null_at(9)}; structs_t = structs_col{{child_col_t}, null_at(10)}.release(); results = search_bounds(structs_t, structs_values, {cudf::order::DESCENDING}, {cudf::null_order::AFTER}); @@ -195,13 +202,13 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithNullsTests) TYPED_TEST(TypedStructSearchTest, SimpleInputWithValuesHavingNullsTests) { - using col_wrapper = cudf::test::fixed_width_column_wrapper; + using tdata_col = cudf::test::fixed_width_column_wrapper; - auto child_col_values = col_wrapper{{1, null, 70, XXX, 2, 100}, null_at(1)}; + auto child_col_values = tdata_col{{1, null, 70, XXX, 2, 100}, null_at(1)}; auto const structs_values = structs_col{{child_col_values}, null_at(3)}.release(); // Sorted asc, search nulls first - auto child_col_t = col_wrapper{0, 0, 0, 1, 2, 2, 2, 2, 3, 3, 4}; + auto child_col_t = tdata_col{0, 0, 0, 1, 2, 2, 2, 2, 3, 3, 4}; auto structs_t = structs_col{{child_col_t}}.release(); auto results = @@ -220,7 +227,7 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithValuesHavingNullsTests) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), verbosity); // Sorted dsc, search nulls first - child_col_t = col_wrapper{4, 3, 3, 2, 2, 2, 2, 1, 0, 0, 0}; + child_col_t = tdata_col{4, 3, 3, 2, 2, 2, 2, 1, 0, 0, 0}; structs_t = structs_col{{child_col_t}}.release(); results = search_bounds(structs_t, structs_values, {cudf::order::DESCENDING}, {cudf::null_order::BEFORE}); @@ -240,13 +247,13 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithValuesHavingNullsTests) TYPED_TEST(TypedStructSearchTest, SimpleInputWithTargetHavingNullsTests) { - using col_wrapper = cudf::test::fixed_width_column_wrapper; + using tdata_col = cudf::test::fixed_width_column_wrapper; - auto child_col_values = col_wrapper{1, 0, 70, 0, 2, 100}; + auto child_col_values = tdata_col{1, 0, 70, 0, 2, 100}; auto const structs_values = structs_col{{child_col_values}}.release(); // Sorted asc, nulls first - auto child_col_t = col_wrapper{{XXX, null, 0, 1, 2, 2, 2, 2, 3, 3, 4}, null_at(1)}; + auto child_col_t = tdata_col{{XXX, null, 0, 1, 2, 2, 2, 2, 3, 3, 4}, null_at(1)}; auto structs_t = structs_col{{child_col_t}, null_at(0)}.release(); auto results = @@ -257,7 +264,7 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithTargetHavingNullsTests) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), verbosity); // Sorted asc, nulls last - child_col_t = col_wrapper{{0, 1, 2, 2, 2, 2, 3, 3, 4, null, XXX}, null_at(9)}; + child_col_t = tdata_col{{0, 1, 2, 2, 2, 2, 3, 3, 4, null, XXX}, null_at(9)}; structs_t = structs_col{{child_col_t}, null_at(10)}.release(); results = search_bounds(structs_t, structs_values, {cudf::order::ASCENDING}, {cudf::null_order::AFTER}); @@ -267,7 +274,7 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithTargetHavingNullsTests) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), verbosity); // Sorted dsc, nulls first - child_col_t = col_wrapper{{XXX, null, 4, 3, 3, 2, 2, 2, 2, 1, 0}, null_at(1)}; + child_col_t = tdata_col{{XXX, null, 4, 3, 3, 2, 2, 2, 2, 1, 0}, null_at(1)}; structs_t = structs_col{{child_col_t}, null_at(0)}.release(); results = search_bounds(structs_t, structs_values, {cudf::order::DESCENDING}, {cudf::null_order::BEFORE}); @@ -277,7 +284,7 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithTargetHavingNullsTests) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), verbosity); // Sorted dsc, nulls last - child_col_t = col_wrapper{{4, 3, 3, 2, 2, 2, 2, 1, 0, null, XXX}, null_at(9)}; + child_col_t = tdata_col{{4, 3, 3, 2, 2, 2, 2, 1, 0, null, XXX}, null_at(9)}; structs_t = structs_col{{child_col_t}, null_at(10)}.release(); results = search_bounds(structs_t, structs_values, {cudf::order::DESCENDING}, {cudf::null_order::AFTER}); @@ -289,16 +296,16 @@ TYPED_TEST(TypedStructSearchTest, SimpleInputWithTargetHavingNullsTests) TYPED_TEST(TypedStructSearchTest, OneColumnHasNullMaskButNoNullElementTest) { - using col_wrapper = cudf::test::fixed_width_column_wrapper; + using tdata_col = cudf::test::fixed_width_column_wrapper; - auto child_col1 = col_wrapper{1, 20, 30}; + auto child_col1 = tdata_col{1, 20, 30}; auto const structs_col1 = structs_col{{child_col1}}.release(); - auto child_col2 = col_wrapper{0, 10, 10}; + auto child_col2 = tdata_col{0, 10, 10}; auto const structs_col2 = structs_col{child_col2}.release(); // structs_col3 (and its child column) will have a null mask but no null element - auto child_col3 = col_wrapper{{0, 10, 10}, no_nulls()}; + auto child_col3 = tdata_col{{0, 10, 10}, no_nulls()}; auto const structs_col3 = structs_col{{child_col3}, no_nulls()}.release(); // Search struct elements of structs_col2 and structs_col3 in the column structs_col1 @@ -329,18 +336,18 @@ TYPED_TEST(TypedStructSearchTest, OneColumnHasNullMaskButNoNullElementTest) TYPED_TEST(TypedStructSearchTest, ComplexStructTest) { // Testing on struct. - using col_wrapper = cudf::test::fixed_width_column_wrapper; + using tdata_col = cudf::test::fixed_width_column_wrapper; auto names_column_t = strings_col{"Cherry", "Kiwi", "Lemon", "Newton", "Tomato", /*NULL*/ "Washington"}; - auto ages_column_t = col_wrapper{{5, 10, 15, 20, null, XXX}, null_at(4)}; + auto ages_column_t = tdata_col{{5, 10, 15, 20, null, XXX}, null_at(4)}; auto is_human_col_t = bools_col{false, false, false, false, false, /*NULL*/ true}; auto const structs_t = structs_col{{names_column_t, ages_column_t, is_human_col_t}, null_at(5)}.release(); auto names_column_values = strings_col{"Bagel", "Tomato", "Lemonade", /*NULL*/ "Donut", "Butter"}; - auto ages_column_values = col_wrapper{{10, null, 15, XXX, 17}, null_at(1)}; + auto ages_column_values = tdata_col{{10, null, 15, XXX, 17}, null_at(1)}; auto is_human_col_values = bools_col{false, false, true, /*NULL*/ true, true}; auto const structs_values = structs_col{{names_column_values, ages_column_values, is_human_col_values}, null_at(3)} @@ -355,232 +362,391 @@ TYPED_TEST(TypedStructSearchTest, ComplexStructTest) } template -struct TypedScalarStructContainTest : public cudf::test::BaseFixture { +struct TypedStructContainsTestScalarNeedle : public cudf::test::BaseFixture { }; -TYPED_TEST_SUITE(TypedScalarStructContainTest, TestTypes); +TYPED_TEST_SUITE(TypedStructContainsTestScalarNeedle, TestTypes); -TYPED_TEST(TypedScalarStructContainTest, EmptyInputTest) +TYPED_TEST(TypedStructContainsTestScalarNeedle, EmptyInput) { - using col_wrapper = cudf::test::fixed_width_column_wrapper; + using tdata_col = cudf::test::fixed_width_column_wrapper; - auto const col = [] { - auto child = col_wrapper{}; + auto const haystack = [] { + auto child = tdata_col{}; return structs_col{{child}}; }(); - auto const val = [] { - auto child = col_wrapper{1}; - return cudf::struct_scalar(std::vector{child}); + auto const needle1 = [] { + auto child = tdata_col{1}; + return make_struct_scalar(child); + }(); + auto const needle2 = [] { + auto child1 = tdata_col{1}; + auto child2 = tdata_col{1}; + return make_struct_scalar(child1, child2); }(); - EXPECT_EQ(false, cudf::contains(col, val)); + EXPECT_FALSE(cudf::contains(haystack, needle1)); + EXPECT_FALSE(cudf::contains(haystack, needle2)); } -TYPED_TEST(TypedScalarStructContainTest, TrivialInputTests) +TYPED_TEST(TypedStructContainsTestScalarNeedle, TrivialInput) { - using col_wrapper = cudf::test::fixed_width_column_wrapper; + using tdata_col = cudf::test::fixed_width_column_wrapper; - auto const col = [] { - auto child1 = col_wrapper{1, 2, 3}; - auto child2 = col_wrapper{4, 5, 6}; + auto const haystack = [] { + auto child1 = tdata_col{1, 2, 3}; + auto child2 = tdata_col{4, 5, 6}; auto child3 = strings_col{"x", "y", "z"}; return structs_col{{child1, child2, child3}}; }(); - auto const val1 = [] { - auto child1 = col_wrapper{1}; - auto child2 = col_wrapper{4}; + auto const needle1 = [] { + auto child1 = tdata_col{1}; + auto child2 = tdata_col{4}; auto child3 = strings_col{"x"}; - return cudf::struct_scalar(std::vector{child1, child2, child3}); + return make_struct_scalar(child1, child2, child3); }(); - auto const val2 = [] { - auto child1 = col_wrapper{1}; - auto child2 = col_wrapper{4}; + auto const needle2 = [] { + auto child1 = tdata_col{1}; + auto child2 = tdata_col{4}; auto child3 = strings_col{"a"}; - return cudf::struct_scalar(std::vector{child1, child2, child3}); + return make_struct_scalar(child1, child2, child3); }(); - EXPECT_EQ(true, cudf::contains(col, val1)); - EXPECT_EQ(false, cudf::contains(col, val2)); + EXPECT_TRUE(cudf::contains(haystack, needle1)); + EXPECT_FALSE(cudf::contains(haystack, needle2)); } -TYPED_TEST(TypedScalarStructContainTest, SlicedColumnInputTests) +TYPED_TEST(TypedStructContainsTestScalarNeedle, SlicedColumnInput) { - using col_wrapper = cudf::test::fixed_width_column_wrapper; + using tdata_col = cudf::test::fixed_width_column_wrapper; - constexpr int32_t dont_care{0}; - - auto const col_original = [] { - auto child1 = col_wrapper{dont_care, dont_care, 1, 2, 3, dont_care}; - auto child2 = col_wrapper{dont_care, dont_care, 4, 5, 6, dont_care}; + auto const haystack_original = [] { + auto child1 = tdata_col{dont_care, dont_care, 1, 2, 3, dont_care}; + auto child2 = tdata_col{dont_care, dont_care, 4, 5, 6, dont_care}; auto child3 = strings_col{"dont_care", "dont_care", "x", "y", "z", "dont_care"}; return structs_col{{child1, child2, child3}}; }(); - auto const col = cudf::slice(col_original, {2, 5})[0]; + auto const haystack = cudf::slice(haystack_original, {2, 5})[0]; - auto const val1 = [] { - auto child1 = col_wrapper{1}; - auto child2 = col_wrapper{4}; + auto const needle1 = [] { + auto child1 = tdata_col{1}; + auto child2 = tdata_col{4}; auto child3 = strings_col{"x"}; - return cudf::struct_scalar(std::vector{child1, child2, child3}); + return make_struct_scalar(child1, child2, child3); }(); - auto const val2 = [] { - auto child1 = col_wrapper{dont_care}; - auto child2 = col_wrapper{dont_care}; + auto const needle2 = [] { + auto child1 = tdata_col{dont_care}; + auto child2 = tdata_col{dont_care}; auto child3 = strings_col{"dont_care"}; - return cudf::struct_scalar(std::vector{child1, child2, child3}); + return make_struct_scalar(child1, child2, child3); }(); - EXPECT_EQ(true, cudf::contains(col, val1)); - EXPECT_EQ(false, cudf::contains(col, val2)); + EXPECT_TRUE(cudf::contains(haystack, needle1)); + EXPECT_FALSE(cudf::contains(haystack, needle2)); } -TYPED_TEST(TypedScalarStructContainTest, SimpleInputWithNullsTests) +TYPED_TEST(TypedStructContainsTestScalarNeedle, SimpleInputWithNulls) { - using col_wrapper = cudf::test::fixed_width_column_wrapper; + using tdata_col = cudf::test::fixed_width_column_wrapper; constexpr int32_t null{0}; // Test with nulls at the top level. { - auto const col = [] { - auto child1 = col_wrapper{1, null, 3}; - auto child2 = col_wrapper{4, null, 6}; + auto const col1 = [] { + auto child1 = tdata_col{1, null, 3}; + auto child2 = tdata_col{4, null, 6}; auto child3 = strings_col{"x", "" /*NULL*/, "z"}; return structs_col{{child1, child2, child3}, null_at(1)}; }(); - auto const val1 = [] { - auto child1 = col_wrapper{1}; - auto child2 = col_wrapper{4}; + auto const needle1 = [] { + auto child1 = tdata_col{1}; + auto child2 = tdata_col{4}; auto child3 = strings_col{"x"}; - return cudf::struct_scalar(std::vector{child1, child2, child3}); + return make_struct_scalar(child1, child2, child3); }(); - auto const val2 = [] { - auto child1 = col_wrapper{1}; - auto child2 = col_wrapper{4}; + auto const needle2 = [] { + auto child1 = tdata_col{1}; + auto child2 = tdata_col{4}; auto child3 = strings_col{"a"}; - return cudf::struct_scalar(std::vector{child1, child2, child3}); + return make_struct_scalar(child1, child2, child3); + }(); + auto const needle3 = [] { + auto child1 = tdata_col{{null}, null_at(0)}; + auto child2 = tdata_col{{null}, null_at(0)}; + auto child3 = strings_col{{""}, null_at(0)}; + return make_struct_scalar(child1, child2, child3); }(); - EXPECT_EQ(true, cudf::contains(col, val1)); - EXPECT_EQ(false, cudf::contains(col, val2)); + EXPECT_TRUE(cudf::contains(col1, needle1)); + EXPECT_FALSE(cudf::contains(col1, needle2)); + EXPECT_FALSE(cudf::contains(col1, needle3)); } // Test with nulls at the children level. { auto const col = [] { - auto child1 = col_wrapper{{1, null, 3}, null_at(1)}; - auto child2 = col_wrapper{{4, null, 6}, null_at(1)}; - auto child3 = strings_col{{"" /*NULL*/, "y", "z"}, null_at(0)}; + auto child1 = tdata_col{{1, null, 3}, null_at(1)}; + auto child2 = tdata_col{{4, null, 6}, null_at(1)}; + auto child3 = strings_col{{"" /*NULL*/, "" /*NULL*/, "z"}, nulls_at({0, 1})}; return structs_col{{child1, child2, child3}}; }(); - auto const val1 = [] { - auto child1 = col_wrapper{1}; - auto child2 = col_wrapper{4}; + auto const needle1 = [] { + auto child1 = tdata_col{1}; + auto child2 = tdata_col{4}; auto child3 = strings_col{{"" /*NULL*/}, null_at(0)}; - return cudf::struct_scalar(std::vector{child1, child2, child3}); + return make_struct_scalar(child1, child2, child3); }(); - auto const val2 = [] { - auto child1 = col_wrapper{1}; - auto child2 = col_wrapper{4}; + auto const needle2 = [] { + auto child1 = tdata_col{1}; + auto child2 = tdata_col{4}; auto child3 = strings_col{""}; - return cudf::struct_scalar(std::vector{child1, child2, child3}); + return make_struct_scalar(child1, child2, child3); + }(); + auto const needle3 = [] { + auto child1 = tdata_col{{null}, null_at(0)}; + auto child2 = tdata_col{{null}, null_at(0)}; + auto child3 = strings_col{{""}, null_at(0)}; + return make_struct_scalar(child1, child2, child3); }(); - EXPECT_EQ(true, cudf::contains(col, val1)); - EXPECT_EQ(false, cudf::contains(col, val2)); + EXPECT_TRUE(cudf::contains(col, needle1)); + EXPECT_FALSE(cudf::contains(col, needle2)); + EXPECT_TRUE(cudf::contains(col, needle3)); } // Test with nulls in the input scalar. { - auto const col = [] { - auto child1 = col_wrapper{1, 2, 3}; - auto child2 = col_wrapper{4, 5, 6}; + auto const haystack = [] { + auto child1 = tdata_col{1, 2, 3}; + auto child2 = tdata_col{4, 5, 6}; auto child3 = strings_col{"x", "y", "z"}; return structs_col{{child1, child2, child3}}; }(); - auto const val1 = [] { - auto child1 = col_wrapper{1}; - auto child2 = col_wrapper{4}; + auto const needle1 = [] { + auto child1 = tdata_col{1}; + auto child2 = tdata_col{4}; auto child3 = strings_col{"x"}; - return cudf::struct_scalar(std::vector{child1, child2, child3}); + return make_struct_scalar(child1, child2, child3); }(); - auto const val2 = [] { - auto child1 = col_wrapper{1}; - auto child2 = col_wrapper{4}; + auto const needle2 = [] { + auto child1 = tdata_col{1}; + auto child2 = tdata_col{4}; auto child3 = strings_col{{"" /*NULL*/}, null_at(0)}; - return cudf::struct_scalar(std::vector{child1, child2, child3}); + return make_struct_scalar(child1, child2, child3); }(); - EXPECT_EQ(true, cudf::contains(col, val1)); - EXPECT_EQ(false, cudf::contains(col, val2)); + EXPECT_TRUE(cudf::contains(haystack, needle1)); + EXPECT_FALSE(cudf::contains(haystack, needle2)); } } -TYPED_TEST(TypedScalarStructContainTest, SlicedInputWithNullsTests) +TYPED_TEST(TypedStructContainsTestScalarNeedle, SlicedInputWithNulls) { - using col_wrapper = cudf::test::fixed_width_column_wrapper; - - constexpr int32_t dont_care{0}; - constexpr int32_t null{0}; + using tdata_col = cudf::test::fixed_width_column_wrapper; // Test with nulls at the top level. { - auto const col_original = [] { - auto child1 = col_wrapper{dont_care, dont_care, 1, null, 3, dont_care}; - auto child2 = col_wrapper{dont_care, dont_care, 4, null, 6, dont_care}; + auto const haystack_original = [] { + auto child1 = tdata_col{dont_care, dont_care, 1, null, 3, dont_care}; + auto child2 = tdata_col{dont_care, dont_care, 4, null, 6, dont_care}; auto child3 = strings_col{"dont_care", "dont_care", "x", "" /*NULL*/, "z", "dont_care"}; return structs_col{{child1, child2, child3}, null_at(3)}; }(); - auto const col = cudf::slice(col_original, {2, 5})[0]; + auto const col = cudf::slice(haystack_original, {2, 5})[0]; - auto const val1 = [] { - auto child1 = col_wrapper{1}; - auto child2 = col_wrapper{4}; + auto const needle1 = [] { + auto child1 = tdata_col{1}; + auto child2 = tdata_col{4}; auto child3 = strings_col{"x"}; - return cudf::struct_scalar(std::vector{child1, child2, child3}); + return make_struct_scalar(child1, child2, child3); }(); - auto const val2 = [] { - auto child1 = col_wrapper{1}; - auto child2 = col_wrapper{4}; + auto const needle2 = [] { + auto child1 = tdata_col{1}; + auto child2 = tdata_col{4}; auto child3 = strings_col{"a"}; - return cudf::struct_scalar(std::vector{child1, child2, child3}); + return make_struct_scalar(child1, child2, child3); }(); - EXPECT_EQ(true, cudf::contains(col, val1)); - EXPECT_EQ(false, cudf::contains(col, val2)); + EXPECT_TRUE(cudf::contains(col, needle1)); + EXPECT_FALSE(cudf::contains(col, needle2)); } // Test with nulls at the children level. { - auto const col_original = [] { + auto const haystack_original = [] { auto child1 = - col_wrapper{{dont_care, dont_care /*also NULL*/, 1, null, 3, dont_care}, null_at(3)}; + tdata_col{{dont_care, dont_care /*also NULL*/, 1, null, 3, dont_care}, null_at(3)}; auto child2 = - col_wrapper{{dont_care, dont_care /*also NULL*/, 4, null, 6, dont_care}, null_at(3)}; + tdata_col{{dont_care, dont_care /*also NULL*/, 4, null, 6, dont_care}, null_at(3)}; auto child3 = strings_col{ {"dont_care", "dont_care" /*also NULL*/, "" /*NULL*/, "y", "z", "dont_care"}, null_at(2)}; return structs_col{{child1, child2, child3}, null_at(1)}; }(); - auto const col = cudf::slice(col_original, {2, 5})[0]; + auto const haystack = cudf::slice(haystack_original, {2, 5})[0]; - auto const val1 = [] { - auto child1 = col_wrapper{1}; - auto child2 = col_wrapper{4}; + auto const needle1 = [] { + auto child1 = tdata_col{1}; + auto child2 = tdata_col{4}; auto child3 = strings_col{{"x"}, null_at(0)}; - return cudf::struct_scalar(std::vector{child1, child2, child3}); + return make_struct_scalar(child1, child2, child3); }(); - auto const val2 = [] { - auto child1 = col_wrapper{dont_care}; - auto child2 = col_wrapper{dont_care}; + auto const needle2 = [] { + auto child1 = tdata_col{dont_care}; + auto child2 = tdata_col{dont_care}; auto child3 = strings_col{"dont_care"}; - return cudf::struct_scalar(std::vector{child1, child2, child3}); + return make_struct_scalar(child1, child2, child3); }(); - EXPECT_EQ(true, cudf::contains(col, val1)); - EXPECT_EQ(false, cudf::contains(col, val2)); + EXPECT_TRUE(cudf::contains(haystack, needle1)); + EXPECT_FALSE(cudf::contains(haystack, needle2)); + } +} + +template +struct TypedStructContainsTestColumnNeedles : public cudf::test::BaseFixture { +}; + +TYPED_TEST_SUITE(TypedStructContainsTestColumnNeedles, TestTypes); + +TYPED_TEST(TypedStructContainsTestColumnNeedles, EmptyInput) +{ + using tdata_col = cudf::test::fixed_width_column_wrapper; + + auto const haystack = [] { + auto child1 = tdata_col{}; + auto child2 = tdata_col{}; + auto child3 = strings_col{}; + return structs_col{{child1, child2, child3}}; + }(); + + { + auto const needles = [] { + auto child1 = tdata_col{}; + auto child2 = tdata_col{}; + auto child3 = strings_col{}; + return structs_col{{child1, child2, child3}}; + }(); + auto const expected = bools_col{}; + auto const result = cudf::contains(haystack, needles); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result, verbosity); + } + + { + auto const needles = [] { + auto child1 = tdata_col{1, 2}; + auto child2 = tdata_col{0, 2}; + auto child3 = strings_col{"x", "y"}; + return structs_col{{child1, child2, child3}}; + }(); + auto const result = cudf::contains(haystack, needles); + auto const expected = bools_col{0, 0}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result, verbosity); } } + +TYPED_TEST(TypedStructContainsTestColumnNeedles, TrivialInput) +{ + using tdata_col = cudf::test::fixed_width_column_wrapper; + + auto const haystack = [] { + auto child1 = tdata_col{1, 3, 1, 1, 2, 1, 2, 2, 1, 2}; + auto child2 = tdata_col{1, 0, 0, 0, 1, 0, 1, 2, 1, 1}; + return structs_col{{child1, child2}}; + }(); + + auto const needles = [] { + auto child1 = tdata_col{1, 3, 1, 1, 2, 1, 0, 0, 1, 0}; + auto child2 = tdata_col{1, 0, 2, 3, 2, 1, 0, 0, 1, 0}; + return structs_col{{child1, child2}}; + }(); + + auto const expected = bools_col{1, 1, 0, 0, 1, 1, 0, 0, 1, 0}; + auto const result = cudf::contains(haystack, needles); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result, verbosity); +} + +TYPED_TEST(TypedStructContainsTestColumnNeedles, SlicedInputNoNulls) +{ + using tdata_col = cudf::test::fixed_width_column_wrapper; + + auto const haystack_original = [] { + auto child1 = tdata_col{dont_care, dont_care, 1, 3, 1, 1, 2, dont_care}; + auto child2 = tdata_col{dont_care, dont_care, 1, 0, 0, 0, 1, dont_care}; + auto child3 = strings_col{"dont_care", "dont_care", "x", "y", "z", "a", "b", "dont_care"}; + return structs_col{{child1, child2, child3}}; + }(); + auto const haystack = cudf::slice(haystack_original, {2, 7})[0]; + + auto const needles_original = [] { + auto child1 = tdata_col{dont_care, 1, 1, 1, 1, 2, dont_care, dont_care}; + auto child2 = tdata_col{dont_care, 0, 1, 2, 3, 1, dont_care, dont_care}; + auto child3 = strings_col{"dont_care", "z", "x", "z", "a", "b", "dont_care", "dont_care"}; + return structs_col{{child1, child2, child3}}; + }(); + auto const needles = cudf::slice(needles_original, {1, 6})[0]; + + auto const expected = bools_col{1, 1, 0, 0, 1}; + auto const result = cudf::contains(haystack, needles); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result, verbosity); +} + +TYPED_TEST(TypedStructContainsTestColumnNeedles, SlicedInputHavingNulls) +{ + using tdata_col = cudf::test::fixed_width_column_wrapper; + + auto const haystack_original = [] { + auto child1 = + tdata_col{{dont_care /*null*/, dont_care, 1, null, XXX, 1, 2, null, 2, 2, null, 2, dont_care}, + nulls_at({0, 3, 7, 10})}; + auto child2 = + tdata_col{{dont_care /*null*/, dont_care, 1, null, XXX, 0, null, 0, 1, 2, 1, 1, dont_care}, + nulls_at({0, 3, 6})}; + return structs_col{{child1, child2}, nulls_at({1, 4})}; + }(); + auto const haystack = cudf::slice(haystack_original, {2, 12})[0]; + + auto const needles_original = [] { + auto child1 = + tdata_col{{dont_care, XXX, null, 1, 1, 2, XXX, null, 1, 1, null, dont_care, dont_care}, + nulls_at({2, 7, 10})}; + auto child2 = + tdata_col{{dont_care, XXX, null, 2, 3, 2, XXX, null, null, 1, 0, dont_care, dont_care}, + nulls_at({2, 7, 8})}; + return structs_col{{child1, child2}, nulls_at({1, 6})}; + }(); + auto const needles = cudf::slice(needles_original, {1, 11})[0]; + + auto const expected = bools_col{{null, 1, 0, 0, 1, null, 1, 0, 1, 1}, nulls_at({0, 5})}; + auto const result = cudf::contains(haystack, needles); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result, verbosity); +} + +TYPED_TEST(TypedStructContainsTestColumnNeedles, StructOfLists) +{ + using lists_col = cudf::test::lists_column_wrapper; + + auto const haystack = [] { + // clang-format off + auto child1 = lists_col{{1, 2}, {1}, {}, {1, 3}}; + auto child2 = lists_col{{1, 3, 4}, {2, 3, 4}, {}, {}}; + // clang-format on + return structs_col{{child1, child2}}; + }(); + + auto const needles = [] { + // clang-format off + auto child1 = lists_col{{1, 2}, {1}, {}, {1, 3}, {}}; + auto child2 = lists_col{{1, 3, 4}, {2, 3}, {1, 2}, {}, {}}; + // clang-format on + return structs_col{{child1, child2}}; + }(); + + auto const expected = bools_col{1, 0, 0, 1, 1}; + auto const result = cudf::contains(haystack, needles); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result, verbosity); +}