diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh index 37616989f00..49437510c7e 100755 --- a/ci/run_cudf_polars_polars_tests.sh +++ b/ci/run_cudf_polars_polars_tests.sh @@ -12,6 +12,7 @@ DESELECTED_TESTS=( "tests/unit/streaming/test_streaming_sort.py::test_streaming_sort[True]" # relies on polars built in debug mode "tests/unit/test_cpu_check.py::test_check_cpu_flags_skipped_no_flags" # Mock library error "tests/docs/test_user_guide.py" # No dot binary in CI image + "tests/unit/test_polars_import.py::test_fork_safety" # test started to hang in polars-1.14 ) if [[ $(arch) == "aarch64" ]]; then diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index e91443ddba8..d21497c4def 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -66,7 +66,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.14 +- polars>=1.11,<1.15 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<19.0.0a0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 2dccb595e59..400c1195e00 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -64,7 +64,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.14 +- polars>=1.11,<1.15 - pre-commit - pyarrow>=14.0.0,<19.0.0a0 - pydata-sphinx-theme!=0.14.2 diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml index 7a477291e7a..b6c03dc1bc2 100644 --- a/conda/recipes/cudf-polars/meta.yaml +++ b/conda/recipes/cudf-polars/meta.yaml @@ -43,7 +43,7 @@ requirements: run: - python - pylibcudf ={{ version }} - - polars >=1.11,<1.14 + - polars >=1.11,<1.15 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 5754994f412..7fdaff35525 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -354,9 +354,7 @@ ConfigureNVBench( # ################################################################################################## # * strings benchmark ------------------------------------------------------------------- -ConfigureBench( - STRINGS_BENCH string/factory.cu string/repeat_strings.cpp string/replace.cpp string/url_decode.cu -) +ConfigureBench(STRINGS_BENCH string/factory.cu string/repeat_strings.cpp string/url_decode.cu) ConfigureNVBench( STRINGS_NVBENCH @@ -380,6 +378,7 @@ ConfigureNVBench( string/lengths.cpp string/like.cpp string/make_strings_column.cu + string/replace.cpp string/replace_re.cpp string/reverse.cpp string/slice.cpp diff --git a/cpp/benchmarks/string/replace.cpp b/cpp/benchmarks/string/replace.cpp index 3d9d51bfd6d..643e857f356 100644 --- a/cpp/benchmarks/string/replace.cpp +++ b/cpp/benchmarks/string/replace.cpp @@ -14,11 +14,8 @@ * limitations under the License. */ -#include "string_bench_args.hpp" - #include #include -#include #include @@ -27,59 +24,51 @@ #include #include -#include - -class StringReplace : public cudf::benchmark {}; +#include enum replace_type { scalar, slice, multi }; -static void BM_replace(benchmark::State& state, replace_type rt) +static void bench_replace(nvbench::state& state) { - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const max_str_length{static_cast(state.range(1))}; + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); + auto const api = state.get_string("api"); + data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); - cudf::strings_column_view input(column->view()); - cudf::string_scalar target("+"); - cudf::string_scalar repl(""); - cudf::test::strings_column_wrapper targets({"+", "-"}); - cudf::test::strings_column_wrapper repls({"", ""}); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - switch (rt) { - case scalar: cudf::strings::replace(input, target, repl); break; - case slice: cudf::strings::replace_slice(input, repl, 1, 10); break; - case multi: - cudf::strings::replace_multiple( - input, cudf::strings_column_view(targets), cudf::strings_column_view(repls)); - break; - } - } + cudf::strings_column_view input(column->view()); - state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); -} + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto const chars_size = input.chars_size(stream); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(chars_size); -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 2; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); + if (api == "scalar") { + cudf::string_scalar target("+"); + cudf::string_scalar repl("-"); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::replace(input, target, repl); }); + } else if (api == "multi") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::test::strings_column_wrapper targets({"+", " "}); + cudf::test::strings_column_wrapper repls({"-", "_"}); + cudf::strings::replace_multiple( + input, cudf::strings_column_view(targets), cudf::strings_column_view(repls)); + }); + } else if (api == "slice") { + cudf::string_scalar repl("0123456789"); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::replace_slice(input, repl, 1, 10); }); + } } -#define STRINGS_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(StringReplace, name) \ - (::benchmark::State & st) { BM_replace(st, replace_type::name); } \ - BENCHMARK_REGISTER_F(StringReplace, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -STRINGS_BENCHMARK_DEFINE(scalar) -STRINGS_BENCHMARK_DEFINE(slice) -STRINGS_BENCHMARK_DEFINE(multi) +NVBENCH_BENCH(bench_replace) + .set_name("replace") + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_string_axis("api", {"scalar", "multi", "slice"}); diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 7cd4697f592..0c3244a1c75 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -946,6 +946,8 @@ class json_writer_options_builder; class json_writer_options { // Specify the sink to use for writer output sink_info _sink; + // Specify the compression format of the sink + compression_type _compression = compression_type::NONE; // maximum number of rows to write in each chunk (limits memory use) size_type _rows_per_chunk = std::numeric_limits::max(); // Set of columns to output @@ -1022,6 +1024,13 @@ class json_writer_options { */ [[nodiscard]] std::string const& get_na_rep() const { return _na_rep; } + /** + * @brief Returns compression type used for sink + * + * @return compression type for sink + */ + [[nodiscard]] compression_type get_compression() const { return _compression; } + /** * @brief Whether to output nulls as 'null'. * @@ -1066,6 +1075,13 @@ class json_writer_options { */ void set_table(table_view tbl) { _table = tbl; } + /** + * @brief Sets compression type to be used + * + * @param comptype Compression type for sink + */ + void set_compression(compression_type comptype) { _compression = comptype; } + /** * @brief Sets metadata. * @@ -1153,6 +1169,18 @@ class json_writer_options_builder { return *this; } + /** + * @brief Sets compression type of output sink + * + * @param comptype Compression type used + * @return this for chaining + */ + json_writer_options_builder& compression(compression_type comptype) + { + options._compression = comptype; + return *this; + } + /** * @brief Sets optional metadata (with column names). * diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh index c6af0c3c58a..06987139188 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cuh +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include @@ -253,16 +253,11 @@ struct binary_op_double_device_dispatcher { template CUDF_KERNEL void for_each_kernel(cudf::size_type size, Functor f) { - int tid = threadIdx.x; - int blkid = blockIdx.x; - int blksz = blockDim.x; - int gridsz = gridDim.x; - - int start = tid + blkid * blksz; - int step = blksz * gridsz; + auto start = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); #pragma unroll - for (cudf::size_type i = start; i < size; i += step) { + for (auto i = start; i < size; i += stride) { f(i); } } @@ -282,9 +277,9 @@ void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f) int min_grid_size; CUDF_CUDA_TRY( cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel)); - // 2 elements per thread. - int const grid_size = util::div_rounding_up_safe(size, 2 * block_size); - for_each_kernel<<>>(size, std::forward(f)); + auto grid = cudf::detail::grid_1d(size, block_size, 2 /* elements_per_thread */); + for_each_kernel<<>>( + size, std::forward(f)); } template diff --git a/cpp/src/io/comp/comp.cpp b/cpp/src/io/comp/comp.cpp index 2176dbb2373..b26a6292806 100644 --- a/cpp/src/io/comp/comp.cpp +++ b/cpp/src/io/comp/comp.cpp @@ -26,7 +26,7 @@ #include #include -#include // compress +#include // GZIP compression namespace cudf::io::detail { @@ -77,12 +77,12 @@ std::vector compress_snappy(host_span src, { auto const d_src = cudf::detail::make_device_uvector_async(src, stream, cudf::get_current_device_resource_ref()); - rmm::device_uvector d_dst(src.size(), stream); - cudf::detail::hostdevice_vector> inputs(1, stream); inputs[0] = d_src; inputs.host_to_device_async(stream); + auto dst_size = compress_max_output_chunk_size(nvcomp::compression_type::SNAPPY, src.size()); + rmm::device_uvector d_dst(dst_size, stream); cudf::detail::hostdevice_vector> outputs(1, stream); outputs[0] = d_dst; outputs.host_to_device_async(stream); @@ -93,13 +93,10 @@ std::vector compress_snappy(host_span src, nvcomp::batched_compress(nvcomp::compression_type::SNAPPY, inputs, outputs, hd_status, stream); - stream.synchronize(); hd_status.device_to_host_sync(stream); CUDF_EXPECTS(hd_status[0].status == cudf::io::compression_status::SUCCESS, "snappy compression failed"); - std::vector dst(d_dst.size()); - cudf::detail::cuda_memcpy(host_span{dst}, device_span{d_dst}, stream); - return dst; + return cudf::detail::make_std_vector_sync(d_dst, stream); } } // namespace diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index e1241f8f90c..8156258c810 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -19,6 +19,7 @@ * @brief cuDF-IO JSON writer implementation */ +#include "io/comp/comp.hpp" #include "io/csv/durations.hpp" #include "io/utilities/parsing_utils.cuh" #include "lists/utilities.hpp" @@ -828,10 +829,10 @@ void write_chunked(data_sink* out_sink, } } -void write_json(data_sink* out_sink, - table_view const& table, - json_writer_options const& options, - rmm::cuda_stream_view stream) +void write_json_uncompressed(data_sink* out_sink, + table_view const& table, + json_writer_options const& options, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); std::vector user_column_names = [&]() { @@ -934,4 +935,24 @@ void write_json(data_sink* out_sink, } } +void write_json(data_sink* out_sink, + table_view const& table, + json_writer_options const& options, + rmm::cuda_stream_view stream) +{ + if (options.get_compression() != compression_type::NONE) { + std::vector hbuf; + auto hbuf_sink_ptr = data_sink::create(&hbuf); + write_json_uncompressed(hbuf_sink_ptr.get(), table, options, stream); + stream.synchronize(); + auto comp_hbuf = cudf::io::detail::compress( + options.get_compression(), + host_span(reinterpret_cast(hbuf.data()), hbuf.size()), + stream); + out_sink->host_write(comp_hbuf.data(), comp_hbuf.size()); + return; + } + write_json_uncompressed(out_sink, table, options, stream); +} + } // namespace cudf::io::json::detail diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu index 515d28201e8..ce4d2067b82 100644 --- a/cpp/src/join/distinct_hash_join.cu +++ b/cpp/src/join/distinct_hash_join.cu @@ -33,7 +33,9 @@ #include #include #include +#include #include +#include #include #include @@ -79,14 +81,9 @@ class build_keys_fn { /** * @brief Device output transform functor to construct `size_type` with `cuco::pair` or `cuco::pair` + * rhs_index_type>` */ struct output_fn { - __device__ constexpr cudf::size_type operator()( - cuco::pair const& x) const - { - return static_cast(x.second); - } __device__ constexpr cudf::size_type operator()( cuco::pair const& x) const { @@ -176,15 +173,33 @@ distinct_hash_join::inner_join(rmm::cuda_stream_view stream, auto const iter = cudf::detail::make_counting_transform_iterator( 0, build_keys_fn{d_probe_hasher}); - auto const build_indices_begin = - thrust::make_transform_output_iterator(build_indices->begin(), output_fn{}); - auto const probe_indices_begin = - thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{}); - - auto const [probe_indices_end, _] = this->_hash_table.retrieve( - iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, {stream.value()}); + auto found_indices = rmm::device_uvector(probe_table_num_rows, stream); + auto const found_begin = + thrust::make_transform_output_iterator(found_indices.begin(), output_fn{}); + + // TODO conditional find for nulls once `cuco::static_set::find_if` is added + // If `idx` is within the range `[0, probe_table_num_rows)` and `found_indices[idx]` is not equal + // to `JoinNoneValue`, then `idx` has a match in the hash set. + this->_hash_table.find_async(iter, iter + probe_table_num_rows, found_begin, stream.value()); + + auto const tuple_iter = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type>( + [found_iter = found_indices.begin()] __device__(size_type idx) { + return thrust::tuple{*(found_iter + idx), idx}; + })); + auto const output_begin = + thrust::make_zip_iterator(build_indices->begin(), probe_indices->begin()); + auto const output_end = + thrust::copy_if(rmm::exec_policy_nosync(stream), + tuple_iter, + tuple_iter + probe_table_num_rows, + found_indices.begin(), + output_begin, + cuda::proclaim_return_type( + [] __device__(size_type idx) { return idx != JoinNoneValue; })); + auto const actual_size = std::distance(output_begin, output_end); - auto const actual_size = std::distance(probe_indices_begin, probe_indices_end); build_indices->resize(actual_size, stream); probe_indices->resize(actual_size, stream); diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp index 3bd67001c16..7cdce1ff735 100644 --- a/cpp/tests/binaryop/binop-compiled-test.cpp +++ b/cpp/tests/binaryop/binop-compiled-test.cpp @@ -23,9 +23,11 @@ #include #include +#include #include #include #include +#include #include #include @@ -820,4 +822,24 @@ TEST_F(BinaryOperationCompiledTest_NullOpsString, NullMin_Vector_Vector) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); } +TEST(BinaryOperationCompiledTest, LargeColumnNoOverflow) +{ + cudf::size_type num_rows{1'799'989'091}; + auto big = cudf::make_column_from_scalar( + cudf::numeric_scalar>{10, true}, num_rows); + auto small = cudf::make_column_from_scalar( + cudf::numeric_scalar>{1, true}, num_rows); + + auto mask = cudf::binary_operation(big->view(), + small->view(), + cudf::binary_operator::GREATER, + cudf::data_type{cudf::type_id::BOOL8}); + + auto agg = cudf::make_sum_aggregation(); + auto result = + cudf::reduce(mask->view(), *agg, cudf::data_type{cudf::type_to_id()}); + auto got = static_cast*>(result.get())->value(); + EXPECT_EQ(num_rows, got); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json/json_writer.cpp b/cpp/tests/io/json/json_writer.cpp index 39d31c406a5..b96fc6425e4 100644 --- a/cpp/tests/io/json/json_writer.cpp +++ b/cpp/tests/io/json/json_writer.cpp @@ -14,10 +14,14 @@ * limitations under the License. */ +#include "io/comp/io_uncomp.hpp" + #include #include +#include #include #include +#include #include #include @@ -31,7 +35,36 @@ struct JsonWriterTest : public cudf::test::BaseFixture {}; -TEST_F(JsonWriterTest, EmptyInput) +/** + * @brief Test fixture for parametrized JSON reader tests + */ +struct JsonCompressedWriterTest : public cudf::test::BaseFixture, + public testing::WithParamInterface {}; + +// Parametrize qualifying JSON tests for multiple compression types +INSTANTIATE_TEST_SUITE_P(JsonCompressedWriterTest, + JsonCompressedWriterTest, + ::testing::Values(cudf::io::compression_type::GZIP, + cudf::io::compression_type::SNAPPY, + cudf::io::compression_type::NONE)); + +void run_test(cudf::io::json_writer_options const& wopts, std::string const& expected) +{ + auto outbuf = wopts.get_sink().buffers().front(); + auto comptype = wopts.get_compression(); + cudf::io::write_json(wopts, cudf::test::get_default_stream()); + if (comptype != cudf::io::compression_type::NONE) { + auto decomp_out_buffer = cudf::io::detail::decompress( + comptype, + cudf::host_span(reinterpret_cast(outbuf->data()), outbuf->size())); + EXPECT_EQ(expected, + std::string_view(reinterpret_cast(decomp_out_buffer.data()), + decomp_out_buffer.size())); + } else + EXPECT_EQ(expected, std::string_view(outbuf->data(), outbuf->size())); +} + +TEST_P(JsonCompressedWriterTest, EmptyInput) { cudf::test::strings_column_wrapper col1; cudf::test::strings_column_wrapper col2; @@ -49,28 +82,21 @@ TEST_F(JsonWriterTest, EmptyInput) .lines(false) .na_rep("null") .build(); - - // Empty columns in table - cudf::io::write_json(out_options, cudf::test::get_default_stream()); - std::string const expected = R"([])"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, "[]"); // Empty columns in table - JSON Lines out_buffer.clear(); out_options.enable_lines(true); - cudf::io::write_json(out_options, cudf::test::get_default_stream()); - std::string const expected_lines = "\n"; - EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, "\n"); // Empty table - JSON Lines cudf::table_view tbl_view2{}; out_options.set_table(tbl_view2); out_buffer.clear(); - cudf::io::write_json(out_options, cudf::test::get_default_stream()); - EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, "\n"); } -TEST_F(JsonWriterTest, EmptyLeaf) +TEST_P(JsonCompressedWriterTest, EmptyLeaf) { cudf::test::strings_column_wrapper col1{""}; cudf::test::fixed_width_column_wrapper offsets{0, 0}; @@ -92,19 +118,14 @@ TEST_F(JsonWriterTest, EmptyLeaf) .lines(false) .na_rep("null") .build(); - - // Empty columns in table - cudf::io::write_json(out_options, cudf::test::get_default_stream()); - std::string const expected = R"([{"col1":"","col2":[],"col3":[]}])"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, R"([{"col1":"","col2":[],"col3":[]}])"); // Empty columns in table - JSON Lines out_buffer.clear(); out_options.enable_lines(true); - cudf::io::write_json(out_options, cudf::test::get_default_stream()); std::string const expected_lines = R"({"col1":"","col2":[],"col3":[]})" "\n"; - EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, expected_lines); } TEST_F(JsonWriterTest, ErrorCases) @@ -141,33 +162,34 @@ TEST_F(JsonWriterTest, ErrorCases) cudf::logic_error); } -TEST_F(JsonWriterTest, PlainTable) +TEST_P(JsonCompressedWriterTest, PlainTable) { + cudf::io::compression_type const comptype = GetParam(); cudf::test::strings_column_wrapper col1{"a", "b", "c"}; cudf::test::strings_column_wrapper col2{"d", "e", "f"}; - cudf::test::fixed_width_column_wrapper col3{1, 2, 3}; - cudf::test::fixed_width_column_wrapper col4{1.5, 2.5, 3.5}; - cudf::test::fixed_width_column_wrapper col5{{1, 2, 3}, + cudf::test::fixed_width_column_wrapper col3{1, 2, 3}; + cudf::test::fixed_width_column_wrapper col4{1.5, 2.5, 3.5}; + cudf::test::fixed_width_column_wrapper col5{{1, 2, 3}, cudf::test::iterators::nulls_at({0, 2})}; cudf::table_view tbl_view{{col1, col2, col3, col4, col5}}; - cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int"}, {"float"}, {"int16"}}}; + cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"col3"}, {"col4"}, {"col5"}}}; std::vector out_buffer; - auto destination = cudf::io::sink_info(&out_buffer); - auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) - .include_nulls(true) - .metadata(mt) - .lines(false) - .na_rep("null"); - - cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(false) + .compression(comptype) + .na_rep("null") + .build(); std::string const expected = - R"([{"col1":"a","col2":"d","int":1,"float":1.5,"int16":null},{"col1":"b","col2":"e","int":2,"float":2.5,"int16":2},{"col1":"c","col2":"f","int":3,"float":3.5,"int16":null}])"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + R"([{"col1":"a","col2":"d","col3":1,"col4":1.5,"col5":null},{"col1":"b","col2":"e","col3":2,"col4":2.5,"col5":2},{"col1":"c","col2":"f","col3":3,"col4":3.5,"col5":null}])"; + run_test(out_options, expected); } -TEST_F(JsonWriterTest, SimpleNested) +TEST_P(JsonCompressedWriterTest, SimpleNested) { std::string const data = R"( {"a": 1, "b": 2, "c": {"d": 3 }, "f": 5.5, "g": [1]} @@ -183,23 +205,23 @@ TEST_F(JsonWriterTest, SimpleNested) cudf::io::table_metadata mt{result.metadata}; std::vector out_buffer; - auto destination = cudf::io::sink_info(&out_buffer); - auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) - .include_nulls(false) - .metadata(mt) - .lines(true) - .na_rep("null"); - - cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(false) + .metadata(mt) + .lines(true) + .na_rep("null") + .build(); + std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]} {"a":6,"b":7,"c":{"d":8},"f":10.5} {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]} {"a":6,"b":7,"c":{"e":9},"f":10.5,"g":[3,4,5]} )"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, expected); } -TEST_F(JsonWriterTest, MixedNested) +TEST_P(JsonCompressedWriterTest, MixedNested) { std::string const data = R"( {"a": 1, "b": 2, "c": {"d": [3] }, "f": 5.5, "g": [ {"h": 1}]} @@ -215,20 +237,20 @@ TEST_F(JsonWriterTest, MixedNested) cudf::io::table_metadata mt{result.metadata}; std::vector out_buffer; - auto destination = cudf::io::sink_info(&out_buffer); - auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) - .include_nulls(false) - .metadata(mt) - .lines(false) - .na_rep("null"); - - cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(false) + .metadata(mt) + .lines(false) + .na_rep("null") + .build(); + std::string const expected = R"([{"a":1,"b":2,"c":{"d":[3]},"f":5.5,"g":[{"h":1}]},)" R"({"a":6,"b":7,"c":{"d":[8]},"f":10.5},)" R"({"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[{"h":2},null]},)" R"({"a":6,"b":7,"c":{"e":9},"f":10.5,"g":[{"h":3},{"h":4},{"h":5}]}])"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, expected); } TEST_F(JsonWriterTest, WriteReadNested) @@ -375,7 +397,7 @@ TEST_F(JsonWriterTest, WriteReadNested) } } -TEST_F(JsonWriterTest, SpecialChars) +TEST_P(JsonCompressedWriterTest, SpecialChars) { cudf::test::fixed_width_column_wrapper a{1, 6, 1, 6}; cudf::test::strings_column_wrapper b{"abcd", "b\b\f\n\r\t", "\"c\"", "/\\"}; @@ -391,17 +413,15 @@ TEST_F(JsonWriterTest, SpecialChars) .na_rep("null") .build(); - cudf::io::write_json(out_options, cudf::test::get_default_stream()); std::string const expected = R"({"\"a\"":1,"'b'":"abcd"} {"\"a\"":6,"'b'":"b\b\f\n\r\t"} {"\"a\"":1,"'b'":"\"c\""} {"\"a\"":6,"'b'":"\/\\"} )"; - auto const output_string = std::string(out_buffer.data(), out_buffer.size()); - EXPECT_EQ(expected, output_string); + run_test(out_options, expected); } -TEST_F(JsonWriterTest, NullList) +TEST_P(JsonCompressedWriterTest, NullList) { std::string const data = R"( {"a": [null], "b": [[1, 2, 3], [null], [null, null, null], [4, null, 5]]} @@ -417,23 +437,23 @@ TEST_F(JsonWriterTest, NullList) cudf::io::table_metadata mt{result.metadata}; std::vector out_buffer; - auto destination = cudf::io::sink_info(&out_buffer); - auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) - .include_nulls(true) - .metadata(mt) - .lines(true) - .na_rep("null"); - - cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(true) + .na_rep("null") + .build(); + std::string const expected = R"({"a":[null],"b":[[1,2,3],[null],[null,null,null],[4,null,5]]} {"a":[2,null,null,3],"b":null} {"a":[null,null,4],"b":[[2,null],null]} {"a":[5,null,null],"b":[null,[3,4,5]]} )"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, expected); } -TEST_F(JsonWriterTest, ChunkedNested) +TEST_P(JsonCompressedWriterTest, ChunkedNested) { std::string const data = R"( {"a": 1, "b": -2, "c": { }, "e": [{"f": 1}]} @@ -455,15 +475,15 @@ TEST_F(JsonWriterTest, ChunkedNested) cudf::io::table_metadata mt{result.metadata}; std::vector out_buffer; - auto destination = cudf::io::sink_info(&out_buffer); - auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) - .include_nulls(false) - .metadata(mt) - .lines(true) - .na_rep("null") - .rows_per_chunk(8); - - cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(false) + .metadata(mt) + .lines(true) + .na_rep("null") + .rows_per_chunk(8) + .build(); + std::string const expected = R"({"a":1,"b":-2,"c":{},"e":[{"f":1}]} {"a":2,"b":-2,"c":{}} @@ -475,10 +495,10 @@ TEST_F(JsonWriterTest, ChunkedNested) {"a":8,"b":-2,"c":{"d":64},"e":[{"f":8}]} {"a":9,"b":-2,"c":{"d":81},"e":[{"f":9}]} )"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, expected); } -TEST_F(JsonWriterTest, StructAllNullCombinations) +TEST_P(JsonCompressedWriterTest, StructAllNullCombinations) { auto const_1_iter = thrust::make_constant_iterator(1); @@ -512,14 +532,14 @@ TEST_F(JsonWriterTest, StructAllNullCombinations) cudf::io::table_metadata mt{{{"a"}, {"b"}, {"c"}, {"d"}, {"e"}}}; std::vector out_buffer; - auto destination = cudf::io::sink_info(&out_buffer); - auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) - .include_nulls(false) - .metadata(mt) - .lines(true) - .na_rep("null"); - - cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(false) + .metadata(mt) + .lines(true) + .na_rep("null") + .build(); + std::string const expected = R"({} {"e":1} {"d":1} @@ -553,10 +573,10 @@ TEST_F(JsonWriterTest, StructAllNullCombinations) {"a":1,"b":1,"c":1,"d":1} {"a":1,"b":1,"c":1,"d":1,"e":1} )"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, expected); } -TEST_F(JsonWriterTest, Unicode) +TEST_P(JsonCompressedWriterTest, Unicode) { // UTF-8, UTF-16 cudf::test::strings_column_wrapper col1{"\"\\/\b\f\n\r\t", "ராபிட்ஸ்", "$€𐐷𤭢", "C𝞵𝓓𝒻"}; @@ -574,14 +594,13 @@ TEST_F(JsonWriterTest, Unicode) cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int16"}}}; std::vector out_buffer; - auto destination = cudf::io::sink_info(&out_buffer); - auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) - .include_nulls(true) - .metadata(mt) - .lines(true) - .na_rep("null"); - - cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(true) + .na_rep("null") + .build(); std::string const expected = R"({"col1":"\"\\\/\b\f\n\r\t","col2":"C\u10ae\u226a\u31f3\u434f\u51f9\u6ca6\u738b\u8fbf\u9fb8\ua057\ubbdc\uc2a4\ud3f6\ue4fe\ufd20","int16":null} @@ -589,7 +608,7 @@ TEST_F(JsonWriterTest, Unicode) {"col1":"$\u20ac\ud801\udc37\ud852\udf62","col2":"\ud841\ude28\ud846\udd4c\ud849\uddc9\ud84c\uddca\ud850\udea9\ud854\udd7d\ud858\ude71\ud85f\udd31\ud860\udc72\ud864\udc79\ud869\udc22\ud86c\udded\ud872\udf2d\ud877\udeb7\ud878\udea6\u5c6e","int16":null} {"col1":"C\ud835\udfb5\ud835\udcd3\ud835\udcbb","col2":"\ud883\udf91\ud885\udd08\ud888\udf49","int16":4} )"; - EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + run_test(out_options, expected); } CUDF_TEST_PROGRAM_MAIN() diff --git a/dependencies.yaml b/dependencies.yaml index a4a4113d1e4..682aaa612b4 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -748,7 +748,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.11,<1.14 + - polars>=1.11,<1.15 run_cudf_polars_experimental: common: - output_types: [conda, requirements, pyproject] diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 080a1af6e19..7a759eea2e9 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -47,6 +47,9 @@ def pytest_configure(config: pytest.Config) -> None: EXPECTED_FAILURES: Mapping[str, str] = { "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed", "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU", + "tests/unit/io/test_delta.py::test_scan_delta_version": "Need to expose hive partitioning", + "tests/unit/io/test_delta.py::test_scan_delta_relative": "Need to expose hive partitioning", + "tests/unit/io/test_delta.py::test_read_delta_version": "Need to expose hive partitioning", "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed", "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read", "tests/unit/io/test_lazy_parquet.py::test_dsl2ir_cached_metadata[False]": "cudf-polars doesn't use metadata read by rust preprocessing", diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index e665d42ab1a..1ce4d7b6867 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.11,<1.14", + "polars>=1.11,<1.15", "pylibcudf==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/libcudf/libcudf/load.py b/python/libcudf/libcudf/load.py index a91fbb7aecf..c3ff5534e87 100644 --- a/python/libcudf/libcudf/load.py +++ b/python/libcudf/libcudf/load.py @@ -76,9 +76,15 @@ def load_library(): # Prefer the libraries bundled in this package. If they aren't found # (which might be the case in builds where the library was prebuilt before # packaging the wheel), look for a system installation. - libcudf_lib = _load_wheel_installation(soname) - if libcudf_lib is None: - libcudf_lib = _load_system_installation(soname) + try: + libcudf_lib = _load_wheel_installation(soname) + if libcudf_lib is None: + libcudf_lib = _load_system_installation(soname) + except OSError: + # If none of the searches above succeed, just silently return None + # and rely on other mechanisms (like RPATHs on other DSOs) to + # help the loader find the library. + pass # The caller almost never needs to do anything with this library, but no # harm in offering the option since this object at least provides a handle