Skip to content

Commit

Permalink
Add stream parameter to cudf::io::text::multibyte_split (#16034)
Browse files Browse the repository at this point in the history
Adds stream support the `cudf::io::text::multibyte_split` API.
Also adds a stream test and deprecates an overloaded API.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #16034
  • Loading branch information
davidwendt authored Jul 1, 2024
1 parent 5efd72f commit b691b1c
Show file tree
Hide file tree
Showing 8 changed files with 141 additions and 50 deletions.
15 changes: 11 additions & 4 deletions cpp/include/cudf/io/text/byte_range_info.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -24,17 +24,22 @@
namespace cudf {
namespace io {
namespace text {
/**
* @addtogroup io_readers
* @{
* @file
*/

/**
* @brief stores offset and size used to indicate a byte range
*/
class byte_range_info {
private:
int64_t _offset; ///< offset in bytes
int64_t _size; ///< size in bytes
int64_t _offset{}; ///< offset in bytes
int64_t _size{}; ///< size in bytes

public:
constexpr byte_range_info() noexcept : _offset(0), _size(0) {}
constexpr byte_range_info() = default;
/**
* @brief Constructs a byte_range_info object
*
Expand Down Expand Up @@ -104,6 +109,8 @@ std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_b
*/
byte_range_info create_byte_range_info_max();

/** @} */ // end of group

} // namespace text
} // namespace io
} // namespace cudf
10 changes: 9 additions & 1 deletion cpp/include/cudf/io/text/data_chunk_source.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,6 +25,12 @@ namespace cudf {
namespace io {
namespace text {

/**
* @addtogroup io_readers
* @{
* @file
*/

/**
* @brief A contract guaranteeing stream-ordered memory access to the underlying device data.
*
Expand Down Expand Up @@ -110,6 +116,8 @@ class data_chunk_source {
[[nodiscard]] virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
};

/** @} */ // end of group

} // namespace text
} // namespace io
} // namespace cudf
27 changes: 23 additions & 4 deletions cpp/include/cudf/io/text/multibyte_split.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@
namespace cudf {
namespace io {
namespace text {
/**
* @addtogroup io_readers
* @{
* @file
*/

/**
* @brief Parsing options for multibyte_split.
Expand Down Expand Up @@ -79,6 +84,7 @@ struct parse_options {
* @param source The source string
* @param delimiter UTF-8 encoded string for which to find offsets in the source
* @param options the parsing options to use (including byte range)
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Memory resource to use for the device memory allocation
* @return The strings found by splitting the source by the delimiter within the relevant byte
* range.
Expand All @@ -87,17 +93,30 @@ std::unique_ptr<cudf::column> multibyte_split(
data_chunk_source const& source,
std::string const& delimiter,
parse_options options = {},
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

std::unique_ptr<cudf::column> multibyte_split(
/**
* @brief Splits the source text into a strings column using a multiple byte delimiter.
*
* @deprecated Since 24.08
*
* @param source The source input data encoded in UTF-8
* @param delimiter UTF-8 encoded string for which to find offsets in the source
* @param byte_range The position and size within `source` to produce the column from
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Memory resource to use for the device memory allocation
* @return The strings found by splitting the source by the delimiter within the relevant byte
* range.
*/
[[deprecated]] std::unique_ptr<cudf::column> multibyte_split(
data_chunk_source const& source,
std::string const& delimiter,
std::optional<byte_range_info> byte_range,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
std::string const& delimiter,
rmm::device_async_resource_ref mr);
/** @} */ // end of group

} // namespace text
} // namespace io
Expand Down
19 changes: 8 additions & 11 deletions cpp/src/io/text/multibyte_split.cu
Original file line number Diff line number Diff line change
Expand Up @@ -565,35 +565,32 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source

} // namespace detail

// deprecated in 24.08
std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
std::string const& delimiter,
std::optional<byte_range_info> byte_range,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
return multibyte_split(
source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr);
return multibyte_split(source,
delimiter,
parse_options{byte_range.value_or(create_byte_range_info_max())},
stream,
mr);
}

std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
std::string const& delimiter,
parse_options options,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto stream = cudf::get_default_stream();

auto result = detail::multibyte_split(
source, delimiter, options.byte_range, options.strip_delimiters, stream, mr);

return result;
}

std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
std::string const& delimiter,
rmm::device_async_resource_ref mr)
{
return multibyte_split(source, delimiter, parse_options{}, mr);
}

} // namespace text
} // namespace io
} // namespace cudf
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,7 @@ ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing)
ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
ConfigureTest(STREAM_MULTIBYTE_SPLIT_TEST streams/io/multibyte_split_test.cpp STREAM_MODE testing)
ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing)
ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing)
ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing)
Expand Down
81 changes: 52 additions & 29 deletions cpp/tests/io/text/multibyte_split_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,9 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange)
auto expected = strings_column_wrapper{"abcdefg:"};

auto source = cudf::io::text::make_source(host_input);
auto out = cudf::io::text::multibyte_split(
*source,
delimiter,
cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size())});
cudf::io::text::parse_options options{
cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size())}};
auto out = cudf::io::text::multibyte_split(*source, delimiter, options);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
}
Expand All @@ -113,10 +112,9 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange2)
auto expected = strings_column_wrapper{"abcdefg:"};

auto source = cudf::io::text::make_source(host_input);
auto out = cudf::io::text::multibyte_split(
*source,
delimiter,
cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size() - 1)});
cudf::io::text::parse_options options{
cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size() - 1)}};
auto out = cudf::io::text::multibyte_split(*source, delimiter, options);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
}
Expand Down Expand Up @@ -277,9 +275,12 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRange)
auto source = cudf::io::text::make_source(host_input);

auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
auto out0 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
auto out1 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
auto out2 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
auto out0 = cudf::io::text::multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
auto out1 = cudf::io::text::multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
auto out2 = cudf::io::text::multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});

auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
auto out = cudf::concatenate(out_views);
Expand All @@ -303,9 +304,12 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRange)
auto source = cudf::io::text::make_source(host_input);

auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
auto out0 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
auto out1 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
auto out2 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
auto out0 = cudf::io::text::multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
auto out1 = cudf::io::text::multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
auto out2 = cudf::io::text::multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});

auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
auto out = cudf::concatenate(out_views);
Expand All @@ -327,9 +331,12 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRangeSingleByte)
auto source = cudf::io::text::make_source(host_input);

auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
auto out0 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
auto out1 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
auto out2 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
auto out0 = cudf::io::text::multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
auto out1 = cudf::io::text::multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
auto out2 = cudf::io::text::multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});

auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
auto out = cudf::concatenate(out_views);
Expand All @@ -352,9 +359,12 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRangeSingleByte)
auto source = cudf::io::text::make_source(host_input);

auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
auto out0 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
auto out1 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
auto out2 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
auto out0 = cudf::io::text::multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
auto out1 = cudf::io::text::multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
auto out2 = cudf::io::text::multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});

auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
auto out = cudf::concatenate(out_views);
Expand Down Expand Up @@ -383,9 +393,14 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRanges)
SCOPED_TRACE(split1);
for (int split2 = split1 + 1; split2 < size; split2++) {
SCOPED_TRACE(split2);
auto out1 = multibyte_split(*source, delimiter, byte_range_info{0, split1});
auto out2 = multibyte_split(*source, delimiter, byte_range_info{split1, split2 - split1});
auto out3 = multibyte_split(*source, delimiter, byte_range_info{split2, size - split2});
auto out1 = multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_range_info{0, split1}});
auto out2 =
multibyte_split(*source,
delimiter,
cudf::io::text::parse_options{byte_range_info{split1, split2 - split1}});
auto out3 = multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_range_info{split2, size - split2}});

auto out_views = std::vector<cudf::column_view>({out1->view(), out2->view(), out3->view()});
auto out = cudf::concatenate(out_views);
Expand Down Expand Up @@ -416,9 +431,14 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRangesSingleByte)
SCOPED_TRACE(split1);
for (int split2 = split1 + 1; split2 < size; split2++) {
SCOPED_TRACE(split2);
auto out1 = multibyte_split(*source, delimiter, byte_range_info{0, split1});
auto out2 = multibyte_split(*source, delimiter, byte_range_info{split1, split2 - split1});
auto out3 = multibyte_split(*source, delimiter, byte_range_info{split2, size - split2});
auto out1 = multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_range_info{0, split1}});
auto out2 =
multibyte_split(*source,
delimiter,
cudf::io::text::parse_options{byte_range_info{split1, split2 - split1}});
auto out3 = multibyte_split(
*source, delimiter, cudf::io::text::parse_options{byte_range_info{split2, size - split2}});

auto out_views = std::vector<cudf::column_view>({out1->view(), out2->view(), out3->view()});
auto out = cudf::concatenate(out_views);
Expand All @@ -441,7 +461,8 @@ TEST_F(MultibyteSplitTest, SingletonRangeAtEnd)
auto source = make_source(host_input);
auto expected = strings_column_wrapper{};

auto out = multibyte_split(*source, delimiter, byte_range_info{5, 1});
auto out =
multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{5, 1}});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS);
}
Expand Down Expand Up @@ -480,7 +501,8 @@ TEST_F(MultibyteSplitTest, EmptyRange)
auto source = make_source(host_input);
auto expected = strings_column_wrapper{};

auto out = multibyte_split(*source, delimiter, byte_range_info{4, 0});
auto out =
multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{4, 0}});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS);
}
Expand All @@ -493,7 +515,8 @@ TEST_F(MultibyteSplitTest, EmptyRangeSingleByte)
auto source = make_source(host_input);
auto expected = strings_column_wrapper{};

auto out = multibyte_split(*source, delimiter, byte_range_info{3, 0});
auto out =
multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{3, 0}});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS);
}
Expand Down
36 changes: 36 additions & 0 deletions cpp/tests/streams/io/multibyte_split_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/default_stream.hpp>

#include <cudf/io/text/byte_range_info.hpp>
#include <cudf/io/text/data_chunk_source_factories.hpp>
#include <cudf/io/text/multibyte_split.hpp>

#include <string>

class MultibyteSplitTest : public cudf::test::BaseFixture {};

TEST_F(MultibyteSplitTest, Reader)
{
auto delimiter = std::string(":");
auto host_input = std::string("abc:def");
auto source = cudf::io::text::make_source(host_input);
cudf::io::text::parse_options options{};
auto result =
cudf::io::text::multibyte_split(*source, delimiter, options, cudf::test::get_default_stream());
}
2 changes: 1 addition & 1 deletion docs/cudf/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ def _generate_namespaces(namespaces):
_all_namespaces = _generate_namespaces(
{
# Note that io::datasource is actually a nested class
"cudf": {"io", "io::datasource", "strings", "ast", "ast::expression"},
"cudf": {"io", "io::datasource", "strings", "ast", "ast::expression", "io::text"},
"numeric": {},
"nvtext": {},
}
Expand Down

0 comments on commit b691b1c

Please sign in to comment.