Skip to content

Commit

Permalink
Parse newline as whitespace character while tokenizing JSONL inputs w…
Browse files Browse the repository at this point in the history
…ith non-newline delimiter (#16923)

Addresses #16915

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Basit Ayantunde (https://github.com/lamarrr)
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: #16923
  • Loading branch information
shrshi authored Sep 27, 2024
1 parent 22d481a commit 6973ef8
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 2 deletions.
4 changes: 2 additions & 2 deletions cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -618,12 +618,12 @@ struct PdaSymbolToSymbolGroupId {
constexpr auto pda_sgid_lookup_size =
static_cast<int32_t>(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0]));
// We map the delimiter character to LINE_BREAK symbol group id, and the newline character
// to OTHER. Note that delimiter cannot be any of opening(closing) brace, bracket, quote,
// to WHITE_SPACE. Note that delimiter cannot be any of opening(closing) brace, bracket, quote,
// escape, comma, colon or whitespace characters.
auto const symbol_position =
symbol == delimiter
? static_cast<int32_t>('\n')
: (symbol == '\n' ? static_cast<int32_t>(delimiter) : static_cast<int32_t>(symbol));
: (symbol == '\n' ? static_cast<int32_t>(' ') : static_cast<int32_t>(symbol));
PdaSymbolGroupIdT symbol_gid =
tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)];
return stack_idx * static_cast<PdaSymbolGroupIdT>(symbol_group_id::NUM_PDA_INPUT_SGS) +
Expand Down
24 changes: 24 additions & 0 deletions cpp/tests/io/json/json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2575,6 +2575,30 @@ TEST_F(JsonReaderTest, ViableDelimiter)
EXPECT_THROW(json_parser_options.set_delimiter('\t'), std::invalid_argument);
}

TEST_F(JsonReaderTest, ViableDelimiterNewlineWS)
{
// Test input
std::string input = R"({"a":
100})";

cudf::io::json_reader_options json_parser_options =
cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()})
.lines(true)
.delimiter('\0');

auto result = cudf::io::read_json(json_parser_options);
EXPECT_EQ(result.tbl->num_columns(), 1);
EXPECT_EQ(result.tbl->num_rows(), 1);

EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);

EXPECT_EQ(result.metadata.schema_info[0].name, "a");

auto col1_iterator = thrust::constant_iterator<int64_t>(100);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
int64_wrapper(col1_iterator, col1_iterator + 1));
}

// Test case for dtype prune:
// all paths, only one.
// one present, another not present, nothing present
Expand Down
178 changes: 178 additions & 0 deletions cpp/tests/io/json/nested_json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <cudf/io/datasource.hpp>
#include <cudf/io/json.hpp>
#include <cudf/io/parquet.hpp>
#include <cudf/io/types.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down Expand Up @@ -1196,4 +1197,181 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAndDelimiter)
}
}

TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAsWSAndDelimiter)
{
// Test input. Inline comments used to indicate character indexes
// 012345678 <= line 0
char const delimiter = GetParam();

/* Input: (Note that \n is considered whitespace according to the JSON spec when it is not used as
* a delimiter for JSONL)
* {"a":2}
* {"a":<delimiter>{"a":{"a":[321<delimiter>{"a":[1]}
*
* <delimiter>{"b":123}
* {"b":123}<delimiter>
* {"b"\n:\n\n\n123\n}
*/
std::string input = R"({"a":2})"
"\n";
// starting position 8 (zero indexed)
input += R"({"a":)" + std::string(1, delimiter);
// starting position 14 (zero indexed)
input += R"({"a":{"a":[321)" + std::string(1, delimiter);
// starting position 29 (zero indexed)
input += R"({"a":[1]})" + std::string("\n\n") + std::string(1, delimiter);
// starting position 41 (zero indexed)
input += R"({"b":123})"
"\n";
// starting position 51 (zero indexed)
input += R"({"b":123})" + std::string(1, delimiter);
// starting position 61 (zero indexed)
input += R"({"b")" + std::string("\n:\n\n\n123\n}");

// Golden token stream sample
using token_t = cuio_json::token_t;
std::vector<std::pair<std::size_t, cuio_json::PdaTokenT>> golden_token_stream;
if (delimiter != '\n') {
golden_token_stream = {// Line 0 (valid)
{0, token_t::StructBegin},
{1, token_t::StructMemberBegin},
{1, token_t::FieldNameBegin},
{3, token_t::FieldNameEnd},
{5, token_t::ValueBegin},
{6, token_t::ValueEnd},
{6, token_t::StructMemberEnd},
{6, token_t::StructEnd},
// Line 1 (invalid)
{0, token_t::StructBegin},
{0, token_t::StructEnd},
// Line 2 (valid)
{29, token_t::StructBegin},
{30, token_t::StructMemberBegin},
{30, token_t::FieldNameBegin},
{32, token_t::FieldNameEnd},
{34, token_t::ListBegin},
{35, token_t::ValueBegin},
{36, token_t::ValueEnd},
{36, token_t::ListEnd},
{37, token_t::StructMemberEnd},
{37, token_t::StructEnd},
// Line 3 (valid)
{41, token_t::StructBegin},
{42, token_t::StructMemberBegin},
{42, token_t::FieldNameBegin},
{44, token_t::FieldNameEnd},
{46, token_t::ValueBegin},
{49, token_t::ValueEnd},
{49, token_t::StructMemberEnd},
{49, token_t::StructEnd},
// Line 4 (valid)
{61, token_t::StructBegin},
{62, token_t::StructMemberBegin},
{62, token_t::FieldNameBegin},
{64, token_t::FieldNameEnd},
{70, token_t::ValueBegin},
{73, token_t::ValueEnd},
{74, token_t::StructMemberEnd},
{74, token_t::StructEnd}};
} else {
/* Input:
* {"a":2}
* {"a":
* {"a":{"a":[321
* {"a":[1]}
*
*
* {"b":123}
* {"b":123}
* {"b"\n:\n\n\n123\n}
*/
golden_token_stream = {// Line 0 (valid)
{0, token_t::StructBegin},
{1, token_t::StructMemberBegin},
{1, token_t::FieldNameBegin},
{3, token_t::FieldNameEnd},
{5, token_t::ValueBegin},
{6, token_t::ValueEnd},
{6, token_t::StructMemberEnd},
{6, token_t::StructEnd},
// Line 1 (invalid)
{0, token_t::StructBegin},
{0, token_t::StructEnd},
// Line 2 (invalid)
{0, token_t::StructBegin},
{0, token_t::StructEnd},
// Line 3 (valid)
{29, token_t::StructBegin},
{30, token_t::StructMemberBegin},
{30, token_t::FieldNameBegin},
{32, token_t::FieldNameEnd},
{34, token_t::ListBegin},
{35, token_t::ValueBegin},
{36, token_t::ValueEnd},
{36, token_t::ListEnd},
{37, token_t::StructMemberEnd},
{37, token_t::StructEnd},
// Line 4 (valid)
{41, token_t::StructBegin},
{42, token_t::StructMemberBegin},
{42, token_t::FieldNameBegin},
{44, token_t::FieldNameEnd},
{46, token_t::ValueBegin},
{49, token_t::ValueEnd},
{49, token_t::StructMemberEnd},
{49, token_t::StructEnd},
// Line 5 (valid)
{51, token_t::StructBegin},
{52, token_t::StructMemberBegin},
{52, token_t::FieldNameBegin},
{54, token_t::FieldNameEnd},
{56, token_t::ValueBegin},
{59, token_t::ValueEnd},
{59, token_t::StructMemberEnd},
{59, token_t::StructEnd},
// Line 6 (invalid)
{0, token_t::StructBegin},
{0, token_t::StructEnd},
{0, token_t::StructBegin},
{0, token_t::StructEnd},
{0, token_t::StructBegin},
{0, token_t::StructEnd},
{0, token_t::StructBegin},
{0, token_t::StructEnd}};
}

auto const stream = cudf::get_default_stream();

// Prepare input & output buffers
cudf::string_scalar const d_scalar(input, true, stream);
auto const d_input = cudf::device_span<cuio_json::SymbolT const>{
d_scalar.data(), static_cast<size_t>(d_scalar.size())};

// Default parsing options
cudf::io::json_reader_options const in_opts =
cudf::io::json_reader_options::builder(cudf::io::source_info{})
.recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
.delimiter(delimiter)
.lines(true);

// Parse the JSON and get the token stream
auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
d_input, in_opts, stream, cudf::get_current_device_resource_ref());
// Copy back the number of tokens that were written
auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);

stream.synchronize();
// Verify the number of tokens matches
ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size());
ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size());

for (std::size_t i = 0; i < tokens_gpu.size(); i++) {
// Ensure the index the tokens are pointing to do match
EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i;
// Ensure the token category is correct
EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i;
}
}

CUDF_TEST_PROGRAM_MAIN()

0 comments on commit 6973ef8

Please sign in to comment.