Skip to content

Commit

Permalink
Perform name matching when parsing name field
Browse files Browse the repository at this point in the history
Signed-off-by: Nghia Truong <nghiat@nvidia.com>
  • Loading branch information
ttnghia committed Oct 4, 2024
1 parent 9e35beb commit 8b0268e
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 69 deletions.
8 changes: 6 additions & 2 deletions src/main/cpp/src/get_json_object.cu
Original file line number Diff line number Diff line change
Expand Up @@ -493,12 +493,16 @@ __device__ thrust::pair<bool, cudf::size_type> evaluate_path(
// match first mached children with expected name
bool found_expected_child = false;
auto const to_match_name = ctx.path.front().name;
while (json_token::END_OBJECT != p.next_token()) {
while (true) {
auto const is_name_matched = p.parse_next_token_with_matching(to_match_name);
if (json_token::END_OBJECT == p.get_current_token()) { break; }

// JSON validation check
if (json_token::ERROR == p.get_current_token()) { return {false, 0}; }

// current token is FIELD_NAME
if (p.match_current_field_name(to_match_name)) {
// if (p.match_current_field_name(to_match_name)) {
if (is_name_matched) {
// skip FIELD_NAME token
p.next_token();
// JSON validation check
Expand Down
143 changes: 76 additions & 67 deletions src/main/cpp/src/json_parser.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ class json_parser {
{
// TODO eventually chars should be a reader so we can just pass it in...
char_range_reader reader(chars, curr_pos);
auto [success, end_char_pos] = try_parse_string(reader);
[[maybe_unused]] auto const [success, matched, end_char_pos] = try_parse_string(reader);
if (success) {
// TODO remove end_char_pos, and just get it from the reader...
curr_pos = end_char_pos;
Expand Down Expand Up @@ -546,8 +546,10 @@ class json_parser {
} else if ('\\' == c) {
// path 3: escape path
str.next();
char_range_reader to_match(char_range::null());
if (!try_skip_escape_part(str, to_match, copy_destination, w_style, output_size_bytes)) {
char_range_reader to_match(char_range::null()); // unused
bool matched_field_name; // unused
if (!try_skip_escape_part(
str, to_match, copy_destination, w_style, output_size_bytes, matched_field_name)) {
return output_size_bytes;
}
} else {
Expand Down Expand Up @@ -611,18 +613,18 @@ class json_parser {
* @param str string to parse
* @param to_match expected match str
* @param w_style the escape style for writing.
* @return a pair of success and length, where success is true if the string
* is valid and length is the number of bytes needed to encode the string
* in the given style.
* @return a tuple of values indicating if the parse process was successful, field name was
* matched, and byte length needed to encode the string in the given style.
*/
static __device__ inline std::pair<bool, cudf::size_type> try_parse_string(
static __device__ inline thrust::tuple<bool, bool, cudf::size_type> try_parse_string(
char_range_reader& str,
char_range_reader to_match = char_range_reader(char_range::null()),
escape_style w_style = escape_style::UNESCAPED)
{
if (str.eof()) { return std::make_pair(false, 0); }
if (str.eof()) { return thrust::make_tuple(false, false, 0); }
char const quote_char = str.current_char();
int output_size_bytes = 0;
bool matched_field_name{true}; // if to_match is null, this doesn't matter

// write the first " if write style is escaped
if (escape_style::ESCAPED == w_style) { output_size_bytes++; }
Expand All @@ -640,12 +642,12 @@ class json_parser {
str.next();

// match check, the last char in match_str is quote_char
if (!to_match.is_null() && !to_match.eof()) { return std::make_pair(false, 0); }
if (!to_match.is_null() && !to_match.eof()) { matched_field_name = false; }

// write the end " if write style is escaped
if (escape_style::ESCAPED == w_style) { output_size_bytes++; }

return std::make_pair(true, str.pos());
return thrust::make_tuple(true, matched_field_name, str.pos());
} else if (v >= 0 && v < 32) {
// path 2: unescaped control char

Expand All @@ -656,17 +658,18 @@ class json_parser {
}

// check match if enabled
if (!try_match_char(to_match, str.current_char())) { return std::make_pair(false, 0); }
if (!try_match_char(to_match, str.current_char())) { matched_field_name = false; }

str.next();
output_size_bytes++;
continue;
} else if ('\\' == c) {
// path 3: escape path
str.next();
char* copy_dest_nullptr = nullptr;
if (!try_skip_escape_part(str, to_match, copy_dest_nullptr, w_style, output_size_bytes)) {
return std::make_pair(false, 0);
char* copy_dest_nullptr = nullptr; // unused
if (!try_skip_escape_part(
str, to_match, copy_dest_nullptr, w_style, output_size_bytes, matched_field_name)) {
return thrust::make_tuple(false, false, 0);
}
} else {
// path 4: safe code point
Expand All @@ -675,14 +678,14 @@ class json_parser {
// e.g.: 'A"' string, escape to "A\\"" (5 chars: " A \ " ")
if ('\"' == c && escape_style::ESCAPED == w_style) { output_size_bytes++; }

if (!try_skip_safe_code_point(str, c)) { return std::make_pair(false, 0); }
if (!try_skip_safe_code_point(str, c)) { return thrust::make_tuple(false, false, 0); }
// check match if enabled
if (!try_match_char(to_match, c)) { return std::make_pair(false, 0); }
if (!try_match_char(to_match, c)) { matched_field_name = false; }
output_size_bytes++;
}
}

return std::make_pair(false, 0);
return thrust::make_tuple(false, false, 0);
}

static __device__ inline bool try_match_char(char_range_reader& reader, char c)
Expand All @@ -708,7 +711,8 @@ class json_parser {
char_range_reader& to_match,
char*& copy_dest,
escape_style w_style,
int& output_size_bytes)
int& output_size_bytes,
bool& matched_field_name)
{
// already skipped the first '\'
// try skip second part
Expand All @@ -725,17 +729,17 @@ class json_parser {
}
output_size_bytes++;
}
if (!try_match_char(to_match, c)) { return false; }
output_size_bytes++;
str.next();
matched_field_name = matched_field_name && try_match_char(to_match, c);
return true;
case '\'':
// for both unescaped/escaped writes a single char '
if (nullptr != copy_dest) { *copy_dest++ = c; }
if (!try_match_char(to_match, c)) { return false; }

output_size_bytes++;
str.next();
matched_field_name = matched_field_name && try_match_char(to_match, c);
return true;
case '\\':
if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = c; }
Expand All @@ -746,16 +750,16 @@ class json_parser {
}
output_size_bytes++;
}
if (!try_match_char(to_match, c)) { return false; }
output_size_bytes++;
str.next();
matched_field_name = matched_field_name && try_match_char(to_match, c);
return true;
case '/':
// for both unescaped/escaped writes a single char /
if (nullptr != copy_dest) { *copy_dest++ = c; }
if (!try_match_char(to_match, c)) { return false; }
output_size_bytes++;
str.next();
matched_field_name = matched_field_name && try_match_char(to_match, c);
return true;
case 'b':
if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\b'; }
Expand All @@ -766,9 +770,9 @@ class json_parser {
}
output_size_bytes++;
}
if (!try_match_char(to_match, '\b')) { return false; }
output_size_bytes++;
str.next();
matched_field_name = matched_field_name && try_match_char(to_match, '\b');
return true;
case 'f':
if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\f'; }
Expand All @@ -779,9 +783,9 @@ class json_parser {
}
output_size_bytes++;
}
if (!try_match_char(to_match, '\f')) { return false; }
output_size_bytes++;
str.next();
matched_field_name = matched_field_name && try_match_char(to_match, '\f');
return true;
case 'n':
if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\n'; }
Expand All @@ -792,9 +796,9 @@ class json_parser {
}
output_size_bytes++;
}
if (!try_match_char(to_match, '\n')) { return false; }
output_size_bytes++;
str.next();
matched_field_name = matched_field_name && try_match_char(to_match, '\n');
return true;
case 'r':
if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\r'; }
Expand All @@ -805,9 +809,9 @@ class json_parser {
}
output_size_bytes++;
}
if (!try_match_char(to_match, '\r')) { return false; }
output_size_bytes++;
str.next();
matched_field_name = matched_field_name && try_match_char(to_match, '\r');
return true;
case 't':
if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\t'; }
Expand All @@ -818,9 +822,9 @@ class json_parser {
}
output_size_bytes++;
}
if (!try_match_char(to_match, '\t')) { return false; }
output_size_bytes++;
str.next();
matched_field_name = matched_field_name && try_match_char(to_match, '\t');
return true;
// path 1 done: \", \', \\, \/, \b, \f, \n, \r, \t
case 'u':
Expand All @@ -829,7 +833,7 @@ class json_parser {

// for both unescaped/escaped writes corresponding utf8 bytes, no need
// to pass in write style
return try_skip_unicode(str, to_match, copy_dest, output_size_bytes);
return try_skip_unicode(str, to_match, copy_dest, output_size_bytes, matched_field_name);
default:
// path 3: invalid
return false;
Expand Down Expand Up @@ -948,7 +952,8 @@ class json_parser {
static __device__ bool try_skip_unicode(char_range_reader& str,
char_range_reader& to_match,
char*& copy_dest,
int& output_size_bytes)
int& output_size_bytes,
bool& matched_field_name)
{
// already parsed \u
// now we expect 4 hex chars.
Expand Down Expand Up @@ -978,7 +983,10 @@ class json_parser {

if (!to_match.is_null()) {
for (cudf::size_type i = 0; i < bytes; i++) {
if (to_match.eof() || to_match.current_char() != buff[i]) { return false; }
if (to_match.eof() || to_match.current_char() != buff[i]) {
matched_field_name = false;
break;
}
to_match.next();
}
}
Expand Down Expand Up @@ -1210,16 +1218,18 @@ class json_parser {
/**
* parse the key string in key:value pair
*/
__device__ inline void parse_field_name_and_set_current()
__device__ inline void parse_field_name_and_set_current(
bool& matched_field_name, char_range to_match_field_name = char_range::null())
{
// TODO eventually chars should be a reader so we can just pass it in...
char_range_reader reader(chars, curr_pos);
current_token_start_pos = curr_pos;
auto [success, end_char_pos] = try_parse_string(reader);
current_token_start_pos = curr_pos;
auto [success, matched, end_char_pos] =
try_parse_string(reader, char_range_reader{to_match_field_name});
if (success) {
// TODO remove end_char_pos, and just get it from the reader...
curr_pos = end_char_pos;
current_token = json_token::FIELD_NAME;
matched_field_name = matched;
curr_pos = end_char_pos;
current_token = json_token::FIELD_NAME;
} else {
set_current_error();
}
Expand All @@ -1231,8 +1241,11 @@ class json_parser {
* @param[out] has_comma_before_token has comma before next token
* @param[out] has_colon_before_token has colon before next token
*/
__device__ inline void parse_next_token_and_set_current(bool& has_comma_before_token,
bool& has_colon_before_token)
__device__ inline void parse_next_token_and_set_current(
bool& has_comma_before_token,
bool& has_colon_before_token,
bool& matched_field_name,
char_range to_match_field_name = char_range::null())
{
skip_whitespaces();
if (!eof()) {
Expand Down Expand Up @@ -1264,7 +1277,7 @@ class json_parser {
current_token = json_token::END_OBJECT;
} else {
// parse key in key:value pair
parse_field_name_and_set_current();
parse_field_name_and_set_current(matched_field_name, to_match_field_name);
}
} else if (current_token == json_token::FIELD_NAME) {
if (c == ':') {
Expand All @@ -1289,7 +1302,7 @@ class json_parser {
// parse next key:value pair
curr_pos++;
skip_whitespaces();
parse_field_name_and_set_current();
parse_field_name_and_set_current(matched_field_name, to_match_field_name);
} else {
set_current_error();
}
Expand Down Expand Up @@ -1351,10 +1364,29 @@ class json_parser {
// parse next token
bool has_comma_before_token; // no-initialization because of do not care here
bool has_colon_before_token; // no-initialization because of do not care here
parse_next_token_and_set_current(has_comma_before_token, has_colon_before_token);
bool matched_field_name; // no-initialization because of do not care here
parse_next_token_and_set_current(
has_comma_before_token, has_colon_before_token, matched_field_name);
return current_token;
}

/**
* Continute parsing the next token. If the token is a field name then check if it is
* matched with the given name.
*/
__device__ bool parse_next_token_with_matching(cudf::string_view to_match_field_name)
{
// parse next token
bool has_comma_before_token; // no-initialization because of do not care here
bool has_colon_before_token; // no-initialization because of do not care here
bool matched_field_name;
parse_next_token_and_set_current(has_comma_before_token,
has_colon_before_token,
matched_field_name,
char_range{to_match_field_name});
return matched_field_name;
}

/**
* get current token
*/
Expand Down Expand Up @@ -1573,31 +1605,6 @@ class json_parser {
return 0;
}

/**
* match field name string when current token is FIELD_NAME,
* return true if current token is FIELD_NAME and match successfully.
* return false otherwise,
*/
__device__ bool match_current_field_name(cudf::string_view name) const
{
return match_current_field_name(char_range(name));
}

/**
* match current field name
*/
__device__ bool match_current_field_name(char_range name) const
{
if (json_token::FIELD_NAME == current_token) {
char_range_reader reader(current_range());
char_range_reader to_match(name);
auto [b, end_pos] = try_parse_string(reader, to_match, escape_style::UNESCAPED);
return b;
} else {
return false;
}
}

/**
* copy current structure to destination.
* return false if meets JSON format error,
Expand Down Expand Up @@ -1648,7 +1655,9 @@ class json_parser {
bool has_colon_before_token = false;

// parse and get has_comma_before_token, has_colon_before_token
parse_next_token_and_set_current(has_comma_before_token, has_colon_before_token);
bool matched_field_name; // unused
parse_next_token_and_set_current(
has_comma_before_token, has_colon_before_token, matched_field_name);

// check the JSON format
if (current_token == json_token::ERROR) { return thrust::make_pair(false, 0); }
Expand Down

0 comments on commit 8b0268e

Please sign in to comment.