Perform name matching when parsing name field

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
NVIDIA · Oct 4, 2024 · 8b0268e · 8b0268e
1 parent 9e35beb
commit 8b0268e
Show file tree

Hide file tree

Showing 2 changed files with 82 additions and 69 deletions.
diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu
@@ -493,12 +493,16 @@ __device__ thrust::pair<bool, cudf::size_type> evaluate_path(
  // match first mached children with expected name
  bool found_expected_child = false;
  auto const to_match_name = ctx.path.front().name;
- while (json_token::END_OBJECT != p.next_token()) {
+ while (true) {
+ auto const is_name_matched = p.parse_next_token_with_matching(to_match_name);
+ if (json_token::END_OBJECT == p.get_current_token()) { break; }
+
  // JSON validation check
  if (json_token::ERROR == p.get_current_token()) { return {false, 0}; }
 
  // current token is FIELD_NAME
- if (p.match_current_field_name(to_match_name)) {
+ // if (p.match_current_field_name(to_match_name)) {
+ if (is_name_matched) {
  // skip FIELD_NAME token
  p.next_token();
  // JSON validation check

diff --git a/src/main/cpp/src/json_parser.cuh b/src/main/cpp/src/json_parser.cuh
@@ -414,7 +414,7 @@ class json_parser {
  {
  // TODO eventually chars should be a reader so we can just pass it in...
  char_range_reader reader(chars, curr_pos);
- auto [success, end_char_pos] = try_parse_string(reader);
+ [[maybe_unused]] auto const [success, matched, end_char_pos] = try_parse_string(reader);
  if (success) {
  // TODO remove end_char_pos, and just get it from the reader...
  curr_pos = end_char_pos;
@@ -546,8 +546,10 @@ class json_parser {
  } else if ('\\' == c) {
  // path 3: escape path
  str.next();
- char_range_reader to_match(char_range::null());
- if (!try_skip_escape_part(str, to_match, copy_destination, w_style, output_size_bytes)) {
+ char_range_reader to_match(char_range::null()); // unused
+ bool matched_field_name; // unused
+ if (!try_skip_escape_part(
+ str, to_match, copy_destination, w_style, output_size_bytes, matched_field_name)) {
  return output_size_bytes;
  }
  } else {
@@ -611,18 +613,18 @@ class json_parser {
  * @param str string to parse
  * @param to_match expected match str
  * @param w_style the escape style for writing.
- * @return a pair of success and length, where success is true if the string
- * is valid and length is the number of bytes needed to encode the string
- * in the given style.
+ * @return a tuple of values indicating if the parse process was successful, field name was
+ * matched, and byte length needed to encode the string in the given style.
  */
- static __device__ inline std::pair<bool, cudf::size_type> try_parse_string(
+ static __device__ inline thrust::tuple<bool, bool, cudf::size_type> try_parse_string(
  char_range_reader& str,
  char_range_reader to_match = char_range_reader(char_range::null()),
  escape_style w_style = escape_style::UNESCAPED)
  {
- if (str.eof()) { return std::make_pair(false, 0); }
+ if (str.eof()) { return thrust::make_tuple(false, false, 0); }
  char const quote_char = str.current_char();
  int output_size_bytes = 0;
+ bool matched_field_name{true}; // if to_match is null, this doesn't matter
 
  // write the first " if write style is escaped
  if (escape_style::ESCAPED == w_style) { output_size_bytes++; }
@@ -640,12 +642,12 @@ class json_parser {
  str.next();
 
  // match check, the last char in match_str is quote_char
- if (!to_match.is_null() && !to_match.eof()) { return std::make_pair(false, 0); }
+ if (!to_match.is_null() && !to_match.eof()) { matched_field_name = false; }
 
  // write the end " if write style is escaped
  if (escape_style::ESCAPED == w_style) { output_size_bytes++; }
 
- return std::make_pair(true, str.pos());
+ return thrust::make_tuple(true, matched_field_name, str.pos());
  } else if (v >= 0 && v < 32) {
  // path 2: unescaped control char
 
@@ -656,17 +658,18 @@ class json_parser {
  }
 
  // check match if enabled
- if (!try_match_char(to_match, str.current_char())) { return std::make_pair(false, 0); }
+ if (!try_match_char(to_match, str.current_char())) { matched_field_name = false; }
 
  str.next();
  output_size_bytes++;
  continue;
  } else if ('\\' == c) {
  // path 3: escape path
  str.next();
- char* copy_dest_nullptr = nullptr;
- if (!try_skip_escape_part(str, to_match, copy_dest_nullptr, w_style, output_size_bytes)) {
- return std::make_pair(false, 0);
+ char* copy_dest_nullptr = nullptr; // unused
+ if (!try_skip_escape_part(
+ str, to_match, copy_dest_nullptr, w_style, output_size_bytes, matched_field_name)) {
+ return thrust::make_tuple(false, false, 0);
  }
  } else {
  // path 4: safe code point
@@ -675,14 +678,14 @@ class json_parser {
  // e.g.: 'A"' string, escape to "A\\"" (5 chars: " A \ " ")
  if ('\"' == c && escape_style::ESCAPED == w_style) { output_size_bytes++; }
 
- if (!try_skip_safe_code_point(str, c)) { return std::make_pair(false, 0); }
+ if (!try_skip_safe_code_point(str, c)) { return thrust::make_tuple(false, false, 0); }
  // check match if enabled
- if (!try_match_char(to_match, c)) { return std::make_pair(false, 0); }
+ if (!try_match_char(to_match, c)) { matched_field_name = false; }
  output_size_bytes++;
  }
  }
 
- return std::make_pair(false, 0);
+ return thrust::make_tuple(false, false, 0);
  }
 
  static __device__ inline bool try_match_char(char_range_reader& reader, char c)
@@ -708,7 +711,8 @@ class json_parser {
  char_range_reader& to_match,
  char*& copy_dest,
  escape_style w_style,
- int& output_size_bytes)
+ int& output_size_bytes,
+ bool& matched_field_name)
  {
  // already skipped the first '\'
  // try skip second part
@@ -725,17 +729,17 @@ class json_parser {
  }
  output_size_bytes++;
  }
- if (!try_match_char(to_match, c)) { return false; }
  output_size_bytes++;
  str.next();
+ matched_field_name = matched_field_name && try_match_char(to_match, c);
  return true;
  case '\'':
  // for both unescaped/escaped writes a single char '
  if (nullptr != copy_dest) { *copy_dest++ = c; }
- if (!try_match_char(to_match, c)) { return false; }
 
  output_size_bytes++;
  str.next();
+ matched_field_name = matched_field_name && try_match_char(to_match, c);
  return true;
  case '\\':
  if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = c; }
@@ -746,16 +750,16 @@ class json_parser {
  }
  output_size_bytes++;
  }
- if (!try_match_char(to_match, c)) { return false; }
  output_size_bytes++;
  str.next();
+ matched_field_name = matched_field_name && try_match_char(to_match, c);
  return true;
  case '/':
  // for both unescaped/escaped writes a single char /
  if (nullptr != copy_dest) { *copy_dest++ = c; }
- if (!try_match_char(to_match, c)) { return false; }
  output_size_bytes++;
  str.next();
+ matched_field_name = matched_field_name && try_match_char(to_match, c);
  return true;
  case 'b':
  if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\b'; }
@@ -766,9 +770,9 @@ class json_parser {
  }
  output_size_bytes++;
  }
- if (!try_match_char(to_match, '\b')) { return false; }
  output_size_bytes++;
  str.next();
+ matched_field_name = matched_field_name && try_match_char(to_match, '\b');
  return true;
  case 'f':
  if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\f'; }
@@ -779,9 +783,9 @@ class json_parser {
  }
  output_size_bytes++;
  }
- if (!try_match_char(to_match, '\f')) { return false; }
  output_size_bytes++;
  str.next();
+ matched_field_name = matched_field_name && try_match_char(to_match, '\f');
  return true;
  case 'n':
  if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\n'; }
@@ -792,9 +796,9 @@ class json_parser {
  }
  output_size_bytes++;
  }
- if (!try_match_char(to_match, '\n')) { return false; }
  output_size_bytes++;
  str.next();
+ matched_field_name = matched_field_name && try_match_char(to_match, '\n');
  return true;
  case 'r':
  if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\r'; }
@@ -805,9 +809,9 @@ class json_parser {
  }
  output_size_bytes++;
  }
- if (!try_match_char(to_match, '\r')) { return false; }
  output_size_bytes++;
  str.next();
+ matched_field_name = matched_field_name && try_match_char(to_match, '\r');
  return true;
  case 't':
  if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\t'; }
@@ -818,9 +822,9 @@ class json_parser {
  }
  output_size_bytes++;
  }
- if (!try_match_char(to_match, '\t')) { return false; }
  output_size_bytes++;
  str.next();
+ matched_field_name = matched_field_name && try_match_char(to_match, '\t');
  return true;
  // path 1 done: \", \', \\, \/, \b, \f, \n, \r, \t
  case 'u':
@@ -829,7 +833,7 @@ class json_parser {
 
  // for both unescaped/escaped writes corresponding utf8 bytes, no need
  // to pass in write style
- return try_skip_unicode(str, to_match, copy_dest, output_size_bytes);
+ return try_skip_unicode(str, to_match, copy_dest, output_size_bytes, matched_field_name);
  default:
  // path 3: invalid
  return false;
@@ -948,7 +952,8 @@ class json_parser {
  static __device__ bool try_skip_unicode(char_range_reader& str,
  char_range_reader& to_match,
  char*& copy_dest,
- int& output_size_bytes)
+ int& output_size_bytes,
+ bool& matched_field_name)
  {
  // already parsed \u
  // now we expect 4 hex chars.
@@ -978,7 +983,10 @@ class json_parser {
 
  if (!to_match.is_null()) {
  for (cudf::size_type i = 0; i < bytes; i++) {
- if (to_match.eof() || to_match.current_char() != buff[i]) { return false; }
+ if (to_match.eof() || to_match.current_char() != buff[i]) {
+ matched_field_name = false;
+ break;
+ }
  to_match.next();
  }
  }
@@ -1210,16 +1218,18 @@ class json_parser {
  /**
  * parse the key string in key:value pair
  */
- __device__ inline void parse_field_name_and_set_current()
+ __device__ inline void parse_field_name_and_set_current(
+ bool& matched_field_name, char_range to_match_field_name = char_range::null())
  {
  // TODO eventually chars should be a reader so we can just pass it in...
  char_range_reader reader(chars, curr_pos);
- current_token_start_pos = curr_pos;
- auto [success, end_char_pos] = try_parse_string(reader);
+ current_token_start_pos = curr_pos;
+ auto [success, matched, end_char_pos] =
+ try_parse_string(reader, char_range_reader{to_match_field_name});
  if (success) {
- // TODO remove end_char_pos, and just get it from the reader...
- curr_pos = end_char_pos;
- current_token = json_token::FIELD_NAME;
+ matched_field_name = matched;
+ curr_pos  = end_char_pos;
+ current_token  = json_token::FIELD_NAME;
  } else {
  set_current_error();
  }
@@ -1231,8 +1241,11 @@ class json_parser {
  * @param[out] has_comma_before_token has comma before next token
  * @param[out] has_colon_before_token has colon before next token
  */
- __device__ inline void parse_next_token_and_set_current(bool& has_comma_before_token,
- bool& has_colon_before_token)
+ __device__ inline void parse_next_token_and_set_current(
+ bool& has_comma_before_token,
+ bool& has_colon_before_token,
+ bool& matched_field_name,
+ char_range to_match_field_name = char_range::null())
  {
  skip_whitespaces();
  if (!eof()) {
@@ -1264,7 +1277,7 @@ class json_parser {
  current_token = json_token::END_OBJECT;
  } else {
  // parse key in key:value pair
- parse_field_name_and_set_current();
+ parse_field_name_and_set_current(matched_field_name, to_match_field_name);
  }
  } else if (current_token == json_token::FIELD_NAME) {
  if (c == ':') {
@@ -1289,7 +1302,7 @@ class json_parser {
  // parse next key:value pair
  curr_pos++;
  skip_whitespaces();
- parse_field_name_and_set_current();
+ parse_field_name_and_set_current(matched_field_name, to_match_field_name);
  } else {
  set_current_error();
  }
@@ -1351,10 +1364,29 @@ class json_parser {
  // parse next token
  bool has_comma_before_token; // no-initialization because of do not care here
  bool has_colon_before_token; // no-initialization because of do not care here
- parse_next_token_and_set_current(has_comma_before_token, has_colon_before_token);
+ bool matched_field_name; // no-initialization because of do not care here
+ parse_next_token_and_set_current(
+ has_comma_before_token, has_colon_before_token, matched_field_name);
  return current_token;
  }
 
+ /**
+ * Continute parsing the next token. If the token is a field name then check if it is
+ * matched with the given name.
+ */
+ __device__ bool parse_next_token_with_matching(cudf::string_view to_match_field_name)
+ {
+ // parse next token
+ bool has_comma_before_token; // no-initialization because of do not care here
+ bool has_colon_before_token; // no-initialization because of do not care here
+ bool matched_field_name;
+ parse_next_token_and_set_current(has_comma_before_token,
+ has_colon_before_token,
+ matched_field_name,
+ char_range{to_match_field_name});
+ return matched_field_name;
+ }
+
  /**
  * get current token
  */
@@ -1573,31 +1605,6 @@ class json_parser {
  return 0;
  }
 
- /**
- * match field name string when current token is FIELD_NAME,
- * return true if current token is FIELD_NAME and match successfully.
- * return false otherwise,
- */
- __device__ bool match_current_field_name(cudf::string_view name) const
- {
- return match_current_field_name(char_range(name));
- }
-
- /**
- * match current field name
- */
- __device__ bool match_current_field_name(char_range name) const
- {
- if (json_token::FIELD_NAME == current_token) {
- char_range_reader reader(current_range());
- char_range_reader to_match(name);
- auto [b, end_pos] = try_parse_string(reader, to_match, escape_style::UNESCAPED);
- return b;
- } else {
- return false;
- }
- }
-
  /**
  * copy current structure to destination.
  * return false if meets JSON format error,
@@ -1648,7 +1655,9 @@ class json_parser {
  bool has_colon_before_token = false;
 
  // parse and get has_comma_before_token, has_colon_before_token
- parse_next_token_and_set_current(has_comma_before_token, has_colon_before_token);
+ bool matched_field_name; // unused
+ parse_next_token_and_set_current(
+ has_comma_before_token, has_colon_before_token, matched_field_name);
 
  // check the JSON format
  if (current_token == json_token::ERROR) { return thrust::make_pair(false, 0); }