Fix a parser bug where tokens are misidentified as commas. (#1502)

* Fix a parser bug where tokens are misidentified as commas. In the old and new readers, when parsing an object, a comment followed by any non-`}` token is treated as a comma. The new unit test required changing the runjsontests.py flag regime so that failure tests could be run with default settings. * Honor allowComments==false mode. Much of the comment handling in the parsers is bespoke, and does not honor this flag. By unfiying it under a common API, the parser is simplified and strict mode is now more correctly strict. Note that allowComments mode does not allow for comments in arbitrary locations; they are allowed only in certain positions. Rectifying this is a bigger effort, since collectComments mode requires storing the comments somewhere, and it's not immediately clear where in the DOM all such comments should live. --------- Co-authored-by: Jordan Bayles <bayles.jordan@gmail.com>
open-source-parsers · Sep 10, 2024 · 0a9b9d9 · 0a9b9d9
1 parent c3a9866
commit 0a9b9d9
Show file tree

Hide file tree

Showing 8 changed files with 49 additions and 55 deletions.
diff --git a/include/json/reader.h b/include/json/reader.h
@@ -190,6 +190,7 @@ class JSON_API Reader {
  using Errors = std::deque<ErrorInfo>;
 
  bool readToken(Token& token);
+ bool readTokenSkippingComments(Token& token);
  void skipSpaces();
  bool match(const Char* pattern, int patternLength);
  bool readComment();
@@ -221,7 +222,6 @@ class JSON_API Reader {
  int& column) const;
  String getLocationLineAndColumn(Location location) const;
  void addComment(Location begin, Location end, CommentPlacement placement);
- void skipCommentTokens(Token& token);
 
  static bool containsNewLine(Location begin, Location end);
  static String normalizeEOL(Location begin, Location end);

diff --git a/src/jsontestrunner/main.cpp b/src/jsontestrunner/main.cpp
@@ -240,11 +240,14 @@ static int parseCommandLine(int argc, const char* argv[], Options* opts) {
  return printUsage(argv);
  }
  int index = 1;
- if (Json::String(argv[index]) == "--json-checker") {
- opts->features = Json::Features::strictMode();
+ if (Json::String(argv[index]) == "--parse-only") {
  opts->parseOnly = true;
  ++index;
  }
+ if (Json::String(argv[index]) == "--strict") {
+ opts->features = Json::Features::strictMode();
+ ++index;
+ }
  if (Json::String(argv[index]) == "--json-config") {
  printConfig();
  return 3;

diff --git a/src/lib_json/json_reader.cpp b/src/lib_json/json_reader.cpp
@@ -129,7 +129,7 @@ bool Reader::parse(const char* beginDoc, const char* endDoc, Value& root,
 
  bool successful = readValue();
  Token token;
- skipCommentTokens(token);
+ readTokenSkippingComments(token);
  if (collectComments_ && !commentsBefore_.empty())
  root.setComment(commentsBefore_, commentAfter);
  if (features_.strictRoot_) {
@@ -157,7 +157,7 @@ bool Reader::readValue() {
  throwRuntimeError("Exceeded stackLimit in readValue().");
 
  Token token;
- skipCommentTokens(token);
+ readTokenSkippingComments(token);
  bool successful = true;
 
  if (collectComments_ && !commentsBefore_.empty()) {
@@ -225,14 +225,14 @@ bool Reader::readValue() {
  return successful;
 }
 
-void Reader::skipCommentTokens(Token& token) {
+bool Reader::readTokenSkippingComments(Token& token) {
+ bool success = readToken(token);
  if (features_.allowComments_) {
- do {
- readToken(token);
- } while (token.type_ == tokenComment);
- } else {
- readToken(token);
+ while (success && token.type_ == tokenComment) {
+ success = readToken(token);
+ }
  }
+ return success;
 }
 
 bool Reader::readToken(Token& token) {
@@ -446,12 +446,7 @@ bool Reader::readObject(Token& token) {
  Value init(objectValue);
  currentValue().swapPayload(init);
  currentValue().setOffsetStart(token.start_ - begin_);
- while (readToken(tokenName)) {
- bool initialTokenOk = true;
- while (tokenName.type_ == tokenComment && initialTokenOk)
- initialTokenOk = readToken(tokenName);
- if (!initialTokenOk)
- break;
+ while (readTokenSkippingComments(tokenName)) {
  if (tokenName.type_ == tokenObjectEnd && name.empty()) // empty object
  return true;
  name.clear();
@@ -480,15 +475,11 @@ bool Reader::readObject(Token& token) {
  return recoverFromError(tokenObjectEnd);
 
  Token comma;
- if (!readToken(comma) ||
- (comma.type_ != tokenObjectEnd && comma.type_ != tokenArraySeparator &&
- comma.type_ != tokenComment)) {
+ if (!readTokenSkippingComments(comma) ||
+ (comma.type_ != tokenObjectEnd && comma.type_ != tokenArraySeparator)) {
  return addErrorAndRecover("Missing ',' or '}' in object declaration",
  comma, tokenObjectEnd);
  }
- bool finalizeTokenOk = true;
- while (comma.type_ == tokenComment && finalizeTokenOk)
- finalizeTokenOk = readToken(comma);
  if (comma.type_ == tokenObjectEnd)
  return true;
  }
@@ -518,10 +509,7 @@ bool Reader::readArray(Token& token) {
 
  Token currentToken;
  // Accept Comment after last item in the array.
- ok = readToken(currentToken);
- while (currentToken.type_ == tokenComment && ok) {
- ok = readToken(currentToken);
- }
+ ok = readTokenSkippingComments(currentToken);
  bool badTokenType = (currentToken.type_ != tokenArraySeparator &&
  currentToken.type_ != tokenArrayEnd);
  if (!ok || badTokenType) {
@@ -943,6 +931,7 @@ class OurReader {
  using Errors = std::deque<ErrorInfo>;
 
  bool readToken(Token& token);
+ bool readTokenSkippingComments(Token& token);
  void skipSpaces();
  void skipBom(bool skipBom);
  bool match(const Char* pattern, int patternLength);
@@ -976,7 +965,6 @@ class OurReader {
  int& column) const;
  String getLocationLineAndColumn(Location location) const;
  void addComment(Location begin, Location end, CommentPlacement placement);
- void skipCommentTokens(Token& token);
 
  static String normalizeEOL(Location begin, Location end);
  static bool containsNewLine(Location begin, Location end);
@@ -1030,7 +1018,7 @@ bool OurReader::parse(const char* beginDoc, const char* endDoc, Value& root,
  bool successful = readValue();
  nodes_.pop();
  Token token;
- skipCommentTokens(token);
+ readTokenSkippingComments(token);
  if (features_.failIfExtra_ && (token.type_ != tokenEndOfStream)) {
  addError("Extra non-whitespace after JSON value.", token);
  return false;
@@ -1058,7 +1046,7 @@ bool OurReader::readValue() {
  if (nodes_.size() > features_.stackLimit_)
  throwRuntimeError("Exceeded stackLimit in readValue().");
  Token token;
- skipCommentTokens(token);
+ readTokenSkippingComments(token);
  bool successful = true;
 
  if (collectComments_ && !commentsBefore_.empty()) {
@@ -1145,14 +1133,14 @@ bool OurReader::readValue() {
  return successful;
 }
 
-void OurReader::skipCommentTokens(Token& token) {
+bool OurReader::readTokenSkippingComments(Token& token) {
+ bool success = readToken(token);
  if (features_.allowComments_) {
- do {
- readToken(token);
- } while (token.type_ == tokenComment);
- } else {
- readToken(token);
+ while (success && token.type_ == tokenComment) {
+ success = readToken(token);
+ }
  }
+ return success;
 }
 
 bool OurReader::readToken(Token& token) {
@@ -1449,12 +1437,7 @@ bool OurReader::readObject(Token& token) {
  Value init(objectValue);
  currentValue().swapPayload(init);
  currentValue().setOffsetStart(token.start_ - begin_);
- while (readToken(tokenName)) {
- bool initialTokenOk = true;
- while (tokenName.type_ == tokenComment && initialTokenOk)
- initialTokenOk = readToken(tokenName);
- if (!initialTokenOk)
- break;
+ while (readTokenSkippingComments(tokenName)) {
  if (tokenName.type_ == tokenObjectEnd &&
  (name.empty() ||
  features_.allowTrailingCommas_)) // empty object or trailing comma
@@ -1491,15 +1474,11 @@ bool OurReader::readObject(Token& token) {
  return recoverFromError(tokenObjectEnd);
 
  Token comma;
- if (!readToken(comma) ||
- (comma.type_ != tokenObjectEnd && comma.type_ != tokenArraySeparator &&
- comma.type_ != tokenComment)) {
+ if (!readTokenSkippingComments(comma) ||
+ (comma.type_ != tokenObjectEnd && comma.type_ != tokenArraySeparator)) {
  return addErrorAndRecover("Missing ',' or '}' in object declaration",
  comma, tokenObjectEnd);
  }
- bool finalizeTokenOk = true;
- while (comma.type_ == tokenComment && finalizeTokenOk)
- finalizeTokenOk = readToken(comma);
  if (comma.type_ == tokenObjectEnd)
  return true;
  }
@@ -1533,10 +1512,7 @@ bool OurReader::readArray(Token& token) {
 
  Token currentToken;
  // Accept Comment after last item in the array.
- ok = readToken(currentToken);
- while (currentToken.type_ == tokenComment && ok) {
- ok = readToken(currentToken);
- }
+ ok = readTokenSkippingComments(currentToken);
  bool badTokenType = (currentToken.type_ != tokenArraySeparator &&
  currentToken.type_ != tokenArrayEnd);
  if (!ok || badTokenType) {

diff --git a/test/data/fail_strict_comment_01.json b/test/data/fail_strict_comment_01.json
@@ -0,0 +1,4 @@
+{
+ "a": "aaa",
+ "b": "bbb" // comments not allowed in strict mode
+}
diff --git a/test/data/fail_strict_comment_02.json b/test/data/fail_strict_comment_02.json
@@ -0,0 +1,4 @@
+{
+ "a": "aaa", // comments not allowed in strict mode
+ "b": "bbb"
+}
diff --git a/test/data/fail_strict_comment_03.json b/test/data/fail_strict_comment_03.json
@@ -0,0 +1,3 @@
+{
+ "array" : [1, 2, 3 /* comments not allowed in strict mode */]
+}
diff --git a/test/data/fail_test_object_02.json b/test/data/fail_test_object_02.json
@@ -0,0 +1 @@
+{"one": 1 /* } */ { "two" : 2 }
diff --git a/test/runjsontests.py b/test/runjsontests.py
@@ -97,14 +97,17 @@ def runAllTests(jsontest_executable_path, input_dir = None,
  valgrind_path = use_valgrind and VALGRIND_CMD or ''
  for input_path in tests + test_jsonchecker:
  expect_failure = os.path.basename(input_path).startswith('fail')
- is_json_checker_test = (input_path in test_jsonchecker) or expect_failure
+ is_json_checker_test = input_path in test_jsonchecker
+ is_parse_only = is_json_checker_test or expect_failure
+ is_strict_test = ('_strict_' in os.path.basename(input_path)) or is_json_checker_test
  print('TESTING:', input_path, end=' ')
- options = is_json_checker_test and '--json-checker' or ''
+ options = is_parse_only and '--parse-only' or ''
+ options += is_strict_test and ' --strict' or ''
  options += ' --json-writer %s'%writerClass
  cmd = '%s%s %s "%s"' % ( valgrind_path, jsontest_executable_path, options,
  input_path)
  status, process_output = getStatusOutput(cmd)
- if is_json_checker_test:
+ if is_parse_only:
  if expect_failure:
  if not status:
  print('FAILED')