Move bad test JSON file to good JSON directory

Due to commit 1a33062, which suggests applications set unicode non-characters to the replacement character, the file n_string_unicode_U+FDD0_nonchar.json had to be moved to the good directory. It was renamed to y_string_unicode_U+FDD0_replacement_char.json to suggest that a replacement is made. This is quite unfortunate as it makes our test suite diverge even more from the test suite repo we found but according to https://www.unicode.org/versions/Unicode13.0.0/UnicodeStandard-13.0.pdf this should be done.
xexyl · Oct 13, 2024 · 64abb7b · 64abb7b
1 parent 379debd
commit 64abb7b
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 18 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -3,15 +3,21 @@
 ## Release 1.2.4 2024-10-13
 
 Add fix to UTF-8 code: if a codepoint is in the so-called non-character range,
-it is recommended to set it to the replacement character `0xFFFD` which the code
-now does. This character is of (UTF-8) length 3 so in that case an additional 3
-bytes are allocated.
+it is recommended to set it to the replacement character `0xFFFD` (see
+<https://www.unicode.org/versions/Unicode13.0.0/UnicodeStandard-13.0.pdf>) which
+the code now does. This character is of (UTF-8) length 3 so in that case an
+additional 3 bytes are allocated. This fix unfortunately makes us diverge even
+more from the JSON test suite as a file, `n_string_unicode_U+FDD0_nonchar.json`
+had to be moved to the good subdirectory. It was renamed instead to
+`y_string_unicode_U+FDD0_replacement_char.json` to suggest that a replacement is
+made.
 
 Set `JPARSE_UTF8_VERSION` to `"1.2.2 2024-10-13"`.
 
 Bug fix `test_jparse/jparse_test.sh` to show the file that failed in some cases
 (an `$` was left off by accident).
 
+
 ## Release 1.2.3 2024-10-12
 
 Add extra sanity check in `sum_and_count()` (see `util.c`).

diff --git a/json_parse.c b/json_parse.c
@@ -1439,8 +1439,10 @@ decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen)
 
 		    /*
 		     * first we check if the range is in the non-character range as
-		     * the spec recommend setting it to the replacement character.
-		     * *sigh*
+		     * the spec recommendation is to set non-characters to the
+		     * replacement character (see
+		     * https://www.unicode.org/versions/Unicode13.0.0/UnicodeStandard-13.0.pdf
+		     * for more details).
 		     */
 		    if (is_unicode_noncharacter(xa)) {
 			dbg(DBG_MED, "%s: %X is non-character, setting to replacement character: U+%X", __func__,
@@ -1730,9 +1732,11 @@ json_decode(char const *ptr, size_t len, size_t *retlen)
 			return NULL;
 		    } else if (bytes == UNICODE_NOT_CHARACTER) {
 			/*
-			 * according to the spec recommendation, a character in
-			 * the non character range should be set to the
-			 * replacement character which is U+FFFD and is 3 bytes.
+			 * according to the spec recommendation (see
+			 * https://www.unicode.org/versions/Unicode13.0.0/UnicodeStandard-13.0.pdf),
+			 * a character in the non character range should be set
+			 * to the replacement character which is U+FFFD, and
+			 * which is in UTF-8 terms 3 bytes.
 			 */
 			bytes = 3;
 		    }

diff --git a/json_utf8.c b/json_utf8.c
@@ -70,18 +70,18 @@ utf8len(const char *str, int32_t surrogate)
 	x = surrogate;
 	if (x < 0x80) {
 	    len = 1;
-	    dbg(DBG_MED, "x %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
 	} else if (x < 0x800) {
 	    len = 2;
-	    dbg(DBG_MED, "x %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
 	} else if (x < 0x10000) {
 	    len = 3;
-	    dbg(DBG_MED, "x %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
 	} else if (x < 0x110000) {
 	    len = 4;
-	    dbg(DBG_MED, "x %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
 	} else {
-	    warn(__func__, "%x: illegal value\n", x);
+	    warn(__func__, "%X: illegal value\n", x);
 	    len = -1;
 	}
 
@@ -130,18 +130,18 @@ utf8len(const char *str, int32_t surrogate)
 
 	if (x < 0x80) {
 	    len = 1;
-	    dbg(DBG_MED, "x %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
 	} else if (x < 0x800) {
 	    len = 2;
-	    dbg(DBG_MED, "x %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
 	} else if (x < 0x10000) {
 	    len = 3;
-	    dbg(DBG_MED, "x %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
 	} else if (x < 0x110000) {
 	    len = 4;
-	    dbg(DBG_MED, "x %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
 	} else {
-	    warn(__func__, "%x: illegal value\n", x);
+	    warn(__func__, "%X: illegal value\n", x);
 	    len = -1;
 	}
     }

diff --git a/.../bad/n_string_unicode_U+FDD0_nonchar.json → ...ring_unicode_U+FDD0_replacement_char.json b/.../bad/n_string_unicode_U+FDD0_nonchar.json → ...ring_unicode_U+FDD0_replacement_char.json