Add fix to UTF-8 code points in noncharacter range

The recommendation is that if a codepoint is in the so-called non-character range, the code point should be set to the replacement character (0xFFFD) which the code now does. As the UTF-8 length is 3 bytes an additional 3 bytes are added in the case a codepoint is in the non-character range. Set JPARSE_UTF8_VERSION to "1.2.2 2024-10-13".
xexyl · Oct 13, 2024 · 1a33062 · 1a33062
1 parent 85b3c42
commit 1a33062
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 5 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,15 @@
 # Significant changes in the JSON parser repo
 
+## Release 1.2.4 2024-10-13
+
+Add fix to UTF-8 code: if a codepoint is in the so-called non-character range,
+it is recommended to set it to the replacement character `0xFFFD` which the code
+now does. This character is of (UTF-8) length 3 so in that case an additional 3
+bytes are allocated.
+
+Set `JPARSE_UTF8_VERSION` to `"1.2.2 2024-10-13"`.
+
+
 ## Release 1.2.3 2024-10-12
 
 Add extra sanity check in `sum_and_count()` (see `util.c`).

diff --git a/json_parse.c b/json_parse.c
@@ -1413,6 +1413,7 @@ decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen)
 		}
 		xa = 0;
 		xb = 0;
+
 		/*
 		 * we check for a second \uxxxx first, in case it is a surrogate
 		 * pair
@@ -1435,6 +1436,18 @@ decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen)
 		     * no possible surrogate pair found so proceed like there
 		     * was not another \uxxxx
 		     */
+
+		    /*
+		     * first we check if the range is in the non-character range as
+		     * the spec recommend setting it to the replacement character.
+		     * *sigh*
+		     */
+		    if (is_unicode_noncharacter(xa)) {
+			dbg(DBG_MED, "%s: %X is non-character, setting to replacement character: U+%X", __func__,
+				UNICODE_REPLACEMENT_CHAR);
+			xa = UNICODE_REPLACEMENT_CHAR;
+		    }
+
 		    bytes = utf8encode(utf8, xa);
 		    if (bytes < 0) {
 			/* error - clear allocated length and free buffer */
@@ -1709,12 +1722,19 @@ json_decode(char const *ptr, size_t len, size_t *retlen)
 		} else if (scanned == 1 || (scanned == 2 && surrogates_to_unicode(xa, xb) < 0)) {
 		    surrogate = xa;
 		    bytes = utf8len(ptr + i, surrogate);
-		    if (bytes <= 0) {
+		    if (bytes <= 0 && bytes != UNICODE_NOT_CHARACTER) {
 			if (retlen != NULL) {
 			    *retlen = 0;
 			}
 			/* utf8len() already warns */
 			return NULL;
+		    } else if (bytes == UNICODE_NOT_CHARACTER) {
+			/*
+			 * according to the spec recommendation, a character in
+			 * the non character range should be set to the
+			 * replacement character which is U+FFFD and is 3 bytes.
+			 */
+			bytes = 3;
 		    }
 		    dbg(DBG_VVHIGH, "UTF-8 bytes: %ju", (uintmax_t)bytes);
 		    mlen += bytes;

diff --git a/json_utf8.c b/json_utf8.c
@@ -22,6 +22,23 @@
 #include <ctype.h>
 #include "json_utf8.h"
 
+/* is_unicode_noncharacter
+ *
+ * Determine if code point is in unicode non-character range
+ *
+ * A code point in the range of >= 0xFDD0 && <= 0xFDEF is a non-character.
+ *
+ * returns:
+ *
+ *  true ==> number is in the non-character range
+ *  false ==> number is NOT in the non-character range
+ */
+bool
+is_unicode_noncharacter(int32_t x)
+{
+    return x >= UNI_NOT_CHAR_MIN && x <= UNI_NOT_CHAR_MAX;
+}
+
 /*
  * count_utf8_bytes	- count bytes needed to encode/decode in str
  *
@@ -53,12 +70,16 @@ utf8len(const char *str, int32_t surrogate)
 	x = surrogate;
 	if (x < 0x80) {
 	    len = 1;
+	    dbg(DBG_MED, "x %X length %d", x, len);
 	} else if (x < 0x800) {
 	    len = 2;
+	    dbg(DBG_MED, "x %X length %d", x, len);
 	} else if (x < 0x10000) {
 	    len = 3;
+	    dbg(DBG_MED, "x %X length %d", x, len);
 	} else if (x < 0x110000) {
 	    len = 4;
+	    dbg(DBG_MED, "x %X length %d", x, len);
 	} else {
 	    warn(__func__, "%x: illegal value\n", x);
 	    len = -1;
@@ -89,6 +110,10 @@ utf8len(const char *str, int32_t surrogate)
 	}
     }
 
+    /*
+     * we have to perform additional checks
+     */
+
     /*
      * now that we know that there is a \u followed by FOUR HEX digits we can
      * try and extract it as a SINGLE HEX number
@@ -105,12 +130,16 @@ utf8len(const char *str, int32_t surrogate)
 
 	if (x < 0x80) {
 	    len = 1;
+	    dbg(DBG_MED, "x %X length %d", x, len);
 	} else if (x < 0x800) {
 	    len = 2;
+	    dbg(DBG_MED, "x %X length %d", x, len);
 	} else if (x < 0x10000) {
 	    len = 3;
+	    dbg(DBG_MED, "x %X length %d", x, len);
 	} else if (x < 0x110000) {
 	    len = 4;
+	    dbg(DBG_MED, "x %X length %d", x, len);
 	} else {
 	    warn(__func__, "%x: illegal value\n", x);
 	    len = -1;
@@ -187,8 +216,9 @@ utf8encode(char *str, unsigned int val)
 	not_reached();
     }
 
-    if (val >= UNI_NOT_CHAR_MIN && val <= UNI_NOT_CHAR_MAX) {
-	warn(__func__, "invalid codepoint: %X", val);
+    if (is_unicode_noncharacter(val)) {
+	warn(__func__, "invalid codepoint: %X, will set to non-character replacement character %d",
+		val, UNICODE_REPLACEMENT_CHAR);
 	len = UNICODE_NOT_CHARACTER;
     } else if ((val & 0xFFFF) >= 0xFFFE) {
 	warn(__func__, "codepoint %X: ends in either FFFE or FFFF", val);

diff --git a/json_utf8.h b/json_utf8.h
@@ -36,10 +36,12 @@
 /*
  * official jparse UTF-8 version
  */
-#define JPARSE_UTF8_VERSION "1.2.1 2024-10-10"	/* format: major.minor YYYY-MM-DD */
+#define JPARSE_UTF8_VERSION "1.2.2 2024-10-13"	/* format: major.minor YYYY-MM-DD */
 
+#define UNICODE_REPLACEMENT_CHAR 0xFFFD
 
 extern size_t utf8len(const char *str, int32_t surrogate);
+extern bool is_unicode_noncharacter(int32_t x);
 
 /*
  * The below function and macros are based on code from

diff --git a/version.h b/version.h
@@ -30,7 +30,7 @@
  *
  * NOTE: this should match the latest Release string in CHANGES.md
  */
-#define JPARSE_REPO_VERSION "1.2.3 2024-10-12"		/* format: major.minor YYYY-MM-DD */
+#define JPARSE_REPO_VERSION "1.2.4 2024-10-13"		/* format: major.minor YYYY-MM-DD */
 
 /*
  * official jparse version