More UTF-8 codepoints found to be valid

The codepoints 0xFF and 0xFE are valid where 0xFF is Latin small letter y with diaeresis (https://www.fileformat.info/info/unicode/char/00ff/index.htm) and 0xFE is Latin small letter thorn (https://www.fileformat.info/info/unicode/char/fe/index.htm). Thus these are now allowed and they do indeed decode into the proper characters.
xexyl · Oct 13, 2024 · a774970 · a774970
1 parent 64abb7b
commit a774970
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 11 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -12,6 +12,12 @@ had to be moved to the good subdirectory. It was renamed instead to
 `y_string_unicode_U+FDD0_replacement_char.json` to suggest that a replacement is
 made.
 
+Also, the codepoints `0xFF` and `0xFE` are valid where `0xFF` is [Latin small
+letter y with
+diaeresis](https://www.fileformat.info/info/unicode/char/00ff/index.htm) and
+`0xFE` is [Latin small letter
+thorn](https://www.fileformat.info/info/unicode/char/fe/index.htm).
+
 Set `JPARSE_UTF8_VERSION` to `"1.2.2 2024-10-13"`.
 
 Bug fix `test_jparse/jparse_test.sh` to show the file that failed in some cases

diff --git a/json_utf8.c b/json_utf8.c
@@ -70,16 +70,16 @@ utf8len(const char *str, int32_t surrogate)
 	x = surrogate;
 	if (x < 0x80) {
 	    len = 1;
-	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "%X length %d", x, len);
 	} else if (x < 0x800) {
 	    len = 2;
-	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "%X length %d", x, len);
 	} else if (x < 0x10000) {
 	    len = 3;
-	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "%X length %d", x, len);
 	} else if (x < 0x110000) {
 	    len = 4;
-	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "%X length %d", x, len);
 	} else {
 	    warn(__func__, "%X: illegal value\n", x);
 	    len = -1;
@@ -130,16 +130,16 @@ utf8len(const char *str, int32_t surrogate)
 
 	if (x < 0x80) {
 	    len = 1;
-	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "%X length %d", x, len);
 	} else if (x < 0x800) {
 	    len = 2;
-	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "%X length %d", x, len);
 	} else if (x < 0x10000) {
 	    len = 3;
-	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "%X length %d", x, len);
 	} else if (x < 0x110000) {
 	    len = 4;
-	    dbg(DBG_VVHIGH, "x: %X length %d", x, len);
+	    dbg(DBG_VVHIGH, "%X length %d", x, len);
 	} else {
 	    warn(__func__, "%X: illegal value\n", x);
 	    len = -1;
@@ -223,9 +223,6 @@ utf8encode(char *str, unsigned int val)
     } else if ((val & 0xFFFF) >= 0xFFFE) {
 	warn(__func__, "codepoint %X: ends in either FFFE or FFFF", val);
 	len = UNICODE_NOT_CHARACTER;
-    } else if (val == 0xFF || val == 0xFE) {
-	warn(__func__, "codepoint: %X: illegal value", val);
-	len = UNICODE_NOT_CHARACTER;
     } else if (val >= UNI_SUR_HIGH_START && val <= UNI_SUR_LOW_END) {
 	warn(__func__, "codepoint: %X: illegal surrogate", val);
 	len = UNICODE_SURROGATE_PAIR;