Skip to content

Commit

Permalink
More UTF-8 codepoints found to be valid
Browse files Browse the repository at this point in the history
The codepoints 0xFF and 0xFE are valid where 0xFF is Latin small
letter y with diaeresis (https://www.fileformat.info/info/unicode/char/00ff/index.htm) and
0xFE is Latin small letter thorn
(https://www.fileformat.info/info/unicode/char/fe/index.htm).

Thus these are now allowed and they do indeed decode into the proper
characters.
  • Loading branch information
xexyl committed Oct 13, 2024
1 parent 64abb7b commit a774970
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 11 deletions.
6 changes: 6 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ had to be moved to the good subdirectory. It was renamed instead to
`y_string_unicode_U+FDD0_replacement_char.json` to suggest that a replacement is
made.

Also, the codepoints `0xFF` and `0xFE` are valid where `0xFF` is [Latin small
letter y with
diaeresis](https://www.fileformat.info/info/unicode/char/00ff/index.htm) and
`0xFE` is [Latin small letter
thorn](https://www.fileformat.info/info/unicode/char/fe/index.htm).

Set `JPARSE_UTF8_VERSION` to `"1.2.2 2024-10-13"`.

Bug fix `test_jparse/jparse_test.sh` to show the file that failed in some cases
Expand Down
19 changes: 8 additions & 11 deletions json_utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,16 @@ utf8len(const char *str, int32_t surrogate)
x = surrogate;
if (x < 0x80) {
len = 1;
dbg(DBG_VVHIGH, "x: %X length %d", x, len);
dbg(DBG_VVHIGH, "%X length %d", x, len);
} else if (x < 0x800) {
len = 2;
dbg(DBG_VVHIGH, "x: %X length %d", x, len);
dbg(DBG_VVHIGH, "%X length %d", x, len);
} else if (x < 0x10000) {
len = 3;
dbg(DBG_VVHIGH, "x: %X length %d", x, len);
dbg(DBG_VVHIGH, "%X length %d", x, len);
} else if (x < 0x110000) {
len = 4;
dbg(DBG_VVHIGH, "x: %X length %d", x, len);
dbg(DBG_VVHIGH, "%X length %d", x, len);
} else {
warn(__func__, "%X: illegal value\n", x);
len = -1;
Expand Down Expand Up @@ -130,16 +130,16 @@ utf8len(const char *str, int32_t surrogate)

if (x < 0x80) {
len = 1;
dbg(DBG_VVHIGH, "x: %X length %d", x, len);
dbg(DBG_VVHIGH, "%X length %d", x, len);
} else if (x < 0x800) {
len = 2;
dbg(DBG_VVHIGH, "x: %X length %d", x, len);
dbg(DBG_VVHIGH, "%X length %d", x, len);
} else if (x < 0x10000) {
len = 3;
dbg(DBG_VVHIGH, "x: %X length %d", x, len);
dbg(DBG_VVHIGH, "%X length %d", x, len);
} else if (x < 0x110000) {
len = 4;
dbg(DBG_VVHIGH, "x: %X length %d", x, len);
dbg(DBG_VVHIGH, "%X length %d", x, len);
} else {
warn(__func__, "%X: illegal value\n", x);
len = -1;
Expand Down Expand Up @@ -223,9 +223,6 @@ utf8encode(char *str, unsigned int val)
} else if ((val & 0xFFFF) >= 0xFFFE) {
warn(__func__, "codepoint %X: ends in either FFFE or FFFF", val);
len = UNICODE_NOT_CHARACTER;
} else if (val == 0xFF || val == 0xFE) {
warn(__func__, "codepoint: %X: illegal value", val);
len = UNICODE_NOT_CHARACTER;
} else if (val >= UNI_SUR_HIGH_START && val <= UNI_SUR_LOW_END) {
warn(__func__, "codepoint: %X: illegal surrogate", val);
len = UNICODE_SURROGATE_PAIR;
Expand Down

0 comments on commit a774970

Please sign in to comment.