Skip to content

Commit

Permalink
Add fix to UTF-8 code points in noncharacter range
Browse files Browse the repository at this point in the history
The recommendation is that if a codepoint is in the so-called
non-character range, the code point should be set to the replacement
character (0xFFFD) which the code now does. As the UTF-8 length is 3
bytes an additional 3 bytes are added in the case a codepoint is in the
non-character range.

Set JPARSE_UTF8_VERSION to "1.2.2 2024-10-13".
  • Loading branch information
xexyl committed Oct 13, 2024
1 parent 85b3c42 commit 1a33062
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 5 deletions.
10 changes: 10 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Significant changes in the JSON parser repo

## Release 1.2.4 2024-10-13

Add fix to UTF-8 code: if a codepoint is in the so-called non-character range,
it is recommended to set it to the replacement character `0xFFFD` which the code
now does. This character is of (UTF-8) length 3 so in that case an additional 3
bytes are allocated.

Set `JPARSE_UTF8_VERSION` to `"1.2.2 2024-10-13"`.


## Release 1.2.3 2024-10-12

Add extra sanity check in `sum_and_count()` (see `util.c`).
Expand Down
22 changes: 21 additions & 1 deletion json_parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -1413,6 +1413,7 @@ decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen)
}
xa = 0;
xb = 0;

/*
* we check for a second \uxxxx first, in case it is a surrogate
* pair
Expand All @@ -1435,6 +1436,18 @@ decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen)
* no possible surrogate pair found so proceed like there
* was not another \uxxxx
*/

/*
* first we check if the range is in the non-character range as
* the spec recommend setting it to the replacement character.
* *sigh*
*/
if (is_unicode_noncharacter(xa)) {
dbg(DBG_MED, "%s: %X is non-character, setting to replacement character: U+%X", __func__,
UNICODE_REPLACEMENT_CHAR);
xa = UNICODE_REPLACEMENT_CHAR;
}

bytes = utf8encode(utf8, xa);
if (bytes < 0) {
/* error - clear allocated length and free buffer */
Expand Down Expand Up @@ -1709,12 +1722,19 @@ json_decode(char const *ptr, size_t len, size_t *retlen)
} else if (scanned == 1 || (scanned == 2 && surrogates_to_unicode(xa, xb) < 0)) {
surrogate = xa;
bytes = utf8len(ptr + i, surrogate);
if (bytes <= 0) {
if (bytes <= 0 && bytes != UNICODE_NOT_CHARACTER) {
if (retlen != NULL) {
*retlen = 0;
}
/* utf8len() already warns */
return NULL;
} else if (bytes == UNICODE_NOT_CHARACTER) {
/*
* according to the spec recommendation, a character in
* the non character range should be set to the
* replacement character which is U+FFFD and is 3 bytes.
*/
bytes = 3;
}
dbg(DBG_VVHIGH, "UTF-8 bytes: %ju", (uintmax_t)bytes);
mlen += bytes;
Expand Down
34 changes: 32 additions & 2 deletions json_utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,23 @@
#include <ctype.h>
#include "json_utf8.h"

/* is_unicode_noncharacter
*
* Determine if code point is in unicode non-character range
*
* A code point in the range of >= 0xFDD0 && <= 0xFDEF is a non-character.
*
* returns:
*
* true ==> number is in the non-character range
* false ==> number is NOT in the non-character range
*/
bool
is_unicode_noncharacter(int32_t x)
{
return x >= UNI_NOT_CHAR_MIN && x <= UNI_NOT_CHAR_MAX;
}

/*
* count_utf8_bytes - count bytes needed to encode/decode in str
*
Expand Down Expand Up @@ -53,12 +70,16 @@ utf8len(const char *str, int32_t surrogate)
x = surrogate;
if (x < 0x80) {
len = 1;
dbg(DBG_MED, "x %X length %d", x, len);
} else if (x < 0x800) {
len = 2;
dbg(DBG_MED, "x %X length %d", x, len);
} else if (x < 0x10000) {
len = 3;
dbg(DBG_MED, "x %X length %d", x, len);
} else if (x < 0x110000) {
len = 4;
dbg(DBG_MED, "x %X length %d", x, len);
} else {
warn(__func__, "%x: illegal value\n", x);
len = -1;
Expand Down Expand Up @@ -89,6 +110,10 @@ utf8len(const char *str, int32_t surrogate)
}
}

/*
* we have to perform additional checks
*/

/*
* now that we know that there is a \u followed by FOUR HEX digits we can
* try and extract it as a SINGLE HEX number
Expand All @@ -105,12 +130,16 @@ utf8len(const char *str, int32_t surrogate)

if (x < 0x80) {
len = 1;
dbg(DBG_MED, "x %X length %d", x, len);
} else if (x < 0x800) {
len = 2;
dbg(DBG_MED, "x %X length %d", x, len);
} else if (x < 0x10000) {
len = 3;
dbg(DBG_MED, "x %X length %d", x, len);
} else if (x < 0x110000) {
len = 4;
dbg(DBG_MED, "x %X length %d", x, len);
} else {
warn(__func__, "%x: illegal value\n", x);
len = -1;
Expand Down Expand Up @@ -187,8 +216,9 @@ utf8encode(char *str, unsigned int val)
not_reached();
}

if (val >= UNI_NOT_CHAR_MIN && val <= UNI_NOT_CHAR_MAX) {
warn(__func__, "invalid codepoint: %X", val);
if (is_unicode_noncharacter(val)) {
warn(__func__, "invalid codepoint: %X, will set to non-character replacement character %d",
val, UNICODE_REPLACEMENT_CHAR);
len = UNICODE_NOT_CHARACTER;
} else if ((val & 0xFFFF) >= 0xFFFE) {
warn(__func__, "codepoint %X: ends in either FFFE or FFFF", val);
Expand Down
4 changes: 3 additions & 1 deletion json_utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,12 @@
/*
* official jparse UTF-8 version
*/
#define JPARSE_UTF8_VERSION "1.2.1 2024-10-10" /* format: major.minor YYYY-MM-DD */
#define JPARSE_UTF8_VERSION "1.2.2 2024-10-13" /* format: major.minor YYYY-MM-DD */

#define UNICODE_REPLACEMENT_CHAR 0xFFFD

extern size_t utf8len(const char *str, int32_t surrogate);
extern bool is_unicode_noncharacter(int32_t x);

/*
* The below function and macros are based on code from
Expand Down
2 changes: 1 addition & 1 deletion version.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
*
* NOTE: this should match the latest Release string in CHANGES.md
*/
#define JPARSE_REPO_VERSION "1.2.3 2024-10-12" /* format: major.minor YYYY-MM-DD */
#define JPARSE_REPO_VERSION "1.2.4 2024-10-13" /* format: major.minor YYYY-MM-DD */

/*
* official jparse version
Expand Down

0 comments on commit 1a33062

Please sign in to comment.