Renamed utf8encode() to utf8_to_unicode()

Renamed utf8encode() to utf8_to_unicode() to be less confusing as although converting a code point to unicode is called encoding (from all sources we have seen) in JSON, according to encoders **AND** decoders out there, a code point in a string should be converted to unicode. A number of bugs have been identified in jstrencode(1) during discussion in 'the other repo' (or one of the 'other repos'). This is in jstrencode(1) now (as of yesterday); prior to yesterday it was in jstrdecode(1) due to the unfortunate swap in names. This swap happened because when focusing on issue #13 (the decoding - which turned out to be encoding - bug of \uxxxx) focus of the fact that the jstrencode(1) tool is not strictly UTF-8 but rather JSON was lost. The man page has had these bugs added so it is important to remove them when the bugs are fixed. A new issue #28 has been opened for these problems.
xexyl · Nov 14, 2024 · c64884e · c64884e
1 parent 01d02a6
commit c64884e
Show file tree

Hide file tree

Showing 6 changed files with 100 additions and 31 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,22 @@
 # Significant changes in the JSON parser repo
 
+## Release 2.0.8 2024-11-14
+
+Renamed `utf8encode()` to `utf8_to_unicode()` to be less confusing as although
+converting a code point to unicode is called encoding (from all sources we have
+seen) in JSON, according to encoders **AND** decoders out there, a code point in a
+string should be converted to unicode.
+
+A number of bugs have been identified in `jstrencode(1)` during discussion in
+'the other repo' (or one of the 'other repos'). This is in `jstrencode(1)` now
+(as of yesterday); prior to yesterday it was in `jstrdecode(1)` due to the
+unfortunate swap in names. This swap happened because when focusing on issue #13
+(the decoding - which turned out to be encoding - bug of `\uxxxx`) focus of the
+fact that the `jstrencode(1)` tool is not strictly UTF-8 but rather JSON was
+lost. The man page has had these bugs added so it is important to remove them
+when the bugs are fixed. A new issue #28 has been opened for these problems.
+
+
 ## Release 2.0.7 2024-11-13
 
 Swap encode/decode terminology again. This is because it refers to **JSON**

diff --git a/json_parse.c b/json_parse.c
@@ -110,7 +110,7 @@ struct byte2asciistr byte2asciistr[JSON_BYTE_VALUES] = {
     {'P', 1, "P", 1}, {'Q', 1, "Q", 1},    {'R', 1, "R", 1}, {'S', 1, "S", 1},
     {'T', 1, "T", 1}, {'U', 1, "U", 1},    {'V', 1, "V", 1}, {'W', 1, "W", 1},
     {'X', 1, "X", 1}, {'Y', 1, "Y", 1},    {'Z', 1, "Z", 1}, {'[', 1, "[", 1},
-    {'\\', 2, "\\\\", 1}, {']', 1, "]",1}, {'^', 1, "^", 1}, {'_', 1, "_", 1},
+    {'\\', 1, "\\\\", 1}, {']', 1, "]",1}, {'^', 1, "^", 1}, {'_', 1, "_", 1},
 
     /* \x60 - \x6f */
     {'`', 1, "`", 1}, {'a', 1, "a", 1}, {'b', 1, "b", 1}, {'c', 1, "c", 1},
@@ -209,7 +209,8 @@ struct byte2asciistr byte2asciistr[JSON_BYTE_VALUES] = {
 char *
 json_encode(char const *ptr, size_t len, size_t *retlen, bool skip_quote)
 {
-    char *ret = NULL;	    /* allocated encoding string or NULL */
+    char *ret = NULL;       /* return value */
+    char *str = NULL;	    /* allocated encoding string or NULL */
     char *beyond = NULL;    /* beyond the end of the allocated encoding string */
     ssize_t mlen = 0;	    /* length of allocated encoded string */
     char *p;		    /* next place to encode */
@@ -245,18 +246,18 @@ json_encode(char const *ptr, size_t len, size_t *retlen, bool skip_quote)
     /*
      * malloc the encoded string
      */
-    ret = malloc((size_t)mlen + 1 + 1);
-    if (ret == NULL) {
+    str = malloc((size_t)mlen + 1 + 1);
+    if (str == NULL) {
 	/* error - clear allocated length */
 	if (retlen != NULL) {
 	    *retlen = 0;
 	}
 	warn(__func__, "malloc of %ju bytes failed", (uintmax_t)(mlen + 1 + 1));
 	return NULL;
     }
-    ret[mlen] = '\0';   /* terminate string */
-    ret[mlen + 1] = '\0';   /* paranoia */
-    beyond = &(ret[mlen]);
+    str[mlen] = '\0';   /* terminate string */
+    str[mlen + 1] = '\0';   /* paranoia */
+    beyond = &(str[mlen]);
 
     /*
      * skip any enclosing quotes if requested
@@ -275,15 +276,15 @@ json_encode(char const *ptr, size_t len, size_t *retlen, bool skip_quote)
     /*
      * JSON encode each byte
      */
-    for (p=ret; i < len; ++i) {
+    for (p=str; i < len; ++i) {
 	if (p+byte2asciistr[(uint8_t)(ptr[i])].len > beyond) {
 	    /* error - clear allocated length */
 	    if (retlen != NULL) {
 		*retlen = 0;
 	    }
-	    if (ret != NULL) {
-		free(ret);
-		ret = NULL;
+	    if (str != NULL) {
+		free(str);
+		str = NULL;
 	    }
 	    warn(__func__, "encoding ran beyond end of allocated encoded string");
 	    return NULL;
@@ -292,24 +293,49 @@ json_encode(char const *ptr, size_t len, size_t *retlen, bool skip_quote)
 	p += byte2asciistr[(uint8_t)(ptr[i])].len;
     }
     *p = '\0';	/* paranoia */
-    mlen = p - ret; /* paranoia */
+    mlen = p - str; /* paranoia */
     if (mlen < 0) { /* paranoia */
 	warn(__func__, "mlen #1: %ju < 0", (uintmax_t)mlen);
-	if (ret != NULL) {
-	    free(ret);
-	    ret = NULL;
+	if (str != NULL) {
+	    free(str);
+	    str = NULL;
 	}
 	return NULL;
     }
 
     /*
-     * return result
+     * we now have to decode the \uxxxx code points
      */
-    dbg(DBG_VVVHIGH, "returning from json_encode(ptr, %ju, *%ju, %s)",
-		     (uintmax_t)len, (uintmax_t)mlen, booltostr(skip_quote));
-    if (retlen != NULL) {
-	*retlen = (size_t)mlen;
+    ret = json_decode(str, mlen, retlen);
+    if (ret == NULL) {
+        if (retlen != NULL) { /* should never be NULL but we check anyway */
+            *retlen = 0;
+        }
+        if (str != NULL) {
+            /*
+             * str should never be NULL but we check as an extra sanity check
+             */
+            free(str);
+            str = NULL;
+        }
+        warn(__func__, "post encoding failed");
+        return NULL;
+    }
+
+    if (str != NULL) {
+        /*
+         * str should always be non NULL at this point but we check as an extra
+         * sanity check
+         */
+        free(str);
+        str = NULL;
     }
+
+    /*
+     * return result
+     */
+    dbg(DBG_VVVHIGH, "returning from json_encode(ptr, %ju, *%ju, %s): %s",
+		     (uintmax_t)len, (uintmax_t)mlen, booltostr(skip_quote), ret);
     return ret;
 }
 
@@ -1502,7 +1528,7 @@ decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen)
 		     * was not another \uxxxx
 		     */
 
-		    bytes = utf8encode(utf8, xa);
+		    bytes = utf8_to_unicode(utf8, xa);
 		    if (bytes <= 0) {
 			/* error - clear allocated length and free buffer */
 			if (retlen != NULL) {
@@ -1512,7 +1538,7 @@ decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen)
 			    free(ret);
 			    ret = NULL;
 			}
-			/* utf8encode warns on error */
+			/* utf8_to_unicode warns on error */
 			return NULL;
 		    }
 		    /*
@@ -1548,7 +1574,7 @@ decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen)
 			return NULL;
 		    }
 
-		    bytes = utf8encode(utf8, surrogate);
+		    bytes = utf8_to_unicode(utf8, surrogate);
 		    if (bytes <= 0) {
 			/* error - clear allocated length and free buffer */
 			if (retlen != NULL) {
@@ -1558,7 +1584,7 @@ decode_json_string(char const *ptr, size_t len, size_t mlen, size_t *retlen)
 			    free(ret);
 			    ret = NULL;
 			}
-			/* utf8encode() warns on error */
+			/* utf8_to_unicode() warns on error */
 			return NULL;
 		    }
 

diff --git a/json_utf8.c b/json_utf8.c
@@ -138,6 +138,13 @@ utf8len(const char *str, int32_t surrogate)
  */
 
 /*
+ * NOTE: the following comment describes the function utf8encode() but we have
+ * renamed it because in JSON in BOTH encoding and decoding it should convert
+ * \uxxxx to unicode. This is by all sources seen called encoding but since this
+ * is a JSON library, to be less confusing, we call it utf8_to_unicode().
+ *
+ * --
+ *
  * UTF8 valid ranges.
  *
  * The UTF-8 decoding spreads the bits of a 32bit word over several
@@ -186,7 +193,7 @@ utf8len(const char *str, int32_t surrogate)
  *
  */
 int
-utf8encode(char *str, unsigned int val)
+utf8_to_unicode(char *str, unsigned int val)
 {
     int len = -1;
 

diff --git a/json_utf8.h b/json_utf8.h
@@ -36,7 +36,7 @@
 /*
  * official jparse UTF-8 version
  */
-#define JPARSE_UTF8_VERSION "2.0.4 2024-11-13"	/* format: major.minor YYYY-MM-DD */
+#define JPARSE_UTF8_VERSION "2.0.5 2024-11-14"	/* format: major.minor YYYY-MM-DD */
 
 
 extern size_t utf8len(const char *str, int32_t surrogate);
@@ -58,7 +58,13 @@ extern size_t utf8len(const char *str, int32_t surrogate);
 #define UTF8_V_MASK     0x3F
 #define UTF8_V_SHIFT    6
 
-extern int utf8encode(char *str, unsigned int val);
+/*
+ * NOTE: the original function name is utf8encode() but we have
+ * renamed it because in JSON in BOTH encoding and decoding it should convert
+ * \uxxxx to unicode. This is by all sources seen called encoding but since this
+ * is a JSON library, to be less confusing, we call it utf8_to_unicode().
+ */
+extern int utf8_to_unicode(char *str, unsigned int val);
 
 /*
  * The above function and macros are based on code from

diff --git a/man/man1/jstrencode.1 b/man/man1/jstrencode.1
@@ -9,7 +9,7 @@
 .\" "Share and Enjoy!"
 .\"     --  Sirius Cybernetics Corporation Complaints Division, JSON spec department. :-)
 .\"
-.TH jstrdecode 1 "09 November 2024" "jstrdecode" "jparse tools"
+.TH jstrdecode 1 "14 November 2024" "jstrdecode" "jparse tools"
 .SH NAME
 .B jstrdecode
 \- JSON decode command line strings
@@ -121,7 +121,20 @@ command line error
 internal error
 .SH BUGS
 .PP
-A known problem, at least with some terminal applications, is that one has to hit ctrl\-d (or whatever one has
+A number of issues are known when it comes to encoding, amongst them:
+.PP
+It does not convert code points to unicode symbols.
+.PP
+It duplicates
+.BR \e
+when it should be singular, according to JavaScript encoding of JSON with
+.BR \e .
+.PP
+It does not handle invalid 
+.BR \e\-
+escaped characters in the same way as JavaScript does, and in very wrong ways.
+.PP
+Otherwise, a known problem when reading from stdin (in the case that the input does not come from the pipe), at least with some terminal applications, is that one has to hit ctrl\-d (or whatever one has
 .B EOF
 configured as) twice in order for it to properly send
 .B EOF

diff --git a/version.h b/version.h
@@ -30,7 +30,7 @@
  *
  * NOTE: this should match the latest Release string in CHANGES.md
  */
-#define JPARSE_REPO_VERSION "2.0.7 2024-11-13"		/* format: major.minor YYYY-MM-DD */
+#define JPARSE_REPO_VERSION "2.0.8 2024-11-14"		/* format: major.minor YYYY-MM-DD */
 
 /*
  * official jparse version
@@ -40,7 +40,7 @@
 /*
  * official JSON parser version
  */
-#define JPARSE_LIBRARY_VERSION "2.0.3 2024-11-13"	/* library version format: major.minor YYYY-MM-DD */
+#define JPARSE_LIBRARY_VERSION "2.0.4 2024-11-14"	/* library version format: major.minor YYYY-MM-DD */
 
 
 #endif /* INCLUDE_JPARSE_VERSION_H */