From 3e653e3e45be930924cd4167788b1f65b414a2ac Mon Sep 17 00:00:00 2001 From: Oliver Crow Date: Wed, 31 Aug 2016 16:17:25 -0700 Subject: [PATCH] Added example UTF-8 conversion functions Added push_utf8_string() and get_utf8_string() functions that implement the suggested encoding/decoding of surrogate pairs when passing UTF8 strings in to or out of duktape. --- HowtoNonBmpCharacters.md | 91 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/HowtoNonBmpCharacters.md b/HowtoNonBmpCharacters.md index 971a332..5f73809 100644 --- a/HowtoNonBmpCharacters.md +++ b/HowtoNonBmpCharacters.md @@ -82,6 +82,97 @@ pairs as appropriate. It's probably best to write helpers to: - Read a string the value stack, converting surrogate pairs (CESU-8) into UTF-8. +For example: +```c +// Get a string from the duk stack, converting data from CESU-8 encoding to UTF-8. +// Returns null if value at index is not a string, or there's insufficient memory +// Caller is responsible for freeing returned buffer (if not null). +// +char *get_utf8_string(duk_context *ctx, duk_idx_t index) { + duk_size_t len; + const duk_uint8_t *in = (duk_uint8_t *)duk_get_lstring(ctx, index, &len); + if (in == NULL) return NULL; + + duk_uint8_t *buf = malloc(len); + if (buf == NULL) return NULL; + + duk_uint8_t *out = buf; + while (*in) { + // next six bytes represent a codepoint encoded as UTF-16 surrogate pair + if (in[0] == 0xED + && (in[1] & 0xF0) == 0xA0 + && (in[2] & 0xC0) == 0x80 + && (in[3] == 0xED) + && (in[4] & 0xF0) == 0xB0 + && (in[5] & 0xC0) == 0x80) + { + // push coding parts of 6 bytes of UTF-16 surrogate pair into a 4 byte UTF-8 codepoint + // adding 1 to in[1] adds 0x10000 to code-point that was subtracted for UTF-16 encoding + out[0] = 0xF0 | ((in[1]+1) & 0x1C) >> 2; + out[1] = 0x80 | ((in[1]+1) & 0x03) << 4 | (in[2] & 0x3C) >> 2; + out[2] = 0x80 | (in[2] & 0x03) << 4 | (in[4] & 0x0F); + out[3] = in[5]; + in += 6; out += 4; + } else { + // copy anything else as is + *out++ = *in++; + } + } + *out = '\0'; + return (char *)buf; +} + +// Push a UTF-8 string to the duk stack, first converting it to CESU-8 encoding +// Returns a pointer to the interned string +const char *push_utf8_string(duk_context *ctx, const char *str) { + const duk_uint8_t *in = (const duk_uint8_t *)str; + int supp_count = 0, out_size = 0; + + // scan input string, look for 4 byte UTF-8 chars, calculate required buffer size + if (in != NULL) { + while (*in) { + if ((in[0] & 0xF8) == 0xF0 && (in[1] & 0xC0) == 0x80 + && (in[2] & 0xC0) == 0x80 && (in[3] & 0xC0) == 0x80) + { + supp_count++; + in += 4; out_size += 6; + } else { + in += 1; out_size += 1; + } + } + } + // found no 4 byte characters to convert, so just push the string and return + if (supp_count == 0) { + return duk_push_string(ctx, str); + } + + // convert some UTF-8 characters to CESU-8 + duk_uint8_t *buf = malloc(out_size + 1); + duk_uint8_t *out = buf; + in = (const duk_uint8_t *)str; + while (*in) { + if ((in[0] & 0xF8) == 0xF0 && (in[1] & 0xC0) == 0x80 + && (in[2] & 0xC0) == 0x80 && (in[3] & 0xC0) == 0x80) + { + out[0] = 0xED; + out[1] = 0xA0 | (((in[0] & 0x07) << 2 | (in[1] & 0x30) >> 4) - 1); + out[2] = 0x80 | (in[1] & 0x0F) << 2 | (in[2] & 0x30) >> 4; + out[3] = 0xED; + out[4] = 0xB0 | (in[2] & 0x0F); + out[5] = in[3]; + in += 4; out += 6; + } else { + // copy anything else as is + *out++ = *in++; + } + } + *out = '\0'; + const char *result = duk_push_string(ctx, (char *)buf); + free(buf); + return result; +} +``` + ## Using Duktape UTF-8 This approach is convenient for C code, because strings can be expressed