diff --git a/src/Makefile b/src/Makefile index 71ca028cde..231d7781f5 100644 --- a/src/Makefile +++ b/src/Makefile @@ -476,7 +476,7 @@ LJVM_O= lj_vm.o LJVM_BOUT= $(LJVM_S) LJVM_MODE= elfasm -LJLIB_O= lib_base.o lib_math.o lib_bit.o lib_string.o lib_table.o \ +LJLIB_O= lib_base.o lib_math.o lib_bit.o lib_string.o lib_utf8.o lib_table.o \ lib_io.o lib_os.o lib_package.o lib_debug.o lib_jit.o lib_ffi.o LJLIB_C= $(LJLIB_O:.o=.c) diff --git a/src/Makefile.dep b/src/Makefile.dep index 2b1cb5ef29..10bfc65a32 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -41,6 +41,9 @@ lib_string.o: lib_string.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \ lib_table.o: lib_table.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \ lj_def.h lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_buf.h lj_str.h \ lj_tab.h lj_ff.h lj_ffdef.h lj_lib.h lj_libdef.h +lib_utf8.o: lib_utf8.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \ + lj_def.h lj_arch.h lj_err.h lj_errmsg.h lj_buf.h lj_gc.h lj_str.h \ + lj_lib.h lj_libdef.h lj_alloc.o: lj_alloc.c lj_def.h lua.h luaconf.h lj_arch.h lj_alloc.h lj_api.o: lj_api.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \ lj_err.h lj_errmsg.h lj_debug.h lj_str.h lj_tab.h lj_func.h lj_udata.h \ diff --git a/src/lib_init.c b/src/lib_init.c index 2ed370e967..ab5eb3d7a2 100644 --- a/src/lib_init.c +++ b/src/lib_init.c @@ -22,6 +22,7 @@ static const luaL_Reg lj_lib_load[] = { { LUA_IOLIBNAME, luaopen_io }, { LUA_OSLIBNAME, luaopen_os }, { LUA_STRLIBNAME, luaopen_string }, + { LUA_UTF8LIBNAME, luaopen_utf8 }, { LUA_MATHLIBNAME, luaopen_math }, { LUA_DBLIBNAME, luaopen_debug }, { LUA_BITLIBNAME, luaopen_bit }, diff --git a/src/lib_utf8.c b/src/lib_utf8.c new file mode 100644 index 0000000000..8fdee9420b --- /dev/null +++ b/src/lib_utf8.c @@ -0,0 +1,249 @@ +/* +** UTF-8 library. +** Copyright (C) 2018. +** +** Major portions taken verbatim or adapted from the Lua interpreter. +** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h +*/ + +#define lib_utf8_c +#define LUA_LIB + +#include "lua.h" +#include "lauxlib.h" +#include "lualib.h" + +#include "lj_obj.h" +#include "lj_err.h" +#include "lj_buf.h" +#include "lj_lib.h" + + +/* ------------------------------------------------------------------------ */ + +#define LJLIB_MODULE_utf8 + +#define MAXUNICODE 0x10FFFF +#define iscont(p) ((*(p) & 0xC0) == 0x80) + +/* +** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. +*/ +static const char *utf8_decode (const char *o, int *val) { + static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF}; + const unsigned char *s = (const unsigned char *)o; + unsigned int c = s[0]; + unsigned int res = 0; /* final result */ + if (c < 0x80) /* ascii? */ + res = c; + else { + int count = 0; /* to count number of continuation bytes */ + while (c & 0x40) { /* still have continuation bytes? */ + int cc = s[++count]; /* read next byte */ + if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ + return NULL; /* invalid byte sequence */ + res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ + c <<= 1; /* to test next bit */ + } + res |= ((c & 0x7F) << (count * 5)); /* add first byte */ + if (count > 3 || res > MAXUNICODE || res <= limits[count]) + return NULL; /* invalid byte sequence */ + s += count; /* skip continuation bytes read */ + } + if (val) *val = res; + return (const char *)s + 1; /* +1 to include first byte */ +} + + +LJLIB_CF(utf8_char) +{ + int i, nargs = (int)(L->top - L->base); + SBuf *sb = lj_buf_tmp_(L); + for (i = 1; i <= nargs; i++) { + int32_t k = lj_lib_checkint(L, i); + if (!checku32(k)) + lj_err_arg(L, i, LJ_ERR_BADVAL); + lj_buf_pututf8(sb, k); + } + setstrV(L, L->top-1, lj_buf_str(L, sb)); + lj_gc_check(L); + return 1; +} + + +/* +** utf8len(s [, i [, j]]) --> number of characters that start in the +** range [i,j], or nil + current position if 's' is not well formed in +** that interval +*/ +LJLIB_CF(utf8_len) +{ + int n = 0; + GCstr *str = lj_lib_checkstr(L, 1); + int32_t len = (int32_t)str->len; + int32_t posi = lj_lib_optint(L, 2, 1); + int32_t posj = lj_lib_optint(L, 3, -1); + + if (posj < 0) posj += len+1; + if (posi < 0) posi += len+1; + + luaL_argcheck(L, 1 <= posi && posi <= len+1, 2, + "initial position out of string"); + luaL_argcheck(L, posj <= len, 3, + "final position out of string"); + + const char *s = strdata(str); + const char *p = s + posi-1; + const char *stop = s+posj; + + while (p < stop) { + const char *nextp = utf8_decode(p, NULL); + if (nextp == NULL) { /* conversion error? */ + lua_pushnil(L); /* return nil ... */ + lua_pushinteger(L, p - s + 1); /* ... and current position */ + return 2; + } + p = nextp; + n++; + } + lua_pushinteger(L, n); + return 1; +} + + + +static int iter_aux (lua_State *L) { + size_t len; + const char *s = luaL_checklstring(L, 1, &len); + lua_Integer n = lua_tointeger(L, 2) - 1; + if (n < 0) /* first iteration? */ + n = 0; /* start from here */ + else if (n < (lua_Integer)len) { + n++; /* skip current byte */ + while (iscont(s + n)) n++; /* and its continuations */ + } + if (n >= (lua_Integer)len) + return 0; /* no more codepoints */ + else { + int code; + const char *next = utf8_decode(s + n, &code); + if (next == NULL || iscont(next)) + return luaL_error(L, "invalid UTF-8 code"); + lua_pushinteger(L, n + 1); + lua_pushinteger(L, code); + return 2; + } +} + + +LJLIB_CF(utf8_codes) +{ + luaL_checkstring(L, 1); + lua_pushcfunction(L, iter_aux); + lua_pushvalue(L, 1); + lua_pushinteger(L, 0); + return 3; +} + + + +/* +** codepoint(s, [i, [j]]) -> returns codepoints for all characters +** that start in the range [i,j] +*/ +LJLIB_CF(utf8_codepoint) +{ + GCstr *str = lj_lib_checkstr(L, 1); + int32_t len = str->len; + int32_t posi = lj_lib_optint(L, 2, 1); + int32_t posj = lj_lib_optint(L, 3, posi); + + if (posj < 0) posj += len+1; + if (posi < 0) posi += len+1; + + if (posi > posj) return 0; + + luaL_argcheck(L, 1 <= posi && posi <= len, 2, + "initial position out of string"); + luaL_argcheck(L, posj <= len, 3, + "final position out of string"); + + luaL_checkstack(L, posj - posi + 1, "string slice too long"); + int n = 0; + const char *s = strdata(str); + const char *se = s + posj; + + for (s += posi - 1; s < se;) { + int code; + s = utf8_decode(s, &code); + if (s == NULL) + return luaL_error(L, "invalid UTF-8 code"); + lua_pushinteger(L, code); + n++; + } + return n; +} + + +/* +** offset(s, n, [i]) -> index where n-th character counting from +** position 'i' starts; 0 means character at 'i'. +*/ +LJLIB_CF(utf8_offset) +{ + GCstr *str = lj_lib_checkstr(L, 1); + int32_t len = str->len; + int32_t n = lj_lib_checkint(L, 2); + int16_t posi = lj_lib_optint(L, 3, (n >= 0) ? 1 : len + 1); + luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3, + "position out of range"); + + const char *s = strdata(str); + + if (n == 0) { + /* find beginning of current byte sequence */ + while (posi > 0 && iscont(s + posi)) posi--; + } + else { + if (iscont(s + posi)) + luaL_error(L, "initial position is a continuation byte"); + if (n < 0) { + while (n < 0 && posi > 0) { /* move back */ + do { /* find beginning of previous character */ + posi--; + } while (posi > 0 && iscont(s + posi)); + n++; + } + } + else { + n--; /* do not move for 1st character */ + while (n > 0 && posi < (lua_Integer)len) { + do { /* find beginning of next character */ + posi++; + } while (iscont(s + posi)); /* (cannot pass final '\0') */ + n--; + } + } + } + if (n == 0) /* did it find given character? */ + lua_pushinteger(L, posi + 1); + else /* no such character */ + lua_pushnil(L); + return 1; +} + + +/* ------------------------------------------------------------------------ */ + +#include "lj_libdef.h" + +/* pattern to match a single UTF-8 character */ +#define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*" + +LUALIB_API int luaopen_utf8(lua_State *L) +{ + LJ_LIB_REG(L, LUA_UTF8LIBNAME, utf8); + lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1); + lua_setfield(L, -2, "charpattern"); + return 1; +} diff --git a/src/lj_buf.c b/src/lj_buf.c index 0dfe7f9807..8fe5399aa4 100644 --- a/src/lj_buf.c +++ b/src/lj_buf.c @@ -94,6 +94,25 @@ SBuf * LJ_FASTCALL lj_buf_putstr(SBuf *sb, GCstr *s) return sb; } +SBuf * lj_buf_pututf8(SBuf* sb, uint32_t x) +{ + if (x < 0x800) { + if (x < 0x80) + return lj_buf_putchar(sb, x); + lj_buf_putchar(sb, 0xc0 | (x >> 6)); + } else { + if (x >= 0x10000) { + lj_buf_putchar(sb, 0xf0 | (x >> 18)); + lj_buf_putchar(sb, 0x80 | ((x >> 12) & 0x3f)); + } else { +// if (x >= 0xd800 && x < 0xe000) goto err_utf8; + lj_buf_putchar(sb, 0xe0 | (x >> 12)); + } + lj_buf_putchar(sb, 0x80 | ((x >> 6) & 0x3f)); + } + return lj_buf_putchar(sb, 0x80 | (x & 0x3f)); +} + /* -- High-level buffer put operations ------------------------------------ */ SBuf * LJ_FASTCALL lj_buf_putstr_reverse(SBuf *sb, GCstr *s) diff --git a/src/lj_buf.h b/src/lj_buf.h index a405169444..02a4a1e6ab 100644 --- a/src/lj_buf.h +++ b/src/lj_buf.h @@ -69,6 +69,7 @@ static LJ_AINLINE char *lj_buf_more(SBuf *sb, MSize sz) LJ_FUNC SBuf *lj_buf_putmem(SBuf *sb, const void *q, MSize len); LJ_FUNC SBuf * LJ_FASTCALL lj_buf_putchar(SBuf *sb, int c); LJ_FUNC SBuf * LJ_FASTCALL lj_buf_putstr(SBuf *sb, GCstr *s); +LJ_FUNC SBuf * LJ_FASTCALL lj_buf_pututf8(SBuf *sb, uint32_t x); static LJ_AINLINE char *lj_buf_wmem(char *p, const void *q, MSize len) { diff --git a/src/lj_strfmt.c b/src/lj_strfmt.c index d7893ce981..edef5f11be 100644 --- a/src/lj_strfmt.c +++ b/src/lj_strfmt.c @@ -19,7 +19,7 @@ static const uint8_t strfmt_map[('x'-'A')+1] = { STRFMT_A,0,0,0,STRFMT_E,STRFMT_F,STRFMT_G,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,STRFMT_X,0,0, + 0,0,0,0,0,0,0,STRFMT_UTF,0,0,STRFMT_X,0,0, 0,0,0,0,0,0, STRFMT_A,0,STRFMT_C,STRFMT_D,STRFMT_E,STRFMT_F,STRFMT_G,0,STRFMT_I,0,0,0,0, 0,STRFMT_O,STRFMT_P,STRFMT_Q,0,STRFMT_S,0,STRFMT_U,0,0,STRFMT_X @@ -443,6 +443,9 @@ const char *lj_strfmt_pushvf(lua_State *L, const char *fmt, va_list argp) case STRFMT_CHAR: lj_buf_putb(sb, va_arg(argp, int)); break; + case STRFMT_UTF: + lj_buf_pututf8(sb, va_arg(argp, long int)); + break; case STRFMT_PTR: lj_strfmt_putptr(sb, va_arg(argp, void *)); break; diff --git a/src/lj_strfmt.h b/src/lj_strfmt.h index 6e1d9017e6..c25dcbefa2 100644 --- a/src/lj_strfmt.h +++ b/src/lj_strfmt.h @@ -21,7 +21,7 @@ typedef struct FormatState { /* Format types (max. 16). */ typedef enum FormatType { STRFMT_EOF, STRFMT_ERR, STRFMT_LIT, - STRFMT_INT, STRFMT_UINT, STRFMT_NUM, STRFMT_STR, STRFMT_CHAR, STRFMT_PTR + STRFMT_INT, STRFMT_UINT, STRFMT_NUM, STRFMT_STR, STRFMT_CHAR, STRFMT_PTR, STRFMT_UTF } FormatType; /* Format subtypes (bits are reused). */ diff --git a/src/lualib.h b/src/lualib.h index bfc130a1a8..b220264129 100644 --- a/src/lualib.h +++ b/src/lualib.h @@ -13,6 +13,7 @@ #define LUA_COLIBNAME "coroutine" #define LUA_MATHLIBNAME "math" #define LUA_STRLIBNAME "string" +#define LUA_UTF8LIBNAME "utf8" #define LUA_TABLIBNAME "table" #define LUA_IOLIBNAME "io" #define LUA_OSLIBNAME "os" @@ -25,6 +26,7 @@ LUALIB_API int luaopen_base(lua_State *L); LUALIB_API int luaopen_math(lua_State *L); LUALIB_API int luaopen_string(lua_State *L); +LUALIB_API int luaopen_utf8(lua_State *L); LUALIB_API int luaopen_table(lua_State *L); LUALIB_API int luaopen_io(lua_State *L); LUALIB_API int luaopen_os(lua_State *L); diff --git a/tests/LuaJIT-test-cleanup/test/lang/concat.lua b/tests/LuaJIT-test-cleanup/test/lang/concat.lua index 04d665b2b7..a369f96687 100644 --- a/tests/LuaJIT-test-cleanup/test/lang/concat.lua +++ b/tests/LuaJIT-test-cleanup/test/lang/concat.lua @@ -110,3 +110,9 @@ do --- Very long strings assert(s:sub(1, 6) == s:sub(-6, -1)) end end + +do --- UTF-8 hexcodes in strings + local x + for i=1,100 do x = '\u{4f6e} - ' .. '\u{d61} - \u{4f1} - \u{a5}' end + assert(x == '佮 - ൡ - ӱ - ¥') +end diff --git a/tests/LuaJIT-test-cleanup/test/lib/index b/tests/LuaJIT-test-cleanup/test/lib/index index cc9d7d7301..b95ec89514 100644 --- a/tests/LuaJIT-test-cleanup/test/lib/index +++ b/tests/LuaJIT-test-cleanup/test/lib/index @@ -2,7 +2,8 @@ base bit.lua +bit math string +utf8.lua table coroutine ffi +ffi -contents.lua \ No newline at end of file +contents.lua diff --git a/tests/LuaJIT-test-cleanup/test/lib/utf8.lua b/tests/LuaJIT-test-cleanup/test/lib/utf8.lua new file mode 100644 index 0000000000..2aee08af6c --- /dev/null +++ b/tests/LuaJIT-test-cleanup/test/lib/utf8.lua @@ -0,0 +1,200 @@ + +do --- is there an utf-8 library? + assert(utf8) +end + + +do --- string from integer codepoints + local s = utf8.char(20334, 3425, 1265, 165) + assert(s == '佮ൡӱ¥', s) +end + + +do --- charpattern as specified on doc + assert(utf8.charpattern == '[\0-\x7F\xC2-\xF4][\x80-\xBF]*') +end + + +do --- codes iterator + local t = {} + for p, c in utf8.codes('佮ൡӱ¥') do + t[#t+1] = string.format('%d,%d', p, c) + end + assert(table.concat(t, ' - ')=='1,20334 - 4,3425 - 7,1265 - 9,165') +end + + +do --- codepoints + local s = '佮ൡӱ¥' + local a, b, c, d = utf8.codepoint(s, 1, -1) + assert(a == 20334) + assert(b == 3425) + assert(c == 1265) + assert(d == 165) + + assert(not pcall(utf8.codepoint, s, 0)) + + local t = {} + for i = 0, #s+1 do + local ok, x = pcall(utf8.codepoint, s, i) + t[#t+1] = ok and string.format('%d:ok:%d', i, x) + or string.format('%d:bad', i) + end + assert(table.concat(t, '\n') == + ([[0:bad 1:ok:20334 2:bad 3:bad 4:ok:3425 5:bad + 6:bad 7:ok:1265 8:bad 9:ok:165 10:bad 11:bad]]):gsub('%s+', '\n')) + + local t = {} + for i = 0, #s+1 do + for j = 0, #s+1 do + local ok, a, b, c, d = pcall(utf8.codepoint, s, i, j) + t[#t+1] = ok and string.format('%d,%d:ok:%s,%s,%s,%s', i, j, a, b, c, d, e) + or string.format('%d,%d:bad', i, j) + end + end + assert(table.concat(t, '\n') == + ([[0,0:bad 0,1:bad 0,2:bad 0,3:bad 0,4:bad 0,5:bad 0,6:bad 0,7:bad 0,8:bad 0,9:bad 0,10:bad 0,11:bad + 1,0:ok:nil,nil,nil,nil 1,1:ok:20334,nil,nil,nil 1,2:ok:20334,nil,nil,nil + 1,3:ok:20334,nil,nil,nil 1,4:ok:20334,3425,nil,nil 1,5:ok:20334,3425,nil,nil + 1,6:ok:20334,3425,nil,nil 1,7:ok:20334,3425,1265,nil 1,8:ok:20334,3425,1265,nil + 1,9:ok:20334,3425,1265,165 1,10:ok:20334,3425,1265,165 1,11:bad + + 2,0:ok:nil,nil,nil,nil 2,1:ok:nil,nil,nil,nil 2,2:bad + 2,3:bad 2,4:bad 2,5:bad 2,6:bad 2,7:bad 2,8:bad 2,9:bad 2,10:bad 2,11:bad + + 3,0:ok:nil,nil,nil,nil 3,1:ok:nil,nil,nil,nil 3,2:ok:nil,nil,nil,nil + 3,3:bad 3,4:bad 3,5:bad 3,6:bad 3,7:bad 3,8:bad 3,9:bad 3,10:bad 3,11:bad + + 4,0:ok:nil,nil,nil,nil 4,1:ok:nil,nil,nil,nil 4,2:ok:nil,nil,nil,nil + 4,3:ok:nil,nil,nil,nil 4,4:ok:3425,nil,nil,nil 4,5:ok:3425,nil,nil,nil + 4,6:ok:3425,nil,nil,nil 4,7:ok:3425,1265,nil,nil 4,8:ok:3425,1265,nil,nil + 4,9:ok:3425,1265,165,nil 4,10:ok:3425,1265,165,nil 4,11:bad + + 5,0:ok:nil,nil,nil,nil 5,1:ok:nil,nil,nil,nil 5,2:ok:nil,nil,nil,nil + 5,3:ok:nil,nil,nil,nil 5,4:ok:nil,nil,nil,nil 5,5:bad + 5,6:bad 5,7:bad 5,8:bad 5,9:bad 5,10:bad 5,11:bad + + 6,0:ok:nil,nil,nil,nil 6,1:ok:nil,nil,nil,nil 6,2:ok:nil,nil,nil,nil + 6,3:ok:nil,nil,nil,nil 6,4:ok:nil,nil,nil,nil 6,5:ok:nil,nil,nil,nil + 6,6:bad 6,7:bad 6,8:bad 6,9:bad 6,10:bad 6,11:bad + + 7,0:ok:nil,nil,nil,nil 7,1:ok:nil,nil,nil,nil 7,2:ok:nil,nil,nil,nil + 7,3:ok:nil,nil,nil,nil 7,4:ok:nil,nil,nil,nil 7,5:ok:nil,nil,nil,nil + 7,6:ok:nil,nil,nil,nil 7,7:ok:1265,nil,nil,nil 7,8:ok:1265,nil,nil,nil + 7,9:ok:1265,165,nil,nil 7,10:ok:1265,165,nil,nil 7,11:bad + + 8,0:ok:nil,nil,nil,nil 8,1:ok:nil,nil,nil,nil 8,2:ok:nil,nil,nil,nil + 8,3:ok:nil,nil,nil,nil 8,4:ok:nil,nil,nil,nil 8,5:ok:nil,nil,nil,nil + 8,6:ok:nil,nil,nil,nil 8,7:ok:nil,nil,nil,nil 8,8:bad 8,9:bad 8,10:bad 8,11:bad + + 9,0:ok:nil,nil,nil,nil 9,1:ok:nil,nil,nil,nil 9,2:ok:nil,nil,nil,nil + 9,3:ok:nil,nil,nil,nil 9,4:ok:nil,nil,nil,nil 9,5:ok:nil,nil,nil,nil + 9,6:ok:nil,nil,nil,nil 9,7:ok:nil,nil,nil,nil 9,8:ok:nil,nil,nil,nil + 9,9:ok:165,nil,nil,nil 9,10:ok:165,nil,nil,nil 9,11:bad + + 10,0:ok:nil,nil,nil,nil 10,1:ok:nil,nil,nil,nil 10,2:ok:nil,nil,nil,nil + 10,3:ok:nil,nil,nil,nil 10,4:ok:nil,nil,nil,nil 10,5:ok:nil,nil,nil,nil + 10,6:ok:nil,nil,nil,nil 10,7:ok:nil,nil,nil,nil 10,8:ok:nil,nil,nil,nil + 10,9:ok:nil,nil,nil,nil 10,10:bad 10,11:bad + + 11,0:ok:nil,nil,nil,nil 11,1:ok:nil,nil,nil,nil 11,2:ok:nil,nil,nil,nil + 11,3:ok:nil,nil,nil,nil 11,4:ok:nil,nil,nil,nil 11,5:ok:nil,nil,nil,nil + 11,6:ok:nil,nil,nil,nil 11,7:ok:nil,nil,nil,nil 11,8:ok:nil,nil,nil,nil + 11,9:ok:nil,nil,nil,nil 11,10:ok:nil,nil,nil,nil 11,11:bad]]):gsub('%s+', '\n')) +end + + +do --- len + local s = '佮ൡӱ¥' + assert(#s == 10) + assert(utf8.len(s) == 4) + assert(not pcall(utf8.len, s, 0)) + + local t = {} + for i = 1, #s+1 do + local a, b = utf8.len(s, i) + t[#t+1] = string.format('%d:%s,%s', i, a, b) + end + assert(table.concat(t, '\n') == + ([[1:4,nil 2:nil,2 3:nil,3 4:3,nil 5:nil,5 6:nil,6 + 7:2,nil 8:nil,8 9:1,nil 10:nil,10 11:0,nil]]):gsub('%s+', '\n')) + + assert(not pcall(utf8.len, s, #s+2)) + + t = {} + for i = 1, #s+1 do + for j = 1, #s do + local a, b = utf8.len(s, i, j) + t[#t+1] = string.format('%d,%d:%s,%s', i, j, a, b) + end + end + assert(table.concat(t, '\n') == + ([[1,1:1,nil 1,2:1,nil 1,3:1,nil 1,4:2,nil 1,5:2,nil + 1,6:2,nil 1,7:3,nil 1,8:3,nil 1,9:4,nil 1,10:4,nil + 2,1:0,nil 2,2:nil,2 2,3:nil,2 2,4:nil,2 2,5:nil,2 + 2,6:nil,2 2,7:nil,2 2,8:nil,2 2,9:nil,2 2,10:nil,2 + 3,1:0,nil 3,2:0,nil 3,3:nil,3 3,4:nil,3 3,5:nil,3 + 3,6:nil,3 3,7:nil,3 3,8:nil,3 3,9:nil,3 3,10:nil,3 + 4,1:0,nil 4,2:0,nil 4,3:0,nil 4,4:1,nil 4,5:1,nil + 4,6:1,nil 4,7:2,nil 4,8:2,nil 4,9:3,nil 4,10:3,nil + 5,1:0,nil 5,2:0,nil 5,3:0,nil 5,4:0,nil 5,5:nil,5 + 5,6:nil,5 5,7:nil,5 5,8:nil,5 5,9:nil,5 5,10:nil,5 + 6,1:0,nil 6,2:0,nil 6,3:0,nil 6,4:0,nil 6,5:0,nil + 6,6:nil,6 6,7:nil,6 6,8:nil,6 6,9:nil,6 6,10:nil,6 + 7,1:0,nil 7,2:0,nil 7,3:0,nil 7,4:0,nil 7,5:0,nil + 7,6:0,nil 7,7:1,nil 7,8:1,nil 7,9:2,nil 7,10:2,nil + 8,1:0,nil 8,2:0,nil 8,3:0,nil 8,4:0,nil 8,5:0,nil + 8,6:0,nil 8,7:0,nil 8,8:nil,8 8,9:nil,8 8,10:nil,8 + 9,1:0,nil 9,2:0,nil 9,3:0,nil 9,4:0,nil 9,5:0,nil + 9,6:0,nil 9,7:0,nil 9,8:0,nil 9,9:1,nil 9,10:1,nil + 10,1:0,nil 10,2:0,nil 10,3:0,nil 10,4:0,nil 10,5:0,nil + 10,6:0,nil 10,7:0,nil 10,8:0,nil 10,9:0,nil 10,10:nil,10 + 11,1:0,nil 11,2:0,nil 11,3:0,nil 11,4:0,nil 11,5:0,nil + 11,6:0,nil 11,7:0,nil 11,8:0,nil 11,9:0,nil 11,10:0,nil]]):gsub('%s+', '\n')) +end + + +do --- offset in bytes of character + local s = '佮ൡӱ¥' + assert(not pcall(utf8.offset, s)) + + local t = {} + for n = -utf8.len(s)-1, utf8.len(s)+2 do + t[#t+1] = tostring(utf8.offset(s, n)) + end + assert(table.concat(t, ';')=='nil;1;4;7;9;1;1;4;7;9;11;nil') + + assert(utf8.offset(s, -5) == nil) + assert(utf8.offset(s, 1) == 1) + assert(utf8.offset(s, -4) == 1) + assert(utf8.offset(s, 2) == 4) + assert(utf8.offset(s, -3) == 4) + assert(utf8.offset(s, 3) == 7) + assert(utf8.offset(s, -2) == 7) + assert(utf8.offset(s, 4) == 9) + assert(utf8.offset(s, -1) == 9) + assert(utf8.offset(s, 5) == 11) + assert(utf8.offset(s, 6) == nil) +end + + + +do --- check lua API lua_pushfstring() handling %U + local ffi = require 'ffi' + ffi.cdef [[ + typedef struct lua_State lua_State; + + lua_State *luaL_newstate(void); + const char *lua_pushfstring (lua_State *L, const char *fmt, ...); + void lua_close (lua_State *L); + ]] + + local L = ffi.C.luaL_newstate() + assert (L ~= nil) + local sp = ffi.C.lua_pushfstring(L, "num: %f, char: %c, UTF: %U", + 19.5, ffi.cast('int',67), ffi.cast('long int', 20334)) + local s = ffi.string(sp) + ffi.C.lua_close(L) + assert (s == "num: 19.5, char: C, UTF: 佮") +end +