-
Notifications
You must be signed in to change notification settings - Fork 0
/
lunicode.lua
147 lines (122 loc) · 3.72 KB
/
lunicode.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/lua
--[[ lunicode.lua
A pure-Lua unicode encoding/decoding library.
Module exports two public functions:
* decode(s) takes a UTF-8 encoded string and returns an array of
code points
* encode(a) takes an array of code points and returns a UTF-8
encoded string
--]]
local TEST = nil
--~ local TEST = { ['decode'] = true,
--~ ['encode'] = true, }
local bit32 = require 'bit32'
local lib = {}
local MASK_HIGH = { 128, 192, 224, 240, 248 }
local MASK_LOW = { 1, 3, 7, 15, 31, 63 }
local HEAD = { 192, 224, 240 }
local UPPER_LIMITS = { 0x800, 0x10000, 0x110000 }
--[[ Organization of HEADS is
mh = {d, mc, n}
where
mh: decimal number of bitmask for head
d: decimal number of multibyte head
mc decimal number of bitmask for rest of byte
n: total number of bytes in codepoint
--]]
local HEADS = {
[MASK_HIGH[3]] = { HEAD[1], MASK_LOW[5], 2 },
[MASK_HIGH[4]] = { HEAD[2], MASK_LOW[4], 3 },
[MASK_HIGH[5]] = { HEAD[3], MASK_LOW[3], 4 }
}
local HEAD_CONT = 128
local REPLACEMENT = 0xfffd
local function munch_at_n(s, n, maxn)
local ptr = n
local c = string.byte(s:sub(n,n))
if bit32.band(c, MASK_HIGH[1]) == 0 then
return c, 1, nil
end
for m, a in pairs(HEADS) do
if bit32.band(c, m) == a[1] then
local nextra = a[3] - 1
if n + nextra > maxn then
return nil, nil, 'string ends mid codepoint'
end
local val = bit32.band(c, a[2])
for i = n+1,n+nextra do
c = string.byte(s:sub(i,i))
if bit32.band(MASK_HIGH[2], c) ~= HEAD_CONT then
return REPLACEMENT, nextra+1, nil
end
val = bit32.lshift(val, 6) + bit32.band(c, MASK_LOW[6])
end
return val, nextra+1, nil
end
end
return nil, nil, 'invalid unicode string'
end
function lib.decode(s)
ptr = 1
N = #s
points = {}
while ptr <= N do
p, adv, err = munch_at_n(s, ptr, N)
if p then
table.insert(points, p)
ptr = ptr + adv
else
return points, err
end
end
return points, nil
end
function lib.encode(a)
local chunks = {}
for _, c in ipairs(a) do
if c < 0x80 then
table.insert(chunks, string.char(c))
else
local t = {}
local err = true
for n, lim in ipairs(UPPER_LIMITS) do
if c < lim then
for _ = 1,n do
local x = bit32.bor(HEAD_CONT,
bit32.band(MASK_LOW[6], c))
table.insert(t, x)
c = bit32.rshift(c, 6)
end
local x = bit32.band(MASK_LOW[6-n], c)
x = bit32.bor(HEAD[n], x)
table.insert(chunks, string.char(x))
for i = n,1,-1 do
table.insert(chunks, string.char(t[i]))
end
err = nil
break
end
end
end
end
return table.concat(chunks, '')
end
if TEST then
local parr = { 65, 66, 67, 68, 69, 32, 97, 98, 99, 100 }
if TEST.decode then
print('feed me a unicode string to test decoding')
local s = io.stdin:read('*line')
parr, err = lib.decode(s)
if parr then
print(table.concat(parr, ' '))
else
print('ERROR: ' .. err)
end
end
if TEST.encode then
local s = lib.encode(parr)
print(s)
end
else
return lib
end