-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutf8.js
82 lines (80 loc) · 3.07 KB
/
utf8.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/* This file is part of the pastechan project.
* https://github.com/mvasilkov/pastechan
* License: MIT */
'use strict'
// The following is based on Emscripten's UTF-8 functions.
// Returns the number of bytes the given JavaScript string takes if encoded as a UTF8 byte array.
exports.lengthBytesUTF8 = function lengthBytesUTF8(a) {
let len = 0
for (let n = 0; n < a.length; ++n) {
// Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code unit,
// not a Unicode code point of the character! So decode UTF16->UTF32->UTF8.
// See http://unicode.org/faq/utf_bom.html#utf16-3
let u = a.charCodeAt(n) // possibly a lead surrogate
if (u >= 0xD800 && u <= 0xDFFF) u = 0x10000 + ((u & 0x3FF) << 10) | (a.charCodeAt(++n) & 0x3FF)
if (u <= 0x7F) {
++len
}
else if (u <= 0x7FF) {
len += 2
}
else if (u <= 0xFFFF) {
len += 3
}
else if (u <= 0x1FFFFF) {
len += 4
}
else if (u <= 0x3FFFFFF) {
len += 5
}
else {
len += 6
}
}
return len
}
exports.stringToUTF8Array = function stringToUTF8Array(a, outU8Array) {
let p = 0
for (let n = 0; n < a.length; ++n) {
// Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code unit,
// not a Unicode code point of the character! So decode UTF16->UTF32->UTF8.
// See http://unicode.org/faq/utf_bom.html#utf16-3
// For UTF8 byte structure, see http://en.wikipedia.org/wiki/UTF-8#Description and
// https://www.ietf.org/rfc/rfc2279.txt and https://tools.ietf.org/html/rfc3629
let u = a.charCodeAt(n) // possibly a lead surrogate
if (u >= 0xD800 && u <= 0xDFFF) u = 0x10000 + ((u & 0x3FF) << 10) | (a.charCodeAt(++n) & 0x3FF)
if (u <= 0x7F) {
outU8Array[p++] = u
}
else if (u <= 0x7FF) {
outU8Array[p++] = 0xC0 | (u >> 6)
outU8Array[p++] = 0x80 | (u & 63)
}
else if (u <= 0xFFFF) {
outU8Array[p++] = 0xE0 | (u >> 12)
outU8Array[p++] = 0x80 | ((u >> 6) & 63)
outU8Array[p++] = 0x80 | (u & 63)
}
else if (u <= 0x1FFFFF) {
outU8Array[p++] = 0xF0 | (u >> 18)
outU8Array[p++] = 0x80 | ((u >> 12) & 63)
outU8Array[p++] = 0x80 | ((u >> 6) & 63)
outU8Array[p++] = 0x80 | (u & 63)
}
else if (u <= 0x3FFFFFF) {
outU8Array[p++] = 0xF8 | (u >> 24)
outU8Array[p++] = 0x80 | ((u >> 18) & 63)
outU8Array[p++] = 0x80 | ((u >> 12) & 63)
outU8Array[p++] = 0x80 | ((u >> 6) & 63)
outU8Array[p++] = 0x80 | (u & 63)
}
else {
outU8Array[p++] = 0xFC | (u >> 30)
outU8Array[p++] = 0x80 | ((u >> 24) & 63)
outU8Array[p++] = 0x80 | ((u >> 18) & 63)
outU8Array[p++] = 0x80 | ((u >> 12) & 63)
outU8Array[p++] = 0x80 | ((u >> 6) & 63)
outU8Array[p++] = 0x80 | (u & 63)
}
}
}