-
Notifications
You must be signed in to change notification settings - Fork 30k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
build: encode non-ASCII Latin1 characters as one byte in JS2C #51605
Conversation
Review requested:
|
7d3db35
to
48a3de6
Compare
cc @nodejs/undici Maybe a separate PR could be done in undici to replace |
If we remove those comments it'll make debugging fetch 10x harder. Seems like a bug where esbuild isn't dropping those comments, where it does drop others. |
If you replace the GetOctalCode with the following code... See codestatic constexpr std::array<char, 4*256> octal_table =
[]() constexpr {
std::array<char, 4*256> result{};
for(uint32_t ch = 0; ch < 256; ch++) {
if (ch >= ' ' && ch <= '~' && ch != '\\' && ch != '"' && ch != '?') {
result[4*ch] = ch;
} else {
result[4*ch] = '\\';
result[4*ch+1] = '0' + ((ch >> 6) & 7);
result[4*ch+2] = '0' + ((ch >> 3) & 7);
result[4*ch+3] = '0' + (ch & 7);
}
}
return result;
}();
static constexpr std::array<std::string_view, 256> octal_table_string =
[]() constexpr {
std::array<std::string_view, 256> result{};
for(uint32_t ch = 0; ch < 256; ch++) {
size_t len = octal_table[4*ch] == '\\' ? 4 : 1;
result[ch] = std::string_view(&octal_table[4*ch], len);
}
return result;
}();
constexpr std::string_view GetOctalCode(uint8_t index) {
return octal_table_string[index];
} It might compile down to (GCC13, x64) a handful of instructions and be constexpr: See codeGetOctalCode(unsigned char):
movzx edi, dil
sal rdi, 4
mov rax, QWORD PTR octal_table_string[rdi]
mov rdx, QWORD PTR octal_table_string[rdi+8] Under GCC13, with x64, the current GetOctalCode function might compile to something much heavier: See codeGetOctalCode[abi:cxx11](unsigned char):
push r15
push r14
push r13
push r12
push rbp
push rbx
sub rsp, 152
mov DWORD PTR [rsp+12], edi
movzx eax, BYTE PTR guard variable for GetOctalCode[abi:cxx11](unsigned char)::table[rip]
test al, al
je .L122
.L12:
movzx eax, BYTE PTR [rsp+12]
sal rax, 5
add rax, QWORD PTR GetOctalCode[abi:cxx11](unsigned char)::table[rip]
add rsp, 152
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
.L122:
mov edi, OFFSET FLAT:guard variable for GetOctalCode[abi:cxx11](unsigned char)::table
call __cxa_guard_acquire
test eax, eax
je .L12
pxor xmm0, xmm0
mov edi, 8192
mov QWORD PTR GetOctalCode[abi:cxx11](unsigned char)::table[rip+16], 0
movaps XMMWORD PTR GetOctalCode[abi:cxx11](unsigned char)::table[rip], xmm0
call operator new(unsigned long)
lea rcx, [rax+8192]
mov QWORD PTR GetOctalCode[abi:cxx11](unsigned char)::table[rip], rax
mov QWORD PTR GetOctalCode[abi:cxx11](unsigned char)::table[rip+16], rcx
.L14:
lea rdx, [rax+16]
mov QWORD PTR [rax+8], 0
add rax, 32
mov QWORD PTR [rax-32], rdx
mov BYTE PTR [rax-16], 0
cmp rax, rcx
jne .L14
mov QWORD PTR GetOctalCode[abi:cxx11](unsigned char)::table[rip+8], rax
mov r12d, -34
xor ebx, ebx
lea r15, [rsp+64]
jmp .L61
.L16:
lea rbp, [rsp+32]
mov edx, ebx
mov esi, 1
lea rdi, [rsp+16]
mov QWORD PTR [rsp+16], rbp
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_construct(unsigned long, char)
.L17:
mov r13, rbx
mov rdx, QWORD PTR [rsp+24]
mov rax, QWORD PTR [rsp+16]
sal r13, 5
add r13, QWORD PTR GetOctalCode[abi:cxx11](unsigned char)::table[rip]
mov rdi, QWORD PTR [r13+0]
lea rcx, [r13+16]
cmp rdi, rcx
je .L123
cmp rax, rbp
je .L51
mov QWORD PTR [r13+0], rax
mov rcx, QWORD PTR [r13+16]
mov QWORD PTR [r13+8], rdx
mov rax, QWORD PTR [rsp+32]
mov QWORD PTR [r13+16], rax
test rdi, rdi
je .L57
mov QWORD PTR [rsp+16], rdi
mov QWORD PTR [rsp+32], rcx
.L54:
mov QWORD PTR [rsp+24], 0
mov BYTE PTR [rdi], 0
mov rdi, QWORD PTR [rsp+16]
cmp rdi, rbp
je .L58
mov rax, QWORD PTR [rsp+32]
add rbx, 1
add r12d, 1
lea rsi, [rax+1]
call operator delete(void*, unsigned long)
cmp rbx, 256
je .L60
.L61:
lea eax, [rbx-32]
mov r8d, ebx
cmp al, 94
ja .L15
cmp r12b, 58
ja .L16
movabs rax, -288230376688582658
bt rax, r12
jc .L16
.L15:
mov r14d, r8d
mov ebp, r8d
shr r8b, 6
mov ecx, 1
movzx eax, WORD PTR .LC0[rip]
shr r14b, 3
and ebp, 7
xor edx, edx
add r8d, 48
and r14d, 7
mov esi, 1
add ebp, 48
and r8d, 127
lea rdi, [rsp+48]
mov QWORD PTR [rsp+48], r15
add r14d, 48
mov QWORD PTR [rsp+56], 1
mov WORD PTR [rsp+64], ax
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)
lea r13, [rsp+96]
lea rcx, [rax+16]
mov rsi, QWORD PTR [rax+8]
mov QWORD PTR [rsp+80], r13
mov rdx, QWORD PTR [rax]
cmp rdx, rcx
je .L124
mov QWORD PTR [rsp+80], rdx
mov rdx, QWORD PTR [rax+16]
mov QWORD PTR [rsp+96], rdx
.L25:
mov QWORD PTR [rax], rcx
mov r8d, r14d
mov ecx, 1
xor edx, edx
mov BYTE PTR [rax+16], 0
and r8d, 127
lea rdi, [rsp+80]
mov QWORD PTR [rax+8], 0
mov QWORD PTR [rsp+88], rsi
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)
lea r14, [rsp+128]
lea rcx, [rax+16]
mov rsi, QWORD PTR [rax+8]
mov QWORD PTR [rsp+112], r14
mov rdx, QWORD PTR [rax]
cmp rdx, rcx
je .L125
mov QWORD PTR [rsp+112], rdx
mov rdx, QWORD PTR [rax+16]
mov QWORD PTR [rsp+128], rdx
.L33:
mov QWORD PTR [rax], rcx
mov r8d, ebp
mov ecx, 1
xor edx, edx
mov BYTE PTR [rax+16], 0
and r8d, 127
lea rdi, [rsp+112]
mov QWORD PTR [rax+8], 0
mov QWORD PTR [rsp+120], rsi
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)
lea rbp, [rsp+32]
lea rsi, [rax+16]
mov rdx, QWORD PTR [rax+8]
mov QWORD PTR [rsp+16], rbp
mov rcx, QWORD PTR [rax]
cmp rcx, rsi
je .L126
mov QWORD PTR [rsp+16], rcx
mov rcx, QWORD PTR [rax+16]
mov QWORD PTR [rsp+32], rcx
.L41:
mov QWORD PTR [rax], rsi
mov rdi, QWORD PTR [rsp+112]
mov QWORD PTR [rsp+24], rdx
mov QWORD PTR [rax+8], 0
mov BYTE PTR [rax+16], 0
cmp rdi, r14
je .L42
mov rax, QWORD PTR [rsp+128]
lea rsi, [rax+1]
call operator delete(void*, unsigned long)
.L42:
mov rdi, QWORD PTR [rsp+80]
cmp rdi, r13
je .L43
mov rax, QWORD PTR [rsp+96]
lea rsi, [rax+1]
call operator delete(void*, unsigned long)
.L43:
mov rdi, QWORD PTR [rsp+48]
cmp rdi, r15
je .L17
mov rax, QWORD PTR [rsp+64]
lea rsi, [rax+1]
call operator delete(void*, unsigned long)
jmp .L17
.L123:
cmp rax, rbp
je .L51
mov QWORD PTR [r13+0], rax
mov QWORD PTR [r13+8], rdx
mov rax, QWORD PTR [rsp+32]
mov QWORD PTR [r13+16], rax
.L57:
mov QWORD PTR [rsp+16], rbp
lea rbp, [rsp+32]
mov rdi, rbp
jmp .L54
.L58:
add rbx, 1
add r12d, 1
cmp rbx, 256
jne .L61
.L60:
mov edx, OFFSET FLAT:__dso_handle
mov esi, OFFSET FLAT:GetOctalCode[abi:cxx11](unsigned char)::table
mov edi, OFFSET FLAT:std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >::~vector() [complete object destructor]
call __cxa_atexit
mov edi, OFFSET FLAT:guard variable for GetOctalCode[abi:cxx11](unsigned char)::table
call __cxa_guard_release
jmp .L12
.L51:
lea rcx, [rsp+16]
cmp r13, rcx
je .L63
test rdx, rdx
je .L55
cmp rdx, 1
je .L127
mov rsi, rbp
call memcpy
mov rdx, QWORD PTR [rsp+24]
mov rdi, QWORD PTR [r13+0]
.L55:
mov QWORD PTR [r13+8], rdx
mov BYTE PTR [rdi+rdx], 0
mov rdi, QWORD PTR [rsp+16]
jmp .L54
.L126:
add rdx, 1
mov r8, rbp
mov rcx, rsi
cmp edx, 8
jnb .L128
.L35:
xor edi, edi
test dl, 4
je .L38
mov edi, DWORD PTR [rcx]
mov DWORD PTR [r8], edi
mov edi, 4
.L38:
test dl, 2
je .L39
movzx r9d, WORD PTR [rcx+rdi]
mov WORD PTR [r8+rdi], r9w
add rdi, 2
.L39:
and edx, 1
je .L121
movzx edx, BYTE PTR [rcx+rdi]
mov BYTE PTR [r8+rdi], dl
.L121:
mov rdx, QWORD PTR [rax+8]
jmp .L41
.L124:
add rsi, 1
mov r8, r13
mov rdx, rcx
cmp esi, 8
jnb .L129
.L19:
xor edi, edi
test sil, 4
je .L22
mov edi, DWORD PTR [rdx]
mov DWORD PTR [r8], edi
mov edi, 4
.L22:
test sil, 2
je .L23
movzx r9d, WORD PTR [rdx+rdi]
mov WORD PTR [r8+rdi], r9w
add rdi, 2
.L23:
and esi, 1
je .L119
movzx edx, BYTE PTR [rdx+rdi]
mov BYTE PTR [r8+rdi], dl
.L119:
mov rsi, QWORD PTR [rax+8]
jmp .L25
.L125:
add rsi, 1
mov r8, r14
mov rdx, rcx
cmp esi, 8
jnb .L130
.L27:
xor edi, edi
test sil, 4
je .L30
mov edi, DWORD PTR [rdx]
mov DWORD PTR [r8], edi
mov edi, 4
.L30:
test sil, 2
je .L31
movzx r9d, WORD PTR [rdx+rdi]
mov WORD PTR [r8+rdi], r9w
add rdi, 2
.L31:
and esi, 1
je .L120
movzx edx, BYTE PTR [rdx+rdi]
mov BYTE PTR [r8+rdi], dl
.L120:
mov rsi, QWORD PTR [rax+8]
jmp .L33
.L130:
mov r9d, esi
xor edx, edx
and r9d, -8
.L28:
mov edi, edx
add edx, 8
mov r8, QWORD PTR [rcx+rdi]
mov QWORD PTR [r14+rdi], r8
cmp edx, r9d
jb .L28
lea r8, [r14+rdx]
add rdx, rcx
jmp .L27
.L129:
mov r9d, esi
xor edx, edx
and r9d, -8
.L20:
mov edi, edx
add edx, 8
mov r8, QWORD PTR [rcx+rdi]
mov QWORD PTR [r13+0+rdi], r8
cmp edx, r9d
jb .L20
lea r8, [r13+0+rdx]
add rdx, rcx
jmp .L19
.L128:
mov r9d, edx
xor ecx, ecx
and r9d, -8
.L36:
mov edi, ecx
add ecx, 8
mov r8, QWORD PTR [rsi+rdi]
mov QWORD PTR [rbp+0+rdi], r8
cmp ecx, r9d
jb .L36
lea r8, [rbp+0+rcx]
add rcx, rsi
jmp .L35
.L127:
movzx eax, BYTE PTR [rsp+32]
mov BYTE PTR [rdi], al
mov rdx, QWORD PTR [rsp+24]
mov rdi, QWORD PTR [r13+0]
jmp .L55
.L63:
mov rdi, rax
jmp .L54
mov rbx, rax
jmp .L62
mov rbx, rax
jmp .L47
mov rbx, rax
jmp .L49
mov rbx, rax
jmp .L48
mov rbx, rax
jmp .L50
GetOctalCode[abi:cxx11](unsigned char) [clone .cold]:
.L47:
lea rdi, [rsp+112]
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
.L48:
lea rdi, [rsp+80]
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
.L49:
lea rdi, [rsp+48]
call std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose()
.L50:
mov edi, OFFSET FLAT:GetOctalCode[abi:cxx11](unsigned char)::table
call std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >::~vector() [complete object destructor]
.L62:
mov edi, OFFSET FLAT:guard variable for GetOctalCode[abi:cxx11](unsigned char)::table
call __cxa_guard_abort
mov rdi, rbx
call _Unwind_Resume
.LC0:
.byte 92
.byte 0 I have not run benchmarks and I don't know if it is relevant to do so. This being said, without any benchmarking, I expect that that the currrent GetOctalCode is not likely to be optimally efficient. |
@KhafraDev I'm posting a PR to undici fixing the comment momentarily :) |
Undici change: nodejs/undici#2672 |
Good catch though I don't think that's relevant for this PR as it only moved existing code to another C++ file. Worth noting that this is used at build time and the number of times it's run is limited (256) so optimizing it would just mean a few ms/ns faster at build time (not long ago we were even using a 100x slower python script in JS2C...) |
Previously we had two encodings for JS files: 1. If a file contains only ASCII characters, encode it as a one-byte string (interpreted as uint8_t array during loading). 2. If a file contains any characters with code point above 127, encode it as a two-byte string (interpreted as uint16_t array during loading). This was done because V8 only supports Latin-1 and UTF16 encoding as underlying representation for strings. To store the JS code as external strings to save encoding cost and memory overhead we need to follow the representations supported by V8. Notice that there is a gap in the Latin1 range (128-255) that we encoded as two-byte, which was an undocumented TODO for a long time. That was fine previously because then files that contained code points beyond the 0-127 range contained code points >255. Now we have undici which contains code points in the range 0-255 (minus a replaceable code point >255). So this patch adds handling for the 128-255 range to reduce the size overhead caused by encoding them as two-byte. This could reduce the size of the binary by ~500KB and helps future files with this kind of code points. Drive-by: replace `’` with `'` in undici.js to make it a Latin-1 only string. That could be removed if undici updates itself to replace this character in the comment.
48a3de6
to
525552a
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
very cool. some of this I don't fully grok, but for the most part I get it. thank you!
Landed in 72df124 |
Previously we had two encodings for JS files: 1. If a file contains only ASCII characters, encode it as a one-byte string (interpreted as uint8_t array during loading). 2. If a file contains any characters with code point above 127, encode it as a two-byte string (interpreted as uint16_t array during loading). This was done because V8 only supports Latin-1 and UTF16 encoding as underlying representation for strings. To store the JS code as external strings to save encoding cost and memory overhead we need to follow the representations supported by V8. Notice that there is a gap in the Latin1 range (128-255) that we encoded as two-byte, which was an undocumented TODO for a long time. That was fine previously because then files that contained code points beyond the 0-127 range contained code points >255. Now we have undici which contains code points in the range 0-255 (minus a replaceable code point >255). So this patch adds handling for the 128-255 range to reduce the size overhead caused by encoding them as two-byte. This could reduce the size of the binary by ~500KB and helps future files with this kind of code points. Drive-by: replace `’` with `'` in undici.js to make it a Latin-1 only string. That could be removed if undici updates itself to replace this character in the comment. PR-URL: #51605 Reviewed-By: Daniel Lemire <daniel@lemire.me> Reviewed-By: Ethan Arrowood <ethan@arrowood.dev>
This is a follow-up to nodejs#51605.
This is a follow-up to nodejs#51605.
This is a follow-up to nodejs#51605.
This is a follow-up to nodejs#51605.
Previously we had two encodings for JS files: 1. If a file contains only ASCII characters, encode it as a one-byte string (interpreted as uint8_t array during loading). 2. If a file contains any characters with code point above 127, encode it as a two-byte string (interpreted as uint16_t array during loading). This was done because V8 only supports Latin-1 and UTF16 encoding as underlying representation for strings. To store the JS code as external strings to save encoding cost and memory overhead we need to follow the representations supported by V8. Notice that there is a gap in the Latin1 range (128-255) that we encoded as two-byte, which was an undocumented TODO for a long time. That was fine previously because then files that contained code points beyond the 0-127 range contained code points >255. Now we have undici which contains code points in the range 0-255 (minus a replaceable code point >255). So this patch adds handling for the 128-255 range to reduce the size overhead caused by encoding them as two-byte. This could reduce the size of the binary by ~500KB and helps future files with this kind of code points. Drive-by: replace `’` with `'` in undici.js to make it a Latin-1 only string. That could be removed if undici updates itself to replace this character in the comment. PR-URL: nodejs#51605 Reviewed-By: Daniel Lemire <daniel@lemire.me> Reviewed-By: Ethan Arrowood <ethan@arrowood.dev>
Previously we had two encodings for JS files: 1. If a file contains only ASCII characters, encode it as a one-byte string (interpreted as uint8_t array during loading). 2. If a file contains any characters with code point above 127, encode it as a two-byte string (interpreted as uint16_t array during loading). This was done because V8 only supports Latin-1 and UTF16 encoding as underlying representation for strings. To store the JS code as external strings to save encoding cost and memory overhead we need to follow the representations supported by V8. Notice that there is a gap in the Latin1 range (128-255) that we encoded as two-byte, which was an undocumented TODO for a long time. That was fine previously because then files that contained code points beyond the 0-127 range contained code points >255. Now we have undici which contains code points in the range 0-255 (minus a replaceable code point >255). So this patch adds handling for the 128-255 range to reduce the size overhead caused by encoding them as two-byte. This could reduce the size of the binary by ~500KB and helps future files with this kind of code points. Drive-by: replace `’` with `'` in undici.js to make it a Latin-1 only string. That could be removed if undici updates itself to replace this character in the comment. PR-URL: #51605 Reviewed-By: Daniel Lemire <daniel@lemire.me> Reviewed-By: Ethan Arrowood <ethan@arrowood.dev>
Previously we had two encodings for JS files: 1. If a file contains only ASCII characters, encode it as a one-byte string (interpreted as uint8_t array during loading). 2. If a file contains any characters with code point above 127, encode it as a two-byte string (interpreted as uint16_t array during loading). This was done because V8 only supports Latin-1 and UTF16 encoding as underlying representation for strings. To store the JS code as external strings to save encoding cost and memory overhead we need to follow the representations supported by V8. Notice that there is a gap in the Latin1 range (128-255) that we encoded as two-byte, which was an undocumented TODO for a long time. That was fine previously because then files that contained code points beyond the 0-127 range contained code points >255. Now we have undici which contains code points in the range 0-255 (minus a replaceable code point >255). So this patch adds handling for the 128-255 range to reduce the size overhead caused by encoding them as two-byte. This could reduce the size of the binary by ~500KB and helps future files with this kind of code points. Drive-by: replace `’` with `'` in undici.js to make it a Latin-1 only string. That could be removed if undici updates itself to replace this character in the comment. PR-URL: #51605 Reviewed-By: Daniel Lemire <daniel@lemire.me> Reviewed-By: Ethan Arrowood <ethan@arrowood.dev>
This is a follow-up to nodejs#51605. PR-URL: nodejs#51818 Reviewed-By: Michaël Zasso <targos@protonmail.com> Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: Yagiz Nizipli <yagiz.nizipli@sentry.io> Reviewed-By: Luigi Pinca <luigipinca@gmail.com>
* chore: bump node in DEPS to v20.12.0 * chore: update build_add_gn_build_files.patch * chore: update patches * chore: bump node in DEPS to v20.12.1 * chore: update patches * build: encode non-ASCII Latin1 characters as one byte in JS2C nodejs/node#51605 * crypto: use EVP_MD_fetch and cache EVP_MD for hashes nodejs/node#51034 * chore: update filenames.json * chore: bump node in DEPS to v20.12.2 * chore: update patches * src: support configurable snapshot nodejs/node#50453 * test: remove test-domain-error-types flaky designation nodejs/node#51717 * src: avoid draining platform tasks at FreeEnvironment nodejs/node#51290 * chore: fix accidentally deleted v8 dep * lib: define FormData and fetch etc. in the built-in snapshot nodejs/node#51598 * chore: rebase on main * chore: remove stray log --------- Co-authored-by: electron-roller[bot] <84116207+electron-roller[bot]@users.noreply.github.com> Co-authored-by: Cheng <zcbenz@gmail.com> Co-authored-by: Shelley Vohr <shelley.vohr@gmail.com> Co-authored-by: PatchUp <73610968+patchup[bot]@users.noreply.github.com>
* chore: bump node in DEPS to v20.13.1 * chore: bump node in DEPS to v20.14.0 * chore: update build_add_gn_build_files.patch * chore: update patches * chore: update patches * build: encode non-ASCII Latin1 characters as one byte in JS2C nodejs/node#51605 * crypto: use EVP_MD_fetch and cache EVP_MD for hashes nodejs/node#51034 * chore: update filenames.json * chore: update patches * src: support configurable snapshot nodejs/node#50453 * test: remove test-domain-error-types flaky designation nodejs/node#51717 * src: avoid draining platform tasks at FreeEnvironment nodejs/node#51290 * chore: fix accidentally deleted v8 dep * lib: define FormData and fetch etc. in the built-in snapshot nodejs/node#51598 * chore: remove stray log * crypto: enable NODE_EXTRA_CA_CERTS with BoringSSL nodejs/node#52217 * test: skip test for dynamically linked OpenSSL nodejs/node#52542 * lib, url: add a `windows` option to path parsing nodejs/node#52509 * src: use dedicated routine to compile function for builtin CJS loader nodejs/node#52016 * test: mark test as flaky nodejs/node#52671 * build,tools: add test-ubsan ci nodejs/node#46297 * src: preload function for Environment nodejs/node#51539 * deps: update c-ares to 1.28.1 nodejs/node#52285 * chore: fixup * events: extract addAbortListener for safe internal use nodejs/node#52081 * module: print location of unsettled top-level await in entry points nodejs/node#51999 * fs: add stacktrace to fs/promises nodejs/node#49849 * chore: fixup indices --------- Co-authored-by: electron-roller[bot] <84116207+electron-roller[bot]@users.noreply.github.com> Co-authored-by: Cheng <zcbenz@gmail.com> Co-authored-by: Shelley Vohr <shelley.vohr@gmail.com> Co-authored-by: PatchUp <73610968+patchup[bot]@users.noreply.github.com>
Previously we had two encodings for JS files:
This was done because V8 only supports Latin-1 and UTF16 encoding as underlying representation for strings. To store the JS code as external strings to save encoding cost and memory overhead we need to follow the representations supported by V8. Notice that there is a gap in the Latin1 range (128-255) that we encoded as two-byte, which was an undocumented TODO for a long time. That was fine previously because then files that contained code points beyond the 0-127 range contained code points >255. Now we have undici which contains code points in the range 0-255 (minus a replaceable code point >255). So this patch adds handling for the 128-255 range to reduce the size overhead caused by encoding them as two-byte. This could reduce the size of the binary by ~500KB and helps future files with this kind of code points.
Drive-by: replace
’
with'
in undici.js to make it a Latin-1 only string. That could be removed if undici updates itself to replace this character in the comment.