Skip to content

Commit 916cd32

Browse files
committed
lib: use utf8 fast path for streaming TextDecoder
1 parent e155415 commit 916cd32

File tree

3 files changed

+112
-43
lines changed

3 files changed

+112
-43
lines changed

lib/internal/encoding.js

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,13 @@ const kHandle = Symbol('handle');
2828
const kFlags = Symbol('flags');
2929
const kEncoding = Symbol('encoding');
3030
const kDecoder = Symbol('decoder');
31+
const kChunk = Symbol('chunk');
3132
const kFatal = Symbol('kFatal');
3233
const kUTF8FastPath = Symbol('kUTF8FastPath');
3334
const kIgnoreBOM = Symbol('kIgnoreBOM');
3435

3536
const { isSinglebyteEncoding, createSinglebyteDecoder } = require('internal/encoding/single-byte');
37+
const { unfinishedBytesUtf8, mergePrefixUtf8 } = require('internal/encoding/util');
3638

3739
const {
3840
getConstructorOf,
@@ -447,6 +449,7 @@ class TextDecoder {
447449
this[kUTF8FastPath] = false;
448450
this[kHandle] = undefined;
449451
this[kSingleByte] = undefined; // Does not care about streaming or BOM
452+
this[kChunk] = null; // A copy of previous streaming tail or null
450453

451454
if (enc === 'utf-8') {
452455
this[kUTF8FastPath] = true;
@@ -483,8 +486,48 @@ class TextDecoder {
483486

484487
const stream = options?.stream;
485488
if (this[kUTF8FastPath]) {
486-
if (!stream) return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]);
487-
this[kUTF8FastPath] = false;
489+
const chunk = this[kChunk];
490+
let ignoreBom = this[kIgnoreBOM] || this[kBOMSeen];
491+
if (!stream) {
492+
this[kBOMSeen] = false;
493+
if (!chunk) return decodeUTF8(input, ignoreBom, this[kFatal]);
494+
}
495+
496+
let u = parseInput(input);
497+
if (u.length === 0 && stream) return '' // no state change
498+
let prefix
499+
if (chunk) {
500+
const merged = mergePrefixUtf8(u, this[kChunk])
501+
if (u.length < 3) {
502+
u = merged; // might be unfinished, but fully consumed old u
503+
} else {
504+
prefix = merged // stops at complete chunk
505+
const add = prefix.length - this[kChunk].length
506+
if (add > 0) u = u.subarray(add)
507+
}
508+
509+
this[kChunk] = null;
510+
}
511+
512+
if (stream) {
513+
const trail = unfinishedBytesUtf8(u, u.length)
514+
if (trail > 0) {
515+
this[kChunk] = new FastBuffer(u.subarray(-trail)) // copy
516+
if (!prefix && trail === u.length) return '' // no further state change
517+
u = u.subarray(0, -trail)
518+
}
519+
}
520+
521+
try {
522+
const res = (prefix ? decodeUTF8(prefix, ignoreBom, this[kFatal]) : '') + decodeUTF8(u, ignoreBom || prefix, this[kFatal]);
523+
// "BOM seen" is set on the current decode call only if it did not error, in "serialize I/O queue" after decoding
524+
// We don't get here if we had no complete data to process, and we don't want BOM processing after that if streaming
525+
if (stream) this[kBOMSeen] = true
526+
} catch (e) {
527+
this[kChunk] = null // reset unfinished chunk on errors
528+
// The correct way per spec seems to be not destroying the decoder state (aka BOM here) in stream mode
529+
throw e
530+
}
488531
}
489532

490533
this.#prepareConverter();

lib/internal/encoding/util.js

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
// From https://npmjs.com/package/@exodus/bytes
2+
// Copyright Exodus Movement. Licensed under MIT License.
3+
4+
'use strict';
5+
6+
// Get a number of last bytes in an Uint8Array `u` ending at `len` that don't
7+
// form a codepoint yet, but can be a part of a single codepoint on more data
8+
function unfinishedBytesUtf8(u, len) {
9+
// 0-3
10+
let p = 0
11+
while (p < 2 && p < len && (u[len - p - 1] & 0xc0) === 0x80) p++ // go back 0-2 trailing bytes
12+
if (p === len) return 0 // no space for lead
13+
const l = u[len - p - 1]
14+
if (l < 0xc2 || l > 0xf4) return 0 // not a lead
15+
if (p === 0) return 1 // nothing to recheck, we have only lead, return it. 2-byte must return here
16+
if (l < 0xe0 || (l < 0xf0 && p >= 2)) return 0 // 2-byte, or 3-byte or less and we already have 2 trailing
17+
const lower = l === 0xf0 ? 0x90 : l === 0xe0 ? 0xa0 : 0x80
18+
const upper = l === 0xf4 ? 0x8f : l === 0xed ? 0x9f : 0xbf
19+
const n = u[len - p]
20+
return n >= lower && n <= upper ? p + 1 : 0
21+
}
22+
23+
// Merge prefix `chunk` with `u` and return new combined prefix
24+
// For u.length < 3, fully consumes u and can return unfinished data,
25+
// otherwise returns a prefix with no unfinished bytes
26+
function mergePrefixUtf8(u, chunk) {
27+
if (u.length === 0) return chunk
28+
if (u.length < 3) {
29+
// No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
30+
const a = new Uint8Array(u.length + chunk.length)
31+
a.set(chunk)
32+
a.set(u, chunk.length)
33+
return a
34+
}
35+
36+
// Slice off a small portion of u into prefix chunk so we can decode them separately without extending array size
37+
const t = new Uint8Array(chunk.length + 3) // We have 1-3 bytes and need 1-3 more bytes
38+
t.set(chunk)
39+
t.set(u.subarray(0, 3), chunk.length)
40+
41+
// Stop at the first offset where unfinished bytes reaches 0 or fits into u
42+
// If that doesn't happen (u too short), just concat chunk and u completely (above)
43+
for (let i = 1; i <= 3; i++) {
44+
const unfinished = unfinishedBytesUtf8(t, chunk.length + i) // 0-3
45+
if (unfinished <= i) {
46+
// Always reachable at 3, but we still need 'unfinished' value for it
47+
const add = i - unfinished // 0-3
48+
return add > 0 ? t.subarray(0, chunk.length + add) : chunk
49+
}
50+
}
51+
52+
// Unreachable
53+
}
54+
55+
module.exports = { unfinishedBytesUtf8, mergePrefixUtf8 }

test/parallel/test-whatwg-encoding-custom-textdecoder.js

Lines changed: 12 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -80,20 +80,8 @@ assert(TextDecoder);
8080

8181
['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => {
8282
const dec = new TextDecoder(i, { fatal: true });
83-
if (common.hasIntl) {
84-
dec.decode(buf.slice(0, 8), { stream: true });
85-
dec.decode(buf.slice(8));
86-
} else {
87-
assert.throws(
88-
() => {
89-
dec.decode(buf.slice(0, 8), { stream: true });
90-
},
91-
{
92-
code: 'ERR_NO_ICU',
93-
name: 'TypeError',
94-
message: '"fatal" option is not supported on Node.js compiled without ICU'
95-
});
96-
}
83+
dec.decode(buf.slice(0, 8), { stream: true });
84+
dec.decode(buf.slice(8));
9785
});
9886

9987
// Test TextDecoder, label undefined, options null
@@ -122,33 +110,16 @@ if (common.hasIntl) {
122110
// Test TextDecoder inspect with hidden fields
123111
{
124112
const dec = new TextDecoder('utf-8', { ignoreBOM: true });
125-
if (common.hasIntl) {
126-
assert.strictEqual(
127-
util.inspect(dec, { showHidden: true }),
128-
'TextDecoder {\n' +
129-
' encoding: \'utf-8\',\n' +
130-
' fatal: false,\n' +
131-
' ignoreBOM: true,\n' +
132-
' Symbol(flags): 4,\n' +
133-
' Symbol(handle): undefined\n' +
134-
'}'
135-
);
136-
} else {
137-
dec.decode(Uint8Array.of(0), { stream: true });
138-
assert.strictEqual(
139-
util.inspect(dec, { showHidden: true }),
140-
'TextDecoder {\n' +
141-
" encoding: 'utf-8',\n" +
142-
' fatal: false,\n' +
143-
' ignoreBOM: true,\n' +
144-
' Symbol(flags): 4,\n' +
145-
' Symbol(handle): StringDecoder {\n' +
146-
" encoding: 'utf8',\n" +
147-
' Symbol(kNativeDecoder): <Buffer 00 00 00 00 00 00 01>\n' +
148-
' }\n' +
149-
'}'
150-
);
151-
}
113+
assert.strictEqual(
114+
util.inspect(dec, { showHidden: true }),
115+
'TextDecoder {\n' +
116+
' encoding: \'utf-8\',\n' +
117+
' fatal: false,\n' +
118+
' ignoreBOM: true,\n' +
119+
' Symbol(flags): 4,\n' +
120+
' Symbol(handle): undefined\n' +
121+
'}'
122+
);
152123
}
153124

154125

0 commit comments

Comments
 (0)