lib: use utf8 fast path for streaming TextDecoder

ChALkeR · ChALkeR · commit 916cd3200799 · 2026-01-27T22:25:16.000+04:00
diff --git a/lib/internal/encoding.js b/lib/internal/encoding.js
@@ -28,11 +28,13 @@ const kHandle = Symbol('handle');
 const kFlags = Symbol('flags');
 const kEncoding = Symbol('encoding');
 const kDecoder = Symbol('decoder');
+const kChunk = Symbol('chunk');
 const kFatal = Symbol('kFatal');
 const kUTF8FastPath = Symbol('kUTF8FastPath');
 const kIgnoreBOM = Symbol('kIgnoreBOM');
 
 const { isSinglebyteEncoding, createSinglebyteDecoder } = require('internal/encoding/single-byte');
+const { unfinishedBytesUtf8, mergePrefixUtf8 } = require('internal/encoding/util');
 
 const {
   getConstructorOf,
@@ -447,6 +449,7 @@ class TextDecoder {
     this[kUTF8FastPath] = false;
     this[kHandle] = undefined;
     this[kSingleByte] = undefined; // Does not care about streaming or BOM
+    this[kChunk] = null; // A copy of previous streaming tail or null
 
     if (enc === 'utf-8') {
       this[kUTF8FastPath] = true;
@@ -483,8 +486,48 @@ class TextDecoder {
 
     const stream = options?.stream;
     if (this[kUTF8FastPath]) {
-      if (!stream) return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]);
-      this[kUTF8FastPath] = false;
+      const chunk = this[kChunk];
+      let ignoreBom = this[kIgnoreBOM] || this[kBOMSeen];
+      if (!stream) {
+        this[kBOMSeen] = false;
+        if (!chunk) return decodeUTF8(input, ignoreBom, this[kFatal]);
+      }
+
+      let u = parseInput(input);
+      if (u.length === 0 && stream) return '' // no state change
+      let prefix
+      if (chunk) {
+        const merged = mergePrefixUtf8(u, this[kChunk])
+        if (u.length < 3) {
+          u = merged; // might be unfinished, but fully consumed old u
+        } else {
+          prefix = merged // stops at complete chunk
+          const add = prefix.length - this[kChunk].length
+          if (add > 0) u = u.subarray(add)
+        }
+
+        this[kChunk] = null;
+      }
+
+      if (stream) {
+        const trail = unfinishedBytesUtf8(u, u.length)
+        if (trail > 0) {
+          this[kChunk] = new FastBuffer(u.subarray(-trail)) // copy
+          if (!prefix && trail === u.length) return '' // no further state change
+          u = u.subarray(0, -trail)
+        }
+      }
+
+      try {
+        const res = (prefix ? decodeUTF8(prefix, ignoreBom, this[kFatal]) : '') + decodeUTF8(u, ignoreBom || prefix, this[kFatal]);
+        // "BOM seen" is set on the current decode call only if it did not error, in "serialize I/O queue" after decoding
+        // We don't get here if we had no complete data to process, and we don't want BOM processing after that if streaming
+        if (stream) this[kBOMSeen] = true
+      } catch (e) {
+        this[kChunk] = null // reset unfinished chunk on errors
+        // The correct way per spec seems to be not destroying the decoder state (aka BOM here) in stream mode
+        throw e
+      }
     }
 
     this.#prepareConverter();
diff --git a/lib/internal/encoding/util.js b/lib/internal/encoding/util.js
@@ -0,0 +1,55 @@
+// From https://npmjs.com/package/@exodus/bytes
+// Copyright Exodus Movement. Licensed under MIT License.
+
+'use strict';
+
+// Get a number of last bytes in an Uint8Array `u` ending at `len` that don't
+// form a codepoint yet, but can be a part of a single codepoint on more data
+function unfinishedBytesUtf8(u, len) {
+  // 0-3
+  let p = 0
+  while (p < 2 && p < len && (u[len - p - 1] & 0xc0) === 0x80) p++ // go back 0-2 trailing bytes
+  if (p === len) return 0 // no space for lead
+  const l = u[len - p - 1]
+  if (l < 0xc2 || l > 0xf4) return 0 // not a lead
+  if (p === 0) return 1 // nothing to recheck, we have only lead, return it. 2-byte must return here
+  if (l < 0xe0 || (l < 0xf0 && p >= 2)) return 0 // 2-byte, or 3-byte or less and we already have 2 trailing
+  const lower = l === 0xf0 ? 0x90 : l === 0xe0 ? 0xa0 : 0x80
+  const upper = l === 0xf4 ? 0x8f : l === 0xed ? 0x9f : 0xbf
+  const n = u[len - p]
+  return n >= lower && n <= upper ? p + 1 : 0
+}
+
+// Merge prefix `chunk` with `u` and return new combined prefix
+// For u.length < 3, fully consumes u and can return unfinished data,
+// otherwise returns a prefix with no unfinished bytes
+function mergePrefixUtf8(u, chunk) {
+  if (u.length === 0) return chunk
+  if (u.length < 3) {
+    // No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
+    const a = new Uint8Array(u.length + chunk.length)
+    a.set(chunk)
+    a.set(u, chunk.length)
+    return a
+  }
+
+  // Slice off a small portion of u into prefix chunk so we can decode them separately without extending array size
+  const t = new Uint8Array(chunk.length + 3) // We have 1-3 bytes and need 1-3 more bytes
+  t.set(chunk)
+  t.set(u.subarray(0, 3), chunk.length)
+
+  // Stop at the first offset where unfinished bytes reaches 0 or fits into u
+  // If that doesn't happen (u too short), just concat chunk and u completely (above)
+  for (let i = 1; i <= 3; i++) {
+    const unfinished = unfinishedBytesUtf8(t, chunk.length + i) // 0-3
+    if (unfinished <= i) {
+      // Always reachable at 3, but we still need 'unfinished' value for it
+      const add = i - unfinished // 0-3
+      return add > 0 ? t.subarray(0, chunk.length + add) : chunk
+    }
+  }
+
+  // Unreachable
+}
+
+module.exports = { unfinishedBytesUtf8, mergePrefixUtf8 }
diff --git a/test/parallel/test-whatwg-encoding-custom-textdecoder.js b/test/parallel/test-whatwg-encoding-custom-textdecoder.js
@@ -80,20 +80,8 @@ assert(TextDecoder);
 
 ['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => {
   const dec = new TextDecoder(i, { fatal: true });
-  if (common.hasIntl) {
-    dec.decode(buf.slice(0, 8), { stream: true });
-    dec.decode(buf.slice(8));
-  } else {
-    assert.throws(
-      () => {
-        dec.decode(buf.slice(0, 8), { stream: true });
-      },
-      {
-        code: 'ERR_NO_ICU',
-        name: 'TypeError',
-        message: '"fatal" option is not supported on Node.js compiled without ICU'
-      });
-  }
+  dec.decode(buf.slice(0, 8), { stream: true });
+  dec.decode(buf.slice(8));
 });
 
 // Test TextDecoder, label undefined, options null
@@ -122,33 +110,16 @@ if (common.hasIntl) {
 // Test TextDecoder inspect with hidden fields
 {
   const dec = new TextDecoder('utf-8', { ignoreBOM: true });
-  if (common.hasIntl) {
-    assert.strictEqual(
-      util.inspect(dec, { showHidden: true }),
-      'TextDecoder {\n' +
-      '  encoding: \'utf-8\',\n' +
-      '  fatal: false,\n' +
-      '  ignoreBOM: true,\n' +
-      '  Symbol(flags): 4,\n' +
-      '  Symbol(handle): undefined\n' +
-      '}'
-    );
-  } else {
-    dec.decode(Uint8Array.of(0), { stream: true });
-    assert.strictEqual(
-      util.inspect(dec, { showHidden: true }),
-      'TextDecoder {\n' +
-      "  encoding: 'utf-8',\n" +
-      '  fatal: false,\n' +
-      '  ignoreBOM: true,\n' +
-      '  Symbol(flags): 4,\n' +
-      '  Symbol(handle): StringDecoder {\n' +
-      "    encoding: 'utf8',\n" +
-      '    Symbol(kNativeDecoder): <Buffer 00 00 00 00 00 00 01>\n' +
-      '  }\n' +
-      '}'
-    );
-  }
+  assert.strictEqual(
+    util.inspect(dec, { showHidden: true }),
+    'TextDecoder {\n' +
+    '  encoding: \'utf-8\',\n' +
+    '  fatal: false,\n' +
+    '  ignoreBOM: true,\n' +
+    '  Symbol(flags): 4,\n' +
+    '  Symbol(handle): undefined\n' +
+    '}'
+  );
 }