nodejs · addaleax · Sep 5, 2018 · Sep 5, 2018
diff --git a/src/string_decoder.cc b/src/string_decoder.cc
@@ -71,16 +71,17 @@ MaybeLocal<String> StringDecoder::DecodeData(Isolate* isolate,
                kIncompleteCharactersEnd);
       if (Encoding() == UTF8) {
         // For UTF-8, we need special treatment to align with the V8 decoder:
-        // If an incomplete character is found at a chunk boundary, we turn
-        // that character into a single invalid one.
+        // If an incomplete character is found at a chunk boundary, we use
+        // its remainder and pass it to V8 as-is.
         for (size_t i = 0; i < nread && i < MissingBytes(); ++i) {
           if ((data[i] & 0xC0) != 0x80) {
             // This byte is not a continuation byte even though it should have
-            // been one.
-            // Act as if there was a 1-byte incomplete character, which does
-            // not make sense but works here because we know it's invalid.
+            // been one. We stop decoding of the incomplete character at this
+            // point (but still use the rest of the incomplete bytes from this
+            // chunk) and assume that the new, unexpected byte starts a new one.
             state_[kMissingBytes] = 0;
-            state_[kBufferedBytes] = 1;
+            memcpy(IncompleteCharacterBuffer() + BufferedBytes(), data, i);
+            state_[kBufferedBytes] += i;
             data += i;
             nread -= i;
             break;

diff --git a/test/parallel/test-string-decoder-fuzz.js b/test/parallel/test-string-decoder-fuzz.js
@@ -0,0 +1,48 @@
+'use strict';
+require('../common');
+const { StringDecoder } = require('string_decoder');
+const util = require('util');
+const assert = require('assert');
+
+// Tests that, for random sequences of bytes, our StringDecoder gives the
+// same result as a direction conversion using Buffer.toString().
+// In particular, it checks that StringDecoder aligns with V8’s own output.
+
+function rand(max) {
+  return Math.floor(Math.random() * max);
+}
+
+function randBuf(maxLen) {
+  const buf = Buffer.allocUnsafe(rand(maxLen));
+  for (let i = 0; i < buf.length; i++)
+    buf[i] = rand(256);
+  return buf;
+}
+
+const encodings = [
+  'utf16le', 'utf8', 'ascii', 'hex', 'base64', 'latin1'
+];
+
+function runSingleFuzzTest() {
+  const enc = encodings[rand(encodings.length)];
+  const sd = new StringDecoder(enc);
+  const bufs = [];
+  const strings = [];
+
+  const N = rand(10);
+  for (let i = 0; i < N; ++i) {
+    const buf = randBuf(50);
+    bufs.push(buf);
+    strings.push(sd.write(buf));
+  }
+  strings.push(sd.end());
+
+  assert.strictEqual(strings.join(''), Buffer.concat(bufs).toString(enc),
+                     `Mismatch:\n${util.inspect(strings)}\n` +
+                     util.inspect(bufs.map((buf) => buf.toString('hex'))) +
+                     `\nfor encoding ${enc}`);
+}
+
+const start = Date.now();
+while (Date.now() - start < 100)
+  runSingleFuzzTest();
diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js
@@ -151,6 +151,17 @@ assert.strictEqual(decoder.write(Buffer.alloc(20)), '\0'.repeat(10));
 assert.strictEqual(decoder.write(Buffer.alloc(48)), '\0'.repeat(24));
 assert.strictEqual(decoder.end(), '');
 
+// Regression tests for https://github.com/nodejs/node/issues/22626
+// (not enough replacement chars when having seen more than one byte of an
+// incomplete multibyte characters).
+decoder = new StringDecoder('utf8');
+assert.strictEqual(decoder.write(Buffer.from('f69b', 'hex')), '');
+assert.strictEqual(decoder.write(Buffer.from('d1', 'hex')), '\ufffd\ufffd');
+assert.strictEqual(decoder.end(), '\ufffd');
+assert.strictEqual(decoder.write(Buffer.from('f4', 'hex')), '');
+assert.strictEqual(decoder.write(Buffer.from('bde5', 'hex')), '\ufffd\ufffd');
+assert.strictEqual(decoder.end(), '\ufffd');
+
 common.expectsError(
   () => new StringDecoder(1),
   {