diff --git a/src/string_decoder.cc b/src/string_decoder.cc index fa8201faff2944..cc38cd927a0f89 100644 --- a/src/string_decoder.cc +++ b/src/string_decoder.cc @@ -71,16 +71,17 @@ MaybeLocal StringDecoder::DecodeData(Isolate* isolate, kIncompleteCharactersEnd); if (Encoding() == UTF8) { // For UTF-8, we need special treatment to align with the V8 decoder: - // If an incomplete character is found at a chunk boundary, we turn - // that character into a single invalid one. + // If an incomplete character is found at a chunk boundary, we use + // its remainder and pass it to V8 as-is. for (size_t i = 0; i < nread && i < MissingBytes(); ++i) { if ((data[i] & 0xC0) != 0x80) { // This byte is not a continuation byte even though it should have - // been one. - // Act as if there was a 1-byte incomplete character, which does - // not make sense but works here because we know it's invalid. + // been one. We stop decoding of the incomplete character at this + // point (but still use the rest of the incomplete bytes from this + // chunk) and assume that the new, unexpected byte starts a new one. state_[kMissingBytes] = 0; - state_[kBufferedBytes] = 1; + memcpy(IncompleteCharacterBuffer() + BufferedBytes(), data, i); + state_[kBufferedBytes] += i; data += i; nread -= i; break; diff --git a/test/parallel/test-string-decoder-fuzz.js b/test/parallel/test-string-decoder-fuzz.js new file mode 100644 index 00000000000000..d8d01881591161 --- /dev/null +++ b/test/parallel/test-string-decoder-fuzz.js @@ -0,0 +1,48 @@ +'use strict'; +require('../common'); +const { StringDecoder } = require('string_decoder'); +const util = require('util'); +const assert = require('assert'); + +// Tests that, for random sequences of bytes, our StringDecoder gives the +// same result as a direction conversion using Buffer.toString(). +// In particular, it checks that StringDecoder aligns with V8’s own output. + +function rand(max) { + return Math.floor(Math.random() * max); +} + +function randBuf(maxLen) { + const buf = Buffer.allocUnsafe(rand(maxLen)); + for (let i = 0; i < buf.length; i++) + buf[i] = rand(256); + return buf; +} + +const encodings = [ + 'utf16le', 'utf8', 'ascii', 'hex', 'base64', 'latin1' +]; + +function runSingleFuzzTest() { + const enc = encodings[rand(encodings.length)]; + const sd = new StringDecoder(enc); + const bufs = []; + const strings = []; + + const N = rand(10); + for (let i = 0; i < N; ++i) { + const buf = randBuf(50); + bufs.push(buf); + strings.push(sd.write(buf)); + } + strings.push(sd.end()); + + assert.strictEqual(strings.join(''), Buffer.concat(bufs).toString(enc), + `Mismatch:\n${util.inspect(strings)}\n` + + util.inspect(bufs.map((buf) => buf.toString('hex'))) + + `\nfor encoding ${enc}`); +} + +const start = Date.now(); +while (Date.now() - start < 100) + runSingleFuzzTest(); diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js index 6e4f4b121d20d7..56894289b17612 100644 --- a/test/parallel/test-string-decoder.js +++ b/test/parallel/test-string-decoder.js @@ -151,6 +151,17 @@ assert.strictEqual(decoder.write(Buffer.alloc(20)), '\0'.repeat(10)); assert.strictEqual(decoder.write(Buffer.alloc(48)), '\0'.repeat(24)); assert.strictEqual(decoder.end(), ''); +// Regression tests for https://github.com/nodejs/node/issues/22626 +// (not enough replacement chars when having seen more than one byte of an +// incomplete multibyte characters). +decoder = new StringDecoder('utf8'); +assert.strictEqual(decoder.write(Buffer.from('f69b', 'hex')), ''); +assert.strictEqual(decoder.write(Buffer.from('d1', 'hex')), '\ufffd\ufffd'); +assert.strictEqual(decoder.end(), '\ufffd'); +assert.strictEqual(decoder.write(Buffer.from('f4', 'hex')), ''); +assert.strictEqual(decoder.write(Buffer.from('bde5', 'hex')), '\ufffd\ufffd'); +assert.strictEqual(decoder.end(), '\ufffd'); + common.expectsError( () => new StringDecoder(1), {