Skip to content

Commit

Permalink
string_decoder: fix handling of malformed utf8
Browse files Browse the repository at this point in the history
There have been problems with utf8 decoding in cases where the input
was invalid.  Some cases would give different results depending on
chunking, while others even led to exceptions.  This commit simplifies
the code in several ways, reducing the risk of breakage.

Most importantly, the `text` method is not only used for bulk
conversion of a single chunk, but also for the conversion of the mini
buffer `lastChar` while handling characters spanning chunks.  That
should make most of the problems due to incorrect handling of special
cases disappear.

Secondly, that `text` method is now independent of encoding.  The
encoding-dependent method `complete` now has a single well-defined
task: determine the buffer position up to which the input consists of
complete characters.  The actual conversion is handled in a central
location, leading to a cleaner and leaner internal interface.

Thirdly, we no longer try to remember just how many bytes we'll need
for the next complete character.  We simply try to fill up the
`nextChar` buffer and perform a conversion on that.  This reduces the
number of internal counter variables from two to one, namely `partial`
which indicates the number of bytes currently stored in `nextChar`.
A possible drawback of this approach is that there is chance of
degraded performance if input arrives one byte at a time and is from a
script using long utf8 sequences.  As a benefit, though, this allows us
to emit a U+FFFD replacement character sooner in cases where the first
byte of an utf8 sequence is not followed by the expected number of
continuation bytes.

Fixes: nodejs#7308
  • Loading branch information
gagern committed Jun 16, 2016
1 parent 1a1ff77 commit 588864d
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 137 deletions.
221 changes: 84 additions & 137 deletions lib/string_decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,186 +44,133 @@ function StringDecoder(encoding) {
var nb;
switch (this.encoding) {
case 'utf16le':
this.text = utf16Text;
this.end = utf16End;
this.complete = utf16Complete;
this.flush = simpleFlush;
// fall through
case 'utf8':
nb = 4;
break;
case 'base64':
this.text = base64Text;
this.end = base64End;
this.complete = base64Complete;
this.flush = simpleFlush;
nb = 3;
break;
default:
this.write = simpleWrite;
this.end = simpleEnd;
return;
}
this.lastNeed = 0;
this.lastTotal = 0;
this.partial = 0;
this.lastChar = Buffer.allocUnsafe(nb);
}

StringDecoder.prototype.write = function(buf) {
if (buf.length === 0)
return '';
var r;
var i;
if (this.lastNeed) {
r = this.fillLast(buf);
if (r === undefined)
return '';
i = this.lastNeed;
this.lastNeed = 0;
} else {
i = 0;
}
if (i < buf.length)
return (r ? r + this.text(buf, i) : this.text(buf, i));
return r || '';
const partial = this.partial;
if (!partial)
return this.text(buf, 0, buf.length);

// We have incomplete characters in partial many bytes from last run.
// Copy bytes from buf to fill lastChar (if there is enough input).
const newHeadLen = Math.min(buf.length, this.lastChar.length - partial);
const totalHeadLen = newHeadLen + partial;
buf.copy(this.lastChar, partial, 0, newHeadLen);
// Now we have totalHeadLen bytes of input in lastChar, try to convert that.
let r = this.text(this.lastChar, 0, totalHeadLen);
if (this.partial <= newHeadLen) // consumed at least all the old head
r += this.text(buf, newHeadLen - this.partial, buf.length);
return r;
};

StringDecoder.prototype.end = utf8End;

// Returns only complete characters in a Buffer
StringDecoder.prototype.text = utf8Text;
StringDecoder.prototype.text = function(buf, start, end) {
if (start === end)
return '';
const complete = this.complete(buf, start, end);
this.partial = end - complete;
if (this.partial && buf !== this.lastChar)
buf.copy(this.lastChar, 0, complete, end);
if (start === complete)
return '';
return buf.toString(this.encoding, start, complete);
};

// Attempts to complete a partial character using bytes from a Buffer
StringDecoder.prototype.fillLast = function(buf) {
if (this.lastNeed <= buf.length) {
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
// Returns a suitable representation of incomplete characters as well
StringDecoder.prototype.end = function(buf) {
let r = (buf && buf.length ? this.write(buf) : '');
if (this.partial) {
r += this.flush();
this.partial = 0;
}
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length);
this.lastNeed -= buf.length;
return r;
};

// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a
// continuation byte.
function utf8CheckByte(byte) {
if (byte <= 0x7F)
return 0;
else if (byte >> 5 === 0x06)
return 2;
else if (byte >> 4 === 0x0E)
return 3;
else if (byte >> 3 === 0x1E)
return 4;
return -1;
}
// Given (buf, start, end), determine the maximal n <= end such that
// buf.slice(start, n) contains only complete characters
StringDecoder.prototype.complete = utf8Complete;

// Returns a string representation of the this.partial bytes in
// this.lastChar which represent an incomplete character
StringDecoder.prototype.flush = utf8Flush;

// Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
// character, returning the total number of bytes needed to complete the partial
// character (if applicable).
function utf8CheckIncomplete(self, buf, i) {
var j = buf.length - 1;
if (j < i)
return 0;
var nb = utf8CheckByte(buf[j--]);
if (nb >= 0) {
if (nb > 0)
self.lastNeed = nb + 1 - (buf.length - j);
return nb;
}
if (j < i)
return 0;
nb = utf8CheckByte(buf[j--]);
if (nb >= 0) {
if (nb > 0)
self.lastNeed = nb + 1 - (buf.length - j);
return nb;
}
if (j < i)
return 0;
nb = utf8CheckByte(buf[j--]);
if (nb >= 0) {
if (nb > 0)
self.lastNeed = nb + 1 - (buf.length - j);
return nb;
// character, returning the position after the last complete character.
function utf8Complete(buf, start, end) {
if (start > end - 3)
start = end - 3;
for (let i = end - 1; i >= start; --i) {
const byte = buf[i];
let numBytes;
if (byte >> 6 === 0x02)
continue; // continuation byte
else if (byte >> 5 === 0x06)
numBytes = 2;
else if (byte >> 4 === 0x0E)
numBytes = 3;
else if (byte >> 3 === 0x1E)
numBytes = 4;
else
numBytes = 1; // ASCII or invalid
if (i + numBytes > end) // incomplete
return i; // continue next run at leading byte
// Have complete sequence, possibly followed by garbage continuation.
return end;
}
return 0;
}

// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
// partial character, the character's bytes are buffered until the required
// number of bytes are available.
function utf8Text(buf, i) {
const total = utf8CheckIncomplete(this, buf, i);
if (!this.lastNeed)
return buf.toString('utf8', i);
this.lastTotal = total;
const end = buf.length - (total - this.lastNeed);
buf.copy(this.lastChar, 0, end);
return buf.toString('utf8', i, end);
// Ends in valid 4-byte sequence or invalid continuation characters.
// Either way the input is complete, so convert it as is.
return end;
}

// For UTF-8, a replacement character for each buffered byte of a (partial)
// character needs to be added to the output.
function utf8End(buf) {
const r = (buf && buf.length ? this.write(buf) : '');
if (this.lastNeed)
return r + '\ufffd'.repeat(this.lastTotal - this.lastNeed);
return r;
function utf8Flush() {
return '\ufffd'.repeat(this.partial);
}

// UTF-16LE typically needs two bytes per character, but even if we have an even
// number of bytes available, we need to check if we end on a leading/high
// surrogate. In that case, we need to wait for the next two bytes in order to
// decode the last character properly.
function utf16Text(buf, i) {
if ((buf.length - i) % 2 === 0) {
const r = buf.toString('utf16le', i);
if (r) {
const c = r.charCodeAt(r.length - 1);
if (c >= 0xD800 && c <= 0xDBFF) {
this.lastNeed = 2;
this.lastTotal = 4;
this.lastChar[0] = buf[buf.length - 2];
this.lastChar[1] = buf[buf.length - 1];
return r.slice(0, -1);
}
}
return r;
function utf16Complete(buf, start, end) {
if ((end - start) & 1)
--end;
if (end > start) {
const byte = buf[end - 1];
if (byte >= 0xD8 && byte <= 0xDB)
return end - 2;
}
this.lastNeed = 1;
this.lastTotal = 2;
this.lastChar[0] = buf[buf.length - 1];
return buf.toString('utf16le', i, buf.length - 1);
return end;
}

// For UTF-16LE we do not explicitly append special replacement characters if we
// end on a partial character, we simply let v8 handle that.
function utf16End(buf) {
const r = (buf && buf.length ? this.write(buf) : '');
if (this.lastNeed) {
const end = this.lastTotal - this.lastNeed;
return r + this.lastChar.toString('utf16le', 0, end);
}
return r;
function base64Complete(buf, start, end) {
return end - (end - start) % 3;
}

function base64Text(buf, i) {
const n = (buf.length - i) % 3;
if (n === 0)
return buf.toString('base64', i);
this.lastNeed = 3 - n;
this.lastTotal = 3;
if (n === 1) {
this.lastChar[0] = buf[buf.length - 1];
} else {
this.lastChar[0] = buf[buf.length - 2];
this.lastChar[1] = buf[buf.length - 1];
}
return buf.toString('base64', i, buf.length - n);
}


function base64End(buf) {
const r = (buf && buf.length ? this.write(buf) : '');
if (this.lastNeed)
return r + this.lastChar.toString('base64', 0, 3 - this.lastNeed);
return r;
// For UTF-16LE and Base64 we do not explicitly append special replacement
// characters if we end on a partial character, we simply let v8 handle that.
function simpleFlush() {
return this.lastChar.toString(this.encoding, 0, this.partial);
}

// Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex)
Expand Down
30 changes: 30 additions & 0 deletions test/parallel/test-string-decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,30 @@ test(
'\u02e4\u0064\u12e4\u0030\u3045'
);

// Some invalid input, known to have caused trouble with chunking
// in https://github.com/nodejs/node/pull/7310#issuecomment-226445923
// 00: |00000000 ASCII
// 41: |01000001 ASCII
// B8: 10|111000 continuation
// CC: 110|01100 two-byte head
// E2: 1110|0010 three-byte head
// F0: 11110|000 four-byte head
// F1: 11110|001'another four-byte head
// FB: 111110|11 "five-byte head", not UTF-8
test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA');
test('utf-8', Buffer.from('E2', 'hex'), '\ufffd');
test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA');
test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338');
test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffd\ufffdA');
test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338');
test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0');
test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38');
test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\ufffd\u0338');
test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001');
test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), // CESU-8 of U+1D40D
'\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd');
test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379');

// UCS-2
test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc');

Expand Down Expand Up @@ -58,6 +82,11 @@ decoder = new StringDecoder('utf8');
assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd');
assert.strictEqual(decoder.end(), '\ufffd');

decoder = new StringDecoder('utf8');
assert.strictEqual(decoder.write(Buffer.from('f1', 'hex')), '');
assert.strictEqual(decoder.write(Buffer.from('41f2', 'hex')), '\ufffdA');
assert.strictEqual(decoder.end(), '\ufffd');


// Additional UTF-16LE surrogate pair tests
decoder = new StringDecoder('utf16le');
Expand Down Expand Up @@ -93,6 +122,7 @@ function test(encoding, input, expected, singleSequence) {
sequence.forEach(function(write) {
output += decoder.write(input.slice(write[0], write[1]));
});
output += decoder.end();
process.stdout.write('.');
if (output !== expected) {
var message =
Expand Down

0 comments on commit 588864d

Please sign in to comment.