Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(NODE-5909): optimize writing basic latin strings #645

Merged
merged 3 commits into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions src/utils/byte_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ export type ByteUtils = {
fromHex: (hex: string) => Uint8Array;
/** Create a lowercase hex string from bytes */
toHex: (buffer: Uint8Array) => string;
/** Create a Uint8Array containing utf8 code units from a string */
fromUTF8: (text: string) => Uint8Array;
/** Create a string from utf8 code units, fatal=true will throw an error if UTF-8 bytes are invalid, fatal=false will insert replacement characters */
toUTF8: (buffer: Uint8Array, start: number, end: number, fatal: boolean) => string;
/** Get the utf8 code unit count from a string if it were to be transformed to utf8 */
Expand Down
45 changes: 44 additions & 1 deletion src/utils/latin.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@
* @param end - The index to stop searching the uint8array
* @returns string if all bytes are within the basic latin range, otherwise null
*/
export function tryLatin(uint8array: Uint8Array, start: number, end: number): string | null {
export function tryReadBasicLatin(
uint8array: Uint8Array,
start: number,
end: number
): string | null {
if (uint8array.length === 0) {
return '';
}
Expand Down Expand Up @@ -59,3 +63,42 @@ export function tryLatin(uint8array: Uint8Array, start: number, end: number): st

return String.fromCharCode(...latinBytes);
}

/**
* This function is an optimization for writing small basic latin strings.
* @internal
* @remarks
* ### Important characteristics:
* - If the string length is 0 return 0, do not perform any work
* - If a string is longer than 25 code units return null
* - If any code unit exceeds 128 this function returns null
*
* @param destination - The uint8array to serialize the string to
* @param source - The string to turn into UTF-8 bytes if it fits in the basic latin range
* @param offset - The position in the destination to begin writing bytes to
* @returns the number of bytes written to destination if all code units are below 128, otherwise null
*/
export function tryWriteBasicLatin(
destination: Uint8Array,
source: string,
offset: number
): number | null {
if (source.length === 0) return 0;

if (source.length > 25) return null;

if (destination.length - offset < source.length) return null;

for (
let charOffset = 0, destinationOffset = offset;
charOffset < source.length;
charOffset++, destinationOffset++
) {
const char = source.charCodeAt(charOffset);
if (char > 127) return null;

destination[destinationOffset] = char;
}

return source.length;
}
13 changes: 7 additions & 6 deletions src/utils/node_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { BSONError } from '../error';
import { validateUtf8 } from '../validate_utf8';
import { tryLatin } from './latin';
import { tryReadBasicLatin, tryWriteBasicLatin } from './latin';

type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary';
type NodeJsBuffer = ArrayBufferView &
Expand Down Expand Up @@ -123,12 +123,8 @@ export const nodeJsByteUtils = {
return nodeJsByteUtils.toLocalBufferType(buffer).toString('hex');
},

fromUTF8(text: string): NodeJsBuffer {
return Buffer.from(text, 'utf8');
},

toUTF8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string {
const basicLatin = end - start <= 20 ? tryLatin(buffer, start, end) : null;
const basicLatin = end - start <= 20 ? tryReadBasicLatin(buffer, start, end) : null;
if (basicLatin != null) {
return basicLatin;
}
Expand All @@ -153,6 +149,11 @@ export const nodeJsByteUtils = {
},

encodeUTF8Into(buffer: Uint8Array, source: string, byteOffset: number): number {
const latinBytesWritten = tryWriteBasicLatin(buffer, source, byteOffset);
if (latinBytesWritten != null) {
return latinBytesWritten;
}

return nodeJsByteUtils.toLocalBufferType(buffer).write(source, byteOffset, undefined, 'utf8');
},

Expand Down
12 changes: 4 additions & 8 deletions src/utils/web_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { BSONError } from '../error';
import { tryLatin } from './latin';
import { tryReadBasicLatin } from './latin';

type TextDecoder = {
readonly encoding: string;
Expand Down Expand Up @@ -169,12 +169,8 @@ export const webByteUtils = {
return Array.from(uint8array, byte => byte.toString(16).padStart(2, '0')).join('');
},

fromUTF8(text: string): Uint8Array {
return new TextEncoder().encode(text);
},

toUTF8(uint8array: Uint8Array, start: number, end: number, fatal: boolean): string {
const basicLatin = end - start <= 20 ? tryLatin(uint8array, start, end) : null;
const basicLatin = end - start <= 20 ? tryReadBasicLatin(uint8array, start, end) : null;
if (basicLatin != null) {
return basicLatin;
}
Expand All @@ -190,11 +186,11 @@ export const webByteUtils = {
},

utf8ByteLength(input: string): number {
return webByteUtils.fromUTF8(input).byteLength;
return new TextEncoder().encode(input).byteLength;
},

encodeUTF8Into(buffer: Uint8Array, source: string, byteOffset: number): number {
const bytes = webByteUtils.fromUTF8(source);
const bytes = new TextEncoder().encode(source);
buffer.set(bytes, byteOffset);
return bytes.byteLength;
},
Expand Down
32 changes: 17 additions & 15 deletions test/node/byte_utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -365,33 +365,35 @@ const toISO88591Tests: ByteUtilTest<'toISO88591'>[] = [
}
}
];
const fromUTF8Tests: ByteUtilTest<'fromUTF8'>[] = [
const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [
{
name: 'should create buffer from utf8 input',
inputs: [Buffer.from('abc\u{1f913}', 'utf8').toString('utf8')],
name: 'should insert utf8 bytes into buffer',
inputs: [Buffer.alloc(7), 'abc\u{1f913}', 0],
expectation({ output, error }) {
expect(error).to.be.null;
expect(output).to.deep.equal(Buffer.from('abc\u{1f913}', 'utf8'));
expect(output).to.equal(7);
expect(this.inputs[0]).to.deep.equal(Buffer.from('abc\u{1f913}', 'utf8'));
}
},
{
name: 'should return empty buffer for empty string input',
inputs: [''],
name: 'should return 0 and not modify input buffer',
inputs: [Uint8Array.from([2, 2]), '', 0],
expectation({ output, error }) {
expect(error).to.be.null;
expect(output).to.have.property('byteLength', 0);
expect(output).to.equal(0);
expect(this.inputs[0]).to.deep.equal(Uint8Array.from([2, 2]));
}
},
{
name: 'should return bytes with replacement character if string is not encodable',
inputs: ['\u{1f913}'.slice(0, 1)],
name: 'should insert replacement character bytes if string is not encodable',
inputs: [Uint8Array.from({ length: 10 }, () => 2), '\u{1f913}'.slice(0, 1), 2],
expectation({ output, error }) {
expect(error).to.be.null;
expect(output).to.have.property('byteLength', 3);
expect(output).to.have.property('0', 0xef);
expect(output).to.have.property('1', 0xbf);
expect(output).to.have.property('2', 0xbd);
const backToString = Buffer.from(output!).toString('utf8');
expect(output).to.equal(3);
expect(this.inputs[0]).to.have.property('2', 0xef);
expect(this.inputs[0]).to.have.property('3', 0xbf);
expect(this.inputs[0]).to.have.property('4', 0xbd);
const backToString = Buffer.from(this.inputs[0].subarray(2, 5)).toString('utf8');
const replacementCharacter = '\u{fffd}';
expect(backToString).to.equal(replacementCharacter);
}
Expand Down Expand Up @@ -507,7 +509,7 @@ const table = new Map<keyof ByteUtils, ByteUtilTest<keyof ByteUtils>[]>([
['toHex', toHexTests],
['fromISO88591', fromISO88591Tests],
['toISO88591', toISO88591Tests],
['fromUTF8', fromUTF8Tests],
['encodeUTF8Into', fromUTF8Tests],
['toUTF8', toUTF8Tests],
['utf8ByteLength', utf8ByteLengthTests],
['randomBytes', randomBytesTests]
Expand Down
112 changes: 86 additions & 26 deletions test/node/utils/latin.test.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import { expect } from 'chai';
import { tryLatin } from '../../../src/utils/latin';
import { tryReadBasicLatin, tryWriteBasicLatin } from '../../../src/utils/latin';
import * as sinon from 'sinon';

describe('tryLatin()', () => {
describe('tryReadBasicLatin()', () => {
context('when given a buffer of length 0', () => {
it('returns an empty string', () => {
expect(tryLatin(new Uint8Array(), 0, 10)).to.equal('');
expect(tryReadBasicLatin(new Uint8Array(), 0, 10)).to.equal('');
});
});

context('when the distance between end and start is 0', () => {
it('returns an empty string', () => {
expect(tryLatin(new Uint8Array([1, 2, 3]), 0, 0)).to.equal('');
expect(tryReadBasicLatin(new Uint8Array([1, 2, 3]), 0, 0)).to.equal('');
});
});

Expand All @@ -30,61 +30,61 @@ describe('tryLatin()', () => {
context('when there is 1 byte', () => {
context('that exceed 127', () => {
it('returns null', () => {
expect(tryLatin(new Uint8Array([128]), 0, 1)).be.null;
expect(tryReadBasicLatin(new Uint8Array([128]), 0, 1)).be.null;
});
});

it('calls fromCharCode once', () => {
tryLatin(new Uint8Array([95]), 0, 1);
tryReadBasicLatin(new Uint8Array([95]), 0, 1);
expect(fromCharCodeSpy).to.have.been.calledOnce;
});

it('never calls array.push', () => {
tryLatin(new Uint8Array([95]), 0, 1);
tryReadBasicLatin(new Uint8Array([95]), 0, 1);
expect(pushSpy).to.have.not.been.called;
});
});

context('when there is 2 bytes', () => {
context('that exceed 127', () => {
it('returns null', () => {
expect(tryLatin(new Uint8Array([0, 128]), 0, 2)).be.null;
expect(tryLatin(new Uint8Array([128, 0]), 0, 2)).be.null;
expect(tryLatin(new Uint8Array([128, 128]), 0, 2)).be.null;
expect(tryReadBasicLatin(new Uint8Array([0, 128]), 0, 2)).be.null;
expect(tryReadBasicLatin(new Uint8Array([128, 0]), 0, 2)).be.null;
expect(tryReadBasicLatin(new Uint8Array([128, 128]), 0, 2)).be.null;
});
});

it('calls fromCharCode twice', () => {
tryLatin(new Uint8Array([95, 105]), 0, 2);
tryReadBasicLatin(new Uint8Array([95, 105]), 0, 2);
expect(fromCharCodeSpy).to.have.been.calledTwice;
});

it('never calls array.push', () => {
tryLatin(new Uint8Array([95, 105]), 0, 2);
tryReadBasicLatin(new Uint8Array([95, 105]), 0, 2);
expect(pushSpy).to.have.not.been.called;
});
});

context('when there is 3 bytes', () => {
context('that exceed 127', () => {
it('returns null', () => {
expect(tryLatin(new Uint8Array([0, 0, 128]), 0, 3)).be.null;
expect(tryLatin(new Uint8Array([0, 128, 0]), 0, 3)).be.null;
expect(tryLatin(new Uint8Array([128, 0, 0]), 0, 3)).be.null;
expect(tryLatin(new Uint8Array([128, 128, 128]), 0, 3)).be.null;
expect(tryLatin(new Uint8Array([128, 128, 0]), 0, 3)).be.null;
expect(tryLatin(new Uint8Array([128, 0, 128]), 0, 3)).be.null;
expect(tryLatin(new Uint8Array([0, 128, 128]), 0, 3)).be.null;
expect(tryReadBasicLatin(new Uint8Array([0, 0, 128]), 0, 3)).be.null;
expect(tryReadBasicLatin(new Uint8Array([0, 128, 0]), 0, 3)).be.null;
expect(tryReadBasicLatin(new Uint8Array([128, 0, 0]), 0, 3)).be.null;
expect(tryReadBasicLatin(new Uint8Array([128, 128, 128]), 0, 3)).be.null;
expect(tryReadBasicLatin(new Uint8Array([128, 128, 0]), 0, 3)).be.null;
expect(tryReadBasicLatin(new Uint8Array([128, 0, 128]), 0, 3)).be.null;
expect(tryReadBasicLatin(new Uint8Array([0, 128, 128]), 0, 3)).be.null;
});
});

it('calls fromCharCode thrice', () => {
tryLatin(new Uint8Array([95, 105, 100]), 0, 3);
tryReadBasicLatin(new Uint8Array([95, 105, 100]), 0, 3);
expect(fromCharCodeSpy).to.have.been.calledThrice;
});

it('never calls array.push', () => {
tryLatin(new Uint8Array([95, 105, 100]), 0, 3);
tryReadBasicLatin(new Uint8Array([95, 105, 100]), 0, 3);
expect(pushSpy).to.have.not.been.called;
});
});
Expand All @@ -93,26 +93,86 @@ describe('tryLatin()', () => {
context(`when there is ${stringLength} bytes`, () => {
context('that exceed 127', () => {
it('returns null', () => {
expect(tryLatin(new Uint8Array(stringLength).fill(128), 0, stringLength)).be.null;
expect(tryReadBasicLatin(new Uint8Array(stringLength).fill(128), 0, stringLength)).be
.null;
});
});

it('calls fromCharCode once', () => {
tryLatin(new Uint8Array(stringLength).fill(95), 0, stringLength);
tryReadBasicLatin(new Uint8Array(stringLength).fill(95), 0, stringLength);
expect(fromCharCodeSpy).to.have.been.calledOnce;
});

it(`calls array.push ${stringLength}`, () => {
tryLatin(new Uint8Array(stringLength).fill(95), 0, stringLength);
tryReadBasicLatin(new Uint8Array(stringLength).fill(95), 0, stringLength);
expect(pushSpy).to.have.callCount(stringLength);
});
});
}

context('when there is >21 bytes', () => {
it('returns null', () => {
expect(tryLatin(new Uint8Array(21).fill(95), 0, 21)).be.null;
expect(tryLatin(new Uint8Array(201).fill(95), 0, 201)).be.null;
expect(tryReadBasicLatin(new Uint8Array(21).fill(95), 0, 21)).be.null;
expect(tryReadBasicLatin(new Uint8Array(201).fill(95), 0, 201)).be.null;
});
});
});

describe('tryWriteBasicLatin()', () => {
context('when given a string of length 0', () => {
it('returns 0 and does not modify the destination', () => {
const input = Uint8Array.from({ length: 10 }, () => 1);
expect(tryWriteBasicLatin(input, '', 2)).to.equal(0);
expect(input).to.deep.equal(Uint8Array.from({ length: 10 }, () => 1));
});
});

context('when given a string with a length larger than the buffer', () => {
it('returns null', () => {
const input = Uint8Array.from({ length: 10 }, () => 1);
expect(tryWriteBasicLatin(input, 'a'.repeat(11), 0)).to.be.null;
expect(tryWriteBasicLatin(input, 'a'.repeat(13), 2)).to.be.null;
});
});

let charCodeAtSpy;

beforeEach(() => {
charCodeAtSpy = sinon.spy(String.prototype, 'charCodeAt');
});

afterEach(() => {
sinon.restore();
});

for (let stringLength = 1; stringLength <= 25; stringLength++) {
context(`when there is ${stringLength} bytes`, () => {
context('that exceed 127', () => {
it('returns null', () => {
expect(
tryWriteBasicLatin(
new Uint8Array(stringLength * 3),
'a'.repeat(stringLength - 1) + '\x80',
0
)
).be.null;
});
});

it(`calls charCodeAt ${stringLength}`, () => {
tryWriteBasicLatin(
new Uint8Array(stringLength * 3),
String.fromCharCode(127).repeat(stringLength),
stringLength
);
expect(charCodeAtSpy).to.have.callCount(stringLength);
});
});
}

context('when there is >25 characters', () => {
it('returns null', () => {
expect(tryWriteBasicLatin(new Uint8Array(75), 'a'.repeat(26), 0)).be.null;
});
});
});