diff --git a/.changeset/large-pants-hide.md b/.changeset/large-pants-hide.md new file mode 100644 index 00000000000..fbaf7bd6008 --- /dev/null +++ b/.changeset/large-pants-hide.md @@ -0,0 +1,6 @@ +--- +'@firebase/firestore': patch +'firebase': patch +--- + +Use lazy encoding in UTF-8 encoded byte comparison for strings. diff --git a/packages/firestore/src/local/indexeddb_remote_document_cache.ts b/packages/firestore/src/local/indexeddb_remote_document_cache.ts index b3d4658d53d..9b23c64fcf5 100644 --- a/packages/firestore/src/local/indexeddb_remote_document_cache.ts +++ b/packages/firestore/src/local/indexeddb_remote_document_cache.ts @@ -655,5 +655,9 @@ export function dbKeyComparator(l: DocumentKey, r: DocumentKey): number { return cmp; } + // TODO(b/329441702): Document IDs should be sorted by UTF-8 encoded byte + // order, but IndexedDB sorts strings lexicographically. Document ID + // comparison here still relies on primitive comparison to avoid mismatches + // observed in snapshot listeners with Unicode characters in documentIds return primitiveComparator(left[left.length - 1], right[right.length - 1]); } diff --git a/packages/firestore/src/model/path.ts b/packages/firestore/src/model/path.ts index 64cb0376a0e..c375b4c56d2 100644 --- a/packages/firestore/src/model/path.ts +++ b/packages/firestore/src/model/path.ts @@ -19,6 +19,7 @@ import { Integer } from '@firebase/webchannel-wrapper/bloom-blob'; import { debugAssert, fail } from '../util/assert'; import { Code, FirestoreError } from '../util/error'; +import { compareUtf8Strings, primitiveComparator } from '../util/misc'; export const DOCUMENT_KEY_NAME = '__name__'; @@ -181,7 +182,7 @@ abstract class BasePath> { return comparison; } } - return Math.sign(p1.length - p2.length); + return primitiveComparator(p1.length, p2.length); } private static compareSegments(lhs: string, rhs: string): number { @@ -201,13 +202,7 @@ abstract class BasePath> { ); } else { // both non-numeric - if (lhs < rhs) { - return -1; - } - if (lhs > rhs) { - return 1; - } - return 0; + return compareUtf8Strings(lhs, rhs); } } diff --git a/packages/firestore/src/model/values.ts b/packages/firestore/src/model/values.ts index 1977767515e..30d8688b776 100644 --- a/packages/firestore/src/model/values.ts +++ b/packages/firestore/src/model/values.ts @@ -25,7 +25,11 @@ import { Value } from '../protos/firestore_proto_api'; import { fail } from '../util/assert'; -import { arrayEquals, primitiveComparator } from '../util/misc'; +import { + arrayEquals, + compareUtf8Strings, + primitiveComparator +} from '../util/misc'; import { forEach, objectSize } from '../util/obj'; import { isNegativeZero } from '../util/types'; @@ -251,7 +255,7 @@ export function valueCompare(left: Value, right: Value): number { getLocalWriteTime(right) ); case TypeOrder.StringValue: - return primitiveComparator(left.stringValue!, right.stringValue!); + return compareUtf8Strings(left.stringValue!, right.stringValue!); case TypeOrder.BlobValue: return compareBlobs(left.bytesValue!, right.bytesValue!); case TypeOrder.RefValue: @@ -400,7 +404,7 @@ function compareMaps(left: MapValue, right: MapValue): number { rightKeys.sort(); for (let i = 0; i < leftKeys.length && i < rightKeys.length; ++i) { - const keyCompare = primitiveComparator(leftKeys[i], rightKeys[i]); + const keyCompare = compareUtf8Strings(leftKeys[i], rightKeys[i]); if (keyCompare !== 0) { return keyCompare; } diff --git a/packages/firestore/src/util/misc.ts b/packages/firestore/src/util/misc.ts index acaff77abb6..04c4891165e 100644 --- a/packages/firestore/src/util/misc.ts +++ b/packages/firestore/src/util/misc.ts @@ -16,6 +16,7 @@ */ import { randomBytes } from '../platform/random_bytes'; +import { newTextEncoder } from '../platform/text_serializer'; import { debugAssert } from './assert'; @@ -74,6 +75,52 @@ export interface Equatable { isEqual(other: T): boolean; } +/** Compare strings in UTF-8 encoded byte order */ +export function compareUtf8Strings(left: string, right: string): number { + for (let i = 0; i < left.length && i < right.length; i++) { + const leftCodePoint = left.codePointAt(i)!; + const rightCodePoint = right.codePointAt(i)!; + + if (leftCodePoint !== rightCodePoint) { + if (leftCodePoint < 128 && rightCodePoint < 128) { + // ASCII comparison + return primitiveComparator(leftCodePoint, rightCodePoint); + } else { + // Lazy instantiate TextEncoder + const encoder = newTextEncoder(); + + // Substring and do UTF-8 encoded byte comparison + const leftBytes = encoder.encode(getUtf8SafeSubstring(left, i)); + const rightBytes = encoder.encode(getUtf8SafeSubstring(right, i)); + for ( + let j = 0; + j < Math.min(leftBytes.length, rightBytes.length); + j++ + ) { + const comp = primitiveComparator(leftBytes[j], rightBytes[j]); + if (comp !== 0) { + return comp; + } + } + } + } + } + + // Compare lengths if all characters are equal + return primitiveComparator(left.length, right.length); +} + +function getUtf8SafeSubstring(str: string, index: number): string { + const firstCodePoint = str.codePointAt(index)!; + if (firstCodePoint > 0xffff) { + // It's a surrogate pair, return the whole pair + return str.substring(index, index + 2); + } else { + // It's a single code point, return it + return str.substring(index, index + 1); + } +} + export interface Iterable { forEach: (cb: (v: V) => void) => void; } diff --git a/packages/firestore/test/integration/api/database.test.ts b/packages/firestore/test/integration/api/database.test.ts index 1cda49d9229..8cbe99b3cd9 100644 --- a/packages/firestore/test/integration/api/database.test.ts +++ b/packages/firestore/test/integration/api/database.test.ts @@ -2424,4 +2424,243 @@ apiDescribe('Database', persistence => { }); }); }); + + describe('Sort unicode strings', () => { + const expectedDocs = [ + 'b', + 'a', + 'h', + 'i', + 'c', + 'f', + 'e', + 'd', + 'g', + 'k', + 'j' + ]; + it('snapshot listener sorts unicode strings the same as server', async () => { + const testDocs = { + 'a': { value: 'Łukasiewicz' }, + 'b': { value: 'Sierpiński' }, + 'c': { value: '岩澤' }, + 'd': { value: '🄟' }, + 'e': { value: 'P' }, + 'f': { value: '︒' }, + 'g': { value: '🐵' }, + 'h': { value: '你好' }, + 'i': { value: '你顥' }, + 'j': { value: '😁' }, + 'k': { value: '😀' } + }; + + return withTestCollection(persistence, testDocs, async collectionRef => { + const orderedQuery = query(collectionRef, orderBy('value')); + + const getSnapshot = await getDocsFromServer(orderedQuery); + expect(toIds(getSnapshot)).to.deep.equal(expectedDocs); + + const storeEvent = new EventsAccumulator(); + const unsubscribe = onSnapshot(orderedQuery, storeEvent.storeEvent); + const watchSnapshot = await storeEvent.awaitEvent(); + expect(toIds(watchSnapshot)).to.deep.equal(toIds(getSnapshot)); + + unsubscribe(); + + await checkOnlineAndOfflineResultsMatch(orderedQuery, ...expectedDocs); + }); + }); + + it('snapshot listener sorts unicode strings in array the same as server', async () => { + const testDocs = { + 'a': { value: ['Łukasiewicz'] }, + 'b': { value: ['Sierpiński'] }, + 'c': { value: ['岩澤'] }, + 'd': { value: ['🄟'] }, + 'e': { value: ['P'] }, + 'f': { value: ['︒'] }, + 'g': { value: ['🐵'] }, + 'h': { value: ['你好'] }, + 'i': { value: ['你顥'] }, + 'j': { value: ['😁'] }, + 'k': { value: ['😀'] } + }; + + return withTestCollection(persistence, testDocs, async collectionRef => { + const orderedQuery = query(collectionRef, orderBy('value')); + + const getSnapshot = await getDocsFromServer(orderedQuery); + expect(toIds(getSnapshot)).to.deep.equal(expectedDocs); + + const storeEvent = new EventsAccumulator(); + const unsubscribe = onSnapshot(orderedQuery, storeEvent.storeEvent); + const watchSnapshot = await storeEvent.awaitEvent(); + expect(toIds(watchSnapshot)).to.deep.equal(toIds(getSnapshot)); + + unsubscribe(); + + await checkOnlineAndOfflineResultsMatch(orderedQuery, ...expectedDocs); + }); + }); + + it('snapshot listener sorts unicode strings in map the same as server', async () => { + const testDocs = { + 'a': { value: { foo: 'Łukasiewicz' } }, + 'b': { value: { foo: 'Sierpiński' } }, + 'c': { value: { foo: '岩澤' } }, + 'd': { value: { foo: '🄟' } }, + 'e': { value: { foo: 'P' } }, + 'f': { value: { foo: '︒' } }, + 'g': { value: { foo: '🐵' } }, + 'h': { value: { foo: '你好' } }, + 'i': { value: { foo: '你顥' } }, + 'j': { value: { foo: '😁' } }, + 'k': { value: { foo: '😀' } } + }; + + return withTestCollection(persistence, testDocs, async collectionRef => { + const orderedQuery = query(collectionRef, orderBy('value')); + + const getSnapshot = await getDocsFromServer(orderedQuery); + expect(toIds(getSnapshot)).to.deep.equal(expectedDocs); + + const storeEvent = new EventsAccumulator(); + const unsubscribe = onSnapshot(orderedQuery, storeEvent.storeEvent); + const watchSnapshot = await storeEvent.awaitEvent(); + expect(toIds(watchSnapshot)).to.deep.equal(toIds(getSnapshot)); + + unsubscribe(); + + await checkOnlineAndOfflineResultsMatch(orderedQuery, ...expectedDocs); + }); + }); + + it('snapshot listener sorts unicode strings in map key the same as server', async () => { + const testDocs = { + 'a': { value: { 'Łukasiewicz': true } }, + 'b': { value: { 'Sierpiński': true } }, + 'c': { value: { '岩澤': true } }, + 'd': { value: { '🄟': true } }, + 'e': { value: { 'P': true } }, + 'f': { value: { '︒': true } }, + 'g': { value: { '🐵': true } }, + 'h': { value: { '你好': true } }, + 'i': { value: { '你顥': true } }, + 'j': { value: { '😁': true } }, + 'k': { value: { '😀': true } } + }; + + return withTestCollection(persistence, testDocs, async collectionRef => { + const orderedQuery = query(collectionRef, orderBy('value')); + + const getSnapshot = await getDocsFromServer(orderedQuery); + expect(toIds(getSnapshot)).to.deep.equal(expectedDocs); + + const storeEvent = new EventsAccumulator(); + const unsubscribe = onSnapshot(orderedQuery, storeEvent.storeEvent); + const watchSnapshot = await storeEvent.awaitEvent(); + expect(toIds(watchSnapshot)).to.deep.equal(toIds(getSnapshot)); + + unsubscribe(); + + await checkOnlineAndOfflineResultsMatch(orderedQuery, ...expectedDocs); + }); + }); + + it('snapshot listener sorts unicode strings in document key the same as server', async () => { + const testDocs = { + 'Łukasiewicz': { value: true }, + 'Sierpiński': { value: true }, + '岩澤': { value: true }, + '🄟': { value: true }, + 'P': { value: true }, + '︒': { value: true }, + '🐵': { value: true }, + '你好': { value: true }, + '你顥': { value: true }, + '😁': { value: true }, + '😀': { value: true } + }; + + return withTestCollection(persistence, testDocs, async collectionRef => { + const orderedQuery = query(collectionRef, orderBy(documentId())); + + const getSnapshot = await getDocsFromServer(orderedQuery); + const expectedDocs = [ + 'Sierpiński', + 'Łukasiewicz', + '你好', + '你顥', + '岩澤', + '︒', + 'P', + '🄟', + '🐵', + '😀', + '😁' + ]; + expect(toIds(getSnapshot)).to.deep.equal(expectedDocs); + + const storeEvent = new EventsAccumulator(); + const unsubscribe = onSnapshot(orderedQuery, storeEvent.storeEvent); + const watchSnapshot = await storeEvent.awaitEvent(); + expect(toIds(watchSnapshot)).to.deep.equal(toIds(getSnapshot)); + + unsubscribe(); + + await checkOnlineAndOfflineResultsMatch(orderedQuery, ...expectedDocs); + }); + }); + + // eslint-disable-next-line no-restricted-properties + (persistence.storage === 'indexeddb' ? it.skip : it)( + 'snapshot listener sorts unicode strings in document key the same as server with persistence', + async () => { + const testDocs = { + 'Łukasiewicz': { value: true }, + 'Sierpiński': { value: true }, + '岩澤': { value: true }, + '🄟': { value: true }, + 'P': { value: true }, + '︒': { value: true }, + '🐵': { value: true }, + '你好': { value: true }, + '你顥': { value: true }, + '😁': { value: true }, + '😀': { value: true } + }; + + return withTestCollection( + persistence, + testDocs, + async collectionRef => { + const orderedQuery = query(collectionRef, orderBy('value')); + + const getSnapshot = await getDocsFromServer(orderedQuery); + expect(toIds(getSnapshot)).to.deep.equal([ + 'Sierpiński', + 'Łukasiewicz', + '你好', + '你顥', + '岩澤', + '︒', + 'P', + '🄟', + '🐵', + '😀', + '😁' + ]); + + const storeEvent = new EventsAccumulator(); + const unsubscribe = onSnapshot(orderedQuery, storeEvent.storeEvent); + const watchSnapshot = await storeEvent.awaitEvent(); + // TODO: IndexedDB sorts string lexicographically, and misses the document with ID '🄟','🐵' + expect(toIds(watchSnapshot)).to.deep.equal(toIds(getSnapshot)); + + unsubscribe(); + } + ); + } + ); + }); }); diff --git a/packages/firestore/test/unit/util/misc.test.ts b/packages/firestore/test/unit/util/misc.test.ts index cc75419c026..c1a2eeb82b7 100644 --- a/packages/firestore/test/unit/util/misc.test.ts +++ b/packages/firestore/test/unit/util/misc.test.ts @@ -18,7 +18,7 @@ import { expect } from 'chai'; import { debugCast } from '../../../src/util/assert'; -import { immediateSuccessor } from '../../../src/util/misc'; +import { compareUtf8Strings, immediateSuccessor } from '../../../src/util/misc'; import { mask } from '../../util/helpers'; describe('immediateSuccessor', () => { @@ -53,3 +53,251 @@ describe('FieldMask', () => { ); }); }); + +class StringPair { + constructor(readonly s1: string, readonly s2: string) {} +} + +class StringPairGenerator { + constructor(private stringGenerator: StringGenerator) {} + + next(): StringPair { + const prefix = this.stringGenerator.next(); + const s1 = prefix + this.stringGenerator.next(); + const s2 = prefix + this.stringGenerator.next(); + return new StringPair(s1, s2); + } +} + +class StringGenerator { + private static readonly DEFAULT_SURROGATE_PAIR_PROBABILITY = 0.33; + private static readonly DEFAULT_MAX_LENGTH = 20; + + // The first Unicode code point that is in the basic multilingual plane ("BMP") and, + // therefore requires 1 UTF-16 code unit to be represented in UTF-16. + private static readonly MIN_BMP_CODE_POINT = 0x00000000; + + // The last Unicode code point that is in the basic multilingual plane ("BMP") and, + // therefore requires 1 UTF-16 code unit to be represented in UTF-16. + private static readonly MAX_BMP_CODE_POINT = 0x0000ffff; + + // The first Unicode code point that is outside of the basic multilingual plane ("BMP") and, + // therefore requires 2 UTF-16 code units, a surrogate pair, to be represented in UTF-16. + private static readonly MIN_SUPPLEMENTARY_CODE_POINT = 0x00010000; + + // The last Unicode code point that is outside of the basic multilingual plane ("BMP") and, + // therefore requires 2 UTF-16 code units, a surrogate pair, to be represented in UTF-16. + private static readonly MAX_SUPPLEMENTARY_CODE_POINT = 0x0010ffff; + + private readonly rnd: Random; + private readonly surrogatePairProbability: number; + private readonly maxLength: number; + + constructor(seed: number); + constructor(rnd: Random, surrogatePairProbability: number, maxLength: number); + constructor( + seedOrRnd: number | Random, + surrogatePairProbability?: number, + maxLength?: number + ) { + if (typeof seedOrRnd === 'number') { + this.rnd = new Random(seedOrRnd); + this.surrogatePairProbability = + StringGenerator.DEFAULT_SURROGATE_PAIR_PROBABILITY; + this.maxLength = StringGenerator.DEFAULT_MAX_LENGTH; + } else { + this.rnd = seedOrRnd; + this.surrogatePairProbability = StringGenerator.validateProbability( + 'surrogate pair', + surrogatePairProbability! + ); + this.maxLength = StringGenerator.validateLength( + 'maximum string', + maxLength! + ); + } + } + + private static validateProbability( + name: string, + probability: number + ): number { + if (!Number.isFinite(probability)) { + throw new Error( + `invalid ${name} probability: ${probability} (must be between 0.0 and 1.0, inclusive)` + ); + } else if (probability < 0.0) { + throw new Error( + `invalid ${name} probability: ${probability} (must be greater than or equal to zero)` + ); + } else if (probability > 1.0) { + throw new Error( + `invalid ${name} probability: ${probability} (must be less than or equal to 1)` + ); + } + return probability; + } + + private static validateLength(name: string, length: number): number { + if (length < 0) { + throw new Error( + `invalid ${name} length: ${length} (must be greater than or equal to zero)` + ); + } + return length; + } + + next(): string { + const length = this.rnd.nextInt(this.maxLength + 1); + const sb = new StringBuilder(); + while (sb.length() < length) { + const codePoint = this.nextCodePoint(); + sb.appendCodePoint(codePoint); + } + return sb.toString(); + } + + private isNextSurrogatePair(): boolean { + return StringGenerator.nextBoolean(this.rnd, this.surrogatePairProbability); + } + + private static nextBoolean(rnd: Random, probability: number): boolean { + if (probability === 0.0) { + return false; + } else if (probability === 1.0) { + return true; + } else { + return rnd.nextFloat() < probability; + } + } + + private nextCodePoint(): number { + if (this.isNextSurrogatePair()) { + return this.nextSurrogateCodePoint(); + } else { + return this.nextNonSurrogateCodePoint(); + } + } + + private nextSurrogateCodePoint(): number { + const highSurrogateMin = 0xd800; + const highSurrogateMax = 0xdbff; + const lowSurrogateMin = 0xdc00; + const lowSurrogateMax = 0xdfff; + + const highSurrogate = this.nextCodePointRange( + highSurrogateMin, + highSurrogateMax + ); + const lowSurrogate = this.nextCodePointRange( + lowSurrogateMin, + lowSurrogateMax + ); + + return (highSurrogate - 0xd800) * 0x400 + (lowSurrogate - 0xdc00) + 0x10000; + } + + private nextNonSurrogateCodePoint(): number { + return this.nextCodePointRange( + StringGenerator.MIN_BMP_CODE_POINT, + StringGenerator.MAX_BMP_CODE_POINT + ); + } + + private nextCodePointRange(min: number, max: number): number { + const rangeSize = max - min + 1; + const offset = this.rnd.nextInt(rangeSize); + return min + offset; + } + + // private nextCodePointRange(min: number, max: number, expectedCharCount: number): number { + // const rangeSize = max - min; + // const offset = this.rnd.nextInt(rangeSize); + // const codePoint = min + offset; + // if (String.fromCharCode(codePoint).length !== expectedCharCount) { + // throw new Error( + // `internal error vqgqnxcy97: Character.charCount(${codePoint}) returned ${ + // String.fromCharCode(codePoint).length + // }, but expected ${expectedCharCount}`, + // ); + // } + // return codePoint; + // } +} + +class Random { + private seed: number; + + constructor(seed: number) { + this.seed = seed; + } + + nextInt(max: number): number { + this.seed = (this.seed * 9301 + 49297) % 233280; + const rnd = this.seed / 233280; + return Math.floor(rnd * max); + } + + nextFloat(): number { + this.seed = (this.seed * 9301 + 49297) % 233280; + return this.seed / 233280; + } +} + +class StringBuilder { + private buffer: string[] = []; + + append(str: string): StringBuilder { + this.buffer.push(str); + return this; + } + + appendCodePoint(codePoint: number): StringBuilder { + this.buffer.push(String.fromCodePoint(codePoint)); + return this; + } + + toString(): string { + return this.buffer.join(''); + } + + length(): number { + return this.buffer.join('').length; + } +} + +describe('CompareUtf8Strings', () => { + it('compareUtf8Strings should return correct results', () => { + const errors = []; + const seed = Math.floor(Math.random() * Number.MAX_SAFE_INTEGER); + let passCount = 0; + const stringGenerator = new StringGenerator(new Random(seed), 0.33, 20); + const stringPairGenerator = new StringPairGenerator(stringGenerator); + + for (let i = 0; i < 1000000 && errors.length < 10; i++) { + const { s1, s2 } = stringPairGenerator.next(); + + const actual = compareUtf8Strings(s1, s2); + const expected = Buffer.from(s1, 'utf8').compare(Buffer.from(s2, 'utf8')); + + if (actual === expected) { + passCount++; + } else { + errors.push( + `compareUtf8Strings(s1="${s1}", s2="${s2}") returned ${actual}, ` + + `but expected ${expected} (i=${i}, s1.length=${s1.length}, s2.length=${s2.length})` + ); + } + } + + if (errors.length > 0) { + console.error( + `${errors.length} test cases failed, ${passCount} test cases passed, seed=${seed};` + ); + errors.forEach((error, index) => + console.error(`errors[${index}]: ${error}`) + ); + throw new Error('Test failed'); + } + }).timeout(20000); +});