From ff421dfd54076c9d39cb7db5304f4e71b27cde0f Mon Sep 17 00:00:00 2001 From: "Mark S. Miller" Date: Thu, 25 Jan 2024 13:04:46 -0800 Subject: [PATCH] fix(marshal)!: compare strings by codepoint --- packages/marshal/NEWS.md | 5 + packages/marshal/index.js | 1 + packages/marshal/src/rankOrder.js | 45 ++++++++- .../test/encodePassable-for-testing.js | 87 +++++++++++++++++ packages/marshal/test/encodePassable.test.js | 95 +++---------------- .../marshal/test/test-string-rank-order.js | 64 +++++++++++++ packages/patterns/NEWS.md | 2 + .../patterns/test/test-string-key-order.js | 54 +++++++++++ 8 files changed, 266 insertions(+), 87 deletions(-) create mode 100644 packages/marshal/test/encodePassable-for-testing.js create mode 100644 packages/marshal/test/test-string-rank-order.js create mode 100644 packages/patterns/test/test-string-key-order.js diff --git a/packages/marshal/NEWS.md b/packages/marshal/NEWS.md index 898d890a1c..21e051cca8 100644 --- a/packages/marshal/NEWS.md +++ b/packages/marshal/NEWS.md @@ -3,6 +3,11 @@ User-visible changes in `@endo/marshal`: # Next release - `deeplyFulfilled` moved from @endo/marshal to @endo/pass-style. @endo/marshal still reexports it, to avoid breaking old importers. But importers should be upgraded to import `deeplyFulfilled` directly from @endo/pass-style. +- JavaScript's relational comparison operators like `<` compare strings by lexicographic UTF16 code unit order, which exposes an internal representational detail not relevant to the string's meaning as a Unicode string. Previously, `compareRank` and associated functions compared strings using this JavaScript-native comparison. Now `compareRank` and associated functions compare strings by lexicographic Unicode Code Point order. ***This change only affects strings containing so-called supplementary characters, i.e., those whose Unicode character code does not fit in 16 bits***. + - This release does not change the `encodePassable` encoding. But now, when we say it is order preserving, we need to be careful about which order we mean. `encodePassable` is rank-order preserving when the encoded strings are compared using `compareRank`. + - The key order of strings defined by the @endo/patterns module is still defined to be the same as the rank ordering of those strings. So this release changes key order among strings to also be lexicographic comparison of Unicode Code Points. To accommodate this change, you may need to adapt applications that relied on key-order being the same as JS native order. This could include the use of any patterns expressing key inequality tests, like `M.gte(string)`. + - These string ordering changes brings Endo into conformance with any string ordering components of the OCapN standard. + - To accommodate these change, you may need to adapt applications that relied on rank-order or key-order being the same as JS native order. You may need to resort any data that had previously been rank sorted using the prior `compareRank` function. You may need to revisit any use of patterns like `M.gte(string)` expressing inequalities over strings. # v1.3.0 (2024-02-22) diff --git a/packages/marshal/index.js b/packages/marshal/index.js index 1a2674de49..1cab921518 100644 --- a/packages/marshal/index.js +++ b/packages/marshal/index.js @@ -16,6 +16,7 @@ export { export { trivialComparator, + compareByCodePoints, assertRankSorted, compareRank, isRankSorted, diff --git a/packages/marshal/src/rankOrder.js b/packages/marshal/src/rankOrder.js index 2018833417..644e20015a 100644 --- a/packages/marshal/src/rankOrder.js +++ b/packages/marshal/src/rankOrder.js @@ -8,7 +8,7 @@ import { /** * @import {Passable, PassStyle} from '@endo/pass-style' - * @import {FullCompare, RankCompare, RankCover} from './types.js' + * @import {FullCompare, RankCompare, RankCover, RankComparison} from './types.js' */ const { entries, fromEntries, setPrototypeOf, is } = Object; @@ -44,9 +44,46 @@ const { entries, fromEntries, setPrototypeOf, is } = Object; */ const sameValueZero = (x, y) => x === y || is(x, y); +/** + * @param {any} left + * @param {any} right + * @returns {RankComparison} + */ export const trivialComparator = (left, right) => // eslint-disable-next-line no-nested-ternary, @endo/restrict-comparison-operands left < right ? -1 : left === right ? 0 : 1; +harden(trivialComparator); + +// Apparently eslint confused about whether the function can ever exit +// without an explicit return. +// eslint-disable-next-line jsdoc/require-returns-check +/** + * @param {string} left + * @param {string} right + * @returns {RankComparison} + */ +export const compareByCodePoints = (left, right) => { + const leftIter = left[Symbol.iterator](); + const rightIter = right[Symbol.iterator](); + for (;;) { + const { value: leftChar } = leftIter.next(); + const { value: rightChar } = rightIter.next(); + if (leftChar === undefined && rightChar === undefined) { + return 0; + } else if (leftChar === undefined) { + // left is a prefix of right. + return -1; + } else if (rightChar === undefined) { + // right is a prefix of left. + return 1; + } + const leftCodepoint = /** @type {number} */ (leftChar.codePointAt(0)); + const rightCodepoint = /** @type {number} */ (rightChar.codePointAt(0)); + if (leftCodepoint < rightCodepoint) return -1; + if (leftCodepoint > rightCodepoint) return 1; + } +}; +harden(compareByCodePoints); /** * @typedef {Record} PassStyleRanksRecord @@ -138,8 +175,7 @@ export const makeComparatorKit = (compareRemotables = (_x, _y) => 0) => { return 0; } case 'boolean': - case 'bigint': - case 'string': { + case 'bigint': { // Within each of these passStyles, the rank ordering agrees with // JavaScript's relational operators `<` and `>`. if (left < right) { @@ -149,6 +185,9 @@ export const makeComparatorKit = (compareRemotables = (_x, _y) => 0) => { return 1; } } + case 'string': { + return compareByCodePoints(left, right); + } case 'symbol': { return comparator( nameForPassableSymbol(left), diff --git a/packages/marshal/test/encodePassable-for-testing.js b/packages/marshal/test/encodePassable-for-testing.js new file mode 100644 index 0000000000..032383bf95 --- /dev/null +++ b/packages/marshal/test/encodePassable-for-testing.js @@ -0,0 +1,87 @@ +/* eslint-disable no-bitwise, @endo/restrict-comparison-operands */ +import { Fail, q } from '@endo/errors'; + +import { + makeEncodePassable, + makeDecodePassable, +} from '../src/encodePassable.js'; +import { compareRank, makeComparatorKit } from '../src/rankOrder.js'; + +const buffers = { + __proto__: null, + r: [], + '?': [], + '!': [], +}; +const resetBuffers = () => { + buffers.r = []; + buffers['?'] = []; + buffers['!'] = []; +}; +const cursors = { + __proto__: null, + r: 0, + '?': 0, + '!': 0, +}; +const resetCursors = () => { + cursors.r = 0; + cursors['?'] = 0; + cursors['!'] = 0; +}; + +const encodeThing = (prefix, r) => { + buffers[prefix].push(r); + // With this encoding, all things with the same prefix have the same rank + return prefix; +}; + +const decodeThing = (prefix, e) => { + prefix === e || + Fail`expected encoding ${q(e)} to simply be the prefix ${q(prefix)}`; + (cursors[prefix] >= 0 && cursors[prefix] < buffers[prefix].length) || + Fail`while decoding ${q(e)}, expected cursors[${q(prefix)}], i.e., ${q( + cursors[prefix], + )} <= ${q(buffers[prefix].length)}`; + const thing = buffers[prefix][cursors[prefix]]; + cursors[prefix] += 1; + return thing; +}; + +const encodePassableInternal = makeEncodePassable({ + encodeRemotable: r => encodeThing('r', r), + encodePromise: p => encodeThing('?', p), + encodeError: er => encodeThing('!', er), +}); + +export const encodePassableInternal2 = makeEncodePassable({ + encodeRemotable: r => encodeThing('r', r), + encodePromise: p => encodeThing('?', p), + encodeError: er => encodeThing('!', er), + format: 'compactOrdered', +}); + +export const encodePassable = passable => { + resetBuffers(); + return encodePassableInternal(passable); +}; + +export const encodePassable2 = passable => { + resetBuffers(); + return encodePassableInternal2(passable); +}; +export const decodePassableInternal = makeDecodePassable({ + decodeRemotable: e => decodeThing('r', e), + decodePromise: e => decodeThing('?', e), + decodeError: e => decodeThing('!', e), +}); + +export const decodePassable = encoded => { + resetCursors(); + return decodePassableInternal(encoded); +}; + +const compareRemotables = (x, y) => + compareRank(encodeThing('r', x), encodeThing('r', y)); + +export const { comparator: compareFull } = makeComparatorKit(compareRemotables); diff --git a/packages/marshal/test/encodePassable.test.js b/packages/marshal/test/encodePassable.test.js index 6583479efe..d539b7df08 100644 --- a/packages/marshal/test/encodePassable.test.js +++ b/packages/marshal/test/encodePassable.test.js @@ -5,91 +5,20 @@ import test from '@endo/ses-ava/prepare-endo.js'; import { fc } from '@fast-check/ava'; import { Remotable } from '@endo/pass-style'; import { arbPassable } from '@endo/pass-style/tools.js'; -import { Fail, q } from '@endo/errors'; +import { Fail } from '@endo/errors'; -import { - makePassableKit, - makeEncodePassable, - makeDecodePassable, -} from '../src/encodePassable.js'; -import { compareRank, makeComparatorKit } from '../src/rankOrder.js'; +import { makePassableKit, makeEncodePassable } from '../src/encodePassable.js'; +import { compareRank } from '../src/rankOrder.js'; import { unsortedSample } from './marshal-test-data.js'; -const buffers = { - __proto__: null, - r: [], - '?': [], - '!': [], -}; -const resetBuffers = () => { - buffers.r = []; - buffers['?'] = []; - buffers['!'] = []; -}; -const cursors = { - __proto__: null, - r: 0, - '?': 0, - '!': 0, -}; -const resetCursors = () => { - cursors.r = 0; - cursors['?'] = 0; - cursors['!'] = 0; -}; - -const encodeThing = (prefix, r) => { - buffers[prefix].push(r); - // With this encoding, all things with the same prefix have the same rank - return prefix; -}; - -const decodeThing = (prefix, e) => { - prefix === e || - Fail`expected encoding ${q(e)} to simply be the prefix ${q(prefix)}`; - (cursors[prefix] >= 0 && cursors[prefix] < buffers[prefix].length) || - Fail`while decoding ${q(e)}, expected cursors[${q(prefix)}], i.e., ${q( - cursors[prefix], - )} <= ${q(buffers[prefix].length)}`; - const thing = buffers[prefix][cursors[prefix]]; - cursors[prefix] += 1; - return thing; -}; - -const compareRemotables = (x, y) => - compareRank(encodeThing('r', x), encodeThing('r', y)); - -const encodePassableInternal = makeEncodePassable({ - encodeRemotable: r => encodeThing('r', r), - encodePromise: p => encodeThing('?', p), - encodeError: er => encodeThing('!', er), -}); -const encodePassableInternal2 = makeEncodePassable({ - encodeRemotable: r => encodeThing('r', r), - encodePromise: p => encodeThing('?', p), - encodeError: er => encodeThing('!', er), - format: 'compactOrdered', -}); - -const encodePassable = passable => { - resetBuffers(); - return encodePassableInternal(passable); -}; -const encodePassable2 = passable => { - resetBuffers(); - return encodePassableInternal2(passable); -}; - -const decodePassableInternal = makeDecodePassable({ - decodeRemotable: e => decodeThing('r', e), - decodePromise: e => decodeThing('?', e), - decodeError: e => decodeThing('!', e), -}); - -const decodePassable = encoded => { - resetCursors(); - return decodePassableInternal(encoded); -}; +import { + encodePassable, + encodePassable2, + encodePassableInternal2, + decodePassable, + decodePassableInternal, + compareFull, +} from './encodePassable-for-testing.js'; test('makePassableKit output shape', t => { const kit = makePassableKit(); @@ -133,8 +62,6 @@ test( (...args) => makePassableKit(...args).encodePassable, ); -const { comparator: compareFull } = makeComparatorKit(compareRemotables); - const asNumber = new Float64Array(1); const asBits = new BigUint64Array(asNumber.buffer); const getNaN = (hexEncoding = '0008000000000000') => { diff --git a/packages/marshal/test/test-string-rank-order.js b/packages/marshal/test/test-string-rank-order.js new file mode 100644 index 0000000000..b8c5333d44 --- /dev/null +++ b/packages/marshal/test/test-string-rank-order.js @@ -0,0 +1,64 @@ +import test from '@endo/ses-ava/prepare-endo.js'; + +import { compareRank } from '../src/rankOrder.js'; +import { encodePassable } from './encodePassable-for-testing.js'; + +/** + * Essentially a ponyfill for Array.prototype.toSorted, for use before + * we can always rely on the platform to provide it. + * + * @param {string[]} strings + * @param {( + * left: string, + * right: string + * ) => import('../src/types.js').RankComparison} comp + * @returns {string[]} + */ +const sorted = (strings, comp) => [...strings].sort(comp); + +test('unicode code point order', t => { + // Test case from + // https://icu-project.org/docs/papers/utf16_code_point_order.html + const str0 = '\u{ff61}'; + const str3 = '\u{d800}\u{dc02}'; + + // str1 and str2 become impossible examples once we prohibit + // non - well - formed strings. + // See https://github.com/endojs/endo/pull/2002 + const str1 = '\u{d800}X'; + const str2 = '\u{d800}\u{ff61}'; + + // harden to ensure it is not sorted in place, just for sanity + const strs = harden([str0, str1, str2, str3]); + + /** + * @param {string} left + * @param {string} right + * @returns {import('../src/types.js').RankComparison} + */ + const nativeComp = (left, right) => + // eslint-disable-next-line no-nested-ternary + left < right ? -1 : left > right ? 1 : 0; + + const nativeSorted = sorted(strs, nativeComp); + + t.deepEqual(nativeSorted, [str1, str3, str2, str0]); + + const rankSorted = sorted(strs, compareRank); + + t.deepEqual(rankSorted, [str1, str2, str0, str3]); + + const nativeEncComp = (left, right) => + nativeComp(encodePassable(left), encodePassable(right)); + + const nativeEncSorted = sorted(strs, nativeEncComp); + + t.deepEqual(nativeEncSorted, nativeSorted); + + const rankEncComp = (left, right) => + compareRank(encodePassable(left), encodePassable(right)); + + const rankEncSorted = sorted(strs, rankEncComp); + + t.deepEqual(rankEncSorted, rankSorted); +}); diff --git a/packages/patterns/NEWS.md b/packages/patterns/NEWS.md index b644bb765a..9b4557ffab 100644 --- a/packages/patterns/NEWS.md +++ b/packages/patterns/NEWS.md @@ -4,6 +4,8 @@ User-visible changes in `@endo/patterns`: - `Passable` is now an accurate type instead of `any`. Downstream type checking may require changes ([example](https://github.com/Agoric/agoric-sdk/pull/8774)) - Some downstream types that take or return `Passable` were changed to `any` to defer downstream work to accomodate. +- JavaScript's relational comparison operators like `<` compare strings by lexicographic UTF16 code unit order, which exposes an internal representational detail not relevant to the string's meaning as a Unicode string. Previously, `compareKeys` and associated functions compared strings using this JavaScript-native comparison. Now `compareKeys` and associated functions compare strings by lexicographic Unicode Code Point order. ***This change only affects strings containing so-called supplementary characters, i.e., those whose Unicode character code does not fit in 16 bits***. + - See the NEWS.md of @endo/marshal for more on this change. # v1.2.0 (2024-02-22) diff --git a/packages/patterns/test/test-string-key-order.js b/packages/patterns/test/test-string-key-order.js new file mode 100644 index 0000000000..28c04c0c6c --- /dev/null +++ b/packages/patterns/test/test-string-key-order.js @@ -0,0 +1,54 @@ +// modeled on test-string-rank-order.js +import test from '@endo/ses-ava/prepare-endo.js'; + +import { compareKeys } from '../src/keys/compareKeys.js'; + +/** + * Essentially a ponyfill for Array.prototype.toSorted, for use before + * we can always rely on the platform to provide it. + * + * @param {string[]} strings + * @param {( + * left: string, + * right: string + * ) => import('@endo/marshal').RankComparison} comp + * @returns {string[]} + */ +const sorted = (strings, comp) => [...strings].sort(comp); + +test('unicode code point order', t => { + // Test case from + // https://icu-project.org/docs/papers/utf16_code_point_order.html + const str0 = '\u{ff61}'; + const str3 = '\u{d800}\u{dc02}'; + + // str1 and str2 become impossible examples once we prohibit + // non - well - formed strings. + // See https://github.com/endojs/endo/pull/2002 + const str1 = '\u{d800}X'; + const str2 = '\u{d800}\u{ff61}'; + + // harden to ensure it is not sorted in place, just for sanity + const strs = harden([str0, str1, str2, str3]); + + /** + * @param {string} left + * @param {string} right + * @returns {import('@endo/marshal').RankComparison} + */ + const nativeComp = (left, right) => + // eslint-disable-next-line no-nested-ternary + left < right ? -1 : left > right ? 1 : 0; + + const nativeSorted = sorted(strs, nativeComp); + + t.deepEqual(nativeSorted, [str1, str3, str2, str0]); + + // @ts-expect-error We know that for strings, `compareKeys` never returns + // NaN because it never judges strings to be incomparable. Thus, the + // KeyComparison it returns happens to also be a RankComparison we can + // sort with. + const keySorted = sorted(strs, compareKeys); + + t.deepEqual(keySorted, [str1, str2, str0, str3]); +});