Skip to content

Commit

Permalink
Fix v-flag bugs (#85)
Browse files Browse the repository at this point in the history
  • Loading branch information
JLHwung authored Sep 23, 2023
1 parent 947f2cd commit 91ee342
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 27 deletions.
30 changes: 18 additions & 12 deletions rewrite-pattern.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ function flatMap(array, callback) {
return result;
}

function regenerateContainsAstral(regenerateData) {
const data = regenerateData.data;
return data.length >= 1 && data[data.length - 1] >= 0x10000;
}

const SPECIAL_CHARS = /([\\^$.*+?()[\]{}|])/g;

// Prepare a Regenerate set containing all code points, used for negative
Expand Down Expand Up @@ -330,7 +335,7 @@ const buildHandler = (action) => {
}
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
/* node:coverage ignore next */
default:
throw new Error(`Unknown set action: ${ characterClassItem.kind }`);
}
Expand Down Expand Up @@ -414,7 +419,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
/* node:coverage ignore next */
default:
throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`);
}
Expand All @@ -441,7 +446,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
case 'characterClassEscape':
handlePositive.regSet(data, getCharacterClassEscapeSet(
item.value,
config.flags.unicode,
config.flags.unicode || config.flags.unicodeSets,
config.flags.ignoreCase
));
break;
Expand All @@ -465,7 +470,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
/* node:coverage ignore next */
default:
throw new Error(`Unknown term type: ${ item.type }`);
}
Expand All @@ -488,13 +493,15 @@ const processCharacterClass = (
const negative = characterClassItem.negative;
const { singleChars, transformed, longStrings } = computed;
if (transformed) {
const setStr = singleChars.toString(regenerateOptions);
// If single chars already contains some astral character, regenerate (bmpOnly: true) will create valid regex strings
const bmpOnly = regenerateContainsAstral(singleChars);
const setStr = singleChars.toString(Object.assign({}, regenerateOptions, { bmpOnly: bmpOnly }));

if (negative) {
if (config.useUnicodeFlag) {
update(characterClassItem, `[^${setStr[0] === '[' ? setStr.slice(1, -1) : setStr}]`)
} else {
if (config.flags.unicode) {
if (config.flags.unicode || config.flags.unicodeSets) {
if (config.flags.ignoreCase) {
const astralCharsSet = singleChars.clone().intersection(ASTRAL_SET);
// Assumption: singleChars do not contain lone surrogates.
Expand All @@ -518,10 +525,9 @@ const processCharacterClass = (
);
} else {
// Generate negative set directly when case folding is not involved.
update(
characterClassItem,
UNICODE_SET.clone().remove(singleChars).toString(regenerateOptions)
);
const negativeSet = UNICODE_SET.clone().remove(singleChars);
const bmpOnly = regenerateContainsAstral(negativeSet);
update(characterClassItem, negativeSet.toString({ bmpOnly: bmpOnly }));
}
} else {
update(characterClassItem, `(?!${setStr})[\\s\\S]`);
Expand Down Expand Up @@ -731,7 +737,7 @@ const processTerm = (item, regenerateOptions, groups) => {
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
/* node:coverage ignore next */
default:
throw new Error(`Unknown term type: ${ item.type }`);
}
Expand Down Expand Up @@ -835,7 +841,7 @@ const rewritePattern = (pattern, flags, options) => {

const regenerateOptions = {
'hasUnicodeFlag': config.useUnicodeFlag,
'bmpOnly': !config.flags.unicode
'bmpOnly': !config.flags.unicode && !config.flags.unicodeSets
};

const groups = {
Expand Down
16 changes: 12 additions & 4 deletions tests/fixtures/character-class.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,25 +44,33 @@ const characterClassFixtures = [
{
pattern: '[^K]', // LATIN CAPITAL LETTER K
flags: 'u',
expected: '(?:[\\0-JL-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["k", "\u212a", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["K"],
expected: '(?:[\\0-JL-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: { unicodeFlag: 'transform' }
},
{
pattern: '[^k]', // LATIN SMALL LETTER K
flags: 'u',
expected: '(?:[\\0-jl-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["K", "\u212a", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["k"],
expected: '(?:[\\0-jl-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: { unicodeFlag: 'transform' }
},
{
pattern: '[^\u212a]', // KELVIN SIGN
flags: 'u',
expected: '(?:[\\0-\\u2129\\u212B-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["K", "k", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["\u212a"],
expected: '(?:[\\0-\\u2129\\u212B-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: { unicodeFlag: 'transform' }
},
{
pattern: '[^\u{1D50E}]', // MATHEMATICAL FRAKTUR CAPITAL K
flags: 'u',
expected: '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD834\\uD836-\\uDBFF][\\uDC00-\\uDFFF]|\\uD835[\\uDC00-\\uDD0D\\uDD0F-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["K", "k", "\u{12345}", "\u{1D50F}", "\uDAAA", "\uDDDD"],
nonMatches: ["\u{1D50E}"],
expected: '(?:[\\0-\\uFFFF]|[\\uD800-\\uD834\\uD836-\\uDBFF][\\uDC00-\\uDFFF]|\\uD835[\\uDC00-\\uDD0D\\uDD0F-\\uDFFF])',
options: { unicodeFlag: 'transform' }
},
{
Expand Down
15 changes: 14 additions & 1 deletion tests/fixtures/unicode-set.js
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,21 @@ const unicodeSetFixtures = [
},
{
pattern: '[^[a-z][f-h]]',
expected: '(?:(?![a-z])[\\s\\S])',
matches: ["A", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["a", "z"],
expected: '(?:[\\0-`\\{-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: TRANSFORM_U
},
{
pattern: '[[^a-z][f-h]]',
matches: ["f", "A", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["a", "z"],
expected: '[\\0-`f-h\\{-\\u{10FFFF}]'
},
{
pattern: '[[^a-z][f-h]]',
matches: ["f", "A", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["a", "z"],
expected: '(?:[\\0-`f-h\\{-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: TRANSFORM_U
},
Expand Down Expand Up @@ -336,6 +342,13 @@ const unicodeSetFixtures = [
{
pattern: '[\\p{ASCII}&&\\p{Control}]',
expected: '[\\0-\\x1F\\x7F]',
},
{
pattern: '.',
flags: 'sv',
matches: ['\n'],
options: { unicodeSetsFlag: 'transform', dotAllFlag: 'transform' },
expected: '[\\s\\S]'
}
];

Expand Down
13 changes: 9 additions & 4 deletions tests/fixtures/unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ const unicodeFixtures = [
{
'pattern': '[\\s\\S]',
'flags': FLAGS_WITH_UNICODE,
'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '\\d',
Expand All @@ -68,8 +68,9 @@ const unicodeFixtures = [
},
{
'pattern': '[\\d\\D]',
'matches': ["a", "0", "\u{12345}", "\uDAAA", "\uDDDD"],
'flags': FLAGS_WITH_UNICODE,
'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '\\w',
Expand Down Expand Up @@ -100,8 +101,9 @@ const unicodeFixtures = [
},
{
'pattern': '[\\w\\W]',
'matches': ["a", "0", "\u{12345}", "\uDAAA", "\uDDDD"],
'flags': FLAGS_WITH_UNICODE,
'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '[\\uD834\\uDF06-\\uD834\\uDF08a-z]',
Expand Down Expand Up @@ -180,11 +182,14 @@ const unicodeFixtures = [
},
{
'pattern': '[^a]',
'matches': ['b', 'A', '\u{1D49C}', '\uDAAA', '\uDDDD'],
'nonMatches': ['a'],
'flags': FLAGS_WITH_UNICODE_WITHOUT_I,
'transpiled': '(?:[\\0-`b-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-`b-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '[^a]',
'nonMatches': ['a', 'A'],
'flags': FLAGS_WITH_UNICODE_WITH_I,
'transpiled': '(?:(?![a\\uD800-\\uDFFF])[\\s\\S]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
Expand Down
81 changes: 75 additions & 6 deletions tests/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,30 @@ const { characterClassFixtures } = require("./fixtures/character-class.js");
const { unicodeSetFixtures } = require("./fixtures/unicode-set.js");
const { modifiersFixtures } = require("./fixtures/modifiers.js");

/** For node 6 compat */
assert.match || (assert.match = function match(value, regex) { assert.ok(regex.exec(value) !== null) });
assert.doesNotMatch || (assert.doesNotMatch = function doesNotMatch(value, regex) { assert.ok(regex.exec(value) === null) });

/**
* comput output regex flags from input flags and transform options
*
* @param {string} inputFlags
* @param {*} regexpuOptions
*/
function getOutputFlags(inputFlags, options) {
let result = inputFlags;
if (options.unicodeSetsFlag === "transform") {
result = result.replace("v", "u");
}
if (options.unicodeFlag === "transform") {
result = result.replace("u", "");
}
if (options.dotAllFlag === "transform") {
result = result.replace("s", "");
}
return result;
}

describe('rewritePattern { unicodeFlag }', () => {
const options = {
'unicodeFlag': 'transform'
Expand Down Expand Up @@ -95,19 +119,19 @@ describe('unicodePropertyEscapes', () => {
);
assert.equal(
rewritePattern('[^\\p{ASCII_Hex_Digit}_]', 'u', features),
'(?:[\\0-\\/:-@G-\\^`g-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'(?:[\\0-\\/:-@G-\\^`g-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
);
assert.equal(
rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', features),
'(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'(?:[\\0-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF])'
);
assert.equal(
rewritePattern('[\\p{Script_Extensions=Anatolian_Hieroglyphs}_]', 'u', features),
'(?:_|\\uD811[\\uDC00-\\uDE46])'
);
assert.equal(
rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}_]', 'u', features),
'(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'(?:[\\0-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF])'
);
assert.equal(
rewritePattern('(?:\\p{ASCII_Hex_Digit})', 'u', features),
Expand Down Expand Up @@ -219,10 +243,10 @@ describe('unicodePropertyEscapes', () => {
'[\\u{14400}-\\u{14646}]'
);
assert.equal(
rewritePattern('[\\p{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', {
rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', {
'unicodePropertyEscapes': 'transform',
}),
'[\\u{14400}-\\u{14646}]'
'[\\0-\\u{143FF}\\u{14647}-\\u{10FFFF}]'
);
});
it('should not transpile unicode property when unicodePropertyEscapes is not enabled', () => {
Expand Down Expand Up @@ -391,13 +415,50 @@ describe('character classes', () => {
if (transpiled != '(?:' + expected + ')') {
assert.strictEqual(transpiled, expected);
}
for (const match of fixture.matches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
assert.match(match, transpiledRegex);
}
for (const nonMatch of fixture.nonMatches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
assert.doesNotMatch(nonMatch, transpiledRegex);
}
});
}
});



describe('unicodeSets (v) flag', () => {
// Re-use the unicode fixtures but replacing the input pattern's `u` flag with `v` flag
for (const fixture of unicodeFixtures) {
if (fixture.flags.includes("u")) {
for (let flag of fixture.flags) {
flag = flag.replace("u", "v");
const { pattern, transpiled: expected } = fixture;
const inputRE = `/${pattern}/${flag}`;
it(`rewrites \`${inputRE}\` correctly without using the u flag`, () => {
const options = {
unicodeSetsFlag: "transform",
unicodeFlag: "transform",
};
const transpiled = rewritePattern(pattern, flag, options);
if (transpiled != "(?:" + expected + ")") {
assert.strictEqual(transpiled, expected);
}
for (const match of fixture.matches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flag, options));
assert.match(match, transpiledRegex);
}
for (const nonMatch of fixture.nonMatches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flag, options));
assert.doesNotMatch(nonMatch, transpiledRegex);
}
});
}
}
}

if (IS_NODE_6) return;

for (const fixture of unicodeSetFixtures) {
Expand All @@ -421,12 +482,20 @@ describe('unicodeSets (v) flag', () => {
}, throws);
});
} else {
const transpiled = rewritePattern(pattern, flags, options);
it(`rewrites \`${inputRE}\` correctly ${transformUnicodeFlag ? 'without ' : ''}using the u flag`, () => {
const transpiled = rewritePattern(pattern, flags, options);
if (transpiled != '(?:' + expected + ')') {
assert.strictEqual(transpiled, expected);
}
});
for (const match of fixture.matches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
assert.match(match, transpiledRegex);
}
for (const nonMatch of fixture.nonMatches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
assert.doesNotMatch(nonMatch, transpiledRegex);
}
}
}

Expand Down

0 comments on commit 91ee342

Please sign in to comment.