Skip to content

Commit

Permalink
improve handling character class
Browse files Browse the repository at this point in the history
  • Loading branch information
JLHwung committed Sep 21, 2023
1 parent 2b55fe7 commit 6cb420e
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 31 deletions.
16 changes: 11 additions & 5 deletions rewrite-pattern.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ function flatMap(array, callback) {
return result;
}

function regenerateContainsAstral(regenerateData) {
const data = regenerateData.data;
return data.length >= 1 && data[data.length - 1] >= 0x10000;
}

const SPECIAL_CHARS = /([\\^$.*+?()[\]{}|])/g;

// Prepare a Regenerate set containing all code points, used for negative
Expand Down Expand Up @@ -488,7 +493,9 @@ const processCharacterClass = (
const negative = characterClassItem.negative;
const { singleChars, transformed, longStrings } = computed;
if (transformed) {
const setStr = singleChars.toString(regenerateOptions);
// If single chars already contains some astral character, regenerate (bmpOnly: true) will create valid regex strings
const bmpOnly = regenerateContainsAstral(singleChars);
const setStr = singleChars.toString({ ...regenerateOptions, bmpOnly: bmpOnly });

if (negative) {
if (config.useUnicodeFlag) {
Expand Down Expand Up @@ -518,10 +525,9 @@ const processCharacterClass = (
);
} else {
// Generate negative set directly when case folding is not involved.
update(
characterClassItem,
UNICODE_SET.clone().remove(singleChars).toString(regenerateOptions)
);
const negativeSet = UNICODE_SET.clone().remove(singleChars);
const bmpOnly = regenerateContainsAstral(negativeSet);
update(characterClassItem, negativeSet.toString({ bmpOnly: bmpOnly }));
}
} else {
update(characterClassItem, `(?!${setStr})[\\s\\S]`);
Expand Down
16 changes: 12 additions & 4 deletions tests/fixtures/character-class.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,25 +44,33 @@ const characterClassFixtures = [
{
pattern: '[^K]', // LATIN CAPITAL LETTER K
flags: 'u',
expected: '(?:[\\0-JL-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["k", "\u212a", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["K"],
expected: '(?:[\\0-JL-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: { unicodeFlag: 'transform' }
},
{
pattern: '[^k]', // LATIN SMALL LETTER K
flags: 'u',
expected: '(?:[\\0-jl-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["K", "\u212a", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["k"],
expected: '(?:[\\0-jl-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: { unicodeFlag: 'transform' }
},
{
pattern: '[^\u212a]', // KELVIN SIGN
flags: 'u',
expected: '(?:[\\0-\\u2129\\u212B-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["K", "k", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["\u212a"],
expected: '(?:[\\0-\\u2129\\u212B-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: { unicodeFlag: 'transform' }
},
{
pattern: '[^\u{1D50E}]', // MATHEMATICAL FRAKTUR CAPITAL K
flags: 'u',
expected: '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD834\\uD836-\\uDBFF][\\uDC00-\\uDFFF]|\\uD835[\\uDC00-\\uDD0D\\uDD0F-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["K", "k", "\u{12345}", "\u{1D50F}", "\uDAAA", "\uDDDD"],
nonMatches: ["\u{1D50E}"],
expected: '(?:[\\0-\\uFFFF]|[\\uD800-\\uD834\\uD836-\\uDBFF][\\uDC00-\\uDFFF]|\\uD835[\\uDC00-\\uDD0D\\uDD0F-\\uDFFF])',
options: { unicodeFlag: 'transform' }
},
{
Expand Down
8 changes: 6 additions & 2 deletions tests/fixtures/unicode-set.js
Original file line number Diff line number Diff line change
Expand Up @@ -105,17 +105,21 @@ const unicodeSetFixtures = [
},
{
pattern: '[^[a-z][f-h]]',
matches: ["A", "\u{12345}"],
matches: ["A", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["a", "z"],
expected: '(?:(?![a-z])[\\s\\S])',
expected: '(?:[\\0-`\\{-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: TRANSFORM_U
},
{
pattern: '[[^a-z][f-h]]',
matches: ["f", "A", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["a", "z"],
expected: '[\\0-`f-h\\{-\\u{10FFFF}]'
},
{
pattern: '[[^a-z][f-h]]',
matches: ["f", "A", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["a", "z"],
expected: '(?:[\\0-`f-h\\{-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: TRANSFORM_U
},
Expand Down
13 changes: 9 additions & 4 deletions tests/fixtures/unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ const unicodeFixtures = [
{
'pattern': '[\\s\\S]',
'flags': FLAGS_WITH_UNICODE,
'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '\\d',
Expand All @@ -68,8 +68,9 @@ const unicodeFixtures = [
},
{
'pattern': '[\\d\\D]',
'matches': ["a", "0", "\u{12345}", "\uDAAA", "\uDDDD"],
'flags': FLAGS_WITH_UNICODE,
'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '\\w',
Expand Down Expand Up @@ -100,8 +101,9 @@ const unicodeFixtures = [
},
{
'pattern': '[\\w\\W]',
'matches': ["a", "0", "\u{12345}", "\uDAAA", "\uDDDD"],
'flags': FLAGS_WITH_UNICODE,
'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '[\\uD834\\uDF06-\\uD834\\uDF08a-z]',
Expand Down Expand Up @@ -180,11 +182,14 @@ const unicodeFixtures = [
},
{
'pattern': '[^a]',
'matches': ['b', 'A', '\u{1D49C}', '\uDAAA', '\uDDDD'],
'nonMatches': ['a'],
'flags': FLAGS_WITH_UNICODE_WITHOUT_I,
'transpiled': '(?:[\\0-`b-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-`b-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '[^a]',
'nonMatches': ['a', 'A'],
'flags': FLAGS_WITH_UNICODE_WITH_I,
'transpiled': '(?:(?![a\\uD800-\\uDFFF])[\\s\\S]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
Expand Down
65 changes: 49 additions & 16 deletions tests/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,27 @@ const { characterClassFixtures } = require("./fixtures/character-class.js");
const { unicodeSetFixtures } = require("./fixtures/unicode-set.js");
const { modifiersFixtures } = require("./fixtures/modifiers.js");

/** For node 6 compat */
assert.match || (assert.match = function match(value, regex) { assert.ok(regex.exec(value) !== null) });
assert.doesNotMatch || (assert.doesNotMatch = function doesNotMatch(value, regex) { assert.ok(regex.exec(value) === null) });

/**
* comput output regex flags from input flags and transform options
*
* @param {string} inputFlags
* @param {*} regexpuOptions
*/
function getOutputFlags(inputFlags, options) {
let result = inputFlags;
if (options.unicodeSetsFlag === "transform") {
result = result.replace("v", "u");
}
if (options.unicodeFlag === "transform") {
result = result.replace("u", "");
}
return result;
}

describe('rewritePattern { unicodeFlag }', () => {
const options = {
'unicodeFlag': 'transform'
Expand Down Expand Up @@ -95,19 +116,19 @@ describe('unicodePropertyEscapes', () => {
);
assert.equal(
rewritePattern('[^\\p{ASCII_Hex_Digit}_]', 'u', features),
'(?:[\\0-\\/:-@G-\\^`g-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'(?:[\\0-\\/:-@G-\\^`g-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
);
assert.equal(
rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', features),
'(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'(?:[\\0-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF])'
);
assert.equal(
rewritePattern('[\\p{Script_Extensions=Anatolian_Hieroglyphs}_]', 'u', features),
'(?:_|\\uD811[\\uDC00-\\uDE46])'
);
assert.equal(
rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}_]', 'u', features),
'(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'(?:[\\0-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF])'
);
assert.equal(
rewritePattern('(?:\\p{ASCII_Hex_Digit})', 'u', features),
Expand Down Expand Up @@ -391,6 +412,14 @@ describe('character classes', () => {
if (transpiled != '(?:' + expected + ')') {
assert.strictEqual(transpiled, expected);
}
for (const match of fixture.matches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
assert.match(match, transpiledRegex);
}
for (const nonMatch of fixture.nonMatches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
assert.doesNotMatch(nonMatch, transpiledRegex);
}
});
}
});
Expand All @@ -406,13 +435,22 @@ describe('unicodeSets (v) flag', () => {
const { pattern, transpiled: expected } = fixture;
const inputRE = `/${pattern}/${flag}`;
it(`rewrites \`${inputRE}\` correctly without using the u flag`, () => {
const transpiled = rewritePattern(pattern, flag, {
const options = {
unicodeSetsFlag: "transform",
unicodeFlag: "transform",
});
};
const transpiled = rewritePattern(pattern, flag, options);
if (transpiled != "(?:" + expected + ")") {
assert.strictEqual(transpiled, expected);
}
for (const match of fixture.matches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flag, options));
assert.match(match, transpiledRegex);
}
for (const nonMatch of fixture.nonMatches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flag, options));
assert.doesNotMatch(nonMatch, transpiledRegex);
}
});
}
}
Expand Down Expand Up @@ -447,18 +485,13 @@ describe('unicodeSets (v) flag', () => {
assert.strictEqual(transpiled, expected);
}
});
if (fixture.matches) {
// todo: infer output flags from fixture input flags and options
const transpiledRegex = new RegExp(`^${transpiled}$`);
for (const match of fixture.matches) {
assert.match(match, transpiledRegex);
}
for (const match of fixture.matches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
assert.match(match, transpiledRegex);
}
if (fixture.nonMatches) {
const transpiledRegex = new RegExp(`^${transpiled}$`);
for (const nonMatch of fixture.nonMatches) {
assert.doesNotMatch(nonMatch, transpiledRegex);
}
for (const nonMatch of fixture.nonMatches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
assert.doesNotMatch(nonMatch, transpiledRegex);
}
}
}
Expand Down

0 comments on commit 6cb420e

Please sign in to comment.