Skip to content

Commit

Permalink
Apply case expansion to unicode property escapes within i modifier (#…
Browse files Browse the repository at this point in the history
…97)

* fix: expand case fold for unicode property escape as term

* perf: pack caseFold parameters into bit array

The commits also avoids querying caseFold flags when we are handling every single characters within a character sets.

* fix: move case fold expansion to getUnicodePropertyEscapeCharacterClassData

* fix: transform property escapes within i modifier

* improve node.js 6 compat test errors

* skip two failing node.js 6 tests
  • Loading branch information
JLHwung authored Oct 7, 2024
1 parent 382a685 commit b38de88
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 57 deletions.
111 changes: 60 additions & 51 deletions rewrite-pattern.js
Original file line number Diff line number Diff line change
Expand Up @@ -128,48 +128,61 @@ const getUnicodePropertyEscapeSet = (value, isNegative) => {
const getUnicodePropertyEscapeCharacterClassData = (property, isNegative) => {
const set = getUnicodePropertyEscapeSet(property, isNegative);
const data = getCharacterClassEmptyData();
data.singleChars = set.characters;
const singleChars = set.characters;
const caseFoldFlags = configGetCaseFoldFlags();
if (caseFoldFlags) {
for (const codepoint of singleChars.toArray()) {
const folded = caseFold(codepoint, caseFoldFlags);
if (folded) {
singleChars.add(folded);
}
}
}
data.singleChars = singleChars;
if (set.strings.size > 0) {
data.longStrings = set.strings;
data.maybeIncludesStrings = true;
}
return data;
};

function configNeedCaseFoldBMP() {
return config.modifiersData.i === true && config.transform.modifiers
}

function configNeedCaseFoldUnicode() {
// config.modifiersData.i : undefined | false
if (config.modifiersData.i === false) return false;
if (
config.modifiersData.i === true &&
config.transform.modifiers &&
(config.flags.unicode || config.flags.unicodeSets)
) {
return true;
const CASE_FOLD_FLAG_NONE = 0b00;
const CASE_FOLD_FLAG_BMP = 0b01;
const CASE_FOLD_FLAG_UNICODE = 0b10;

function configGetCaseFoldFlags() {
let flags = CASE_FOLD_FLAG_NONE;
if (config.modifiersData.i === true) {
if (config.transform.modifiers) {
flags |= CASE_FOLD_FLAG_BMP;
if (config.flags.unicode || config.flags.unicodeSets) {
flags |= CASE_FOLD_FLAG_UNICODE;
}
}
} else if (config.modifiersData.i === undefined) {
if (config.transform.unicodeFlag && config.flags.ignoreCase) {
flags |= CASE_FOLD_FLAG_UNICODE;
}
}
if (!config.transform.unicodeFlag) return false;
return Boolean(config.modifiersData.i || config.flags.ignoreCase);
return flags;
}

// Given a range of code points, add any case-folded code points in that range
// to a set.
regenerate.prototype.iuAddRange = function(min, max) {
regenerate.prototype.iuAddRange = function(min, max, caseFoldFlags) {
const $this = this;
do {
const folded = caseFold(min, configNeedCaseFoldBMP(), configNeedCaseFoldUnicode());
const folded = caseFold(min, caseFoldFlags);
if (folded) {
$this.add(folded);
}
} while (++min <= max);
return $this;
};
regenerate.prototype.iuRemoveRange = function(min, max) {
regenerate.prototype.iuRemoveRange = function(min, max, caseFoldFlags) {
const $this = this;
do {
const folded = caseFold(min, configNeedCaseFoldBMP(), configNeedCaseFoldUnicode());
const folded = caseFold(min, caseFoldFlags);
if (folded) {
$this.remove(folded);
}
Expand Down Expand Up @@ -208,10 +221,10 @@ const wrap = (tree, pattern) => {
};
};

const caseFold = (codePoint, expandBMP, includeUnicode) => {
let folded = (includeUnicode ? iuMappings.get(codePoint) : undefined) || [];
const caseFold = (codePoint, flags) => {
let folded = ((flags & CASE_FOLD_FLAG_UNICODE) ? iuMappings.get(codePoint) : undefined) || [];
if (typeof folded === "number") folded = [folded];
if (expandBMP) {
if (flags & CASE_FOLD_FLAG_BMP) {
for (const cp of [codePoint].concat(folded)) {
// Fast path for ASCII characters
if (cp >= 0x41 && cp <= 0x5a) {
Expand Down Expand Up @@ -239,8 +252,8 @@ const buildHandler = (action) => {
range: (data, start, end) => {
data.singleChars.addRange(start, end);
},
iuRange: (data, start, end) => {
data.singleChars.iuAddRange(start, end);
iuRange: (data, start, end, caseFoldFlags) => {
data.singleChars.iuAddRange(start, end, caseFoldFlags);
},
nested: (data, nestedData) => {
data.singleChars.add(nestedData.singleChars);
Expand All @@ -261,8 +274,8 @@ const buildHandler = (action) => {
range: (data, start, end) => {
data.singleChars = UNICODE_SET.clone().removeRange(start, end).add(data.singleChars);
},
iuRange: (data, start, end) => {
data.singleChars = UNICODE_SET.clone().iuRemoveRange(start, end).add(data.singleChars);
iuRange: (data, start, end, caseFoldFlags) => {
data.singleChars = UNICODE_SET.clone().iuRemoveRange(start, end, caseFoldFlags).add(data.singleChars);
},
nested: (data, nestedData) => {
regSet(data, nestedData.singleChars);
Expand Down Expand Up @@ -292,9 +305,9 @@ const buildHandler = (action) => {
data.longStrings.clear();
data.maybeIncludesStrings = false;
},
iuRange: (data, start, end) => {
if (data.first) data.singleChars.iuAddRange(start, end);
else data.singleChars.intersection(regenerate().iuAddRange(start, end));
iuRange: (data, start, end, caseFoldFlags) => {
if (data.first) data.singleChars.iuAddRange(start, end, caseFoldFlags);
else data.singleChars.intersection(regenerate().iuAddRange(start, end, caseFoldFlags));
data.longStrings.clear();
data.maybeIncludesStrings = false;
},
Expand Down Expand Up @@ -328,9 +341,9 @@ const buildHandler = (action) => {
if (data.first) data.singleChars.addRange(start, end);
else data.singleChars.removeRange(start, end);
},
iuRange: (data, start, end) => {
if (data.first) data.singleChars.iuAddRange(start, end);
else data.singleChars.iuRemoveRange(start, end);
iuRange: (data, start, end, caseFoldFlags) => {
if (data.first) data.singleChars.iuAddRange(start, end, caseFoldFlags);
else data.singleChars.iuRemoveRange(start, end, caseFoldFlags);
},
nested: (data, nestedData) => {
regSet(data, nestedData.singleChars);
Expand Down Expand Up @@ -363,12 +376,9 @@ const getCharacterClassEmptyData = () => ({
maybeIncludesStrings: false
});

const maybeFold = (codePoint) => {
const caseFoldBMP = configNeedCaseFoldBMP();
const caseFoldUnicode = configNeedCaseFoldUnicode();

if (caseFoldBMP || caseFoldUnicode) {
const folded = caseFold(codePoint, caseFoldBMP, caseFoldUnicode);
const maybeFold = (codePoint, caseFoldFlags) => {
if (caseFoldFlags) {
const folded = caseFold(codePoint, caseFoldFlags);
if (folded) {
return [codePoint, folded];
}
Expand All @@ -379,21 +389,20 @@ const maybeFold = (codePoint) => {
const computeClassStrings = (classStrings, regenerateOptions) => {
let data = getCharacterClassEmptyData();

const caseFoldBMP = configNeedCaseFoldBMP();
const caseFoldUnicode = configNeedCaseFoldUnicode();
const caseFoldFlags = configGetCaseFoldFlags();

for (const string of classStrings.strings) {
if (string.characters.length === 1) {
maybeFold(string.characters[0].codePoint).forEach((cp) => {
maybeFold(string.characters[0].codePoint, caseFoldFlags).forEach((cp) => {
data.singleChars.add(cp);
});
} else {
let stringifiedString;
if (caseFoldUnicode || caseFoldBMP) {
if (caseFoldFlags) {
stringifiedString = '';
for (const ch of string.characters) {
let set = regenerate(ch.codePoint);
const folded = maybeFold(ch.codePoint);
const folded = maybeFold(ch.codePoint, caseFoldFlags);
if (folded) set.add(folded);
stringifiedString += set.toString(regenerateOptions);
}
Expand Down Expand Up @@ -437,13 +446,12 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`);
}

const caseFoldBMP = configNeedCaseFoldBMP();
const caseFoldUnicode = configNeedCaseFoldUnicode();
const caseFoldFlags = configGetCaseFoldFlags();

for (const item of characterClassItem.body) {
switch (item.type) {
case 'value':
const folded = maybeFold(item.codePoint);
const folded = maybeFold(item.codePoint, caseFoldFlags);
folded.forEach((cp) => {
handlePositive.single(data, cp);
});
Expand All @@ -455,8 +463,8 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
const min = item.min.codePoint;
const max = item.max.codePoint;
handlePositive.range(data, min, max);
if (caseFoldBMP || caseFoldUnicode) {
handlePositive.iuRange(data, min, max);
if (caseFoldFlags) {
handlePositive.iuRange(data, min, max, caseFoldFlags);
data.transformed = true;
}
break;
Expand Down Expand Up @@ -623,7 +631,7 @@ const processTerm = (item, regenerateOptions, groups) => {
data.transformed = true;
item = processCharacterClass(item, regenerateOptions, data);
}
} else if (config.transform.unicodePropertyEscapes) {
} else if (config.transform.unicodePropertyEscapes || configGetCaseFoldFlags()) {
update(
item,
data.singleChars.toString(regenerateOptions)
Expand Down Expand Up @@ -700,7 +708,8 @@ const processTerm = (item, regenerateOptions, groups) => {
case 'value':
const codePoint = item.codePoint;
const set = regenerate(codePoint);
const folded = maybeFold(codePoint);
const caseFoldFlags = configGetCaseFoldFlags();
const folded = maybeFold(codePoint, caseFoldFlags);
if (folded.length === 1 && item.kind === "symbol" && folded[0] >= 0x20 && folded[0] <= 0x7E) {
// skip regenerate when it is a printable ASCII symbol
break;
Expand Down
64 changes: 63 additions & 1 deletion tests/fixtures/modifiers.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
const IS_NODE_6 = process.version.startsWith('v6.');

const modifiersFixtures = [
// +i
{
Expand Down Expand Up @@ -103,6 +105,36 @@ const modifiersFixtures = [
'expected': '(?:[Aa][^])',
'expectedFlags': ''
},
!IS_NODE_6 && {
'pattern': '(?i:\\p{Lowercase_Letter})k',
'flags': 'u',
'options': { modifiers: 'transform' },
'matches': ['ck', 'Ck', 'δk', 'Δk', '\u{118A8}k', '\u{118C8}k'],
'nonMatches': ['cK', 'CK', 'δK', 'ΔK', '\u{118A8}K', '\u{118C8}K', 'c\u212A', 'C\u212A'],
'expectedFlags': 'u'
},
!IS_NODE_6 && {
'pattern': '(?i:\\p{Lowercase_Letter})k',
'flags': 'u',
'options': { unicodePropertyEscapes: 'transform', modifiers: 'transform' },
'matches': ['ck', 'Ck', 'δk', 'Δk', '\u{118A8}k', '\u{118C8}k'],
'nonMatches': ['cK', 'CK', 'δK', 'ΔK', '\u{118A8}K', '\u{118C8}K', 'c\u212A', 'C\u212A'],
'expectedFlags': 'u'
},
{
'pattern': '(?i:[\\p{Lowercase_Letter}&&\\p{ASCII}])a',
'flags': 'v',
'options': { unicodeSetsFlag: 'transform', modifiers: 'transform' },
'expected': '(?:[A-Za-z\\u017F\\u212A])a',
'expectedFlags': 'u'
},
{
'pattern': '(?i:[\\p{Lowercase_Letter}&&\\p{ASCII}])a',
'flags': 'v',
'options': { unicodeSetsFlag: 'transform', unicodePropertyEscapes: 'transform', modifiers: 'transform' },
'expected': '(?:[A-Za-z\\u017F\\u212A])a',
'expectedFlags': 'u'
},
// +m
{
'pattern': '(?m:^[a-z])',
Expand Down Expand Up @@ -167,6 +199,36 @@ const modifiersFixtures = [
'expected': '([Aa](?:a))',
'expectedFlags': ''
},
{
'pattern': '\\p{Lowercase_Letter}(?-i:k)',
'flags': 'iu',
'options': { modifiers: 'transform' },
'matches': ['ck', 'Ck', 'δk', 'Δk', '\u{118A8}k', '\u{118C8}k'],
'nonMatches': ['cK', 'CK', 'δK', 'ΔK', '\u{118A8}K', '\u{118C8}K', 'c\u212A', 'C\u212A'],
'expectedFlags': 'u'
},
{
'pattern': '\\p{Lowercase_Letter}(?-i:k)',
'flags': 'iu',
'options': { unicodePropertyEscapes: 'transform', modifiers: 'transform' },
'matches': ['ck', 'Ck', 'δk', 'Δk', '\u{118A8}k', '\u{118C8}k'],
'nonMatches': ['cK', 'CK', 'δK', 'ΔK', '\u{118A8}K', '\u{118C8}K', 'c\u212A', 'C\u212A'],
'expectedFlags': 'u'
},
{
'pattern': '[\\p{Lowercase_Letter}&&\\p{ASCII}](?-i:a)',
'flags': 'iv',
'options': { unicodeSetsFlag: 'transform', modifiers: 'transform' },
'expected': '[A-Za-z\\u017F\\u212A](?:a)',
'expectedFlags': 'u'
},
{
'pattern': '[\\p{Lowercase_Letter}&&\\p{ASCII}](?-i:a)',
'flags': 'iv',
'options': { unicodeSetsFlag: 'transform', unicodePropertyEscapes: 'transform', modifiers: 'transform' },
'expected': '[A-Za-z\\u017F\\u212A](?:a)',
'expectedFlags': 'u'
},
// -m
{
'pattern': '(?-m:^[a-z])(^[a-z])',
Expand Down Expand Up @@ -220,6 +282,6 @@ const modifiersFixtures = [
'expected': '(?:^[a-z].)(^[a-z].)',
'expectedFlags': '',
},
];
].filter(Boolean);

exports.modifiersFixtures = modifiersFixtures;
13 changes: 8 additions & 5 deletions tests/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ const { unicodeSetFixtures } = require("./fixtures/unicode-set.js");
const { modifiersFixtures } = require("./fixtures/modifiers.js");

/** For node 6 compat */
assert.match || (assert.match = function match(value, regex) { assert.ok(regex.exec(value) !== null) });
assert.doesNotMatch || (assert.doesNotMatch = function doesNotMatch(value, regex) { assert.ok(regex.exec(value) === null) });
assert.match || (assert.match = function match(value, regex) { assert.ok(regex.exec(value) !== null, `${value} does not match ${regex.toString()}`) });
assert.doesNotMatch || (assert.doesNotMatch = function doesNotMatch(value, regex) { assert.ok(regex.exec(value) === null, `${value} does match ${regex.toString()}`) });

/**
* comput output regex flags from input flags and transform options
Expand Down Expand Up @@ -71,6 +71,7 @@ const getPropertyValuePattern = (path) => {
};

describe('unicodePropertyEscapes', () => {
// ignore tests as @unicode/unicode-* library does not support node.js 6
if (IS_NODE_6) return;

const features = {
Expand Down Expand Up @@ -530,16 +531,18 @@ describe('modifiers', () => {

it('rewrites `/' + pattern + '/' + flags + '` correctly', () => {
const transpiled = rewritePattern(pattern, flags, options);
assert.strictEqual(transpiled, expected);
if (expected != undefined) {
assert.strictEqual(transpiled, expected);
}
if (fixture.expectedFlags != undefined) {
assert.strictEqual(actualFlags, fixture.expectedFlags);
}
for (const match of fixture.matches || []) {
const transpiledRegex = new RegExp(transpiled, getOutputFlags(flags, options));
const transpiledRegex = new RegExp(transpiled, actualFlags);
assert.match(match, transpiledRegex);
}
for (const nonMatch of fixture.nonMatches || []) {
const transpiledRegex = new RegExp(transpiled, getOutputFlags(flags, options));
const transpiledRegex = new RegExp(transpiled, actualFlags);
assert.doesNotMatch(nonMatch, transpiledRegex);
}
});
Expand Down

0 comments on commit b38de88

Please sign in to comment.