Skip to content

Commit ead22e1

Browse files
Tests: Cache results for exp backtracking check (#3356)
1 parent 17ed916 commit ead22e1

File tree

1 file changed

+117
-69
lines changed

1 file changed

+117
-69
lines changed

tests/pattern-tests.js

+117-69
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ const TestCase = require('./helper/test-case');
88
const { BFS, BFSPathToPrismTokenPath, parseRegex } = require('./helper/util');
99
const { languages } = require('../components.json');
1010
const { visitRegExpAST } = require('regexpp');
11-
const { transform, combineTransformers, getIntersectionWordSets, JS, Words, NFA, Transformers } = require('refa');
11+
const { transform, combineTransformers, getIntersectionWordSets, JS, Words, NFA, Transformers, isDisjointWith } = require('refa');
1212
const scslre = require('scslre');
1313
const { argv } = require('yargs');
1414
const RAA = require('regexp-ast-analysis');
@@ -461,6 +461,50 @@ const transformer = combineTransformers([
461461
]);
462462

463463

464+
/** @type {Map<string, Map<string, Error | null>>} */
465+
const resultCache = new Map();
466+
/**
467+
* @param {string} cacheName
468+
* @returns {Map<string, Error | null>}
469+
*/
470+
function getResultCache(cacheName) {
471+
let cache = resultCache.get(cacheName);
472+
if (cache === undefined) {
473+
resultCache.set(cacheName, cache = new Map());
474+
}
475+
return cache;
476+
}
477+
/**
478+
* @param {string} cacheName
479+
* @param {T} cacheKey
480+
* @param {(node: T) => void} compute
481+
* @returns {void}
482+
* @template {import('regexpp/ast').Node} T
483+
*/
484+
function withResultCache(cacheName, cacheKey, compute) {
485+
const hasBackRef = RAA.hasSomeDescendant(cacheKey, n => n.type === 'Backreference');
486+
if (hasBackRef) {
487+
compute(cacheKey);
488+
return;
489+
}
490+
491+
const cache = getResultCache(cacheName);
492+
let cached = cache.get(cacheKey.raw);
493+
if (cached === undefined) {
494+
try {
495+
compute(cacheKey);
496+
cached = null;
497+
} catch (error) {
498+
cached = error;
499+
}
500+
cache.set(cacheKey.raw, cached);
501+
}
502+
503+
if (cached) {
504+
throw cached;
505+
}
506+
}
507+
464508
/**
465509
* @param {string} path
466510
* @param {RegExp} pattern
@@ -510,32 +554,34 @@ function checkExponentialBacktracking(path, pattern, ast) {
510554
return;
511555
}
512556

513-
const alternatives = node.alternatives;
514-
515-
const total = toNFA(alternatives[0]);
516-
total.withoutEmptyWord();
517-
for (let i = 1, l = alternatives.length; i < l; i++) {
518-
const a = alternatives[i];
519-
const current = toNFA(a);
520-
current.withoutEmptyWord();
521-
522-
if (!total.isDisjointWith(current)) {
523-
assert.fail(`${path}: The alternative \`${a.raw}\` is not disjoint with at least one previous alternative.`
524-
+ ` This will cause exponential backtracking.`
525-
+ `\n\nTo fix this issue, you have to rewrite the ${node.type} \`${node.raw}\`.`
526-
+ ` The goal is that all of its alternatives are disjoint.`
527-
+ ` This means that if a (sub-)string is matched by the ${node.type}, then only one of its alternatives can match the (sub-)string.`
528-
+ `\n\nExample: \`(?:[ab]|\\w|::)+\``
529-
+ `\nThe alternatives of the group are not disjoint because the string "a" can be matched by both \`[ab]\` and \`\\w\`.`
530-
+ ` In this example, the pattern can easily be fixed because the \`[ab]\` is a subset of the \`\\w\`, so its enough to remove the \`[ab]\` alternative to get \`(?:\\w|::)+\` as the fixed pattern.`
531-
+ `\nIn the real world, patterns can be a lot harder to fix.`
532-
+ ` If you are trying to make the tests pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway.`
533-
+ ` A maintainer will help you.`
534-
+ `\n\nFull pattern:\n${pattern}`);
535-
} else if (i !== l - 1) {
536-
total.union(current);
557+
withResultCache('disjointAlternatives', node, () => {
558+
const alternatives = node.alternatives;
559+
560+
const total = toNFA(alternatives[0]);
561+
total.withoutEmptyWord();
562+
for (let i = 1, l = alternatives.length; i < l; i++) {
563+
const a = alternatives[i];
564+
const current = toNFA(a);
565+
current.withoutEmptyWord();
566+
567+
if (!isDisjointWith(total, current)) {
568+
assert.fail(`${path}: The alternative \`${a.raw}\` is not disjoint with at least one previous alternative.`
569+
+ ` This will cause exponential backtracking.`
570+
+ `\n\nTo fix this issue, you have to rewrite the ${node.type} \`${node.raw}\`.`
571+
+ ` The goal is that all of its alternatives are disjoint.`
572+
+ ` This means that if a (sub-)string is matched by the ${node.type}, then only one of its alternatives can match the (sub-)string.`
573+
+ `\n\nExample: \`(?:[ab]|\\w|::)+\``
574+
+ `\nThe alternatives of the group are not disjoint because the string "a" can be matched by both \`[ab]\` and \`\\w\`.`
575+
+ ` In this example, the pattern can easily be fixed because the \`[ab]\` is a subset of the \`\\w\`, so its enough to remove the \`[ab]\` alternative to get \`(?:\\w|::)+\` as the fixed pattern.`
576+
+ `\nIn the real world, patterns can be a lot harder to fix.`
577+
+ ` If you are trying to make the tests pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway.`
578+
+ ` A maintainer will help you.`
579+
+ `\n\nFull pattern:\n${pattern}`);
580+
} else if (i !== l - 1) {
581+
total.union(current);
582+
}
537583
}
538-
}
584+
});
539585
}
540586

541587
visitRegExpAST(ast.pattern, {
@@ -555,49 +601,51 @@ function checkExponentialBacktracking(path, pattern, ast) {
555601
return; // not a group
556602
}
557603

558-
// The idea here is the following:
559-
//
560-
// We have found a part `A*` of the regex (`A` is assumed to not accept the empty word). Let `I` be
561-
// the intersection of `A` and `A{2,}`. If `I` is not empty, then there exists a non-empty word `w`
562-
// that is accepted by both `A` and `A{2,}`. That means that there exists some `m>1` for which `w`
563-
// is accepted by `A{m}`.
564-
// This means that there are at least two ways `A*` can accept `w`. It can be accepted as `A` or as
565-
// `A{m}`. Hence there are at least 2^n ways for `A*` to accept the word `w{n}`. This is the main
566-
// requirement for exponential backtracking.
567-
//
568-
// This is actually only a crude approximation for the real analysis that would have to be done. We
569-
// would actually have to check the intersection `A{p}` and `A{p+1,}` for all p>0. However, in most
570-
// cases, the approximation is good enough.
571-
572-
const nfa = toNFA(node.element);
573-
nfa.withoutEmptyWord();
574-
const twoStar = nfa.copy();
575-
twoStar.quantify(2, Infinity);
576-
577-
if (!nfa.isDisjointWith(twoStar)) {
578-
const word = Words.pickMostReadableWord(firstOf(getIntersectionWordSets(nfa, twoStar)));
579-
const example = Words.fromUnicodeToString(word);
580-
assert.fail(`${path}: The quantifier \`${node.raw}\` ambiguous for all words ${JSON.stringify(example)}.repeat(n) for any n>1.`
581-
+ ` This will cause exponential backtracking.`
582-
+ `\n\nTo fix this issue, you have to rewrite the element (let's call it E) of the quantifier.`
583-
+ ` The goal is modify E such that it is disjoint with repetitions of itself.`
584-
+ ` This means that if a (sub-)string is matched by E, then it must not be possible for E{2}, E{3}, E{4}, etc. to match that (sub-)string.`
585-
+ `\n\nExample 1: \`(?:\\w+|::)+\``
586-
+ `\nThe problem lies in \`\\w+\` because \`\\w+\` and \`(?:\\w+){2}\` are not disjoint as the string "aa" is fully matched by both.`
587-
+ ` In this example, the pattern can easily be fixed by changing \`\\w+\` to \`\\w\`.`
588-
+ `\nExample 2: \`(?:\\w|Foo)+\``
589-
+ `\nThe problem lies in \`\\w\` and \`Foo\` because the string "Foo" can be matched as either repeating \`\\w\` 3 times or by using the \`Foo\` alternative once.`
590-
+ ` In this example, the pattern can easily be fixed because the \`Foo\` alternative is redundant can can be removed.`
591-
+ `\nExample 3: \`(?:\\.\\w+(?:<.*?>)?)+\``
592-
+ `\nThe problem lies in \`<.*?>\`. The string ".a<>.a<>" can be matched as either \`\\. \\w < . . . . >\` or \`\\. \\w < > \\. \\w < >\`.`
593-
+ ` When it comes to exponential backtracking, it doesn't matter whether a quantifier is greedy or lazy.`
594-
+ ` This means that the lazy \`.*?\` can jump over \`>\`.`
595-
+ ` In this example, the pattern can easily be fixed because we just have to prevent \`.*?\` jumping over \`>\`.`
596-
+ ` This can done by replacing \`<.*?>\` with \`<[^\\r\\n>]*>\`.`
597-
+ `\n\nIn the real world, patterns can be a lot harder to fix.`
598-
+ ` If you are trying to make this test pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway, a maintainer will help you.`
599-
+ `\n\nFull pattern:\n${pattern}`);
600-
}
604+
withResultCache('2star', node, () => {
605+
// The idea here is the following:
606+
//
607+
// We have found a part `A*` of the regex (`A` is assumed to not accept the empty word). Let `I` be
608+
// the intersection of `A` and `A{2,}`. If `I` is not empty, then there exists a non-empty word `w`
609+
// that is accepted by both `A` and `A{2,}`. That means that there exists some `m>1` for which `w`
610+
// is accepted by `A{m}`.
611+
// This means that there are at least two ways `A*` can accept `w`. It can be accepted as `A` or as
612+
// `A{m}`. Hence there are at least 2^n ways for `A*` to accept the word `w{n}`. This is the main
613+
// requirement for exponential backtracking.
614+
//
615+
// This is actually only a crude approximation for the real analysis that would have to be done. We
616+
// would actually have to check the intersection `A{p}` and `A{p+1,}` for all p>0. However, in most
617+
// cases, the approximation is good enough.
618+
619+
const nfa = toNFA(node.element);
620+
nfa.withoutEmptyWord();
621+
const twoStar = nfa.copy();
622+
twoStar.quantify(2, Infinity);
623+
624+
if (!isDisjointWith(nfa, twoStar)) {
625+
const word = Words.pickMostReadableWord(firstOf(getIntersectionWordSets(nfa, twoStar)));
626+
const example = Words.fromUnicodeToString(word);
627+
assert.fail(`${path}: The quantifier \`${node.raw}\` ambiguous for all words ${JSON.stringify(example)}.repeat(n) for any n>1.`
628+
+ ` This will cause exponential backtracking.`
629+
+ `\n\nTo fix this issue, you have to rewrite the element (let's call it E) of the quantifier.`
630+
+ ` The goal is modify E such that it is disjoint with repetitions of itself.`
631+
+ ` This means that if a (sub-)string is matched by E, then it must not be possible for E{2}, E{3}, E{4}, etc. to match that (sub-)string.`
632+
+ `\n\nExample 1: \`(?:\\w+|::)+\``
633+
+ `\nThe problem lies in \`\\w+\` because \`\\w+\` and \`(?:\\w+){2}\` are not disjoint as the string "aa" is fully matched by both.`
634+
+ ` In this example, the pattern can easily be fixed by changing \`\\w+\` to \`\\w\`.`
635+
+ `\nExample 2: \`(?:\\w|Foo)+\``
636+
+ `\nThe problem lies in \`\\w\` and \`Foo\` because the string "Foo" can be matched as either repeating \`\\w\` 3 times or by using the \`Foo\` alternative once.`
637+
+ ` In this example, the pattern can easily be fixed because the \`Foo\` alternative is redundant can can be removed.`
638+
+ `\nExample 3: \`(?:\\.\\w+(?:<.*?>)?)+\``
639+
+ `\nThe problem lies in \`<.*?>\`. The string ".a<>.a<>" can be matched as either \`\\. \\w < . . . . >\` or \`\\. \\w < > \\. \\w < >\`.`
640+
+ ` When it comes to exponential backtracking, it doesn't matter whether a quantifier is greedy or lazy.`
641+
+ ` This means that the lazy \`.*?\` can jump over \`>\`.`
642+
+ ` In this example, the pattern can easily be fixed because we just have to prevent \`.*?\` jumping over \`>\`.`
643+
+ ` This can done by replacing \`<.*?>\` with \`<[^\\r\\n>]*>\`.`
644+
+ `\n\nIn the real world, patterns can be a lot harder to fix.`
645+
+ ` If you are trying to make this test pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway, a maintainer will help you.`
646+
+ `\n\nFull pattern:\n${pattern}`);
647+
}
648+
});
601649
},
602650
});
603651

0 commit comments

Comments
 (0)