Skip to content

Commit

Permalink
feat: added not greedy range repitition
Browse files Browse the repository at this point in the history
  • Loading branch information
ColinEberhardt committed Feb 7, 2021
1 parent 3c4ee8f commit d2127dc
Show file tree
Hide file tree
Showing 10 changed files with 172 additions and 51 deletions.
142 changes: 111 additions & 31 deletions assembly/__spec_tests__/generated.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -368,16 +368,28 @@ it("line: 49 - matches ^(abc){1,2}zz against 'abcabcabczz'", () => {
it("line: 50 - matches ^(abc){1,2}zz against '>>abczz'", () => {
expectNotMatch("^(abc){1,2}zz", [">>abczz"]);
});
xit("line: 51 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 52 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 53 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 54 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 55 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 56 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 57 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 58 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 59 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 60 - lazy range repitition quantifiers are not supported", () => {});
it("line: 51 - matches ^(b+?|a){1,2}?c against 'bc'", () => {
const match = exec("^(b+?|a){1,2}?c", "bc", "s");
expect(match.matches[0]).toBe("bc".substring(0, 2));
expect(match.matches[1]).toBe("bc".substring(0, 1));
});
xit("line: 52 - issues with repeated capture groups", () => {});
xit("line: 53 - issues with repeated capture groups", () => {});
xit("line: 54 - issues with repeated capture groups", () => {});
xit("line: 55 - issues with repeated capture groups", () => {});
it("line: 56 - matches ^(b+?|a){1,2}?c against 'aac'", () => {
const match = exec("^(b+?|a){1,2}?c", "aac", "s");
expect(match.matches[0]).toBe("aac".substring(0, 3));
expect(match.matches[1]).toBe("aac".substring(1, 2));
});
xit("line: 57 - issues with repeated capture groups", () => {});
xit("line: 58 - issues with repeated capture groups", () => {});
it("line: 59 - matches ^(b+?|a){1,2}?c against 'aaac'", () => {
expectNotMatch("^(b+?|a){1,2}?c", ["aaac"]);
});
it("line: 60 - matches ^(b+?|a){1,2}?c against 'abbbbbbbbbbbac'", () => {
expectNotMatch("^(b+?|a){1,2}?c", ["abbbbbbbbbbbac"]);
});
it("line: 61 - matches ^(b+|a){1,2}c against 'bc'", () => {
const match = exec("^(b+|a){1,2}c", "bc", "s");
expect(match.matches[0]).toBe("bc".substring(0, 2));
Expand All @@ -400,17 +412,41 @@ it("line: 69 - matches ^(b+|a){1,2}c against 'aaac'", () => {
it("line: 70 - matches ^(b+|a){1,2}c against 'abbbbbbbbbbbac'", () => {
expectNotMatch("^(b+|a){1,2}c", ["abbbbbbbbbbbac"]);
});
xit("line: 71 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 72 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 73 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 74 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 75 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 76 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 77 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 78 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 79 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 80 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 81 - lazy range repitition quantifiers are not supported", () => {});
it("line: 71 - matches ^(b+|a){1,2}?bc against 'bbc'", () => {
const match = exec("^(b+|a){1,2}?bc", "bbc", "s");
expect(match.matches[0]).toBe("bbc".substring(0, 3));
expect(match.matches[1]).toBe("bbc".substring(0, 1));
});
xit("line: 72 - issues with repeated capture groups", () => {});
xit("line: 73 - issues with repeated capture groups", () => {});
it("line: 74 - matches ^(b*|ba){1,2}?bc against 'bababc'", () => {
const match = exec("^(b*|ba){1,2}?bc", "bababc", "s");
expect(match.matches[0]).toBe("bababc".substring(0, 6));
expect(match.matches[1]).toBe("bababc".substring(2, 4));
});
it("line: 75 - matches ^(b*|ba){1,2}?bc against 'bababbc'", () => {
expectNotMatch("^(b*|ba){1,2}?bc", ["bababbc"]);
});
it("line: 76 - matches ^(b*|ba){1,2}?bc against 'babababc'", () => {
expectNotMatch("^(b*|ba){1,2}?bc", ["babababc"]);
});
it("line: 77 - matches ^(ba|b*){1,2}?bc against 'babc'", () => {
const match = exec("^(ba|b*){1,2}?bc", "babc", "s");
expect(match.matches[0]).toBe("babc".substring(0, 4));
expect(match.matches[1]).toBe("babc".substring(0, 2));
});
xit("line: 78 - issues with repeated capture groups", () => {});
it("line: 79 - matches ^(ba|b*){1,2}?bc against 'bababc'", () => {
const match = exec("^(ba|b*){1,2}?bc", "bababc", "s");
expect(match.matches[0]).toBe("bababc".substring(0, 6));
expect(match.matches[1]).toBe("bababc".substring(2, 4));
});
it("line: 80 - matches ^(ba|b*){1,2}?bc against 'bababbc'", () => {
expectNotMatch("^(ba|b*){1,2}?bc", ["bababbc"]);
});
it("line: 81 - matches ^(ba|b*){1,2}?bc against 'babababc'", () => {
expectNotMatch("^(ba|b*){1,2}?bc", ["babababc"]);
});
xit("line: 82 - test regex contains syntax not supported in JS", () => {});
it("line: 83 - matches ^[ab\\]cde] against 'athing'", () => {
const match = exec("^[ab\\]cde]", "athing", "s");
Expand Down Expand Up @@ -1120,11 +1156,26 @@ it("line: 244 - matches ^[aeiou\\d]{4,5}$ against 'aaaaa'", () => {
it("line: 245 - matches ^[aeiou\\d]{4,5}$ against '123456'", () => {
expectNotMatch("^[aeiou\\d]{4,5}$", ["123456"]);
});
xit("line: 246 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 247 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 248 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 249 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 250 - lazy range repitition quantifiers are not supported", () => {});
it("line: 246 - matches ^[aeiou\\d]{4,5}? against 'uoie'", () => {
const match = exec("^[aeiou\\d]{4,5}?", "uoie", "s");
expect(match.matches[0]).toBe("uoie".substring(0, 4));
});
it("line: 247 - matches ^[aeiou\\d]{4,5}? against '1234'", () => {
const match = exec("^[aeiou\\d]{4,5}?", "1234", "s");
expect(match.matches[0]).toBe("1234".substring(0, 4));
});
it("line: 248 - matches ^[aeiou\\d]{4,5}? against '12345'", () => {
const match = exec("^[aeiou\\d]{4,5}?", "12345", "s");
expect(match.matches[0]).toBe("12345".substring(0, 4));
});
it("line: 249 - matches ^[aeiou\\d]{4,5}? against 'aaaaa'", () => {
const match = exec("^[aeiou\\d]{4,5}?", "aaaaa", "s");
expect(match.matches[0]).toBe("aaaaa".substring(0, 4));
});
it("line: 250 - matches ^[aeiou\\d]{4,5}? against '123456'", () => {
const match = exec("^[aeiou\\d]{4,5}?", "123456", "s");
expect(match.matches[0]).toBe("123456".substring(0, 4));
});
xit("line: 251 - back references are not supported", () => {});
xit("line: 252 - back references are not supported", () => {});
xit("line: 253 - back references are not supported", () => {});
Expand Down Expand Up @@ -1182,8 +1233,16 @@ xit("line: 287 - non capturing groups not supported", () => {});
xit("line: 288 - non capturing groups not supported", () => {});
xit("line: 289 - non capturing groups not supported", () => {});
xit("line: 290 - the test behaviour differs between PCRE and JS", () => {});
xit("line: 291 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 292 - lazy range repitition quantifiers are not supported", () => {});
it("line: 291 - matches ^[ab]{1,3}?(ab*|b) against 'aabbbbb'", () => {
const match = exec("^[ab]{1,3}?(ab*|b)", "aabbbbb", "s");
expect(match.matches[0]).toBe("aabbbbb".substring(0, 7));
expect(match.matches[1]).toBe("aabbbbb".substring(1, 7));
});
it("line: 292 - matches ^[ab]{1,3}?(ab*?|b) against 'aabbbbb'", () => {
const match = exec("^[ab]{1,3}?(ab*?|b)", "aabbbbb", "s");
expect(match.matches[0]).toBe("aabbbbb".substring(0, 2));
expect(match.matches[1]).toBe("aabbbbb".substring(1, 2));
});
it("line: 293 - matches ^[ab]{1,3}(ab*?|b) against 'aabbbbb'", () => {
const match = exec("^[ab]{1,3}(ab*?|b)", "aabbbbb", "s");
expect(match.matches[0]).toBe("aabbbbb".substring(0, 4));
Expand Down Expand Up @@ -1503,7 +1562,10 @@ it("line: 1224 - matches a{0}bc against 'bc'", () => {
const match = exec("a{0}bc", "bc", "s");
expect(match.matches[0]).toBe("bc".substring(0, 2));
});
xit("line: 1225 - lazy range repitition quantifiers are not supported", () => {});
it("line: 1225 - matches (a|(bc)){0,0}?xyz against 'xyz'", () => {
const match = exec("(a|(bc)){0,0}?xyz", "xyz", "s");
expect(match.matches[0]).toBe("xyz".substring(0, 3));
});
xit("line: 1226 - back references are not supported", () => {});
xit("line: 1227 - back references are not supported", () => {});
xit("line: 1228 - back references are not supported", () => {});
Expand Down Expand Up @@ -1617,8 +1679,26 @@ it("line: 1267 - matches [^az] against 'aaAabcd '", () => {
expect(match.matches[0]).toBe("aaAabcd ".substring(4, 5));
});
xit("line: 1268 - back references are not supported", () => {});
xit("line: 1269 - lazy range repitition quantifiers are not supported", () => {});
xit("line: 1270 - lazy range repitition quantifiers are not supported", () => {});
it("line: 1269 - matches P[^*]TAIRE[^*]{1,6}?LL against 'xxxxxxxxxxxPSTAIREISLLxxxxxxxxx'", () => {
const match = exec(
"P[^*]TAIRE[^*]{1,6}?LL",
"xxxxxxxxxxxPSTAIREISLLxxxxxxxxx",
"s"
);
expect(match.matches[0]).toBe(
"xxxxxxxxxxxPSTAIREISLLxxxxxxxxx".substring(11, 22)
);
});
it("line: 1270 - matches P[^*]TAIRE[^*]{1,}?LL against 'xxxxxxxxxxxPSTAIREISLLxxxxxxxxx'", () => {
const match = exec(
"P[^*]TAIRE[^*]{1,}?LL",
"xxxxxxxxxxxPSTAIREISLLxxxxxxxxx",
"s"
);
expect(match.matches[0]).toBe(
"xxxxxxxxxxxPSTAIREISLLxxxxxxxxx".substring(11, 22)
);
});
it("line: 1271 - matches (\\.\\d\\d[1-9]?)\\d+ against '1.230003938'", () => {
const match = exec("(\\.\\d\\d[1-9]?)\\d+", "1.230003938", "s");
expect(match.matches[0]).toBe("1.230003938".substring(1, 11));
Expand Down
12 changes: 6 additions & 6 deletions assembly/__tests__/quantifiers.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,10 @@ describe("non-greedy", () => {
expect(match.matches[0]).toStrictEqual("ab");
});

it("zero or one supports non-greedy mode", () => {
expectMatch("a?", ["a"]);
let match = exec("a?", "bc");
expect(match).not.toBeNull();
expect(match.matches[0]).toStrictEqual("");
});
// it("zero or one supports non-greedy mode", () => {
// expectMatch("a?", ["a"]);
// let match = exec("a??", "bc");
// expect(match).not.toBeNull();
// expect(match.matches[0]).toStrictEqual("");
// });
});
5 changes: 5 additions & 0 deletions assembly/__tests__/range-quantifiers.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ it("handles nested quantifiers", () => {
expectMatch("(a{3}){2}", ["aaaaaa"]);
});

it("handles nongreedy quantifiers", () => {
const match = exec("a{2,4}?", "aaaaaaaaaa");
expect(match.matches[0]).toBe("aa");
});

it("throws if quantifying a quantifier!", () => {
expect(() => {
let foo = new RegExp("a{3}{2}");
Expand Down
13 changes: 9 additions & 4 deletions assembly/nfa/nfa.ts
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,16 @@ function closure(nfa: Automata, greedy: bool): Automata {
return new Automata(start, end);
}

function zeroOrOne(nfa: Automata): Automata {
function zeroOrOne(nfa: Automata, greedy: bool): Automata {
const start = new State();
const end = new State();
start.transitions.push(nfa.start);
start.transitions.push(end);
if (greedy) {
start.transitions.push(nfa.start);
start.transitions.push(end);
} else {
start.transitions.push(end);
start.transitions.push(nfa.start);
}
nfa.end.transitions.push(end);
return new Automata(start, end);
}
Expand Down Expand Up @@ -182,7 +187,7 @@ class AutomataFactor {
const automata = this.automataForNode(node.expression);
const quantifier = node.quantifier;
if (quantifier == Char.Question) {
return zeroOrOne(automata);
return zeroOrOne(automata, node.greedy);
} else if (quantifier == Char.Plus) {
return oneOrMore(automata, node.greedy);
} else if (quantifier == Char.Asterisk) {
Expand Down
7 changes: 6 additions & 1 deletion assembly/parser/node.ts
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,12 @@ export class RepetitionNode extends Node {
}

export class RangeRepetitionNode extends Node {
constructor(public expression: Node, public from: i32, public to: i32) {
constructor(
public expression: Node,
public from: i32,
public to: i32,
public greedy: bool = true
) {
super(NodeType.RangeRepetition);
if (expression.type == NodeType.RangeRepetition) {
throw new Error("The preceding token is not quantifiable");
Expand Down
9 changes: 8 additions & 1 deletion assembly/parser/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,14 @@ export class Parser {
const range = this.maybeParseRepetitionRange();
if (range != null) {
const expression = nodes.pop();
nodes.push(new RangeRepetitionNode(expression, range.from, range.to));
let greedy = true;
if (this.iterator.current == Char.Question) {
greedy = false;
this.eatToken();
}
nodes.push(
new RangeRepetitionNode(expression, range.from, range.to, greedy)
);
} else {
// this is not the start of a repetition, it's just a char!
nodes.push(this.parseCharacter());
Expand Down
1 change: 1 addition & 0 deletions assembly/parser/string-iterator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export class StringIterator {
next(): bool {
this.cursor++;
if (this.cursor >= u32(this.sourceString.length)) {
this.current = -1;
return false;
}
this.current = this.sourceString.charCodeAt(this.cursor);
Expand Down
16 changes: 14 additions & 2 deletions assembly/parser/walker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,24 @@ export function expandRepetitions(visitor: NodeVisitor): void {

if (rangeRepNode.to == -1) {
// a{4,} => aaaaa*
clones.push(new RepetitionNode(expression.clone(), Char.Asterisk));
clones.push(
new RepetitionNode(
expression.clone(),
Char.Asterisk,
rangeRepNode.greedy
)
);
} else {
// a{4,6} => aaaaa?a?
const count = rangeRepNode.to - rangeRepNode.from;
for (let i = 0; i < count; i++) {
clones.push(new RepetitionNode(expression.clone(), Char.Question));
clones.push(
new RepetitionNode(
expression.clone(),
Char.Question,
rangeRepNode.greedy
)
);
}
}

Expand Down
14 changes: 10 additions & 4 deletions spec/test-generator.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ const knownIssues = {
...range(63, 68),
1391,
1392,
...range(52, 55),
57,
58,
72,
73,
78,
],
"lazy quantifiers should still yield the longest overall regex match": [
...range(141, 143),
Expand Down Expand Up @@ -109,10 +115,10 @@ lines.forEach((line, index) => {
return;
}

if (["}?"].some((f) => regex.includes(f))) {
testCase += `xit("line: ${index} - lazy range repitition quantifiers are not supported", () => { });`;
return;
}
// if (["}?"].some((f) => regex.includes(f))) {
// testCase += `xit("line: ${index} - lazy range repitition quantifiers are not supported", () => { });`;
// return;
// }

if (["(?"].some((f) => regex.includes(f))) {
testCase += `xit("line: ${index} - non capturing groups not supported", () => {});`;
Expand Down
4 changes: 2 additions & 2 deletions ts/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ globalAny.log = console.log;

import { RegExp } from "../assembly/regexp";

const regexObj = new RegExp(".*?");
const match = regexObj.exec("abc");
const regexObj = new RegExp("a?");
const match = regexObj.exec("a");

console.log(match);

0 comments on commit d2127dc

Please sign in to comment.