-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add single char lazy loop support to simplified Regex code gen #61698
Conversation
Tagging subscribers to this area: @eerhardt, @dotnet/area-system-text-regularexpressions Issue Details#60385 added support for greedy loops to our "simplified" code gen for the Regex compiler and source generator. This does the same for lazy loops. In our stable of known real-world regex patterns, this increases the number supported by the simplified code gen by ~5% to ~70%. (Note we can subsequently add vectorization into this to, for example, special-case .* and use IndexOf{Any} to search for what comes next.) cc: @joperezr, @danmoseley Code generated for `<.*?>` beforeprotected override void Go()
{
string runtext = base.runtext!;
int runtextbeg = base.runtextbeg;
int runtextend = base.runtextend;
int runtextpos = base.runtextpos;
int[] runtrack = base.runtrack!;
int runtrackpos = base.runtrackpos;
int[] runstack = base.runstack!;
int runstackpos = base.runstackpos;
int tmp1, tmp2, ch;
// 000000 *Lazybranch addr = 13
L0:
runtrack[--runtrackpos] = runtextpos;
runtrack[--runtrackpos] = 0;
// 000002 *Setmark
L1:
runstack[--runstackpos] = runtextpos;
runtrack[--runtrackpos] = 1;
// 000003 One '<'
L2:
if (runtextpos >= runtextend || runtext[runtextpos++] != 60)
{
goto Backtrack;
}
// 000005 *Notonelazy '\\n', rep = inf
L3:
tmp1 = runtextend - runtextpos; // count
if (tmp1 <= 0)
{
goto L4;
}
runtrack[--runtrackpos] = tmp1 - 1;
runtrack[--runtrackpos] = runtextpos;
runtrack[--runtrackpos] = 2;
// 000008 One '>'
L4:
if (runtextpos >= runtextend || runtext[runtextpos++] != 62)
{
goto Backtrack;
}
// 000010 *Capturemark index = 0
L5:
tmp1 = runstack[runstackpos++];
base.Capture(0, tmp1, runtextpos);
runtrack[--runtrackpos] = tmp1;
runtrack[--runtrackpos] = 3;
// 000013 Stop
L6:
base.runtextpos = runtextpos;
return;
Backtrack:
int limit = base.runtrackcount * 4;
if (runstackpos < limit)
{
base.runstackpos = runstackpos;
base.DoubleStack(); // might change runstackpos and runstack
runstackpos = base.runstackpos;
runstack = base.runstack!;
}
if (runtrackpos < limit)
{
base.runtrackpos = runtrackpos;
base.DoubleTrack(); // might change runtrackpos and runtrack
runtrackpos = base.runtrackpos;
runtrack = base.runtrack!;
}
switch (runtrack[runtrackpos++])
{
case 0:
{
// 000000 *Lazybranch addr = 13
runtextpos = runtrack[runtrackpos++];
goto L6;
}
case 1:
{
// 000002 *Setmark
runstackpos++;
goto Backtrack;
}
case 2:
{
// 000005 *Notonelazy '\\n', rep = inf
runtextpos = runtrack[runtrackpos++];
tmp1 = runtrack[runtrackpos++]; // i
if (runtext[runtextpos++] == '\n')
{
goto Backtrack;
}
if (tmp1 > 0)
{
runtrack[--runtrackpos] = tmp1 - 1;
runtrack[--runtrackpos] = runtextpos;
runtrack[--runtrackpos] = 2;
}
goto L4;
}
case 3:
{
// 000010 *Capturemark index = 0
runstack[--runstackpos] = runtrack[runtrackpos++];
base.Uncapture();
goto Backtrack;
}
default:
{
global::System.Diagnostics.Debug.Fail($"Unexpected backtracking state {runtrack[runtrackpos - 1]}");
break;
}
}
} Code generated for `<.*?>` afterprotected override void Go()
{
string runtext = base.runtext!;
int runtextpos = base.runtextpos;
int runtextend = base.runtextend;
int originalruntextpos = runtextpos;
global::System.ReadOnlySpan<byte> byteSpan;
char ch;
global::System.ReadOnlySpan<char> textSpan = global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos);
// Concatenate
//{
// One '<'
{
if ((uint)textSpan.Length < 1 || textSpan[0] != '<')
{
goto NoMatch;
}
}
// Notonelazy '\\n'*
//{
runtextpos++;
textSpan = textSpan.Slice(1);
int nextPos0 = runtextpos;
goto endLoop0;
Backtrack1:
runtextpos = nextPos0;
textSpan = global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos);
if ((uint)textSpan.Length < 1 || textSpan[0] == '\n')
{
goto NoMatch;
}
runtextpos++;
textSpan = textSpan.Slice(1);
nextPos0 = runtextpos;
endLoop0:
//}
// One '>'
{
if ((uint)textSpan.Length < 1 || textSpan[0] != '>')
{
goto Backtrack1;
}
}
//}
// Match
runtextpos++;
base.runtextpos = runtextpos;
base.Capture(0, originalruntextpos, runtextpos);
return;
// No match
NoMatch:
return;
}
|
77ae85a
to
947a861
Compare
src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
Show resolved
Hide resolved
...libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
Show resolved
Hide resolved
src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs
Show resolved
Hide resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Left a couple of questions, but this looks good.
Random question: is there a reason why we wouldn't want this optimization for multi lazy loops too? E.g. There seems to be a very BIG difference between codegen from onelazy loops and multilazy loops: Code generated for `(?:a)*?b`protected override void Go()
{
string runtext = base.runtext!;
int runtextpos = base.runtextpos;
int runtextend = base.runtextend;
int originalruntextpos = runtextpos;
global::System.ReadOnlySpan<byte> byteSpan;
char ch;
global::System.ReadOnlySpan<char> textSpan = global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos);
// Concatenate
//{
// Onelazy 'a'*
//{
int nextPos0 = runtextpos;
goto endLoop0;
Backtrack1:
runtextpos = nextPos0;
textSpan = global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos);
if ((uint)textSpan.Length < 1 || textSpan[0] != 'a')
{
goto NoMatch;
}
runtextpos++;
textSpan = textSpan.Slice(1);
nextPos0 = runtextpos;
endLoop0:
//}
// One 'b'
{
if ((uint)textSpan.Length < 1 || textSpan[0] != 'b')
{
goto Backtrack1;
}
}
//}
// Match
runtextpos++;
base.runtextpos = runtextpos;
base.Capture(0, originalruntextpos, runtextpos);
return;
// No match
NoMatch:
return;
} Code generated for `(?:ab)*?b`protected override void Go()
{
string runtext = base.runtext!;
int runtextbeg = base.runtextbeg;
int runtextend = base.runtextend;
int runtextpos = base.runtextpos;
int[] runtrack = base.runtrack!;
int runtrackpos = base.runtrackpos;
int[] runstack = base.runstack!;
int runstackpos = base.runstackpos;
int tmp1, tmp2, ch;
// 000000 *Lazybranch addr = 15
L0:
runtrack[--runtrackpos] = runtextpos;
runtrack[--runtrackpos] = 0;
// 000002 *Setmark
L1:
runstack[--runstackpos] = runtextpos;
runtrack[--runtrackpos] = 1;
// 000003 Nullmark
L2:
runstack[--runstackpos] = -1;
runtrack[--runtrackpos] = 1;
// 000004 *Goto addr = 8
L3:
goto L5;
// 000006 Multi "ab"
L4:
if (runtextend - runtextpos < 2 ||
runtext[runtextpos] != 'a' ||
runtext[runtextpos + 1] != 'b')
{
goto Backtrack;
}
runtextpos += 2;
// 000008 *Lazybranchmark addr = 6
L5:
tmp1 = runstack[runstackpos++]; // mark
runtrack[--runtrackpos] = tmp1 != -1 ? tmp1 : runtextpos;
if (runtextpos != tmp1)
{
runtrack[--runtrackpos] = runtextpos;
runtrack[--runtrackpos] = 2;
goto L6;
}
runstack[--runstackpos] = tmp1;
runtrack[--runtrackpos] = 3;
// 000010 One 'b'
L6:
if (runtextpos >= runtextend || runtext[runtextpos++] != 98)
{
goto Backtrack;
}
// 000012 *Capturemark index = 0
L7:
tmp1 = runstack[runstackpos++];
base.Capture(0, tmp1, runtextpos);
runtrack[--runtrackpos] = tmp1;
runtrack[--runtrackpos] = 4;
// 000015 Stop
L8:
base.runtextpos = runtextpos;
return;
Backtrack:
int limit = base.runtrackcount * 4;
if (runstackpos < limit)
{
base.runstackpos = runstackpos;
base.DoubleStack(); // might change runstackpos and runstack
runstackpos = base.runstackpos;
runstack = base.runstack!;
}
if (runtrackpos < limit)
{
base.runtrackpos = runtrackpos;
base.DoubleTrack(); // might change runtrackpos and runtrack
runtrackpos = base.runtrackpos;
runtrack = base.runtrack!;
}
switch (runtrack[runtrackpos++])
{
case 0:
{
// 000000 *Lazybranch addr = 15
runtextpos = runtrack[runtrackpos++];
goto L8;
}
case 1:
{
// 000002 *Setmark
runstackpos++;
goto Backtrack;
}
case 2:
{
// 000008 *Lazybranchmark addr = 6
runtextpos = runtrack[runtrackpos++];
runstack[--runstackpos] = runtextpos;
runtrack[--runtrackpos] = 3;
if (runtrackpos <= 20 || runstackpos <= 15)
{
runtrack[--runtrackpos] = 5;
goto Backtrack;
}
goto L4;
}
case 3:
{
// 000008 *Lazybranchmark addr = 6
runstack[runstackpos] = runtrack[runtrackpos++];
goto Backtrack;
}
case 4:
{
// 000012 *Capturemark index = 0
runstack[--runstackpos] = runtrack[runtrackpos++];
base.Uncapture();
goto Backtrack;
}
case 5:
{
goto L4;
}
default:
{
global::System.Diagnostics.Debug.Fail($"Unexpected backtracking state {runtrack[runtrackpos - 1]}");
break;
}
}
} Of course I'm not suggesting for that support to also be added here if this is something we do want to add, but just asking in case we want to log an issue for adding that later. |
"This optimization" being the simplified code gen path? Eventually it'd be nice if we could switch to this approach for everything. I've been building it up piecemeal, focusing on which constructs would cast the widest net. In .NET 5, we covered ~40% of real-world patterns (based on our data set). From previous PRs in .NET 7, we've gotten that up to ~65%. This PR gets us to ~70%. The construct you're talking about shows up as a RegexNode.Lazy, which wraps an arbitrary node, rather than a RegexNode.{One/Notone/Set}lazy, which is a specialized, very popular (in particular sets) single character lazy loop. This PR adds support for the latter; we'd need to do something different for the former. The code to handle a RegexNode.Loop (greedy) would be doable but I expect comparatively non-trivial and involve a true stack, as we'd need to track all the places each iteration completed in order to backtrack there... but for RegexNode.Lazy, it's probably a lot simpler and much closer to this PR's code, since we're incrementally matching another iteration and thus don't need to track an arbitrary number of locations. If you're interested in dipping your toes in, might be a good one to attempt (according to our "real-world" data set, it would net us less than 1% additional patterns, but that doesn't say anything about how "valuable" those patterns are). |
947a861
to
b051c2c
Compare
Actually, looking at it further, I think it's just a few lines needing to be tweaked on top of what this PR provided... trying... |
#60385 added support for greedy loops to our "simplified" code gen for the Regex compiler and source generator. This does the same for lazy loops. In our stable of known real-world regex patterns, this increases the number supported by the simplified code gen by ~5% to ~70%.
(Note we can subsequently add vectorization into this to, for example, special-case .* and use IndexOf{Any} to search for what comes next.)
cc: @joperezr, @danmoseley
Code generated for `<.*?>` before
Code generated for `<.*?>` after