-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add limited support for backtracking Regex single char loops to simplified code gen #60385
Conversation
Tagging subscribers to this area: @eerhardt, @dotnet/area-system-text-regularexpressions Issue DetailsIn .NET 5, we added simpler compiled code gen for regexes that didn't entail backtracking (or that had only very constrained backtracking, such as a top-level alternation). In our corpus of ~90K regular expressions, that code generator is employed for ~40% of them. The primary purpose of adding that code generator initially was performance, as it was able to avoid lots of the expense that original code generator had, especially for simple regexes. However, with the source generator, it's much more valuable to use this code gen as the generated code is human-readable and really helps to understand how the regex is operating, is much more easily debugged, etc. This change allows the simplified code gen to be used even if there are backtracking single-character loops in the regex, as long as those loops are in a top-level concatenation (or a simple grouping structure like a capture). This increases the percentage of expressions in our corpus that will use the simplified code gen to ~65%. Once we have the simplified loop code gen, it's also a lot easier to add in vectorization of searching for the next location to back off to based on a literal that comes immediately after the loop (e.g. "abc.*def"). This adds support into both RegexOptions.Compiled and the source generator to use LastIndexOf in that case. The change also entailed adding/updating a few recursive functions. The plan has been to adopt the same model as in System.Linq.Expressions, Roslyn, and elsewhere, where we fork processing to continue on a secondary thread, rather than trying to enforce some max depth or rewrite as iterative, so I've done that as part of this change as well. As an example, the "email" benchmark from: private readonly static Regex s_email = new Regex(@"[\w\.+-]+@[\w\.-]+\.[\w\.-]+", RegexOptions.Compiled);
[Benchmark]
public int Email() => Count(s_email, s_mariomkaInput);
private static int Count(Regex r, string input)
{
int count = 0;
Match m = r.Match(input);
while (m.Success)
{
count++;
m = m.NextMatch();
}
return count;
}
And here's the generated code for the matching logic before and after... Beforeprotected override void Go()
{
string runtext = base.runtext!;
int runtextbeg = base.runtextbeg;
int runtextend = base.runtextend;
int runtextpos = base.runtextpos;
int[] runtrack = base.runtrack!;
int runtrackpos = base.runtrackpos;
int[] runstack = base.runstack!;
int runstackpos = base.runstackpos;
int tmp1, tmp2, ch;
// 000000 *Lazybranch addr = 29
L0:
runtrack[--runtrackpos] = runtextpos;
runtrack[--runtrackpos] = 0;
// 000002 *Setmark
L1:
runstack[--runstackpos] = runtextpos;
runtrack[--runtrackpos] = 1;
// 000003 Setrep [+-.\\w], rep = 1
L2:
if (runtextend - runtextpos < 1)
{
goto Backtrack;
}
for (int i = 0; i < 1; i++)
{
if (!((ch = runtext[runtextpos + i]) < 128 ? ("\0\0栀Ͽ\ufffe蟿\ufffe\u07ff"[ch >> 4] & (1 << (ch & 0xF))) != 0 : CharInClass((char)ch, "\0\u0004\n+,-/\0\u0002\u0004\u0005\u0003\u0001\u0006\t\u0013\0")))
{
goto Backtrack;
}
}
runtextpos++;
// 000006 Setloopatomic [+-.\\w], rep = inf
L3:
tmp1 = runtextend - runtextpos; // length
tmp2 = tmp1 + 1;
while (--tmp2 > 0)
{
if (!((ch = runtext[runtextpos++]) < 128 ? ("\0\0栀Ͽ\ufffe蟿\ufffe\u07ff"[ch >> 4] & (1 << (ch & 0xF))) != 0 : CharInClass((char)ch, "\0\u0004\n+,-/\0\u0002\u0004\u0005\u0003\u0001\u0006\t\u0013\0")))
{
runtextpos--;
break;
}
}
// 000009 UpdateBumpalong
L4:
runtrack[^1] = runtextpos;
// 000010 One '@'
L5:
if (runtextpos >= runtextend || runtext[runtextpos++] != 64)
{
goto Backtrack;
}
// 000012 Setrep [-.\\w], rep = 1
L6:
if (runtextend - runtextpos < 1)
{
goto Backtrack;
}
for (int i = 0; i < 1; i++)
{
if (!((ch = runtext[runtextpos + i]) < 128 ? ("\0\0怀Ͽ\ufffe蟿\ufffe\u07ff"[ch >> 4] & (1 << (ch & 0xF))) != 0 : CharInClass((char)ch, "\0\u0002\n-/\0\u0002\u0004\u0005\u0003\u0001\u0006\t\u0013\0")))
{
goto Backtrack;
}
}
runtextpos++;
// 000015 *Setloop [-.\\w], rep = inf
L7:
tmp1 = runtextend - runtextpos; // length
tmp2 = tmp1 + 1;
while (--tmp2 > 0)
{
if (!((ch = runtext[runtextpos++]) < 128 ? ("\0\0怀Ͽ\ufffe蟿\ufffe\u07ff"[ch >> 4] & (1 << (ch & 0xF))) != 0 : CharInClass((char)ch, "\0\u0002\n-/\0\u0002\u0004\u0005\u0003\u0001\u0006\t\u0013\0")))
{
runtextpos--;
break;
}
}
if (tmp2 >= tmp1)
{
goto L8;
}
runtrack[--runtrackpos] = tmp1 - tmp2 - 1;
runtrack[--runtrackpos] = runtextpos - 1;
runtrack[--runtrackpos] = 2;
// 000018 One '.'
L8:
if (runtextpos >= runtextend || runtext[runtextpos++] != 46)
{
goto Backtrack;
}
// 000020 Setrep [-.\\w], rep = 1
L9:
if (runtextend - runtextpos < 1)
{
goto Backtrack;
}
for (int i = 0; i < 1; i++)
{
if (!((ch = runtext[runtextpos + i]) < 128 ? ("\0\0怀Ͽ\ufffe蟿\ufffe\u07ff"[ch >> 4] & (1 << (ch & 0xF))) != 0 : CharInClass((char)ch, "\0\u0002\n-/\0\u0002\u0004\u0005\u0003\u0001\u0006\t\u0013\0")))
{
goto Backtrack;
}
}
runtextpos++;
// 000023 Setloopatomic [-.\\w], rep = inf
L10:
tmp1 = runtextend - runtextpos; // length
tmp2 = tmp1 + 1;
while (--tmp2 > 0)
{
if (!((ch = runtext[runtextpos++]) < 128 ? ("\0\0怀Ͽ\ufffe蟿\ufffe\u07ff"[ch >> 4] & (1 << (ch & 0xF))) != 0 : CharInClass((char)ch, "\0\u0002\n-/\0\u0002\u0004\u0005\u0003\u0001\u0006\t\u0013\0")))
{
runtextpos--;
break;
}
}
// 000026 *Capturemark index = 0
L11:
tmp1 = runstack[runstackpos++];
base.Capture(0, tmp1, runtextpos);
runtrack[--runtrackpos] = tmp1;
runtrack[--runtrackpos] = 3;
// 000029 Stop
L12:
base.runtextpos = runtextpos;
return;
Backtrack:
int limit = base.runtrackcount * 4;
if (runstackpos < limit)
{
base.runstackpos = runstackpos;
base.DoubleStack(); // might change runstackpos and runstack
runstackpos = base.runstackpos;
runstack = base.runstack!;
}
if (runtrackpos < limit)
{
base.runtrackpos = runtrackpos;
base.DoubleTrack(); // might change runtrackpos and runtrack
runtrackpos = base.runtrackpos;
runtrack = base.runtrack!;
}
switch (runtrack[runtrackpos++])
{
case 0:
{
// 000000 *Lazybranch addr = 29
runtextpos = runtrack[runtrackpos++];
goto L12;
}
case 1:
{
// 000002 *Setmark
runstackpos++;
goto Backtrack;
}
case 2:
{
// 000015 *Setloop [-.\\w], rep = inf
runtextpos = runtrack[runtrackpos++];
tmp1 = runtrack[runtrackpos++]; // position
if (tmp1 > 0)
{
runtrack[--runtrackpos] = tmp1 - 1;
runtrack[--runtrackpos] = runtextpos - 1;
runtrack[--runtrackpos] = 2;
}
goto L8;
}
case 3:
{
// 000026 *Capturemark index = 0
runstack[--runstackpos] = runtrack[runtrackpos++];
base.Uncapture();
goto Backtrack;
}
default:
{
global::System.Diagnostics.Debug.Fail($"Unexpected backtracking state {runtrack[runtrackpos - 1]}");
break;
}
}
} Afterprotected override void Go()
{
string runtext = base.runtext!;
int runtextpos = base.runtextpos;
int runtextend = base.runtextend;
int originalruntextpos = runtextpos;
global::System.ReadOnlySpan<byte> byteSpan;
char ch;
global::System.ReadOnlySpan<char> textSpan = global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos);
// Concatenate
//{
// Setloopatomic [+-.\\w]+
{
int i0 = 0;
while ((uint)i0 < (uint)textSpan.Length && ((ch = textSpan[i0]) < 128 ? ("\0\0栀Ͽ\ufffe蟿\ufffe\u07ff"[ch >> 4] & (1 << (ch & 0xF))) != 0 : CharInClass((char)ch, "\0\u0004\n+,-/\0\u0002\u0004\u0005\u0003\u0001\u0006\t\u0013\0")))
{
i0++;
}
if (i0 < 1)
{
goto NoMatch;
}
textSpan = textSpan.Slice(i0);
runtextpos += i0;
}
// UpdateBumpalong
{
base.runtextpos = runtextpos;
}
// One '@'
{
if ((uint)textSpan.Length < 1 || textSpan[0] != '@')
{
goto NoMatch;
}
}
// Setloop [-.\\w]+
//{
runtextpos++;
textSpan = textSpan.Slice(1);
int startingRunTextPos1 = runtextpos;
int i4 = 0;
while ((uint)i4 < (uint)textSpan.Length && ((ch = textSpan[i4]) < 128 ? ("\0\0怀Ͽ\ufffe蟿\ufffe\u07ff"[ch >> 4] & (1 << (ch & 0xF))) != 0 : CharInClass((char)ch, "\0\u0002\n-/\0\u0002\u0004\u0005\u0003\u0001\u0006\t\u0013\0")))
{
i4++;
}
if (i4 < 1)
{
goto NoMatch;
}
textSpan = textSpan.Slice(i4);
runtextpos += i4;
int endingRunTextPos2 = runtextpos;
int crawlPos3 = base.Crawlpos();
startingRunTextPos1 += 1;
goto EndLoop1;
Backtrack0:
if (startingRunTextPos1 >= endingRunTextPos2)
{
goto NoMatch;
}
endingRunTextPos2 = runtext.LastIndexOf('.', endingRunTextPos2 - 1, endingRunTextPos2 - startingRunTextPos1);
if (endingRunTextPos2 < 0)
{
goto NoMatch;
}
runtextpos = endingRunTextPos2;
textSpan = global::System.MemoryExtensions.AsSpan(runtext, runtextpos, runtextend - runtextpos);
EndLoop1:
//}
// One '.'
{
if ((uint)textSpan.Length < 1 || textSpan[0] != '.')
{
goto Backtrack0;
}
}
// Setloopatomic [-.\\w]+
{
runtextpos++;
textSpan = textSpan.Slice(1);
int i5 = 0;
while ((uint)i5 < (uint)textSpan.Length && ((ch = textSpan[i5]) < 128 ? ("\0\0怀Ͽ\ufffe蟿\ufffe\u07ff"[ch >> 4] & (1 << (ch & 0xF))) != 0 : CharInClass((char)ch, "\0\u0002\n-/\0\u0002\u0004\u0005\u0003\u0001\u0006\t\u0013\0")))
{
i5++;
}
if (i5 < 1)
{
goto Backtrack0;
}
textSpan = textSpan.Slice(i5);
runtextpos += i5;
}
//}
// Match
base.runtextpos = runtextpos;
base.Capture(0, originalruntextpos, runtextpos);
return;
// No match
NoMatch:
return;
}
|
Just curious, what does the generated code look like for this example? could you share? |
It's already in the PR description. Expand the before/ after nodes. |
It's surprisingly readable! |
😯 Oh ye of little faith 😄 |
253d091
to
14eed58
Compare
There's a joke here somewhere about the readability of goto statements (that is still respectful), but I don't know what it is. 😼 |
@BrzVlad, any idea why the |
@stephentoub Should get fixed by #60514 |
Thanks |
src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs
Outdated
Show resolved
Hide resolved
85a8031
to
7dfe534
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just a few questions - mostly for my learning.
src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs
Show resolved
Hide resolved
...gularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs
Show resolved
Hide resolved
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
Show resolved
Hide resolved
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
Outdated
Show resolved
Hide resolved
src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
Show resolved
Hide resolved
src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
Outdated
Show resolved
Hide resolved
src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
Show resolved
Hide resolved
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
Show resolved
Hide resolved
dd72b32
to
124ddcf
Compare
@safern, some drawing tests have repeatedly failed on this PR. It's not clear to me how my changes here could have broken this, but it's failed multiple times.
Is there any known issue here? I don't see any open issues for it. |
Ah, I see #60731 was literally just created. |
Hmm interesting, I'll dig into what cause them to start failing and disable them. Thanks for the ping. |
…ified code gen In .NET 5, we added simpler compiled code gen for regexes that didn't entail backtracking (or that had only very constrained backtracking, such as a top-level alternation). In our corpus of ~90K regular expressions, that code generator is employed for ~40% of them. The primary purpose of adding that code generator initially was performance, as it was able to avoid lots of the expense that original code generator had, especially for simple regexes. However, with the source generator, it's much more valuable to use this code gen as the generated code is human-readable and really helps to understand how the regex is operating, is much more easily debugged, etc. This change allows the simplified code gen to be used even if there are backtracking single-character loops in the regex, as long as those loops are in a top-level concatenation (or a simple grouping structure like a capture). This increases the percentage of expressions in our corpus that will use the simplified code gen to ~65%. Once we have the simplified loop code gen, it's also a lot easier to add in vectorization of searching for the next location to back off to based on a literal that comes immediately after the loop (e.g. "abc.*def"). This adds support into both RegexOptions.Compiled and the source generator to use LastIndexOf in that case. The change also entailed adding/updating a few recursive functions. The plan has been to adopt the same model as in System.Linq.Expressions, Roslyn, and elsewhere, where we fork processing to continue on a secondary thread, rather than trying to enforce some max depth or rewrite as iterative, so I've done that as part of this change as well.
124ddcf
to
e8bb072
Compare
Linux/x64 improvement - dotnet/perf-autofiling-issues#1975 |
Excellent. |
In .NET 5, we added simpler compiled code gen for regexes that didn't entail backtracking (or that had only very constrained backtracking, such as a top-level alternation). In our corpus of ~90K regular expressions, that code generator is employed for ~40% of them. The primary purpose of adding that code generator initially was performance, as it was able to avoid lots of the expense that original code generator had, especially for simple regexes. However, with the source generator, it's much more valuable to use this code gen as the generated code is human-readable and really helps to understand how the regex is operating, is much more easily debugged, etc.
This change allows the simplified code gen to be used even if there are backtracking single-character loops in the regex, as long as those loops are in a top-level concatenation (or a simple grouping structure like a capture). This increases the percentage of expressions in our corpus that will use the simplified code gen to ~65%.
Once we have the simplified loop code gen, it's also a lot easier to add in vectorization of searching for the next location to back off to based on a literal that comes immediately after the loop (e.g. "abc.*def"). This adds support into both RegexOptions.Compiled and the source generator to use LastIndexOf in that case.
The change also entailed adding/updating a few recursive functions. The plan has been to adopt the same model as in System.Linq.Expressions, Roslyn, and elsewhere, where we fork processing to continue on a secondary thread, rather than trying to enforce some max depth or rewrite as iterative, so I've done that as part of this change as well.
As an example, the "email" benchmark from:
https://github.com/mariomka/regex-benchmark/blame/244ca6c0e4bc8dd257904c51c0b5cabba6956dd2/csharp/Benchmark.cs#L19-L20
And here's the generated code for the matching logic before and after...
Before
After