Skip to content

Commit 0c38d80

Browse files
ievievstephentoubdanmoseley
authored
NonBacktracking Regex optimizations (#102655)
- Raise limits for when to switch from DFA to NFA and for when to fail to build because the predicted NFA size is too large to handle. - Cache nullability information for each minterm to make computing it cheaper. - Cache minterm information for all characters that map to a minterm other than zero, avoiding expensive computation per step for each non-ASCII character encountered. - Reduce inner hot loop overhead for patterns not containing an end Z anchor and for having fewer than 256 minterms (more than that is rare). - Reduce the frequency of timeout checks that were both costly and unnecessarily frequent to achieve the goal --------- Co-authored-by: ieviev <ieviev@users.noreply.github.com> Co-authored-by: Stephen Toub <stoub@microsoft.com> Co-authored-by: Dan Moseley <danmose@microsoft.com>
1 parent ebaa0cd commit 0c38d80

18 files changed

+994
-257
lines changed

src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj

+2
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@
7272
<Compile Include="System\Text\RegularExpressions\Symbolic\MatchingState.cs" />
7373
<Compile Include="System\Text\RegularExpressions\Symbolic\DoublyLinkedList.cs" />
7474
<Compile Include="System\Text\RegularExpressions\Symbolic\ISolver.cs" />
75+
<Compile Include="System\Text\RegularExpressions\Symbolic\MatchReversalKind.cs"/>
76+
<Compile Include="System\Text\RegularExpressions\Symbolic\MatchReversal.cs"/>
7577
<Compile Include="System\Text\RegularExpressions\Symbolic\MintermClassifier.cs" />
7678
<Compile Include="System\Text\RegularExpressions\Symbolic\MintermGenerator.cs" />
7779
<Compile Include="System\Text\RegularExpressions\Symbolic\RegexNodeConverter.cs" />

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs

-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
using System.Collections.Generic;
66
using System.Diagnostics;
77
using System.Runtime.CompilerServices;
8-
using System.Runtime.InteropServices;
98

109
#pragma warning disable CS8500 // takes address of managed type
1110

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/BitVectorSolver.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@ internal sealed class BitVectorSolver : ISolver<BitVector>
1010
internal readonly MintermClassifier _classifier;
1111
private readonly BitVector[] _mintermVectors;
1212

13-
public BitVectorSolver(BDD[] minterms, CharSetSolver solver)
13+
public BitVectorSolver(BDD[] minterms)
1414
{
1515
_minterms = minterms;
1616

17-
_classifier = new MintermClassifier(minterms, solver);
17+
_classifier = new MintermClassifier(minterms);
1818

1919
var singleBitVectors = new BitVector[minterms.Length];
2020
for (int i = 0; i < singleBitVectors.Length; i++)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
using System.Diagnostics;
5+
6+
namespace System.Text.RegularExpressions.Symbolic
7+
{
8+
/// <summary>Provides details on how a match may be processed in reverse to find the beginning of a match once a match's existence has been confirmed.</summary>
9+
internal readonly struct MatchReversalInfo<TSet> where TSet : IComparable<TSet>, IEquatable<TSet>
10+
{
11+
/// <summary>Initializes the match reversal details.</summary>
12+
internal MatchReversalInfo(MatchReversalKind kind, int fixedLength, MatchingState<TSet>? adjustedStartState = null)
13+
{
14+
Debug.Assert(kind is MatchReversalKind.MatchStart or MatchReversalKind.FixedLength or MatchReversalKind.PartialFixedLength);
15+
Debug.Assert(fixedLength >= 0);
16+
Debug.Assert((adjustedStartState is not null) == (kind is MatchReversalKind.PartialFixedLength));
17+
18+
Kind = kind;
19+
FixedLength = fixedLength;
20+
AdjustedStartState = adjustedStartState;
21+
}
22+
23+
/// <summary>Gets the kind of the match reversal processing required.</summary>
24+
internal MatchReversalKind Kind { get; }
25+
26+
/// <summary>Gets the fixed length of the match, if one is known.</summary>
27+
/// <remarks>
28+
/// For <see cref="MatchReversalKind.MatchStart"/>, this is ignored.
29+
/// For <see cref="MatchReversalKind.FixedLength"/>, this is the full length of the match. The beginning may be found simply
30+
/// by subtracting this length from the end.
31+
/// For <see cref="MatchReversalKind.PartialFixedLength"/>, this is the length of fixed portion of the match.
32+
/// </remarks>
33+
internal int FixedLength { get; }
34+
35+
/// <summary>Gets the adjusted start state to use for partial fixed-length matches.</summary>
36+
/// <remarks>This will be non-null iff <see cref="Kind"/> is <see cref="MatchReversalKind.PartialFixedLength"/>.</remarks>
37+
internal MatchingState<TSet>? AdjustedStartState { get; }
38+
}
39+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
namespace System.Text.RegularExpressions.Symbolic
5+
{
6+
/// <summary>Specifies the kind of a <see cref="MatchReversalInfo{TSet}"/>.</summary>
7+
internal enum MatchReversalKind
8+
{
9+
/// <summary>The regex should be run in reverse to find beginning of the match.</summary>
10+
MatchStart,
11+
12+
/// <summary>The end of the pattern is of a fixed length and can be skipped as part of running a regex in reverse to find the beginning of the match.</summary>
13+
/// <remarks>
14+
/// Reverse execution is not necessary for a subset of the match.
15+
/// <see cref="MatchReversalInfo{TSet}.FixedLength"/> will contain the length of the fixed portion.
16+
/// </remarks>
17+
PartialFixedLength,
18+
19+
/// <summary>The entire pattern is of a fixed length.</summary>
20+
/// <remarks>
21+
/// Reverse execution is not necessary to find the beginning of the match.
22+
/// <see cref="MatchReversalInfo{TSet}.FixedLength"/> will contain the length of the match.
23+
/// </remarks>
24+
FixedLength
25+
}
26+
}

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs

+38-10
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
1414
{
1515
Node = node;
1616
PrevCharKind = prevCharKind;
17+
NullabilityInfo = BuildNullabilityInfo();
1718
}
1819

1920
/// <summary>The regular expression that labels this state and gives it its semantics.</summary>
@@ -95,21 +96,37 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
9596
return Node.CreateNfaDerivativeWithEffects(builder, minterm, context);
9697
}
9798

99+
/// <summary>Determines whether the node is nullable for the given context.</summary>
100+
/// <remarks>
101+
/// This is functionally equivalent to <see cref="SymbolicRegexNode{TSet}.IsNullableFor(uint)"/>, but using cached
102+
/// answers stored in <see cref="NullabilityInfo"/>.
103+
/// </remarks>
98104
[MethodImpl(MethodImplOptions.AggressiveInlining)]
99105
internal bool IsNullableFor(uint nextCharKind)
100106
{
101-
Debug.Assert(CharKind.IsValidCharKind(nextCharKind));
102-
uint context = CharKind.Context(PrevCharKind, nextCharKind);
103-
return Node.IsNullableFor(context);
107+
Debug.Assert(nextCharKind is >= 0 and < CharKind.CharKindCount);
108+
return (NullabilityInfo & (1 << (int)nextCharKind)) != 0;
104109
}
105110

111+
/// <summary>Gets the nullability info for the matching state.</summary>
112+
/// <remarks>
113+
/// <list>
114+
/// <item>00000 -> node cannot be nullable</item>
115+
/// <item>00001 -> nullable for General</item>
116+
/// <item>00010 -> nullable for BeginningEnd</item>
117+
/// <item>00100 -> nullable for NewLine</item>
118+
/// <item>01000 -> nullable for NewLineS</item>
119+
/// <item>10000 -> nullable for WordLetter</item>
120+
/// </list>
121+
/// </remarks>
122+
internal int NullabilityInfo { get; }
123+
106124
/// <summary>
107125
/// Builds a <see cref="StateFlags"/> with the relevant flags set.
108126
/// </summary>
109-
/// <param name="solver">a solver for <typeparamref name="TSet"/></param>
110127
/// <param name="isInitial">whether this state is an initial state</param>
111128
/// <returns>the flags for this matching state</returns>
112-
internal StateFlags BuildStateFlags(ISolver<TSet> solver, bool isInitial)
129+
internal StateFlags BuildStateFlags(bool isInitial)
113130
{
114131
StateFlags info = 0;
115132

@@ -118,11 +135,6 @@ internal StateFlags BuildStateFlags(ISolver<TSet> solver, bool isInitial)
118135
info |= StateFlags.IsInitialFlag;
119136
}
120137

121-
if (IsDeadend(solver))
122-
{
123-
info |= StateFlags.IsDeadendFlag;
124-
}
125-
126138
if (Node.CanBeNullable)
127139
{
128140
info |= StateFlags.CanBeNullableFlag;
@@ -140,6 +152,22 @@ internal StateFlags BuildStateFlags(ISolver<TSet> solver, bool isInitial)
140152
return info;
141153
}
142154

155+
/// <summary>Builds the nullability information for the matching state.</summary>
156+
/// <remarks>Nullability for each context is encoded in a bit. See <see cref="NullabilityInfo"/>.</remarks>
157+
private byte BuildNullabilityInfo()
158+
{
159+
byte nullabilityInfo = 0;
160+
if (Node.CanBeNullable)
161+
{
162+
for (uint charKind = 0; charKind < CharKind.CharKindCount; charKind++)
163+
{
164+
nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, charKind)) ? 1 << (int)charKind : 0);
165+
}
166+
}
167+
168+
return nullabilityInfo;
169+
}
170+
143171
public override bool Equals(object? obj) =>
144172
obj is MatchingState<TSet> s && PrevCharKind == s.PrevCharKind && Node.Equals(s.Node);
145173

Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33

4+
using System.Buffers;
45
using System.Diagnostics;
6+
using System.Numerics;
57
using System.Runtime.CompilerServices;
68

79
namespace System.Text.RegularExpressions.Symbolic
@@ -20,81 +22,104 @@ namespace System.Text.RegularExpressions.Symbolic
2022
/// </remarks>
2123
internal sealed class MintermClassifier
2224
{
23-
/// <summary>An array used when there's a single minterm, in order to map every ASCII character to it trivially.</summary>
24-
private static readonly int[] AllAsciiIsZeroMintermArray = new int[128];
25+
/// <summary>Mapping for characters to minterms, used in the vast majority case when there are less than 256 minterms.</summary>
26+
/// <remarks>_lookup[char] provides the minterm ID. If char &gt;= _lookup.Length, its minterm is 0.</remarks>
27+
private readonly byte[]? _lookup;
2528

26-
/// <summary>Array providing fast mapping from an ASCII character (the array index) to its corresponding minterm ID.</summary>
27-
private readonly int[] _ascii;
28-
/// <summary>A multi-terminal BDD for mapping any non-ASCII character to its associated minterm ID.</summary>
29-
/// <remarks>
30-
/// The use of a multi-terminal BDD here is an implementation detail. Should we decide its important to optimize non-ASCII inputs further,
31-
/// or to consolidate the mechanism with the other engines, an alternatie lookup algorithm / data structure could be employed.
32-
/// </remarks>
33-
private readonly BDD _nonAscii;
29+
/// <summary>Mapping for characters to minterms, used when there are at least 256 minterms. This is rarely used.</summary>
30+
/// <remarks>_intLookup[char] provides the minterm ID. If char &gt;= _intLookup.Length, its minterm is 0.</remarks>
31+
private readonly int[]? _intLookup;
3432

3533
/// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
3634
/// <param name="minterms">A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs.</param>
37-
/// <param name="solver">The character set solver to use.</param>
38-
public MintermClassifier(BDD[] minterms, CharSetSolver solver)
35+
public MintermClassifier(BDD[] minterms)
3936
{
4037
Debug.Assert(minterms.Length > 0, "Requires at least");
4138

4239
if (minterms.Length == 1)
4340
{
4441
// With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
45-
// For ASCII, use an array containing all zeros. For non-ASCII, use a BDD that maps everything to 0.
46-
_ascii = AllAsciiIsZeroMintermArray;
47-
_nonAscii = solver.ReplaceTrue(BDD.True, 0);
42+
_lookup = [];
4843
return;
4944
}
5045

51-
// Create a multi-terminal BDD for mapping any character to its associated minterm.
52-
BDD anyCharacterToMintermId = BDD.False;
53-
for (int i = 0; i < minterms.Length; i++)
54-
{
55-
// Each supplied minterm BDD decides whether a given character maps to it or not.
56-
// We need to combine all of those into a multi-terminal BDD that decides which
57-
// minterm a character maps to. To do that, we take each minterm BDD and replace
58-
// its True result with the ID of the minterm, such that a character that would
59-
// have returned True for that BDD now returns the minterm ID.
60-
BDD charToTargetMintermId = solver.ReplaceTrue(minterms[i], i);
46+
// Compute all minterm ranges. We do this here in order to determine the maximum character value
47+
// in order to size the lookup array to minimize steady-state memory consumption of the potentially
48+
// large lookup array. We prefer to use the byte[] _lookup when possible, in order to keep memory
49+
// consumption to a minimum; doing so accomodates up to 255 minterms, which is the vast majority case.
50+
// However, when there are more than 255 minterms, we need to use int[] _intLookup.
51+
(uint, uint)[][] charRangesPerMinterm = ArrayPool<(uint, uint)[]>.Shared.Rent(minterms.Length);
6152

62-
// Now union this BDD with the multi-terminal BDD we've built up thus far. Unioning
63-
// is valid because every character belongs to exactly one minterm and thus will
64-
// only map to an ID instead of False in exactly one of the input BDDs.
65-
anyCharacterToMintermId = solver.Or(anyCharacterToMintermId, charToTargetMintermId);
53+
int maxChar = -1;
54+
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
55+
{
56+
(uint, uint)[] ranges = BDDRangeConverter.ToRanges(minterms[mintermId]);
57+
charRangesPerMinterm[mintermId] = ranges;
58+
maxChar = Math.Max(maxChar, (int)ranges[^1].Item2);
6659
}
6760

68-
// Now that we have our mapping that supports any input character, we want to optimize for
69-
// ASCII inputs. Rather than forcing every input ASCII character to consult the BDD at match
70-
// time, we precompute a lookup table, where each ASCII character can be used to index into the
71-
// array to determine the ID for its corresponding minterm.
72-
var ascii = new int[128];
73-
for (int i = 0; i < ascii.Length; i++)
61+
// It's incredibly rare for a regex to use more than a couple hundred minterms,
62+
// but we need a fallback just in case. (Over 128 unique sets also means it's never ASCII only.)
63+
if (minterms.Length > 255)
64+
{
65+
_intLookup = CreateLookup<int>(minterms, charRangesPerMinterm, maxChar);
66+
}
67+
else
7468
{
75-
ascii[i] = anyCharacterToMintermId.Find(i);
69+
_lookup = CreateLookup<byte>(minterms, charRangesPerMinterm, maxChar);
7670
}
77-
_ascii = ascii;
7871

79-
// We can also further optimize the BDD in two ways:
80-
// 1. We can now remove the ASCII characters from it, as we'll always consult the lookup table first
81-
// for ASCII inputs and thus will never use the BDD for them. While optional (skipping this step will not
82-
// affect correctness), removing the ASCII values from the BDD reduces the size of the multi-terminal BDD.
83-
// 2. We can check if every character now maps to the same minterm ID (the same terminal in the
84-
// multi-terminal BDD). This can be relatively common after (1) above is applied, as many
85-
// patterns don't distinguish between any non-ASCII characters (e.g. "[0-9]*"). If every character
86-
// in the BDD now maps to the same minterm, we can replace the BDD with a much simpler/faster/smaller one.
87-
BDD nonAsciiBDD = solver.And(anyCharacterToMintermId, solver.NonAscii);
88-
nonAsciiBDD = nonAsciiBDD.IsEssentiallyBoolean(out BDD? singleTerminalBDD) ? singleTerminalBDD : nonAsciiBDD;
89-
_nonAscii = nonAsciiBDD;
72+
// Return the rented array. We clear it before returning it in order to avoid all the ranges arrays being kept alive.
73+
Array.Clear(charRangesPerMinterm, 0, minterms.Length);
74+
ArrayPool<(uint, uint)[]>.Shared.Return(charRangesPerMinterm);
75+
76+
// Creates the lookup array.
77+
static T[] CreateLookup<T>(BDD[] minterms, ReadOnlySpan<(uint, uint)[]> charRangesPerMinterm, int _maxChar) where T : IBinaryInteger<T>
78+
{
79+
T[] lookup = new T[_maxChar + 1];
80+
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
81+
{
82+
// Each minterm maps to a range of characters. Set each of the characters in those ranges to the corresponding minterm.
83+
foreach ((uint start, uint end) in charRangesPerMinterm[mintermId])
84+
{
85+
lookup.AsSpan((int)start, (int)(end + 1 - start)).Fill(T.CreateTruncating(mintermId));
86+
}
87+
}
88+
89+
return lookup;
90+
}
9091
}
9192

92-
/// <summary>Gets the ID of the minterm associated with the specified character.</summary>
93+
/// <summary>Gets the ID of the minterm associated with the specified character. </summary>
9394
[MethodImpl(MethodImplOptions.AggressiveInlining)]
9495
public int GetMintermID(int c)
9596
{
96-
int[] ascii = _ascii;
97-
return (uint)c < (uint)ascii.Length ? ascii[c] : _nonAscii.Find(c);
97+
if (_lookup is not null)
98+
{
99+
byte[] lookup = _lookup;
100+
return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
101+
}
102+
else
103+
{
104+
int[] lookup = _intLookup!;
105+
return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
106+
}
98107
}
108+
/// <summary>
109+
/// Gets a quick mapping from char to minterm for the common case when there are &lt;= 255 minterms.
110+
/// Null if there are greater than 255 minterms.
111+
/// </summary>
112+
public byte[]? ByteLookup => _lookup;
113+
114+
/// <summary>
115+
/// Gets a mapping from char to minterm for the rare case when there are &gt;= 255 minterms.
116+
/// Null in the common case where there are fewer than 255 minterms.
117+
/// </summary>
118+
public int[]? IntLookup => _intLookup;
119+
120+
/// <summary>
121+
/// Maximum ordinal character for a non-0 minterm, used to conserve memory
122+
/// </summary>
123+
public int MaxChar => (_lookup?.Length ?? _intLookup!.Length) - 1;
99124
}
100125
}

0 commit comments

Comments
 (0)