Skip to content

Commit b040ed6

Browse files
stephentoubMihaZupan
andauthoredApr 3, 2024··
Several improvements / simplifications in Regex (#100315)
* Several improvements / simplifications in Regex This started out as a small improvement for one thing and grew to be something else. Initially, my intent was just to improve how `SearchValues<char>` applies to character classes with subtraction. Character class subtraction isn't frequently used, but it is a convenient way to express removing subsets of ranges, e.g. all ASCII other than digits `[\u0000-\u007F-[0-9]]`. Currently when we go to enumerate the characters in a char class, for perf reasons we only do the enumeration if we can enumerate sets and up to the max space provided, in order to keep the time down. We immediately give up if the char class has subtraction, but given that we've already limited how many values we're enumerating, if there is subtraction we can afford to query for just those chars that would otherwise pass in order to enable the subtraction. So, with this PR, we can now support using SearchValues in this manner: **this means that whereas previously we would have generated an IndexOfAny for any of the ASCII characters or anything non-ASCII, then with a fallback for if we hit something non-ASCII, now we'll just create an IndexOfAny for the full set**. However, that triggered a (then defunct) assert which led me to see that we have a bunch of duplicated logic around asserts: we'd frequently be checking to see if a set contained at most 5 chars (in support of a time when we didn't have SearchValues and only optimized IndexOfAny for up to 5 chars) and then subsequently would see if it contained only ASCII. We no longer need that separation, especially since SearchValues will now both vectorize probabilistic map searches and will first do a search for the ASCII portion (or anything non-ASCII). **This then means we can delete a variety of duplicated code while also expanding what we recognize for use with SearchValues.** This then lead to seeing that in a variety of places we compute the set of chars in a set and then check whether it could instead be satisfied just as a range but not if the set of chars is small. The former check is more expensive than the latter, but we were doing the first one first presumably in order to be able to do the set size check as part of the latter. However, we don't need it for that, as a single subtraction gives us the size of the range, **so we can just do the range check first and skip the more expensive set check if it's not needed.** That then led to seeing that we're not using range-based searching in the interpreter or non-backtracking engines. **This adds that support, such that the interpreter/non-backtracking engines will now search for the next starting location using IndexOfAny{Except}InRange if appropriate.**. * Update src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs Co-authored-by: Miha Zupan <mihazupan.zupan1@gmail.com> --------- Co-authored-by: Miha Zupan <mihazupan.zupan1@gmail.com>
1 parent dd1b8b5 commit b040ed6

File tree

7 files changed

+87
-158
lines changed

7 files changed

+87
-158
lines changed
 

‎src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

+15-37
Original file line numberDiff line numberDiff line change
@@ -391,11 +391,13 @@ private static void AddIsECMABoundaryHelper(Dictionary<string, string[]> require
391391
/// <summary>Adds a SearchValues instance declaration to the required helpers collection if the chars are ASCII.</summary>
392392
private static string EmitSearchValuesOrLiteral(ReadOnlySpan<char> chars, Dictionary<string, string[]> requiredHelpers)
393393
{
394-
// SearchValues<char> is faster than a regular IndexOfAny("abcd") for sets of 4/5 values iff they are ASCII.
395-
// Only emit SearchValues instances when we know they'll be faster to avoid increasing the startup cost too much.
396-
Debug.Assert(chars.Length is 4 or 5);
394+
Debug.Assert(chars.Length > 3);
397395

398-
return RegexCharClass.IsAscii(chars)
396+
// IndexOfAny(SearchValues) is faster than a regular IndexOfAny("abcd") if:
397+
// - There are more than 5 characters in the needle, or
398+
// - There are only 4 or 5 characters in the needle and they're all ASCII.
399+
400+
return chars.Length > 5 || RegexCharClass.IsAscii(chars)
399401
? EmitSearchValues(chars.ToArray(), requiredHelpers)
400402
: Literal(chars.ToString());
401403
}
@@ -3510,11 +3512,10 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL
35103512
{
35113513
if (iterationCount is null &&
35123514
node.Kind is RegexNodeKind.Notonelazy &&
3513-
subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max efficiently optimized by IndexOfAny, and we need to reserve 1 for node.Ch
3515+
subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal &&
35143516
!literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
35153517
(literal.String is not null ||
35163518
literal.SetChars is not null ||
3517-
(literal.AsciiChars is not null && node.Ch < 128) || // for ASCII sets, only allow when the target can be efficiently included in the set
35183519
literal.Range.LowInclusive == literal.Range.HighInclusive ||
35193520
(literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union
35203521
{
@@ -3546,18 +3547,6 @@ literal.SetChars is not null ||
35463547
(false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({EmitSearchValuesOrLiteral($"{node.Ch}{literal.SetChars}".AsSpan(), requiredHelpers)});",
35473548
});
35483549
}
3549-
else if (literal.AsciiChars is not null) // set of only ASCII characters
3550-
{
3551-
char[] asciiChars = literal.AsciiChars;
3552-
overlap = asciiChars.Contains(node.Ch);
3553-
if (!overlap)
3554-
{
3555-
Debug.Assert(node.Ch < 128);
3556-
Array.Resize(ref asciiChars, asciiChars.Length + 1);
3557-
asciiChars[asciiChars.Length - 1] = node.Ch;
3558-
}
3559-
writer.WriteLine($"{startingPos} = {sliceSpan}.IndexOfAny({EmitSearchValues(asciiChars, requiredHelpers)});");
3560-
}
35613550
else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One
35623551
{
35633552
overlap = literal.Range.LowInclusive == node.Ch;
@@ -4928,11 +4917,10 @@ private static bool TryEmitIndexOf(
49284917
{
49294918
bool negated = RegexCharClass.IsNegated(node.Str) ^ negate;
49304919

4931-
Span<char> setChars = stackalloc char[5]; // current max that's vectorized
4932-
int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars);
4933-
4934-
// Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
4935-
if (setCharsCount is not (1 or 2) && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
4920+
// IndexOfAny{Except}InRange
4921+
// Prefer IndexOfAnyInRange over IndexOfAny, except for tiny ranges (1 or 2 items) that IndexOfAny handles more efficiently
4922+
if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive) &&
4923+
(highInclusive - lowInclusive) > 1)
49364924
{
49374925
string indexOfAnyInRangeName = !negated ?
49384926
"IndexOfAnyInRange" :
@@ -4944,13 +4932,15 @@ private static bool TryEmitIndexOf(
49444932
return true;
49454933
}
49464934

4947-
if (setCharsCount > 0)
4935+
// IndexOfAny{Except}(ch1, ...)
4936+
Span<char> setChars = stackalloc char[128];
4937+
setChars = setChars.Slice(0, RegexCharClass.GetSetChars(node.Str, setChars));
4938+
if (!setChars.IsEmpty)
49484939
{
49494940
(string indexOfName, string indexOfAnyName) = !negated ?
49504941
("IndexOf", "IndexOfAny") :
49514942
("IndexOfAnyExcept", "IndexOfAnyExcept");
49524943

4953-
setChars = setChars.Slice(0, setCharsCount);
49544944
indexOfExpr = setChars.Length switch
49554945
{
49564946
1 => $"{last}{indexOfName}({Literal(setChars[0])})",
@@ -4962,18 +4952,6 @@ private static bool TryEmitIndexOf(
49624952
literalLength = 1;
49634953
return true;
49644954
}
4965-
4966-
if (RegexCharClass.TryGetAsciiSetChars(node.Str, out char[]? asciiChars))
4967-
{
4968-
string indexOfAnyName = !negated ?
4969-
"IndexOfAny" :
4970-
"IndexOfAnyExcept";
4971-
4972-
indexOfExpr = $"{last}{indexOfAnyName}({EmitSearchValues(asciiChars, requiredHelpers)})";
4973-
4974-
literalLength = 1;
4975-
return true;
4976-
}
49774955
}
49784956

49794957
indexOfExpr = null;

‎src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
<EnableDefaultEmbeddedResourceItems>false</EnableDefaultEmbeddedResourceItems>
1212
<UsingToolXliff>true</UsingToolXliff>
1313
<CLSCompliant>false</CLSCompliant>
14-
<NoWarn>$(NoWarn);CS0436;CS0649</NoWarn>
14+
<NoWarn>$(NoWarn);CS0436;CS0649;CA1872</NoWarn>
1515
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
1616
<AnalyzerLanguage>cs</AnalyzerLanguage>
1717
</PropertyGroup>

‎src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

+27-24
Original file line numberDiff line numberDiff line change
@@ -815,17 +815,23 @@ public static bool TryGetDoubleRange(
815815
/// If 0 is returned, no assumptions can be made about the characters.
816816
/// </returns>
817817
/// <remarks>
818-
/// Only considers character classes that only contain sets (no categories)
819-
/// and no subtraction... just simple sets containing starting/ending pairs.
820-
/// The returned characters may be negated: if IsNegated(set) is false, then
821-
/// the returned characters are the only ones that match; if it returns true,
822-
/// then the returned characters are the only ones that don't match.
818+
/// Only considers character classes that only contain sets (no categories),
819+
/// just simple sets containing starting/ending pairs (subtraction from those pairs
820+
/// is factored in, however).The returned characters may be negated: if IsNegated(set)
821+
/// is false, then the returned characters are the only ones that match; if it returns
822+
/// true, then the returned characters are the only ones that don't match.
823823
/// </remarks>
824824
public static int GetSetChars(string set, Span<char> chars)
825825
{
826826
// We get the characters by enumerating the set portion, so we validate that it's
827827
// set up to enable that, e.g. no categories.
828-
if (!CanEasilyEnumerateSetContents(set))
828+
if (!CanEasilyEnumerateSetContents(set, out bool hasSubtraction))
829+
{
830+
return 0;
831+
}
832+
833+
// Negation with subtraction is too cumbersome to reason about efficiently.
834+
if (hasSubtraction && IsNegated(set))
829835
{
830836
return 0;
831837
}
@@ -837,40 +843,37 @@ public static int GetSetChars(string set, Span<char> chars)
837843
// based on it a) complicating things, and b) it being really unlikely to
838844
// be part of a small set.
839845
int setLength = set[SetLengthIndex];
840-
int count = 0;
846+
int count = 0, evaluated = 0;
841847
for (int i = SetStartIndex; i < SetStartIndex + setLength; i += 2)
842848
{
843849
int curSetEnd = set[i + 1];
844850
for (int c = set[i]; c < curSetEnd; c++)
845851
{
846-
if (count >= chars.Length)
852+
// Keep track of how many characters we've checked. This could work
853+
// just comparing count rather than evaluated, but we also want to
854+
// limit how much work is done here, which we can do by constraining
855+
// the number of checks to the size of the storage provided.
856+
if (++evaluated > chars.Length)
847857
{
848858
return 0;
849859
}
850860

861+
// If the set is all ranges but has a subtracted class,
862+
// validate the char is actually in the set prior to storing it:
863+
// it might be in the subtracted range.
864+
if (hasSubtraction && !CharInClass((char)c, set))
865+
{
866+
continue;
867+
}
868+
869+
Debug.Assert(count <= evaluated);
851870
chars[count++] = (char)c;
852871
}
853872
}
854873

855874
return count;
856875
}
857876

858-
public static bool TryGetAsciiSetChars(string set, [NotNullWhen(true)] out char[]? asciiChars)
859-
{
860-
Span<char> chars = stackalloc char[128];
861-
862-
chars = chars.Slice(0, GetSetChars(set, chars));
863-
864-
if (chars.IsEmpty || !IsAscii(chars))
865-
{
866-
asciiChars = null;
867-
return false;
868-
}
869-
870-
asciiChars = chars.ToArray();
871-
return true;
872-
}
873-
874877
/// <summary>
875878
/// Determines whether two sets may overlap.
876879
/// </summary>

‎src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

+15-60
Original file line numberDiff line numberDiff line change
@@ -939,7 +939,7 @@ void EmitFixedSet_LeftToRight()
939939
default:
940940
// tmp = ...IndexOfAny(setChars);
941941
// tmp = ...IndexOfAny(s_searchValues);
942-
EmitIndexOfAnyWithSearchValuesOrLiteral(new string(primarySet.Chars), except: primarySet.Negated);
942+
EmitIndexOfAnyWithSearchValuesOrLiteral(primarySet.Chars, except: primarySet.Negated);
943943
break;
944944
}
945945
}
@@ -3587,11 +3587,10 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL
35873587
if (!rtl &&
35883588
iterationCount is null &&
35893589
node.Kind is RegexNodeKind.Notonelazy &&
3590-
subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max optimized by IndexOfAny, and we need to reserve 1 for node.Ch
3590+
subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal &&
35913591
!literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
35923592
(literal.String is not null ||
35933593
literal.SetChars is not null ||
3594-
(literal.AsciiChars is not null && node.Ch < 128) || // for ASCII sets, only allow when the target can be efficiently included in the set
35953594
literal.Range.LowInclusive == literal.Range.HighInclusive ||
35963595
(literal.Range.LowInclusive <= node.Ch && node.Ch <= literal.Range.HighInclusive))) // for ranges, only allow when the range overlaps with the target, since there's no accelerated way to search for the union
35973596
{
@@ -3660,18 +3659,6 @@ literal.SetChars is not null ||
36603659
break;
36613660
}
36623661
}
3663-
else if (literal.AsciiChars is not null) // set of only ASCII characters
3664-
{
3665-
char[] asciiChars = literal.AsciiChars;
3666-
overlap = asciiChars.AsSpan().Contains(node.Ch);
3667-
if (!overlap)
3668-
{
3669-
Debug.Assert(node.Ch < 128);
3670-
Array.Resize(ref asciiChars, asciiChars.Length + 1);
3671-
asciiChars[^1] = node.Ch;
3672-
}
3673-
EmitIndexOfAnyWithSearchValuesOrLiteral(new string(asciiChars));
3674-
}
36753662
else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One
36763663
{
36773664
overlap = literal.Range.LowInclusive == node.Ch;
@@ -5153,21 +5140,9 @@ bool CanEmitIndexOf(RegexNode node, out int literalLength)
51535140

51545141
if (node.IsSetFamily)
51555142
{
5156-
Span<char> setChars = stackalloc char[5]; // current max that's vectorized
5157-
int setCharsCount;
5158-
if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0)
5159-
{
5160-
literalLength = 1;
5161-
return true;
5162-
}
5163-
5164-
if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
5165-
{
5166-
literalLength = 1;
5167-
return true;
5168-
}
5169-
5170-
if (RegexCharClass.TryGetAsciiSetChars(node.Str, out _))
5143+
Span<char> setChars = stackalloc char[128];
5144+
if (RegexCharClass.TryGetSingleRange(node.Str, out _, out _) ||
5145+
RegexCharClass.GetSetChars(node.Str, setChars) > 0)
51715146
{
51725147
literalLength = 1;
51735148
return true;
@@ -5218,26 +5193,11 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate)
52185193
{
52195194
bool negated = RegexCharClass.IsNegated(node.Str) ^ negate;
52205195

5221-
Span<char> setChars = stackalloc char[5]; // current max that's vectorized
5222-
int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars);
5223-
52245196
// IndexOfAny{Except}InRange
5225-
// Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
5226-
if (setCharsCount is not (1 or 2) && RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
5197+
// Prefer IndexOfAnyInRange over IndexOfAny, except for tiny ranges (1 or 2 items) that IndexOfAny handles more efficiently
5198+
if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive) &&
5199+
(highInclusive - lowInclusive) > 1)
52275200
{
5228-
if (lowInclusive == highInclusive)
5229-
{
5230-
Ldc(lowInclusive);
5231-
Call((useLast, negated) switch
5232-
{
5233-
(false, false) => s_spanIndexOfChar,
5234-
(false, true) => s_spanIndexOfAnyExceptChar,
5235-
(true, false) => s_spanLastIndexOfChar,
5236-
(true, true) => s_spanLastIndexOfAnyExceptChar,
5237-
});
5238-
return;
5239-
}
5240-
52415201
Ldc(lowInclusive);
52425202
Ldc(highInclusive);
52435203
Call((useLast, negated) switch
@@ -5251,6 +5211,8 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate)
52515211
}
52525212

52535213
// IndexOfAny{Except}(ch1, ...)
5214+
Span<char> setChars = stackalloc char[128]; // arbitrary cut-off that accomodates all of ASCII and doesn't take too long to compute
5215+
int setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars);
52545216
if (setCharsCount > 0)
52555217
{
52565218
setChars = setChars.Slice(0, setCharsCount);
@@ -5293,17 +5255,10 @@ void EmitIndexOf(RegexNode node, bool useLast, bool negate)
52935255
return;
52945256

52955257
default:
5296-
EmitIndexOfAnyWithSearchValuesOrLiteral(setChars.ToString(), last: useLast, except: negated);
5258+
EmitIndexOfAnyWithSearchValuesOrLiteral(setChars, last: useLast, except: negated);
52975259
return;
52985260
}
52995261
}
5300-
5301-
// IndexOfAny{Except}(SearchValues<char>)
5302-
if (RegexCharClass.TryGetAsciiSetChars(node.Str, out char[]? asciiChars))
5303-
{
5304-
EmitIndexOfAnyWithSearchValuesOrLiteral(new string(asciiChars), last: useLast, except: negated);
5305-
return;
5306-
}
53075262
}
53085263

53095264
Debug.Fail("We should never get here. This method should only be called if CanEmitIndexOf returned true, and all of the same cases should be covered.");
@@ -6197,15 +6152,15 @@ private void EmitTimeoutCheckIfNeeded()
61976152
}
61986153

61996154
/// <summary>Emits a call to either IndexOfAny("abcd") or IndexOfAny(SearchValues) depending on the <paramref name="chars"/>.</summary>
6200-
private void EmitIndexOfAnyWithSearchValuesOrLiteral(string chars, bool last = false, bool except = false)
6155+
private void EmitIndexOfAnyWithSearchValuesOrLiteral(ReadOnlySpan<char> chars, bool last = false, bool except = false)
62016156
{
6202-
Debug.Assert(chars.Length > 3);
6157+
Debug.Assert(chars.Length > 3, $"chars.Length == {chars.Length}");
62036158

62046159
// SearchValues<char> is faster than a regular IndexOfAny("abcd") for sets of 4/5 values iff they are ASCII.
62056160
// Only emit SearchValues instances when we know they'll be faster to avoid increasing the startup cost too much.
62066161
if (chars.Length is 4 or 5 && !RegexCharClass.IsAscii(chars))
62076162
{
6208-
Ldstr(chars);
6163+
Ldstr(chars.ToString());
62096164
Call(s_stringAsSpanMethod);
62106165
Call((last, except) switch
62116166
{
@@ -6217,7 +6172,7 @@ private void EmitIndexOfAnyWithSearchValuesOrLiteral(string chars, bool last = f
62176172
}
62186173
else
62196174
{
6220-
LoadSearchValues(chars.ToCharArray());
6175+
LoadSearchValues(chars.ToArray());
62216176
Call((last, except) switch
62226177
{
62236178
(false, false) => s_spanIndexOfAnySearchValues,

‎src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs

+12-3
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
9494
if (RegexPrefixAnalyzer.FindFirstCharClass(root) is string charClass)
9595
{
9696
// See if the set is limited to holding only a few characters.
97-
Span<char> scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today
97+
Span<char> scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today without SearchValues, which isn't used for RTL
9898
int scratchCount;
9999
char[]? chars = null;
100100
if (!RegexCharClass.IsNegated(charClass) &&
@@ -278,7 +278,6 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
278278
/// <summary>Data about a character class at a fixed offset from the start of any match to a pattern.</summary>
279279
public struct FixedDistanceSet(char[]? chars, string set, int distance)
280280
{
281-
282281
/// <summary>The character class description.</summary>
283282
public string Set = set;
284283
/// <summary>Whether the <see cref="Set"/> is negated.</summary>
@@ -606,9 +605,9 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
606605
case FindNextStartingPositionMode.LeadingSet_LeftToRight:
607606
{
608607
FixedDistanceSet primarySet = FixedDistanceSets![0];
609-
char[]? chars = primarySet.Chars;
610608

611609
ReadOnlySpan<char> span = textSpan.Slice(pos);
610+
char[]? chars = primarySet.Chars;
612611
if (chars is { Length: <= 5 }) // 5 == currently the max length efficiently handled by IndexOfAny{Except} without SearchValues
613612
{
614613
int i = primarySet.Negated ? span.IndexOfAnyExcept(chars) : span.IndexOfAny(chars);
@@ -618,6 +617,16 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
618617
return true;
619618
}
620619
}
620+
else if (primarySet.Range is not null)
621+
{
622+
(char low, char high) = primarySet.Range.GetValueOrDefault();
623+
int i = primarySet.Negated ? span.IndexOfAnyExceptInRange(low, high) : span.IndexOfAnyInRange(low, high);
624+
if (i >= 0)
625+
{
626+
pos += i;
627+
return true;
628+
}
629+
}
621630
else
622631
{
623632
ref uint[]? startingAsciiLookup = ref _asciiLookups![0];

‎src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

+7-22
Original file line numberDiff line numberDiff line change
@@ -1426,10 +1426,8 @@ public char FirstCharOfOneOrMulti()
14261426
/// A tuple of data about the literal: only one of the Char/String/SetChars fields is relevant.
14271427
/// The Negated value indicates whether the Char/SetChars should be considered exclusionary.
14281428
/// </returns>
1429-
public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max efficiently optimized by IndexOfAny today
1429+
public StartingLiteralData? FindStartingLiteral()
14301430
{
1431-
Debug.Assert(maxSetCharacters is >= 0 and <= 128, $"{nameof(maxSetCharacters)} == {maxSetCharacters} should be small enough to be stack allocated.");
1432-
14331431
if (FindStartingLiteralNode() is RegexNode node)
14341432
{
14351433
switch (node.Kind)
@@ -1441,23 +1439,18 @@ public char FirstCharOfOneOrMulti()
14411439
return new StartingLiteralData(range: (node.Ch, node.Ch), negated: true);
14421440

14431441
case RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy:
1444-
Span<char> setChars = stackalloc char[maxSetCharacters];
1445-
int numChars;
1446-
if ((numChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0)
1447-
{
1448-
setChars = setChars.Slice(0, numChars);
1449-
return new StartingLiteralData(setChars: setChars.ToString(), negated: RegexCharClass.IsNegated(node.Str!));
1450-
}
1451-
1452-
if (RegexCharClass.TryGetSingleRange(node.Str!, out char lowInclusive, out char highInclusive))
1442+
if (RegexCharClass.TryGetSingleRange(node.Str!, out char lowInclusive, out char highInclusive) &&
1443+
(highInclusive - lowInclusive) > 1) // prefer IndexOfAny for 1 or 2 elements as an optimization
14531444
{
14541445
Debug.Assert(lowInclusive < highInclusive);
14551446
return new StartingLiteralData(range: (lowInclusive, highInclusive), negated: RegexCharClass.IsNegated(node.Str!));
14561447
}
14571448

1458-
if (RegexCharClass.TryGetAsciiSetChars(node.Str!, out char[]? asciiChars))
1449+
Span<char> setChars = stackalloc char[128];
1450+
int numChars;
1451+
if ((numChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0)
14591452
{
1460-
return new StartingLiteralData(asciiChars: asciiChars, negated: RegexCharClass.IsNegated(node.Str!));
1453+
return new StartingLiteralData(setChars: setChars.Slice(0, numChars).ToString(), negated: RegexCharClass.IsNegated(node.Str!));
14611454
}
14621455
break;
14631456

@@ -1475,7 +1468,6 @@ public readonly struct StartingLiteralData
14751468
public readonly (char LowInclusive, char HighInclusive) Range;
14761469
public readonly string? String;
14771470
public readonly string? SetChars;
1478-
public readonly char[]? AsciiChars;
14791471
public readonly bool Negated;
14801472

14811473
public StartingLiteralData((char LowInclusive, char HighInclusive) range, bool negated)
@@ -1496,13 +1488,6 @@ public StartingLiteralData(string? setChars, bool negated)
14961488
SetChars = setChars;
14971489
Negated = negated;
14981490
}
1499-
1500-
public StartingLiteralData(char[]? asciiChars, bool negated)
1501-
{
1502-
Debug.Assert(asciiChars is not null);
1503-
AsciiChars = asciiChars;
1504-
Negated = negated;
1505-
}
15061491
}
15071492

15081493
/// <summary>

‎src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs

+10-11
Original file line numberDiff line numberDiff line change
@@ -535,25 +535,24 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
535535

536536
// For every entry, try to get the chars that make up the set, if there are few enough.
537537
// For any for which we couldn't get the small chars list, see if we can get other useful info.
538-
Span<char> scratch = stackalloc char[128]; // limit based on what's currently efficiently handled by SearchValues
538+
Span<char> scratch = stackalloc char[128];
539539
for (int i = 0; i < results.Count; i++)
540540
{
541541
RegexFindOptimizations.FixedDistanceSet result = results[i];
542542
result.Negated = RegexCharClass.IsNegated(result.Set);
543543

544-
int count = RegexCharClass.GetSetChars(result.Set, scratch);
545-
if (count > 0)
544+
if (RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive) &&
545+
(highInclusive - lowInclusive) > 1) // prefer IndexOfAny for tiny sets of 1 or 2 elements
546546
{
547-
result.Chars = scratch.Slice(0, count).ToArray();
547+
result.Range = (lowInclusive, highInclusive);
548548
}
549-
550-
// Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
551-
if (thorough &&
552-
(result.Chars is null || result.Chars.Length > 2) &&
553-
RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive))
549+
else
554550
{
555-
result.Chars = null;
556-
result.Range = (lowInclusive, highInclusive);
551+
int count = RegexCharClass.GetSetChars(result.Set, scratch);
552+
if (count > 0)
553+
{
554+
result.Chars = scratch.Slice(0, count).ToArray();
555+
}
557556
}
558557

559558
results[i] = result;

0 commit comments

Comments
 (0)
Please sign in to comment.