diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index e54ffd3ecc725..19488703f07f7 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -399,54 +399,65 @@ private static string EmitSearchValuesOrLiteral(ReadOnlySpan chars, Dictio } /// Adds a SearchValues instance declaration to the required helpers collection. - private static string EmitSearchValues(char[] asciiChars, Dictionary requiredHelpers) + private static string EmitSearchValues(char[] chars, Dictionary requiredHelpers) { - Debug.Assert(RegexCharClass.IsAscii(asciiChars)); + Array.Sort(chars); - // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key. - byte[] bitmap = new byte[16]; - foreach (char c in asciiChars) + string fieldName; + if (RegexCharClass.IsAscii(chars)) { - bitmap[c >> 3] |= (byte)(1 << (c & 7)); + // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key. + var bitmap = new byte[16]; + foreach (char c in chars) + { + bitmap[c >> 3] |= (byte)(1 << (c & 7)); + } + + string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty); + + fieldName = hexBitmap switch + { + "FFFFFFFF000000000000000000000080" => "s_asciiControl", + "000000000000FF030000000000000000" => "s_asciiDigits", + "0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters", + "000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits", + "000000000000FF037E0000007E000000" => "s_asciiHexDigits", + "000000000000FF03000000007E000000" => "s_asciiHexDigitsLower", + "000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper", + "00000000EEF7008C010000B800000028" => "s_asciiPunctuation", + "00000000010000000000000000000000" => "s_asciiSeparators", + "00000000100800700000004001000050" => "s_asciiSymbols", + "003E0000010000000000000000000000" => "s_asciiWhiteSpace", + "000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars", + + "00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl", + "FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits", + "FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters", + "FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits", + "FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower", + "FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation", + "FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators", + "FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols", + "FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper", + "FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace", + "FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars", + + _ => $"s_ascii_{hexBitmap.TrimStart('0')}" + }; } - - string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty); - - string fieldName = hexBitmap switch + else { - "FFFFFFFF000000000000000000000080" => "s_asciiControl", - "000000000000FF030000000000000000" => "s_asciiDigits", - "0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters", - "000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits", - "000000000000FF037E0000007E000000" => "s_asciiHexDigits", - "000000000000FF03000000007E000000" => "s_asciiHexDigitsLower", - "000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper", - "00000000EEF7008C010000B800000028" => "s_asciiPunctuation", - "00000000010000000000000000000000" => "s_asciiSeparators", - "00000000100800700000004001000050" => "s_asciiSymbols", - "003E0000010000000000000000000000" => "s_asciiWhiteSpace", - "000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars", - - "00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl", - "FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits", - "FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters", - "FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits", - "FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower", - "FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation", - "FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators", - "FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols", - "FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper", - "FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace", - "FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars", - - _ => $"s_ascii_{hexBitmap.TrimStart('0')}" - }; + using (SHA256 sha = SHA256.Create()) + { +#pragma warning disable CA1850 // SHA256.HashData isn't available on netstandard2.0 + fieldName = $"s_nonAscii_{BitConverter.ToString(sha.ComputeHash(Encoding.UTF8.GetBytes(chars))).Replace("-", "")}"; +#pragma warning restore CA1850 + } + } if (!requiredHelpers.ContainsKey(fieldName)) { - Array.Sort(asciiChars); - - string setLiteral = Literal(new string(asciiChars)); + string setLiteral = Literal(new string(chars)); requiredHelpers.Add(fieldName, new string[] { @@ -465,12 +476,12 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary(); + var excludedAsciiChars = new List(); for (int i = 0; i < 128; i++) { if (!RegexCharClass.CharInClass((char)i, set)) { - asciiChars.Add((char)i); + excludedAsciiChars.Add((char)i); } } @@ -538,9 +549,9 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary span)"); lines.Add($"{{"); int uncheckedStart = lines.Count; - lines.Add(asciiChars.Count == 128 ? + lines.Add(excludedAsciiChars.Count == 128 ? $" int i = span.IndexOfAnyExceptInRange('\0', '\u007f');" : - $" int i = span.IndexOfAnyExcept({EmitSearchValues(asciiChars.ToArray(), requiredHelpers)});"); + $" int i = span.IndexOfAnyExcept({EmitSearchValues(excludedAsciiChars.ToArray(), requiredHelpers)});"); lines.Add($" if ((uint)i < (uint)span.Length)"); lines.Add($" {{"); lines.Add($" if (char.IsAscii(span[i]))"); @@ -1067,6 +1078,8 @@ void EmitFixedSet_LeftToRight() string indexOf; if (primarySet.Chars is not null) { + Debug.Assert(primarySet.Chars.Length > 0); + // We have a chars array, so we can use IndexOf{Any}{Except} to search for it. Choose the best overload. string indexOfName = "IndexOf", indexOfAnyName = "IndexOfAny"; if (primarySet.Negated) @@ -1076,18 +1089,19 @@ void EmitFixedSet_LeftToRight() indexOf = primarySet.Chars.Length switch { + // 1, 2, 3 have dedicated optimized IndexOfAny overloads 1 => $"{span}.{indexOfName}({Literal(primarySet.Chars[0])})", 2 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})", 3 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})", - _ => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})", + + // 4, 5 have dedicated optimized IndexOfAny overloads accessible via the ReadOnlySpan overload, + // but can also be handled via SearchValues + 4 or 5 => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})", + + // > 5 can only be handled efficiently via SearchValues + _ => $"{span}.{indexOfAnyName}({EmitSearchValues(primarySet.Chars, requiredHelpers)})", }; } - else if (primarySet.AsciiSet is not null) - { - // We have a set of ASCII chars, so we can use IndexOfAny(SearchValues) to search for it. - Debug.Assert(!primarySet.Negated); - indexOf = $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})"; - } else if (primarySet.Range is not null) { // We have a range, so we can use IndexOfAny{Except}InRange to search for it. In the corner case, @@ -1102,8 +1116,8 @@ void EmitFixedSet_LeftToRight() } else { - // We have an arbitrary set of characters that includes at least one non-ASCII char. We use a custom IndexOfAny helper that - // will perform the search as efficiently as possible. + // We have an arbitrary set of characters that's really large or otherwise not enumerable. + // We use a custom IndexOfAny helper that will perform the search as efficiently as possible. indexOf = $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()"; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 97e728a08ca64..20ccc3afefcca 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -903,6 +903,7 @@ void EmitFixedSet_LeftToRight() if (primarySet.Chars is not null) { + Debug.Assert(primarySet.Chars.Length > 0); switch (primarySet.Chars.Length) { case 1: @@ -926,19 +927,23 @@ void EmitFixedSet_LeftToRight() Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar); break; - default: + case 4 or 5: + // tmp = ...IndexOfAny("abcd"); + // Note that this case differs slightly from the source generator, where it might choose to use + // SearchValues instead of a literal, but there's extra cost to doing so for RegexCompiler so + // it just always uses IndexOfAny(span). Ldstr(new string(primarySet.Chars)); Call(s_stringAsSpanMethod); Call(primarySet.Negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan); break; + + default: + // tmp = ...IndexOfAny(s_searchValues); + LoadSearchValues(primarySet.Chars); + Call(primarySet.Negated ? s_spanIndexOfAnyExceptSearchValues : s_spanIndexOfAnySearchValues); + break; } } - else if (primarySet.AsciiSet is not null) - { - Debug.Assert(!primarySet.Negated); - LoadSearchValues(primarySet.AsciiSet); - Call(s_spanIndexOfAnySearchValues); - } else if (primarySet.Range is not null) { if (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index 7597b37edcc90..517c9da6b4270 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -271,8 +271,6 @@ public FixedDistanceSet(char[]? chars, string set, int distance) public int Distance; /// As an alternative to , a description of the single range the set represents, if it does. public (char LowInclusive, char HighInclusive)? Range; - /// As an alternative to , a description of the set of ASCII characters it represents, if it does. - public char[]? AsciiSet; } /// When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop. @@ -593,7 +591,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan, char[]? chars = primarySet.Chars; ReadOnlySpan span = textSpan.Slice(pos); - if (chars is not null) + if (chars is { Length: <= 5 }) // 5 == currently the max length efficiently handled by IndexOfAny{Except} without SearchValues { int i = primarySet.Negated ? span.IndexOfAnyExcept(chars) : span.IndexOfAny(chars); if (i >= 0) @@ -660,7 +658,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan, int endMinusRequiredLength = textSpan.Length - Math.Max(1, MinRequiredLength); - if (primarySet.Chars is not null) + if (primarySet.Chars is { Length: <= 5 }) // 5 == currently the max length efficiently handled by IndexOfAny{Except} { for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 56ff64cdb182f..96c50215c7b77 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -194,7 +194,7 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) TryFindRawFixedSets(root, results, ref distance, thorough); #if DEBUG results.ForEach(r => Debug.Assert( - !r.Negated && r.Chars is null && r.AsciiSet is null && r.Range is null, + !r.Negated && r.Chars is null && r.Range is null, $"{nameof(TryFindRawFixedSets)} should have only populated {nameof(r.Set)} and {nameof(r.Distance)}")); #endif @@ -225,31 +225,25 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) // For every entry, try to get the chars that make up the set, if there are few enough. // For any for which we couldn't get the small chars list, see if we can get other useful info. - Span scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today + Span scratch = stackalloc char[128]; // limit based on what's currently efficiently handled by SearchValues for (int i = 0; i < results.Count; i++) { RegexFindOptimizations.FixedDistanceSet result = results[i]; result.Negated = RegexCharClass.IsNegated(result.Set); int count = RegexCharClass.GetSetChars(result.Set, scratch); - if (count > 0) { result.Chars = scratch.Slice(0, count).ToArray(); } - if (thorough) + // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range. + if (thorough && + (result.Chars is null || result.Chars.Length > 2) && + RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive)) { - // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range. - if ((result.Chars is null || count > 2) && RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive)) - { - result.Chars = null; - result.Range = (lowInclusive, highInclusive); - } - else if (result.Chars is null && !result.Negated && RegexCharClass.TryGetAsciiSetChars(result.Set, out char[]? asciiChars)) - { - result.AsciiSet = asciiChars; - } + result.Chars = null; + result.Range = (lowInclusive, highInclusive); } results[i] = result; @@ -472,8 +466,8 @@ public static void SortFixedDistanceSetsByQuality(List { - char[]? s1Chars = s1.Chars ?? s1.AsciiSet; - char[]? s2Chars = s2.Chars ?? s2.AsciiSet; + char[]? s1Chars = s1.Chars; + char[]? s2Chars = s2.Chars; int s1CharsLength = s1Chars?.Length ?? 0; int s2CharsLength = s2Chars?.Length ?? 0; bool s1Negated = s1.Negated;