diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index 63eddeaa977f08..a85ecef2303173 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -887,6 +887,20 @@ public static bool ParticipatesInCaseConversion(ReadOnlySpan s) return false; } + /// Gets whether the specified span contains only ASCII. + public static bool IsAscii(ReadOnlySpan s) // TODO https://github.com/dotnet/runtime/issues/28230: Replace once Ascii is available + { + foreach (char c in s) + { + if (c >= 128) + { + return false; + } + } + + return true; + } + /// Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents. /// This may enumerate negated characters if the set is negated. private static bool CanEasilyEnumerateSetContents(string set) => diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 3b8b25539c2bef..12afbb48f663d5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -2406,9 +2406,15 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil int i = childIndex; for ( ; i < exclusiveChildBound; i++) { - // We focus on only Ones and Sets. A sequence of Ones should have already been turned into a Multi, - // so we don't bother looking for Oneloop/etc. RegexNode child = Child(i); + if ((child.Options & RegexOptions.IgnoreCase) != 0) + { + // TODO https://github.com/dotnet/runtime/issues/61048: Remove this block once fixed. + // We don't want any nodes that are still IgnoreCase, as they'd no longer be IgnoreCase if + // they were applicable to this optimization. + break; + } + if (child.Kind is RegexNodeKind.One) { // We only want to include ASCII characters, and only if they don't participate in case conversion @@ -2419,8 +2425,21 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil { break; } + vsb.Append(child.Ch); } + else if (child.Kind is RegexNodeKind.Multi) + { + // As with RegexNodeKind.One, the string needs to be composed solely of ASCII characters that + // don't participate in case conversion. + if (!RegexCharClass.IsAscii(child.Str.AsSpan()) || + RegexCharClass.ParticipatesInCaseConversion(child.Str.AsSpan())) + { + break; + } + + vsb.Append(child.Str); + } else if (child.Kind is RegexNodeKind.Set || (child.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic && child.M == child.N)) {