diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 88a4211c251872..10ec814bea2649 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -154,7 +154,15 @@ private void MakeRep(RegexNodeKind kind, int min, int max) N = max; } - private void MakeLoopAtomic() + /// Converts this loop node to be atomic. + /// + /// If a loop is atomic by construction, e.g. it's at the end of the pattern + /// or its parent is an atomic group, there's no backtracking into it, which means it does its full + /// initial processing and then stops. For an eager loop, that means consuming as much as possible, + /// but for a lazy loop, that means consuming as little as possible. Thus, if this is true, a lazy + /// loop should lower its max iteration count to its min iteration count. + /// + private void MakeLoopAtomic(bool noBacktrackingByPosition = false) { switch (Kind) { @@ -165,11 +173,15 @@ private void MakeLoopAtomic() break; case RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy: - // For lazy, we not only change the Type, we also lower the max number of iterations - // to the minimum number of iterations, creating a repeater, as they should end up - // matching as little as possible. + // For lazy, we not only change the Type, if we're at the end of the pattern, + // we also lower the max number of iterations to the minimum number of iterations, + // creating a repeater, as they should end up matching as little as possible. Kind += RegexNodeKind.Oneloopatomic - RegexNodeKind.Onelazy; - N = M; + if (noBacktrackingByPosition) + { + N = M; + } + if (N == 0) { // If moving the max to be the same as the min dropped it to 0, there's no @@ -189,6 +201,34 @@ private void MakeLoopAtomic() } break; + case RegexNodeKind.Loop: + if (Parent is not { Kind: RegexNodeKind.Atomic }) + { + RegexNode loopAsChild = new(RegexNodeKind.Loop, Options, M, N); + Kind = RegexNodeKind.Atomic; + M = N = 0; + loopAsChild.AddChild(Child(0)); + ReplaceChild(0, loopAsChild); + } + break; + + case RegexNodeKind.Lazyloop: + if (noBacktrackingByPosition) + { + N = M; + } + + if (N != 0) + { + // A lazy loop that becomes atomic gets treated the same as a greedy loop, + // so we can share the same logic. + goto case RegexNodeKind.Loop; + } + + Kind = RegexNodeKind.Empty; + Children = null; + break; + default: Debug.Fail($"Unexpected type: {Kind}"); break; @@ -435,7 +475,7 @@ private void EliminateEndingBacktracking() // or even empty nodes. case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop: case RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy: - node.MakeLoopAtomic(); + node.MakeLoopAtomic(noBacktrackingByPosition: true); break; // Just because a particular node is atomic doesn't mean all its descendants are. @@ -456,7 +496,7 @@ private void EliminateEndingBacktracking() case RegexNodeKind.Concatenate when !rtl: RegexNode existingChild = node.Child(node.ChildCount() - 1); if ((existingChild.Kind is RegexNodeKind.Alternate or RegexNodeKind.BackreferenceConditional or RegexNodeKind.ExpressionConditional or RegexNodeKind.Loop or RegexNodeKind.Lazyloop) && - (node.Parent is null || node.Parent.Kind != RegexNodeKind.Atomic)) // validate grandparent isn't atomic + node.Parent is not { Kind: RegexNodeKind.Atomic }) // validate grandparent isn't atomic { var atomic = new RegexNode(RegexNodeKind.Atomic, existingChild.Options); atomic.AddChild(existingChild); @@ -494,26 +534,30 @@ private void EliminateEndingBacktracking() // e.g. (?:abc*)* => (?:ab(?>c*))* // e.g. (abc*?)+? => (ab){1} case RegexNodeKind.Lazyloop: - node.N = node.M; - goto case RegexNodeKind.Loop; case RegexNodeKind.Loop: { - if (node.N == 1) + // Make the loop atomic, if it isn't already. This entails changing node to instead be an Atomic node + // that has the {Lazy}Loop as its child. If the parent of the loop is already Atomic, this will be a nop. + node.MakeLoopAtomic(noBacktrackingByPosition: true); + Debug.Assert(node.Kind is RegexNodeKind.Atomic or RegexNodeKind.Empty or RegexNodeKind.Loop or RegexNodeKind.Lazyloop); + + if (node.Kind is RegexNodeKind.Atomic) { - // If the loop has a max iteration count of 1 (e.g. it's an optional node), - // there's no possibility for conflict between multiple iterations, so - // we can process it. node = node.Child(0); - continue; + Debug.Assert(node.Kind is RegexNodeKind.Loop or RegexNodeKind.Lazyloop); } - if (!rtl) + if (node.Kind is RegexNodeKind.Loop or RegexNodeKind.Lazyloop) { - RegexNode? loopDescendent = node.FindLastExpressionInLoopForAutoAtomic(); - if (loopDescendent != null) + if (node.N == 1 || CanBeMadeAtomic(node.Child(0), node.Child(0), iterateNullableSubsequent: false, allowLazy: false)) { - node = loopDescendent; - continue; // loop around to process node + // If the loop has a max iteration count of 1 (e.g. it's an optional node), + // there's no possibility for conflict between multiple iterations, so + // we can process it. Or, if the node can be made atomic with itself as a subsequent + // node (which is logically what happens when there are multiple iterations), we can also + // recur into its child. + node = node.Child(0); + continue; } } } @@ -643,7 +687,7 @@ private RegexNode ReduceAtomic() case RegexNodeKind.Onelazy: case RegexNodeKind.Notonelazy: case RegexNodeKind.Setlazy: - child.MakeLoopAtomic(); + child.MakeLoopAtomic(noBacktrackingByPosition: true); return child; // Alternations have a variety of possible optimizations that can be applied @@ -759,7 +803,9 @@ private RegexNode ReduceAtomic() // For everything else, try to reduce ending backtracking of the last contained expression. default: child.EliminateEndingBacktracking(); - return atomic; + return child.Kind == RegexNodeKind.Empty ? + child : // if the child became empty, then the atomic node isn't needed + atomic; } } @@ -1850,31 +1896,9 @@ static void ProcessNode(RegexNode node, RegexNode subsequent) } // Skip down the node past irrelevant nodes. - while (true) + while (node.Kind is RegexNodeKind.Capture or RegexNodeKind.Concatenate) { - // We can always recur into captures and into the last node of concatenations. - if (node.Kind is RegexNodeKind.Capture or RegexNodeKind.Concatenate) - { - node = node.Child(node.ChildCount() - 1); - continue; - } - - // For loops with at least one guaranteed iteration, we can recur into them, but - // we need to be careful not to just always do so; the ending node of a loop can only - // be made atomic if what comes after the loop but also the beginning of the loop are - // compatible for the optimization. - if (node.Kind == RegexNodeKind.Loop) - { - RegexNode? loopDescendent = node.FindLastExpressionInLoopForAutoAtomic(); - if (loopDescendent != null) - { - node = loopDescendent; - continue; - } - } - - // Can't skip any further. - break; + node = node.Child(node.ChildCount() - 1); } // If the node can be changed to atomic based on what comes after it, do so. @@ -1909,6 +1933,66 @@ static void ProcessNode(RegexNode node, RegexNode subsequent) node.MakeLoopAtomic(); break; + case RegexNodeKind.Loop when CanBeMadeAtomic(node, subsequent, iterateNullableSubsequent: true, allowLazy: false): + case RegexNodeKind.Lazyloop when CanBeMadeAtomic(node, subsequent, iterateNullableSubsequent: false, allowLazy: true): + // General loops and lazy loops can also be made atomic, but we need to be very careful in doing so. Making such loops + // atomic means wrapping them in an atomic group, and children of these loops can look up through their ancestry, see + // such an atomic group, and then decide to alter their behavior because backtracking isn't possible. For example, if + // a developer writes the pattern (?>(abcd*?)+)e, it is safe for that inner lazy char loop to see that nothing can + // backtrack into it, such that the lazy loop can match the minimum possible, such that the loop evaporates entirely, + // and it becomes (?>(abc)+)e. Thus, given a pattern like (abcd*?)+e, even though the outer loop can be made atomic, + // because the beginning/end of the loop don't overlap with each other or with their successor, we can't just wrap it + // in an atomic block, because that would then trigger the nested loop to behave incorrectly. We can address this in + // multiple ways, such as by tagging Atomic nodes we introduce as being different from ones originally part of the pattern, + // and then having children treat them differently when looking at their ancestors, or we can address it by only introducing + // such an atomic node when we can see it's safe for the children. For now, this does the latter, and to be conservative, + // it allowlists a small known set of children types. + RegexNode loopChild = node.Child(0); + while (loopChild.Kind is RegexNodeKind.Capture or RegexNodeKind.Concatenate) + { + loopChild = loopChild.Child(loopChild.ChildCount() - 1); + } + + if (loopChild.Kind is + RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or + RegexNodeKind.Multi or + RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set) + { + // For types on the allow list, we can make the loop itself atomic. + node.MakeLoopAtomic(); + } + else if (node.Kind is RegexNodeKind.Loop or RegexNodeKind.Lazyloop) + { + // For everything else, we can't make the loop itself atomic, but we can + // possibly continue to make children of the loop atomic. + goto case RegexNodeKind.Loop; + } + break; + + // For all other loops, we may not be able to make them atomic, but we might still be able to make a node + // they end with be atomic. If the loop has a max iteration count of 1, then we don't need to worry about it + // following itself and can simply examine its child. If it has a max iteration count greater than 1, then + // we can examine its child iff its child could be made atomic against itself. + case RegexNodeKind.Loop: + { + RegexNode child = node.Child(0); + if (node.N == 1 || CanBeMadeAtomic(child, child, iterateNullableSubsequent: false, allowLazy: false)) + { + ProcessNode(child, subsequent); + } + } + break; + + case RegexNodeKind.Lazyloop: + { + RegexNode child = node.Child(0); + if (node.N == 1 || CanBeMadeAtomic(child, child, iterateNullableSubsequent: false, allowLazy: true)) + { + ProcessNode(child, subsequent); + } + } + break; + case RegexNodeKind.Alternate or RegexNodeKind.BackreferenceConditional or RegexNodeKind.ExpressionConditional: // In the case of alternation, we can't change the alternation node itself // based on what comes after it (at least not with more complicated analysis @@ -1931,47 +2015,6 @@ static void ProcessNode(RegexNode node, RegexNode subsequent) } } - /// - /// Recurs into the last expression of a loop node, looking to see if it can find a node - /// that could be made atomic _assuming_ the conditions exist for it with the loop's ancestors. - /// - /// The found node that should be explored further for auto-atomicity; null if it doesn't exist. - private RegexNode? FindLastExpressionInLoopForAutoAtomic() - { - RegexNode node = this; - - Debug.Assert((node.Options & RegexOptions.RightToLeft) == 0, "Currently only implemented for left-to-right"); - Debug.Assert(node.Kind is RegexNodeKind.Loop or RegexNodeKind.Lazyloop); - - // Start by looking at the loop's sole child. - node = node.Child(0); - - // Skip past captures. - while (node.Kind == RegexNodeKind.Capture) - { - node = node.Child(0); - } - - // If the loop's body is a concatenate, we can skip to its last child iff that - // last child doesn't conflict with the first child, since this whole concatenation - // could be repeated, such that the first node ends up following the last. For - // example, in the expression (a+[def])*, the last child is [def] and the first is - // a+, which can't possibly overlap with [def]. In contrast, if we had (a+[ade])*, - // [ade] could potentially match the starting 'a'. - if (node.Kind == RegexNodeKind.Concatenate) - { - int concatCount = node.ChildCount(); - RegexNode lastConcatChild = node.Child(concatCount - 1); - if (CanBeMadeAtomic(lastConcatChild, node.Child(0), iterateNullableSubsequent: false, allowLazy: false)) - { - return lastConcatChild; - } - } - - // Otherwise, the loop has nothing that can participate in auto-atomicity. - return null; - } - /// Optimizations for positive and negative lookaheads/behinds. private RegexNode ReduceLookaround() { @@ -2065,6 +2108,12 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i return false; } + // Skip down past irrelevant nodes. + while (node.Kind is RegexNodeKind.Capture or RegexNodeKind.Concatenate) + { + node = node.Child(node.ChildCount() - 1); + } + // In most case, we'll simply check the node against whatever subsequent is. However, in case // subsequent ends up being a loop with a min bound of 0, we'll also need to evaluate the node // against whatever comes after subsequent. In that case, we'll walk the tree to find the @@ -2072,16 +2121,13 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i while (true) { // Skip the successor down to the closest node that's guaranteed to follow it. - int childCount; - while ((childCount = subsequent.ChildCount()) > 0) + while (true) { - Debug.Assert(subsequent.Kind != RegexNodeKind.Group); switch (subsequent.Kind) { case RegexNodeKind.Concatenate: case RegexNodeKind.Capture: case RegexNodeKind.Atomic: - case RegexNodeKind.PositiveLookaround when (subsequent.Options & RegexOptions.RightToLeft) == 0: // only lookaheads, not lookbehinds (represented as RTL PositiveLookaround nodes) case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when subsequent.M > 0: subsequent = subsequent.Child(0); continue; @@ -2103,10 +2149,11 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i // only a yes branch, we'd need to also check whatever comes after the conditional). It doesn't apply to // backreference conditionals, as the condition itself is unknown statically and could overlap with the // loop being considered for atomicity. + int childCount = subsequent.ChildCount(); switch (subsequent.Kind) { case RegexNodeKind.Alternate: - case RegexNodeKind.ExpressionConditional when childCount == 3: // condition, yes, and no branch + case RegexNodeKind.ExpressionConditional when childCount is 3: // condition, yes, and no branch for (int i = 0; i < childCount; i++) { if (!CanBeMadeAtomic(node, subsequent.Child(i), iterateNullableSubsequent, allowLazy: false)) @@ -2198,6 +2245,53 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i } break; + case RegexNodeKind.Loop: + case RegexNodeKind.Lazyloop when allowLazy: + // With single character loops (e.g. OneLoop, NotOneLoop, SetLoop), we only need to prove there's no overlap between + // what that single character could be and what comes next. For arbitrary loops, we have more to prove. First, we need + // to understand what the loop can possibly start with and what it can possibly end with (with a single character loop, + // those are the same things), and we need to ensure that there's no overlap between those two sets; otherwise, a second + // iteration of a loop could end up giving back characters that could be consumed by the previous iteration. Second, we need + // to ensure that neither the starting set nor the ending set overlaps with what could possibly come after it, for the same reason. + RegexNode loopChild = node.Child(0); + if (RegexPrefixAnalyzer.FindFirstCharClass(loopChild) is not string loopStartingSet || + RegexPrefixAnalyzer.FindLastCharClass(loopChild) is not string loopEndingSet || + (node.N > 1 && RegexCharClass.MayOverlap(loopStartingSet, loopEndingSet))) + { + return false; + } + + bool CharInStartingOrEndingSet(char ch) => + RegexCharClass.CharInClass(ch, loopStartingSet) || RegexCharClass.CharInClass(ch, loopEndingSet); + + bool MayOverlapStartingOrEndingSet(string set) => + RegexCharClass.MayOverlap(set, loopStartingSet) || RegexCharClass.MayOverlap(set, loopEndingSet); + + switch (subsequent.Kind) + { + case RegexNodeKind.One when !CharInStartingOrEndingSet(subsequent.Ch): + case RegexNodeKind.Set when !MayOverlapStartingOrEndingSet(subsequent.Str!): + case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M > 0 && !CharInStartingOrEndingSet(subsequent.Ch): + case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M > 0 && !MayOverlapStartingOrEndingSet(subsequent.Str!): + case RegexNodeKind.Multi when !CharInStartingOrEndingSet(subsequent.Str![0]): + case RegexNodeKind.End: + case RegexNodeKind.EndZ or RegexNodeKind.Eol when !CharInStartingOrEndingSet('\n'): + return true; + + case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !CharInStartingOrEndingSet(subsequent.Ch): + case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !MayOverlapStartingOrEndingSet(subsequent.Str!): + case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsKnownWordClassSubset(loopStartingSet) && RegexCharClass.IsKnownWordClassSubset(loopEndingSet): + case RegexNodeKind.NonBoundary when node.M > 0 && (loopStartingSet is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass) && (loopEndingSet is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass): + case RegexNodeKind.ECMABoundary when node.M > 0 && (loopStartingSet is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass) && (loopEndingSet is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass): + case RegexNodeKind.NonECMABoundary when node.M > 0 && (loopStartingSet is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass) && (loopEndingSet is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass): + // The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well. + break; + + default: + return false; + } + break; + default: return false; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 6be7fdda3fc948..93ab67a2c392a0 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -895,13 +895,22 @@ static int GetRangeLength((char LowInclusive, char HighInclusive) range, bool ne }); /// - /// Computes a character class for the first character in tree. This uses a more robust algorithm - /// than is used by TryFindFixedLiterals and thus can find starting sets it couldn't. For example, + /// Computes a character class for the first character in the node. + /// + /// + /// This uses a more robust algorithm than is used by TryFindFixedLiterals and thus can find starting sets it couldn't. For example, /// fixed literals won't find the starting set for a*b, as the a isn't guaranteed and the b is at a /// variable position, but this will find [ab] as it's instead looking for anything that under any /// circumstance could possibly start a match. + /// + public static string? FindFirstCharClass(RegexNode root) => FindFirstOrLastCharClass(root, findFirst: true); + + /// + /// Computes a character class for the last character in the node. /// - public static string? FindFirstCharClass(RegexNode root) + public static string? FindLastCharClass(RegexNode root) => FindFirstOrLastCharClass(root, findFirst: false); + + private static string? FindFirstOrLastCharClass(RegexNode root, bool findFirst) { // Explore the graph, adding found chars into a result set, which is lazily initialized so that // we can initialize it to a parsed set if we discover one first (this is helpful not just for allocation @@ -913,7 +922,7 @@ static int GetRangeLength((char LowInclusive, char HighInclusive) range, bool ne // whole pattern was nullable such that it could match an empty string, in which case we // can't make any statements about what begins a match. RegexCharClass? cc = null; - return TryFindFirstCharClass(root, ref cc) == true ? + return TryFindFirstOrLastCharClass(root, findFirst, ref cc) == true ? cc!.ToStringClass() : null; @@ -930,7 +939,7 @@ static int GetRangeLength((char LowInclusive, char HighInclusive) range, bool ne // it's zero-width (e.g. empty, a lookaround, an anchor, etc.) or it could be zero-width // (e.g. a loop with a min bound of 0). A concatenation processing a child that returns // null needs to keep processing the next child. - static bool? TryFindFirstCharClass(RegexNode node, ref RegexCharClass? cc) + static bool? TryFindFirstOrLastCharClass(RegexNode node, bool findFirst, ref RegexCharClass? cc) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { @@ -992,7 +1001,8 @@ static int GetRangeLength((char LowInclusive, char HighInclusive) range, bool ne if (cc is null || cc.CanMerge) { cc ??= new RegexCharClass(); - cc.AddChar(node.Str![(node.Options & RegexOptions.RightToLeft) != 0 ? node.Str.Length - 1 : 0]); + bool firstChar = findFirst == ((node.Options & RegexOptions.RightToLeft) == 0); + cc.AddChar(node.Str![firstChar ? 0 : node.Str.Length - 1]); return true; } return false; @@ -1019,14 +1029,14 @@ static int GetRangeLength((char LowInclusive, char HighInclusive) range, bool ne // Groups. These don't contribute anything of their own, and are just pass-throughs to their children. case RegexNodeKind.Atomic: case RegexNodeKind.Capture: - return TryFindFirstCharClass(node.Child(0), ref cc); + return TryFindFirstOrLastCharClass(node.Child(0), findFirst, ref cc); // Loops. Like groups, these are mostly pass-through: if the child fails, then the whole operation needs // to fail, and if the child is nullable, then the loop is as well. However, if the child succeeds but // the loop has a lower bound of 0, then the loop is still nullable. case RegexNodeKind.Loop: case RegexNodeKind.Lazyloop: - return TryFindFirstCharClass(node.Child(0), ref cc) switch + return TryFindFirstOrLastCharClass(node.Child(0), findFirst, ref cc) switch { false => false, null => null, @@ -1040,12 +1050,26 @@ static int GetRangeLength((char LowInclusive, char HighInclusive) range, bool ne case RegexNodeKind.Concatenate: { int childCount = node.ChildCount(); - for (int i = 0; i < childCount; i++) + if (findFirst) + { + for (int i = 0; i < childCount; i++) + { + bool? childResult = TryFindFirstOrLastCharClass(node.Child(i), findFirst, ref cc); + if (childResult != null) + { + return childResult; + } + } + } + else { - bool? childResult = TryFindFirstCharClass(node.Child(i), ref cc); - if (childResult != null) + for (int i = childCount - 1; i >= 0; i--) { - return childResult; + bool? childResult = TryFindFirstOrLastCharClass(node.Child(i), findFirst, ref cc); + if (childResult != null) + { + return childResult; + } } } return null; @@ -1060,7 +1084,7 @@ static int GetRangeLength((char LowInclusive, char HighInclusive) range, bool ne bool anyChildWasNull = false; for (int i = 0; i < childCount; i++) { - bool? childResult = TryFindFirstCharClass(node.Child(i), ref cc); + bool? childResult = TryFindFirstOrLastCharClass(node.Child(i), findFirst, ref cc); if (childResult is null) { anyChildWasNull = true; @@ -1078,7 +1102,7 @@ static int GetRangeLength((char LowInclusive, char HighInclusive) range, bool ne case RegexNodeKind.BackreferenceConditional: case RegexNodeKind.ExpressionConditional: int branchStart = node.Kind is RegexNodeKind.BackreferenceConditional ? 0 : 1; - return (TryFindFirstCharClass(node.Child(branchStart), ref cc), TryFindFirstCharClass(node.Child(branchStart + 1), ref cc)) switch + return (TryFindFirstOrLastCharClass(node.Child(branchStart), findFirst, ref cc), TryFindFirstOrLastCharClass(node.Child(branchStart + 1), findFirst, ref cc)) switch { (false, _) or (_, false) => false, (null, _) or (_, null) => null, diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index b7f48470256d02..b720936b59b995 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -272,6 +272,12 @@ public static IEnumerable Match_MemberData() yield return (@"(b|a|aa)((?:aa)+?)+?$", "aaaaaaaa", RegexOptions.None, 0, 8, true, "aaaaaaaa"); yield return (@"(|a|aa)(((?:aa)+?)+?|aaaaab)\w$", "aaaaaabc", RegexOptions.None, 0, 8, true, "aaaaaabc"); + // Nested loops + yield return (@"(abcd*)+e", "abcde", RegexOptions.None, 0, 5, true, "abcde"); + yield return (@"(abcd*?)+e", "abcde", RegexOptions.None, 0, 5, true, "abcde"); + yield return (@"(abcd*)+?e", "abcde", RegexOptions.None, 0, 5, true, "abcde"); + yield return (@"(abcd*?)+?e", "abcde", RegexOptions.None, 0, 5, true, "abcde"); + // Testing selected FindOptimizations finds the right prefix yield return (@"(^|a+)bc", " aabc", RegexOptions.None, 0, 5, true, "aabc"); yield return (@"(^|($|a+))bc", " aabc", RegexOptions.None, 0, 5, true, "aabc"); diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs index f063d790c06745..31967fb8d965b8 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs @@ -369,6 +369,13 @@ public class RegexReductionTests [InlineData("(\\w[bcd]\\s*)*fg", "(\\w[bcd](?>\\s*))*fg")] [InlineData(@"\b(\w+)\b", @"\b((?>\w+))\b")] [InlineData(@"\b(?:\w+)\b ", @"\b(?>\w+)\b ")] + [InlineData("(abc)*", "(?>(abc)*)")] + [InlineData("(abc)*?", "")] + [InlineData("(abc)+d", "(?>(abc)+)d")] + [InlineData("(abc)*d", "(?>(abc)*)d")] + [InlineData("(abc?)*?d", "(ab(?>c?))*?d")] + [InlineData("(ab*c)*d", "(?>(a(?>b*)c)*)d")] + [InlineData("(aba)?d", "(?>(aba)?)d")] // Nothing handling [InlineData(@"\wabc(?!)def", "(?!)")] [InlineData(@"\wabc(?!)def|ghi(?!)", "(?!)")] @@ -506,6 +513,11 @@ public void PatternsReduceIdentically(string actual, string expected) [InlineData(@"\b[a-z ]+\b", @"\b(?>[a-z ]+)\b")] [InlineData(@"\b[\p{L}\p{Mn}a]+\b", @"\b(?>[\p{L}\p{Mn}a]+)\b")] [InlineData(@"\b[\p{C}]+\b", @"\b(?>[\p{C}]+)\b")] + [InlineData("(aba)*d", "(?>(aba)*)d")] + [InlineData("(dba)*d", "(?>(dba)*)d")] + [InlineData("(abc?)*?d", "(?>(ab(?>c?))*)d")] + [InlineData("(aba)+d", "(?>(aba)+)d")] + [InlineData("(abc*)*d", "(?>(ab(?>c*))*)d")] // Loops inside alternation constructs [InlineData("(abc*|def)chi", "(ab(?>c*)|def)chi")] [InlineData("(abc|def*)fhi", "(abc|de(?>f*))fhi")]